diff options
author | robot-piglet <robot-piglet@yandex-team.com> | 2024-10-10 10:23:59 +0300 |
---|---|---|
committer | robot-piglet <robot-piglet@yandex-team.com> | 2024-10-10 10:34:20 +0300 |
commit | 64ced6422a2063c32dd0b137118dd0d9a7fdf5ef (patch) | |
tree | 1477fb28803747c3dfdd47ccdd120f3a4c6af8f7 /contrib | |
parent | 23e8fd57599306ec8e2604c93c75e661d1ab330f (diff) | |
download | ydb-64ced6422a2063c32dd0b137118dd0d9a7fdf5ef.tar.gz |
Intermediate changes
commit_hash:afec429f28a760d2850a90d23b1a8f590aefa0d5
Diffstat (limited to 'contrib')
237 files changed, 19486 insertions, 5845 deletions
diff --git a/contrib/libs/isa-l/erasure_code/Makefile.am b/contrib/libs/isa-l/erasure_code/Makefile.am index bad2aae2f3..8f334462ac 100644 --- a/contrib/libs/isa-l/erasure_code/Makefile.am +++ b/contrib/libs/isa-l/erasure_code/Makefile.am @@ -1,5 +1,5 @@ ######################################################################## -# Copyright(c) 2011-2017 Intel Corporation All rights reserved. +# Copyright(c) 2011-2019 Intel Corporation All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -27,11 +27,13 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ######################################################################## +include erasure_code/aarch64/Makefile.am + +include erasure_code/ppc64le/Makefile.am + lsrc += erasure_code/ec_base.c lsrc_base_aliases += erasure_code/ec_base_aliases.c -lsrc_aarch64 += erasure_code/ec_base_aliases.c - lsrc_x86_64 += \ erasure_code/ec_highlevel_func.c \ erasure_code/gf_vect_mul_sse.asm \ @@ -76,14 +78,38 @@ lsrc_x86_64 += \ #if HAVE_AVX512 lsrc_x86_64 += \ + erasure_code/gf_vect_mad_avx2_gfni.asm \ + erasure_code/gf_2vect_mad_avx2_gfni.asm \ + erasure_code/gf_3vect_mad_avx2_gfni.asm \ + erasure_code/gf_4vect_mad_avx2_gfni.asm \ + erasure_code/gf_5vect_mad_avx2_gfni.asm \ erasure_code/gf_vect_dot_prod_avx512.asm \ erasure_code/gf_2vect_dot_prod_avx512.asm \ erasure_code/gf_3vect_dot_prod_avx512.asm \ erasure_code/gf_4vect_dot_prod_avx512.asm \ + erasure_code/gf_5vect_dot_prod_avx512.asm \ + erasure_code/gf_6vect_dot_prod_avx512.asm \ + erasure_code/gf_vect_dot_prod_avx512_gfni.asm \ + erasure_code/gf_vect_dot_prod_avx2_gfni.asm \ + erasure_code/gf_2vect_dot_prod_avx2_gfni.asm \ + erasure_code/gf_3vect_dot_prod_avx2_gfni.asm \ + erasure_code/gf_2vect_dot_prod_avx512_gfni.asm \ + erasure_code/gf_3vect_dot_prod_avx512_gfni.asm \ + erasure_code/gf_4vect_dot_prod_avx512_gfni.asm \ + erasure_code/gf_5vect_dot_prod_avx512_gfni.asm \ + erasure_code/gf_6vect_dot_prod_avx512_gfni.asm \ erasure_code/gf_vect_mad_avx512.asm \ erasure_code/gf_2vect_mad_avx512.asm \ erasure_code/gf_3vect_mad_avx512.asm \ - erasure_code/gf_4vect_mad_avx512.asm + erasure_code/gf_4vect_mad_avx512.asm \ + erasure_code/gf_5vect_mad_avx512.asm \ + erasure_code/gf_6vect_mad_avx512.asm \ + erasure_code/gf_vect_mad_avx512_gfni.asm \ + erasure_code/gf_2vect_mad_avx512_gfni.asm \ + erasure_code/gf_3vect_mad_avx512_gfni.asm \ + erasure_code/gf_4vect_mad_avx512_gfni.asm \ + erasure_code/gf_5vect_mad_avx512_gfni.asm \ + erasure_code/gf_6vect_mad_avx512_gfni.asm lsrc_x86_32 += \ erasure_code/ec_highlevel_func.c \ @@ -143,19 +169,4 @@ perf_tests += erasure_code/gf_vect_mul_perf \ other_tests += erasure_code/gen_rs_matrix_limits -other_tests_x86_64 += \ - erasure_code/gf_2vect_dot_prod_sse_test \ - erasure_code/gf_3vect_dot_prod_sse_test \ - erasure_code/gf_4vect_dot_prod_sse_test \ - erasure_code/gf_5vect_dot_prod_sse_test \ - erasure_code/gf_6vect_dot_prod_sse_test - -other_tests_x86_32 += \ - erasure_code/gf_2vect_dot_prod_sse_test \ - erasure_code/gf_3vect_dot_prod_sse_test \ - erasure_code/gf_4vect_dot_prod_sse_test \ - erasure_code/gf_5vect_dot_prod_sse_test \ - erasure_code/gf_6vect_dot_prod_sse_test - -other_src += include/test.h \ - include/types.h +other_src += include/test.h diff --git a/contrib/libs/isa-l/erasure_code/ut/gf_2vect_dot_prod_sse_test/.yandex_meta/licenses.list.txt b/contrib/libs/isa-l/erasure_code/aarch64/.yandex_meta/licenses.list.txt index 8f218b47cb..8f218b47cb 100644 --- a/contrib/libs/isa-l/erasure_code/ut/gf_2vect_dot_prod_sse_test/.yandex_meta/licenses.list.txt +++ b/contrib/libs/isa-l/erasure_code/aarch64/.yandex_meta/licenses.list.txt diff --git a/contrib/libs/isa-l/erasure_code/aarch64/Makefile.am b/contrib/libs/isa-l/erasure_code/aarch64/Makefile.am new file mode 100644 index 0000000000..47bbf12d2b --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/Makefile.am @@ -0,0 +1,60 @@ +################################################################## +# Copyright (c) 2019 Huawei Technologies Co., Ltd. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Huawei Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +######################################################################## + +lsrc_aarch64 += \ + erasure_code/aarch64/ec_aarch64_highlevel_func.c \ + erasure_code/aarch64/ec_aarch64_dispatcher.c \ + erasure_code/aarch64/gf_vect_dot_prod_neon.S \ + erasure_code/aarch64/gf_2vect_dot_prod_neon.S \ + erasure_code/aarch64/gf_3vect_dot_prod_neon.S \ + erasure_code/aarch64/gf_4vect_dot_prod_neon.S \ + erasure_code/aarch64/gf_5vect_dot_prod_neon.S \ + erasure_code/aarch64/gf_vect_mad_neon.S \ + erasure_code/aarch64/gf_2vect_mad_neon.S \ + erasure_code/aarch64/gf_3vect_mad_neon.S \ + erasure_code/aarch64/gf_4vect_mad_neon.S \ + erasure_code/aarch64/gf_5vect_mad_neon.S \ + erasure_code/aarch64/gf_6vect_mad_neon.S \ + erasure_code/aarch64/gf_vect_mul_neon.S \ + erasure_code/aarch64/gf_vect_mad_sve.S \ + erasure_code/aarch64/gf_2vect_mad_sve.S \ + erasure_code/aarch64/gf_3vect_mad_sve.S \ + erasure_code/aarch64/gf_4vect_mad_sve.S \ + erasure_code/aarch64/gf_5vect_mad_sve.S \ + erasure_code/aarch64/gf_6vect_mad_sve.S \ + erasure_code/aarch64/gf_vect_dot_prod_sve.S \ + erasure_code/aarch64/gf_2vect_dot_prod_sve.S \ + erasure_code/aarch64/gf_3vect_dot_prod_sve.S \ + erasure_code/aarch64/gf_4vect_dot_prod_sve.S \ + erasure_code/aarch64/gf_5vect_dot_prod_sve.S \ + erasure_code/aarch64/gf_6vect_dot_prod_sve.S \ + erasure_code/aarch64/gf_7vect_dot_prod_sve.S \ + erasure_code/aarch64/gf_8vect_dot_prod_sve.S \ + erasure_code/aarch64/gf_vect_mul_sve.S \ + erasure_code/aarch64/ec_multibinary_arm.S diff --git a/contrib/libs/isa-l/erasure_code/aarch64/ec_aarch64_dispatcher.c b/contrib/libs/isa-l/erasure_code/aarch64/ec_aarch64_dispatcher.c new file mode 100644 index 0000000000..0a11604076 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/ec_aarch64_dispatcher.c @@ -0,0 +1,124 @@ +/************************************************************** + Copyright (c) 2019 Huawei Technologies Co., Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include <aarch64_multibinary.h> + +DEFINE_INTERFACE_DISPATCHER(gf_vect_dot_prod) +{ +#if defined(__linux__) + unsigned long auxval = getauxval(AT_HWCAP); + + if (auxval & HWCAP_SVE) + return PROVIDER_INFO(gf_vect_dot_prod_sve); + if (auxval & HWCAP_ASIMD) + return PROVIDER_INFO(gf_vect_dot_prod_neon); +#elif defined(__APPLE__) + if (sysctlEnabled(SYSCTL_SVE_KEY)) + return PROVIDER_INFO(gf_vect_dot_prod_sve); + return PROVIDER_INFO(gf_vect_dot_prod_neon); +#endif + return PROVIDER_BASIC(gf_vect_dot_prod); + +} + +DEFINE_INTERFACE_DISPATCHER(gf_vect_mad) +{ +#if defined(__linux__) + unsigned long auxval = getauxval(AT_HWCAP); + + if (auxval & HWCAP_SVE) + return PROVIDER_INFO(gf_vect_mad_sve); + if (auxval & HWCAP_ASIMD) + return PROVIDER_INFO(gf_vect_mad_neon); +#elif defined(__APPLE__) + if (sysctlEnabled(SYSCTL_SVE_KEY)) + return PROVIDER_INFO(gf_vect_mad_sve); + return PROVIDER_INFO(gf_vect_mad_neon); +#endif + return PROVIDER_BASIC(gf_vect_mad); + +} + +DEFINE_INTERFACE_DISPATCHER(ec_encode_data) +{ +#if defined(__linux__) + unsigned long auxval = getauxval(AT_HWCAP); + + if (auxval & HWCAP_SVE) + return PROVIDER_INFO(ec_encode_data_sve); + if (auxval & HWCAP_ASIMD) + return PROVIDER_INFO(ec_encode_data_neon); +#elif defined(__APPLE__) + if (sysctlEnabled(SYSCTL_SVE_KEY)) + return PROVIDER_INFO(ec_encode_data_sve); + return PROVIDER_INFO(ec_encode_data_neon); +#endif + return PROVIDER_BASIC(ec_encode_data); + +} + +DEFINE_INTERFACE_DISPATCHER(ec_encode_data_update) +{ +#if defined(__linux__) + unsigned long auxval = getauxval(AT_HWCAP); + + if (auxval & HWCAP_SVE) + return PROVIDER_INFO(ec_encode_data_update_sve); + if (auxval & HWCAP_ASIMD) + return PROVIDER_INFO(ec_encode_data_update_neon); +#elif defined(__APPLE__) + if (sysctlEnabled(SYSCTL_SVE_KEY)) + return PROVIDER_INFO(ec_encode_data_update_sve); + return PROVIDER_INFO(ec_encode_data_update_neon); +#endif + return PROVIDER_BASIC(ec_encode_data_update); + +} + +DEFINE_INTERFACE_DISPATCHER(gf_vect_mul) +{ +#if defined(__linux__) + unsigned long auxval = getauxval(AT_HWCAP); + + if (auxval & HWCAP_SVE) + return PROVIDER_INFO(gf_vect_mul_sve); + if (auxval & HWCAP_ASIMD) + return PROVIDER_INFO(gf_vect_mul_neon); +#elif defined(__APPLE__) + if (sysctlEnabled(SYSCTL_SVE_KEY)) + return PROVIDER_INFO(gf_vect_mul_sve); + return PROVIDER_INFO(gf_vect_mul_neon); +#endif + return PROVIDER_BASIC(gf_vect_mul); + +} + +DEFINE_INTERFACE_DISPATCHER(ec_init_tables) +{ + return PROVIDER_BASIC(ec_init_tables); +} diff --git a/contrib/libs/isa-l/erasure_code/aarch64/ec_aarch64_highlevel_func.c b/contrib/libs/isa-l/erasure_code/aarch64/ec_aarch64_highlevel_func.c new file mode 100644 index 0000000000..e001fd72a0 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/ec_aarch64_highlevel_func.c @@ -0,0 +1,264 @@ +/************************************************************** + Copyright (c) 2019 Huawei Technologies Co., Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include "erasure_code.h" + +/*external function*/ +extern void gf_vect_dot_prod_neon(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char *dest); +extern void gf_2vect_dot_prod_neon(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); +extern void gf_3vect_dot_prod_neon(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); +extern void gf_4vect_dot_prod_neon(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); +extern void gf_5vect_dot_prod_neon(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); +extern void gf_vect_mad_neon(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char *dest); +extern void gf_2vect_mad_neon(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest); +extern void gf_3vect_mad_neon(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest); +extern void gf_4vect_mad_neon(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest); +extern void gf_5vect_mad_neon(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest); +extern void gf_6vect_mad_neon(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest); + +void ec_encode_data_neon(int len, int k, int rows, unsigned char *g_tbls, unsigned char **data, + unsigned char **coding) +{ + if (len < 16) { + ec_encode_data_base(len, k, rows, g_tbls, data, coding); + return; + } + + while (rows > 5) { + gf_5vect_dot_prod_neon(len, k, g_tbls, data, coding); + g_tbls += 5 * k * 32; + coding += 5; + rows -= 5; + } + switch (rows) { + case 5: + gf_5vect_dot_prod_neon(len, k, g_tbls, data, coding); + break; + case 4: + gf_4vect_dot_prod_neon(len, k, g_tbls, data, coding); + break; + case 3: + gf_3vect_dot_prod_neon(len, k, g_tbls, data, coding); + break; + case 2: + gf_2vect_dot_prod_neon(len, k, g_tbls, data, coding); + break; + case 1: + gf_vect_dot_prod_neon(len, k, g_tbls, data, *coding); + break; + case 0: + break; + default: + break; + } +} + +void ec_encode_data_update_neon(int len, int k, int rows, int vec_i, unsigned char *g_tbls, + unsigned char *data, unsigned char **coding) +{ + if (len < 16) { + ec_encode_data_update_base(len, k, rows, vec_i, g_tbls, data, coding); + return; + } + while (rows > 6) { + gf_6vect_mad_neon(len, k, vec_i, g_tbls, data, coding); + g_tbls += 6 * k * 32; + coding += 6; + rows -= 6; + } + switch (rows) { + case 6: + gf_6vect_mad_neon(len, k, vec_i, g_tbls, data, coding); + break; + case 5: + gf_5vect_mad_neon(len, k, vec_i, g_tbls, data, coding); + break; + case 4: + gf_4vect_mad_neon(len, k, vec_i, g_tbls, data, coding); + break; + case 3: + gf_3vect_mad_neon(len, k, vec_i, g_tbls, data, coding); + break; + case 2: + gf_2vect_mad_neon(len, k, vec_i, g_tbls, data, coding); + break; + case 1: + gf_vect_mad_neon(len, k, vec_i, g_tbls, data, *coding); + break; + case 0: + break; + } +} + +/* SVE */ +extern void gf_vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char *dest); +extern void gf_2vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); +extern void gf_3vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); +extern void gf_4vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); +extern void gf_5vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); +extern void gf_6vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); +extern void gf_7vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); +extern void gf_8vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); +extern void gf_vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char *dest); +extern void gf_2vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest); +extern void gf_3vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest); +extern void gf_4vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest); +extern void gf_5vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest); +extern void gf_6vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest); + +void ec_encode_data_sve(int len, int k, int rows, unsigned char *g_tbls, unsigned char **data, + unsigned char **coding) +{ + if (len < 16) { + ec_encode_data_base(len, k, rows, g_tbls, data, coding); + return; + } + + while (rows > 11) { + gf_6vect_dot_prod_sve(len, k, g_tbls, data, coding); + g_tbls += 6 * k * 32; + coding += 6; + rows -= 6; + } + + switch (rows) { + case 11: + /* 7 + 4 */ + gf_7vect_dot_prod_sve(len, k, g_tbls, data, coding); + g_tbls += 7 * k * 32; + coding += 7; + gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding); + break; + case 10: + /* 6 + 4 */ + gf_6vect_dot_prod_sve(len, k, g_tbls, data, coding); + g_tbls += 6 * k * 32; + coding += 6; + gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding); + break; + case 9: + /* 5 + 4 */ + gf_5vect_dot_prod_sve(len, k, g_tbls, data, coding); + g_tbls += 5 * k * 32; + coding += 5; + gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding); + break; + case 8: + /* 4 + 4 */ + gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding); + g_tbls += 4 * k * 32; + coding += 4; + gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding); + break; + case 7: + gf_7vect_dot_prod_sve(len, k, g_tbls, data, coding); + break; + case 6: + gf_6vect_dot_prod_sve(len, k, g_tbls, data, coding); + break; + case 5: + gf_5vect_dot_prod_sve(len, k, g_tbls, data, coding); + break; + case 4: + gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding); + break; + case 3: + gf_3vect_dot_prod_sve(len, k, g_tbls, data, coding); + break; + case 2: + gf_2vect_dot_prod_sve(len, k, g_tbls, data, coding); + break; + case 1: + gf_vect_dot_prod_sve(len, k, g_tbls, data, *coding); + break; + default: + break; + } +} + +void ec_encode_data_update_sve(int len, int k, int rows, int vec_i, unsigned char *g_tbls, + unsigned char *data, unsigned char **coding) +{ + if (len < 16) { + ec_encode_data_update_base(len, k, rows, vec_i, g_tbls, data, coding); + return; + } + while (rows > 6) { + gf_6vect_mad_sve(len, k, vec_i, g_tbls, data, coding); + g_tbls += 6 * k * 32; + coding += 6; + rows -= 6; + } + switch (rows) { + case 6: + gf_6vect_mad_sve(len, k, vec_i, g_tbls, data, coding); + break; + case 5: + gf_5vect_mad_sve(len, k, vec_i, g_tbls, data, coding); + break; + case 4: + gf_4vect_mad_sve(len, k, vec_i, g_tbls, data, coding); + break; + case 3: + gf_3vect_mad_sve(len, k, vec_i, g_tbls, data, coding); + break; + case 2: + gf_2vect_mad_sve(len, k, vec_i, g_tbls, data, coding); + break; + case 1: + gf_vect_mad_sve(len, k, vec_i, g_tbls, data, *coding); + break; + default: + break; + } +} diff --git a/contrib/libs/isa-l/erasure_code/aarch64/ec_multibinary_arm.S b/contrib/libs/isa-l/erasure_code/aarch64/ec_multibinary_arm.S new file mode 100644 index 0000000000..c276e63780 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/ec_multibinary_arm.S @@ -0,0 +1,37 @@ +/************************************************************** + Copyright (c) 2019 Huawei Technologies Co., Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "aarch64_multibinary.h" + +mbin_interface ec_encode_data +mbin_interface gf_vect_mul +mbin_interface gf_vect_dot_prod +mbin_interface gf_vect_mad +mbin_interface ec_encode_data_update +mbin_interface ec_init_tables diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_dot_prod_neon.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_dot_prod_neon.S new file mode 100644 index 0000000000..4ff7e7ce16 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_dot_prod_neon.S @@ -0,0 +1,402 @@ +/************************************************************** + Copyright (c) 2019 Huawei Technologies Co., Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "../include/aarch64_label.h" + +.text + +.global cdecl(gf_2vect_dot_prod_neon) +#ifndef __APPLE__ +.type gf_2vect_dot_prod_neon, %function +#endif + +/* arguments */ +x_len .req x0 +x_vec .req x1 +x_tbl .req x2 +x_src .req x3 +x_dest .req x4 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_vec_i .req x5 +x_ptr .req x6 +x_pos .req x7 +x_tmp .req x8 +x_tbl1 .req x9 +x_tbl2 .req x10 +x_dest1 .req x11 +x_dest2 .req x12 + +/* vectors */ +v_gft1_lo .req v0 +v_gft1_hi .req v1 +v_gft2_lo .req v2 +v_gft2_hi .req v3 +q_gft1_lo .req q0 +q_gft1_hi .req q1 +q_gft2_lo .req q2 +q_gft2_hi .req q3 + +v_mask0f .req v4 +q_mask0f .req q4 + +v_tmp1_lo .req v5 +v_tmp1_hi .req v6 +v_tmp1 .req v7 + +v_data_0 .req v8 +v_data_1 .req v9 +v_data_2 .req v10 +v_data_3 .req v11 +v_data_4 .req v12 +v_data_5 .req v13 +v_data_6 .req v14 +v_data_7 .req v15 +q_data_0 .req q8 +q_data_1 .req q9 +q_data_2 .req q10 +q_data_3 .req q11 +q_data_4 .req q12 +q_data_5 .req q13 +q_data_6 .req q14 +q_data_7 .req q15 + +v_p1_0 .req v16 +v_p1_1 .req v17 +v_p1_2 .req v18 +v_p1_3 .req v19 +v_p1_4 .req v20 +v_p1_5 .req v21 +v_p1_6 .req v22 +v_p1_7 .req v23 +v_p2_0 .req v24 +v_p2_1 .req v25 +v_p2_2 .req v26 +v_p2_3 .req v27 +v_p2_4 .req v28 +v_p2_5 .req v29 +v_p2_6 .req v30 +v_p2_7 .req v31 + +q_p1_0 .req q16 +q_p1_1 .req q17 +q_p1_2 .req q18 +q_p1_3 .req q19 +q_p1_4 .req q20 +q_p1_5 .req q21 +q_p1_6 .req q22 +q_p1_7 .req q23 +q_p2_0 .req q24 +q_p2_1 .req q25 +q_p2_2 .req q26 +q_p2_3 .req q27 +q_p2_4 .req q28 +q_p2_5 .req q29 +q_p2_6 .req q30 +q_p2_7 .req q31 + +v_p1 .req v_p1_0 +q_p1 .req q_p1_0 +v_p2 .req v_p2_0 +q_p2 .req q_p2_0 +v_data .req v_p1_1 +q_data .req q_p1_1 +v_data_lo .req v_p1_2 +v_data_hi .req v_p1_3 + +cdecl(gf_2vect_dot_prod_neon): + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + movi v_mask0f.16b, #0x0f + mov x_pos, #0 + lsl x_vec, x_vec, #3 + ldr x_dest1, [x_dest, #8*0] + ldr x_dest2, [x_dest, #8*1] + +.Lloop128_init: + /* less than 128 bytes, goto Lloop16_init */ + cmp x_len, #128 + blt .Lloop16_init + + /* save d8 ~ d15 to stack */ + sub sp, sp, #64 + stp d8, d9, [sp] + stp d10, d11, [sp, #16] + stp d12, d13, [sp, #32] + stp d14, d15, [sp, #48] + + sub x_len, x_len, #128 + +.Lloop128: + movi v_p1_0.16b, #0 + movi v_p1_1.16b, #0 + movi v_p1_2.16b, #0 + movi v_p1_3.16b, #0 + movi v_p1_4.16b, #0 + movi v_p1_5.16b, #0 + movi v_p1_6.16b, #0 + movi v_p1_7.16b, #0 + + movi v_p2_0.16b, #0 + movi v_p2_1.16b, #0 + movi v_p2_2.16b, #0 + movi v_p2_3.16b, #0 + movi v_p2_4.16b, #0 + movi v_p2_5.16b, #0 + movi v_p2_6.16b, #0 + movi v_p2_7.16b, #0 + + mov x_tbl1, x_tbl + add x_tbl2, x_tbl, x_vec, lsl #2 + mov x_vec_i, #0 + +.Lloop128_vects: + ldr x_ptr, [x_src, x_vec_i] + add x_vec_i, x_vec_i, #8 + add x_ptr, x_ptr, x_pos + + ldp q_data_0, q_data_1, [x_ptr], #32 + ldp q_data_2, q_data_3, [x_ptr], #32 + + ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 + ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 + ldp q_data_4, q_data_5, [x_ptr], #32 + ldp q_data_6, q_data_7, [x_ptr], #32 + prfm pldl1strm, [x_ptr] + prfm pldl1keep, [x_tbl1] + prfm pldl1keep, [x_tbl2] + + /* data_0 */ + and v_tmp1.16b, v_data_0.16b, v_mask0f.16b + ushr v_data_0.16b, v_data_0.16b, #4 + + tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_0.16b + eor v_p1_0.16b, v_tmp1_lo.16b, v_p1_0.16b + eor v_p1_0.16b, v_p1_0.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_0.16b + eor v_p2_0.16b, v_tmp1_lo.16b, v_p2_0.16b + eor v_p2_0.16b, v_p2_0.16b, v_tmp1_hi.16b + + /* data_1 */ + and v_tmp1.16b, v_data_1.16b, v_mask0f.16b + ushr v_data_1.16b, v_data_1.16b, #4 + + tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_1.16b + eor v_p1_1.16b, v_tmp1_lo.16b, v_p1_1.16b + eor v_p1_1.16b, v_p1_1.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_1.16b + eor v_p2_1.16b, v_tmp1_lo.16b, v_p2_1.16b + eor v_p2_1.16b, v_p2_1.16b, v_tmp1_hi.16b + + /* data_2 */ + and v_tmp1.16b, v_data_2.16b, v_mask0f.16b + ushr v_data_2.16b, v_data_2.16b, #4 + + tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_2.16b + eor v_p1_2.16b, v_tmp1_lo.16b, v_p1_2.16b + eor v_p1_2.16b, v_p1_2.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_2.16b + eor v_p2_2.16b, v_tmp1_lo.16b, v_p2_2.16b + eor v_p2_2.16b, v_p2_2.16b, v_tmp1_hi.16b + + /* data_3 */ + and v_tmp1.16b, v_data_3.16b, v_mask0f.16b + ushr v_data_3.16b, v_data_3.16b, #4 + + tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_3.16b + eor v_p1_3.16b, v_tmp1_lo.16b, v_p1_3.16b + eor v_p1_3.16b, v_p1_3.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_3.16b + eor v_p2_3.16b, v_tmp1_lo.16b, v_p2_3.16b + eor v_p2_3.16b, v_p2_3.16b, v_tmp1_hi.16b + + /* data_4 */ + and v_tmp1.16b, v_data_4.16b, v_mask0f.16b + ushr v_data_4.16b, v_data_4.16b, #4 + + tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_4.16b + eor v_p1_4.16b, v_tmp1_lo.16b, v_p1_4.16b + eor v_p1_4.16b, v_p1_4.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_4.16b + eor v_p2_4.16b, v_tmp1_lo.16b, v_p2_4.16b + eor v_p2_4.16b, v_p2_4.16b, v_tmp1_hi.16b + + /* data_5 */ + and v_tmp1.16b, v_data_5.16b, v_mask0f.16b + ushr v_data_5.16b, v_data_5.16b, #4 + + tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_5.16b + eor v_p1_5.16b, v_tmp1_lo.16b, v_p1_5.16b + eor v_p1_5.16b, v_p1_5.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_5.16b + eor v_p2_5.16b, v_tmp1_lo.16b, v_p2_5.16b + eor v_p2_5.16b, v_p2_5.16b, v_tmp1_hi.16b + + /* data_6 */ + and v_tmp1.16b, v_data_6.16b, v_mask0f.16b + ushr v_data_6.16b, v_data_6.16b, #4 + + tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_6.16b + eor v_p1_6.16b, v_tmp1_lo.16b, v_p1_6.16b + eor v_p1_6.16b, v_p1_6.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_6.16b + eor v_p2_6.16b, v_tmp1_lo.16b, v_p2_6.16b + eor v_p2_6.16b, v_p2_6.16b, v_tmp1_hi.16b + + /* data_7 */ + and v_tmp1.16b, v_data_7.16b, v_mask0f.16b + ushr v_data_7.16b, v_data_7.16b, #4 + + tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_7.16b + eor v_p1_7.16b, v_tmp1_lo.16b, v_p1_7.16b + eor v_p1_7.16b, v_p1_7.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_7.16b + eor v_p2_7.16b, v_tmp1_lo.16b, v_p2_7.16b + eor v_p2_7.16b, v_p2_7.16b, v_tmp1_hi.16b + + cmp x_vec_i, x_vec + blt .Lloop128_vects + +.Lloop128_vects_end: + add x_ptr, x_dest1, x_pos + stp q_p1_0, q_p1_1, [x_ptr], #32 + stp q_p1_2, q_p1_3, [x_ptr], #32 + stp q_p1_4, q_p1_5, [x_ptr], #32 + stp q_p1_6, q_p1_7, [x_ptr] + + add x_ptr, x_dest2, x_pos + stp q_p2_0, q_p2_1, [x_ptr], #32 + stp q_p2_2, q_p2_3, [x_ptr], #32 + stp q_p2_4, q_p2_5, [x_ptr], #32 + stp q_p2_6, q_p2_7, [x_ptr] + + add x_pos, x_pos, #128 + cmp x_pos, x_len + ble .Lloop128 + +.Lloop128_end: + /* restore d8 ~ d15 */ + ldp d8, d9, [sp] + ldp d10, d11, [sp, #16] + ldp d12, d13, [sp, #32] + ldp d14, d15, [sp, #48] + add sp, sp, #64 + + add x_len, x_len, #128 + cmp x_pos, x_len + beq .return_pass + +.Lloop16_init: + sub x_len, x_len, #16 + cmp x_pos, x_len + bgt .lessthan16_init + +.Lloop16: + movi v_p1.16b, #0 + movi v_p2.16b, #0 + mov x_tbl1, x_tbl + add x_tbl2, x_tbl, x_vec, lsl #2 + mov x_vec_i, #0 + +.Lloop16_vects: + ldr x_ptr, [x_src, x_vec_i] + ldr q_data, [x_ptr, x_pos] + add x_vec_i, x_vec_i, #8 + + ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 + ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 + + and v_data_lo.16b, v_data.16b, v_mask0f.16b + ushr v_data_hi.16b, v_data.16b, #4 + + tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b + tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b + eor v_p1.16b, v_tmp1_lo.16b, v_p1.16b + eor v_p1.16b, v_p1.16b, v_tmp1_hi.16b + + tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b + tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b + eor v_p2.16b, v_tmp1_lo.16b, v_p2.16b + eor v_p2.16b, v_p2.16b, v_tmp1_hi.16b + + cmp x_vec_i, x_vec + bne .Lloop16_vects + +.Lloop16_vects_end: + str q_p1, [x_dest1, x_pos] + str q_p2, [x_dest2, x_pos] + add x_pos, x_pos, #16 + cmp x_pos, x_len + ble .Lloop16 + +.Lloop16_end: + sub x_tmp, x_pos, x_len + cmp x_tmp, #16 + beq .return_pass + +.lessthan16_init: + mov x_pos, x_len + b .Lloop16 + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_dot_prod_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_dot_prod_sve.S new file mode 100644 index 0000000000..99b5f15cfb --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_dot_prod_sve.S @@ -0,0 +1,168 @@ +/************************************************************* + Copyright (c) 2021 Linaro Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +.text +.align 6 +.arch armv8-a+sve + +#include "../include/aarch64_label.h" + +.global cdecl(gf_2vect_dot_prod_sve) +#ifndef __APPLE__ +.type gf_2vect_dot_prod_sve, %function +#endif +/* void gf_2vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + */ + +/* arguments */ +x_len .req x0 /* vector length */ +x_vec .req x1 /* number of source vectors (ie. data blocks) */ +x_tbl .req x2 +x_src .req x3 +x_dest .req x4 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_vec_i .req x5 +x_ptr .req x6 +x_pos .req x7 + +x_tbl1 .req x8 +x_tbl2 .req x9 +x_dest1 .req x10 +x_dest2 .req x_dest /* reused */ + +/* r16,r17,r18,r29,r30: special role registers, avoided */ +/* r19..r29 and SP must be preserved */ + +/* vectors */ +z_mask0f .req z0 + +z_src .req z1 +z_src_lo .req z2 +z_src_hi .req z_src + +z_dest1 .req z3 + +z_gft1_lo .req z4 +z_gft1_hi .req z5 +q_gft1_lo .req q4 +q_gft1_hi .req q5 + +/* bottom 64-bit of v8..v15 must be preserved if used */ +z_gft2_lo .req z17 +z_gft2_hi .req z18 +q_gft2_lo .req q17 +q_gft2_hi .req q18 + +z_dest2 .req z27 + +cdecl(gf_2vect_dot_prod_sve): + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ + mov x_pos, #0 + lsl x_vec, x_vec, #3 + ldp x_dest1, x_dest2, [x_dest, #8*0] + +/* Loop 1: x_len, vector length */ +.Lloopsve_vl: + whilelo p0.b, x_pos, x_len + b.none .return_pass + + mov x_vec_i, #0 /* clear x_vec_i */ + ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ + + mov z_dest1.b, #0 /* clear z_dest1 */ + mov z_dest2.b, #0 /* clear z_dest2 */ + + /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */ + mov x_tbl1, x_tbl /* reset x_tbl1 */ + add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */ + +/* Loop 2: x_vec, number of source vectors (ie. data blocks) */ +.Lloopsve_vl_vects: + /* load src data, governed by p0 */ + ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */ + /* split 4-bit lo; 4-bit hi */ + and z_src_lo.d, z_src.d, z_mask0f.d + lsr z_src_hi.b, z_src.b, #4 + + + /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */ + /* load gf_table's */ + ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */ + ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 + + /* prefetch */ + prfb pldl2keep, p0, [x_tbl1] + prfb pldl2keep, p0, [x_tbl2] + + /* calc for next */ + add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */ + ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ + + /* dest 1 */ + /* table indexing, ie. gf(2^8) multiplication */ + tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b + tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b + /* exclusive or, ie. gf(2^8) add */ + eor z_dest1.d, z_gft1_lo.d, z_dest1.d + eor z_dest1.d, z_dest1.d, z_gft1_hi.d + + /* dest 2 */ + tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b + tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b + eor z_dest2.d, z_gft2_lo.d, z_dest2.d + eor z_dest2.d, z_dest2.d, z_gft2_hi.d + + cmp x_vec_i, x_vec + blt .Lloopsve_vl_vects +/* end of Loop 2 */ + + /* store dest data, governed by p0 */ + st1b z_dest1.b, p0, [x_dest1, x_pos] + st1b z_dest2.b, p0, [x_dest2, x_pos] + + /* increment one vector length */ + incb x_pos + b .Lloopsve_vl +/* end of Loop 1 */ + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_mad_neon.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_mad_neon.S new file mode 100644 index 0000000000..453524a221 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_mad_neon.S @@ -0,0 +1,411 @@ +/************************************************************** + Copyright (c) 2019 Huawei Technologies Co., Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "../include/aarch64_label.h" + +.text + +.global cdecl(gf_2vect_mad_neon) +#ifndef __APPLE__ +.type gf_2vect_mad_neon, %function +#endif + +/* arguments */ +x_len .req x0 +x_vec .req x1 +x_vec_i .req x2 +x_tbl .req x3 +x_src .req x4 +x_dest .req x5 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_src_end .req x6 +x_dest1 .req x7 +x_dest2 .req x8 +x_tmp .req x9 +x_tbl1 .req x10 +x_tbl2 .req x11 +x_const .req x12 + +/* vectors */ +v_mask0f .req v0 +v_tmp_lo .req v1 +v_tmp_hi .req v2 +v_tmp .req v3 +q_tmp .req q3 + +v_gft1_lo .req v4 +v_gft1_hi .req v5 +v_gft2_lo .req v6 +v_gft2_hi .req v7 +q_gft1_lo .req q4 +q_gft1_hi .req q5 +q_gft2_lo .req q6 +q_gft2_hi .req q7 + +v_data_0 .req v8 +v_data_1 .req v9 +v_data_2 .req v10 +v_data_3 .req v11 +v_data_4 .req v12 +v_data_5 .req v13 +v_data_6 .req v14 +v_data_7 .req v15 +q_data_0 .req q8 +q_data_1 .req q9 +q_data_2 .req q10 +q_data_3 .req q11 +q_data_4 .req q12 +q_data_5 .req q13 +q_data_6 .req q14 +q_data_7 .req q15 + +v_data_0_lo .req v16 +v_data_1_lo .req v17 +v_data_2_lo .req v18 +v_data_3_lo .req v19 +v_data_4_lo .req v20 +v_data_5_lo .req v21 +v_data_6_lo .req v22 +v_data_7_lo .req v23 +v_data_0_hi .req v_data_0 +v_data_1_hi .req v_data_1 +v_data_2_hi .req v_data_2 +v_data_3_hi .req v_data_3 +v_data_4_hi .req v_data_4 +v_data_5_hi .req v_data_5 +v_data_6_hi .req v_data_6 +v_data_7_hi .req v_data_7 + +v_d0 .req v24 +v_d1 .req v25 +v_d2 .req v26 +v_d3 .req v27 +v_d4 .req v28 +v_d5 .req v29 +v_d6 .req v30 +v_d7 .req v31 +q_d0 .req q24 +q_d1 .req q25 +q_d2 .req q26 +q_d3 .req q27 +q_d4 .req q28 +q_d5 .req q29 +q_d6 .req q30 +q_d7 .req q31 + +v_data .req v16 +q_data .req q16 +v_data_lo .req v17 +v_data_hi .req v18 + + +cdecl(gf_2vect_mad_neon): + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + movi v_mask0f.16b, #0x0f + lsl x_vec_i, x_vec_i, #5 + lsl x_vec, x_vec, #5 + add x_tbl1, x_tbl, x_vec_i + add x_tbl2, x_tbl1, x_vec + add x_src_end, x_src, x_len + + ldr x_dest1, [x_dest] + ldr x_dest2, [x_dest, #8] + ldr q_gft1_lo, [x_tbl1] + ldr q_gft1_hi, [x_tbl1, #16] + ldr q_gft2_lo, [x_tbl2] + ldr q_gft2_hi, [x_tbl2, #16] + +.Lloop128_init: + /* less than 128 bytes, goto Lloop16_init */ + cmp x_len, #128 + blt .Lloop16_init + + /* save d8 ~ d15 to stack */ + sub sp, sp, #64 + stp d8, d9, [sp] + stp d10, d11, [sp, #16] + stp d12, d13, [sp, #32] + stp d14, d15, [sp, #48] + + sub x_src_end, x_src_end, #128 + +.Lloop128: + ldr q_data_0, [x_src, #16*0] + ldr q_data_1, [x_src, #16*1] + ldr q_data_2, [x_src, #16*2] + ldr q_data_3, [x_src, #16*3] + ldr q_data_4, [x_src, #16*4] + ldr q_data_5, [x_src, #16*5] + ldr q_data_6, [x_src, #16*6] + ldr q_data_7, [x_src, #16*7] + + ldr q_d0, [x_dest1, #16*0] + ldr q_d1, [x_dest1, #16*1] + ldr q_d2, [x_dest1, #16*2] + ldr q_d3, [x_dest1, #16*3] + ldr q_d4, [x_dest1, #16*4] + ldr q_d5, [x_dest1, #16*5] + ldr q_d6, [x_dest1, #16*6] + ldr q_d7, [x_dest1, #16*7] + + and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b + and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b + and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b + and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b + and v_data_4_lo.16b, v_data_4.16b, v_mask0f.16b + and v_data_5_lo.16b, v_data_5.16b, v_mask0f.16b + and v_data_6_lo.16b, v_data_6.16b, v_mask0f.16b + and v_data_7_lo.16b, v_data_7.16b, v_mask0f.16b + + ushr v_data_0_hi.16b, v_data_0.16b, #4 + ushr v_data_1_hi.16b, v_data_1.16b, #4 + ushr v_data_2_hi.16b, v_data_2.16b, #4 + ushr v_data_3_hi.16b, v_data_3.16b, #4 + ushr v_data_4_hi.16b, v_data_4.16b, #4 + ushr v_data_5_hi.16b, v_data_5.16b, #4 + ushr v_data_6_hi.16b, v_data_6.16b, #4 + ushr v_data_7_hi.16b, v_data_7.16b, #4 + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b + eor v_d0.16b, v_tmp_lo.16b, v_d0.16b + eor v_d0.16b, v_d0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b + eor v_d1.16b, v_tmp_lo.16b, v_d1.16b + eor v_d1.16b, v_d1.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b + eor v_d2.16b, v_tmp_lo.16b, v_d2.16b + eor v_d2.16b, v_d2.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b + eor v_d3.16b, v_tmp_lo.16b, v_d3.16b + eor v_d3.16b, v_d3.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_4_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_4_hi.16b + eor v_d4.16b, v_tmp_lo.16b, v_d4.16b + eor v_d4.16b, v_d4.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_5_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_5_hi.16b + eor v_d5.16b, v_tmp_lo.16b, v_d5.16b + eor v_d5.16b, v_d5.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_6_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_6_hi.16b + eor v_d6.16b, v_tmp_lo.16b, v_d6.16b + eor v_d6.16b, v_d6.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_7_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_7_hi.16b + eor v_d7.16b, v_tmp_lo.16b, v_d7.16b + eor v_d7.16b, v_d7.16b, v_tmp_hi.16b + + str q_d0, [x_dest1, #16*0] + str q_d1, [x_dest1, #16*1] + str q_d2, [x_dest1, #16*2] + str q_d3, [x_dest1, #16*3] + str q_d4, [x_dest1, #16*4] + str q_d5, [x_dest1, #16*5] + str q_d6, [x_dest1, #16*6] + str q_d7, [x_dest1, #16*7] + + ldr q_d0, [x_dest2, #16*0] + ldr q_d1, [x_dest2, #16*1] + ldr q_d2, [x_dest2, #16*2] + ldr q_d3, [x_dest2, #16*3] + ldr q_d4, [x_dest2, #16*4] + ldr q_d5, [x_dest2, #16*5] + ldr q_d6, [x_dest2, #16*6] + ldr q_d7, [x_dest2, #16*7] + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_0_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_0_hi.16b + eor v_d0.16b, v_tmp_lo.16b, v_d0.16b + eor v_d0.16b, v_d0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_1_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_1_hi.16b + eor v_d1.16b, v_tmp_lo.16b, v_d1.16b + eor v_d1.16b, v_d1.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_2_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_2_hi.16b + eor v_d2.16b, v_tmp_lo.16b, v_d2.16b + eor v_d2.16b, v_d2.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_3_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_3_hi.16b + eor v_d3.16b, v_tmp_lo.16b, v_d3.16b + eor v_d3.16b, v_d3.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_4_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_4_hi.16b + eor v_d4.16b, v_tmp_lo.16b, v_d4.16b + eor v_d4.16b, v_d4.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_5_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_5_hi.16b + eor v_d5.16b, v_tmp_lo.16b, v_d5.16b + eor v_d5.16b, v_d5.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_6_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_6_hi.16b + eor v_d6.16b, v_tmp_lo.16b, v_d6.16b + eor v_d6.16b, v_d6.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_7_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_7_hi.16b + eor v_d7.16b, v_tmp_lo.16b, v_d7.16b + eor v_d7.16b, v_d7.16b, v_tmp_hi.16b + + str q_d0, [x_dest2, #16*0] + str q_d1, [x_dest2, #16*1] + str q_d2, [x_dest2, #16*2] + str q_d3, [x_dest2, #16*3] + str q_d4, [x_dest2, #16*4] + str q_d5, [x_dest2, #16*5] + str q_d6, [x_dest2, #16*6] + str q_d7, [x_dest2, #16*7] + + add x_src, x_src, #128 + add x_dest1, x_dest1, #128 + add x_dest2, x_dest2, #128 + cmp x_src, x_src_end + bls .Lloop128 + +.Lloop128_end: + /* restore d8 ~ d15 */ + ldp d8, d9, [sp] + ldp d10, d11, [sp, #16] + ldp d12, d13, [sp, #32] + ldp d14, d15, [sp, #48] + add sp, sp, #64 + add x_src_end, x_src_end, #128 + +.Lloop16_init: + sub x_src_end, x_src_end, #16 + cmp x_src, x_src_end + bhi .lessthan16_init + +.Lloop16: + ldr q_data, [x_src] + + ldr q_d0, [x_dest1] + ldr q_d1, [x_dest2] + + and v_data_lo.16b, v_data.16b, v_mask0f.16b + ushr v_data_hi.16b, v_data.16b, #4 + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b + eor v_d0.16b, v_tmp_lo.16b, v_d0.16b + eor v_d0.16b, v_d0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b + eor v_d1.16b, v_tmp_lo.16b, v_d1.16b + eor v_d1.16b, v_d1.16b, v_tmp_hi.16b + + str q_d0, [x_dest1] + str q_d1, [x_dest2] + + add x_dest1, x_dest1, #16 + add x_dest2, x_dest2, #16 + add x_src, x_src, #16 + cmp x_src, x_src_end + bls .Lloop16 + +.lessthan16_init: + sub x_tmp, x_src, x_src_end + cmp x_tmp, #16 + beq .return_pass + +.lessthan16: + mov x_src, x_src_end + sub x_dest1, x_dest1, x_tmp + sub x_dest2, x_dest2, x_tmp + +#ifndef __APPLE__ + adrp x_const, const_tbl + add x_const, x_const, :lo12:const_tbl +#else + adrp x_const, const_tbl@PAGE + add x_const, x_const, const_tbl@PAGEOFF +#endif + sub x_const, x_const, x_tmp + ldr q_tmp, [x_const, #16] + + ldr q_data, [x_src] + ldr q_d0, [x_dest1] + ldr q_d1, [x_dest2] + + and v_data_lo.16b, v_data.16b, v_mask0f.16b + ushr v_data_hi.16b, v_data.16b, #4 + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b + eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b + and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b + eor v_d0.16b, v_d0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b + eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b + and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b + eor v_d1.16b, v_d1.16b, v_tmp_hi.16b + + str q_d0, [x_dest1] + str q_d1, [x_dest2] + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret + +ASM_DEF_RODATA +.balign 8 +const_tbl: + .dword 0x0000000000000000, 0x0000000000000000 + .dword 0xffffffffffffffff, 0xffffffffffffffff diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_mad_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_mad_sve.S new file mode 100644 index 0000000000..f0ddf01187 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_mad_sve.S @@ -0,0 +1,152 @@ +/************************************************************** + Copyright (c) 2021 Linaro Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +.text +.align 6 +.arch armv8-a+sve + +#include "../include/aarch64_label.h" + +.global cdecl(gf_2vect_mad_sve) +#ifndef __APPLE__ +.type gf_2vect_mad_sve, %function +#endif + +/* gf_2vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest); + */ +/* arguments */ +x_len .req x0 +x_vec .req x1 +x_vec_i .req x2 +x_tbl .req x3 +x_src .req x4 +x_dest .req x5 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_pos .req x6 +x_dest2 .req x7 +x_dest1 .req x12 + +/* vectors */ +z_mask0f .req z0 + +z_src .req z1 +z_src_lo .req z2 +z_src_hi .req z_src + +z_dest1 .req z3 + +z_tmp_lo .req z4 +z_tmp_hi .req z5 + +z_gft1_lo .req z6 +z_gft1_hi .req z7 +q_gft1_lo .req q6 +q_gft1_hi .req q7 + +/* bottom 64-bit of v8..v15 must be preserved if used */ +z_gft2_lo .req z17 +z_gft2_hi .req z18 +q_gft2_lo .req q17 +q_gft2_hi .req q18 + +z_dest2 .req z27 + +cdecl(gf_2vect_mad_sve): + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ + /* load table 1 */ + add x_tbl, x_tbl, x_vec_i, LSL #5 /* x_tbl += x_vec_i * 2^5 */ + + /* Load table 1 with NEON instruction ldp */ + ldp q_gft1_lo, q_gft1_hi, [x_tbl] + /* load table 2 */ + add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */ + ldp q_gft2_lo, q_gft2_hi, [x_tbl] + + ldr x_dest1, [x_dest, #8*0] /* pointer to dest1 */ + ldr x_dest2, [x_dest, #8*1] /* pointer to dest2 */ + + mov x_pos, #0 + + /* vector length agnostic */ +.Lloopsve_vl: + whilelo p0.b, x_pos, x_len + b.none .return_pass + + /* prefetch dest data */ + prfb pldl2strm, p0, [x_dest1, x_pos] + prfb pldl2strm, p0, [x_dest2, x_pos] + + /* load src data, governed by p0 */ + ld1b z_src.b, p0/z, [x_src, x_pos] + + /* split 4-bit lo; 4-bit hi */ + and z_src_lo.d, z_src.d, z_mask0f.d + lsr z_src_hi.b, z_src.b, #4 + + /* load dest data, governed by p0 */ + ld1b z_dest1.b, p0/z, [x_dest1, x_pos] + ld1b z_dest2.b, p0/z, [x_dest2, x_pos] + + /* dest1 */ + /* table indexing, ie. gf(2^8) multiplication */ + tbl z_tmp_lo.b, {z_gft1_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft1_hi.b}, z_src_hi.b + /* exclusive or, ie. gf(2^8) add */ + eor z_dest1.d, z_tmp_lo.d, z_dest1.d + eor z_dest1.d, z_tmp_hi.d, z_dest1.d + + /* dest2 */ + tbl z_tmp_lo.b, {z_gft2_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft2_hi.b}, z_src_hi.b + eor z_dest2.d, z_tmp_lo.d, z_dest2.d + eor z_dest2.d, z_tmp_hi.d, z_dest2.d + + /* store dest data, governed by p0 */ + st1b z_dest1.b, p0, [x_dest1, x_pos] + st1b z_dest2.b, p0, [x_dest2, x_pos] + /* increment one vector length */ + incb x_pos + + b .Lloopsve_vl + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_dot_prod_neon.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_dot_prod_neon.S new file mode 100644 index 0000000000..cff34fc3dd --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_dot_prod_neon.S @@ -0,0 +1,361 @@ +/************************************************************** + Copyright (c) 2019 Huawei Technologies Co., Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "../include/aarch64_label.h" + +.text + +.global cdecl(gf_3vect_dot_prod_neon) +#ifndef __APPLE__ +.type gf_3vect_dot_prod_neon, %function +#endif + +/* arguments */ +x_len .req x0 +x_vec .req x1 +x_tbl .req x2 +x_src .req x3 +x_dest .req x4 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_vec_i .req x5 +x_ptr .req x6 +x_pos .req x7 +x_tmp .req x8 +x_dest1 .req x9 +x_tbl1 .req x10 +x_dest2 .req x11 +x_tbl2 .req x12 +x_dest3 .req x13 +x_tbl3 .req x14 + +/* vectors */ +v_gft1_lo .req v0 +v_gft1_hi .req v1 +v_gft2_lo .req v2 +v_gft2_hi .req v3 +v_gft3_lo .req v4 +v_gft3_hi .req v5 +q_gft1_lo .req q0 +q_gft1_hi .req q1 +q_gft2_lo .req q2 +q_gft2_hi .req q3 +q_gft3_lo .req q4 +q_gft3_hi .req q5 + +v_mask0f .req v6 +q_mask0f .req q6 +v_tmp1 .req v7 + +v_data_0 .req v8 +v_data_1 .req v9 +v_data_2 .req v10 +v_data_3 .req v11 +q_data_0 .req q8 +q_data_1 .req q9 +q_data_2 .req q10 +q_data_3 .req q11 + +v_tmp1_lo .req v12 +v_tmp1_hi .req v13 + +v_p1_0 .req v20 +v_p1_1 .req v21 +v_p1_2 .req v22 +v_p1_3 .req v23 +v_p2_0 .req v24 +v_p2_1 .req v25 +v_p2_2 .req v26 +v_p2_3 .req v27 +v_p3_0 .req v28 +v_p3_1 .req v29 +v_p3_2 .req v30 +v_p3_3 .req v31 + +q_p1_0 .req q20 +q_p1_1 .req q21 +q_p1_2 .req q22 +q_p1_3 .req q23 +q_p2_0 .req q24 +q_p2_1 .req q25 +q_p2_2 .req q26 +q_p2_3 .req q27 +q_p3_0 .req q28 +q_p3_1 .req q29 +q_p3_2 .req q30 +q_p3_3 .req q31 + +v_data .req v_p1_1 +q_data .req q_p1_1 +v_data_lo .req v_p1_2 +v_data_hi .req v_p1_3 + + +cdecl(gf_3vect_dot_prod_neon): + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + movi v_mask0f.16b, #0x0f + mov x_pos, #0 + lsl x_vec, x_vec, #3 + ldr x_dest1, [x_dest, #8*0] + ldr x_dest2, [x_dest, #8*1] + ldr x_dest3, [x_dest, #8*2] + +.Lloop64_init: + /* less than 64 bytes, goto Lloop16_init */ + cmp x_len, #64 + blt .Lloop16_init + + /* save d8 ~ d15 to stack */ + sub sp, sp, #64 + stp d8, d9, [sp] + stp d10, d11, [sp, #16] + stp d12, d13, [sp, #32] + stp d14, d15, [sp, #48] + + sub x_len, x_len, #64 + +.Lloop64: + movi v_p1_0.16b, #0 + movi v_p1_1.16b, #0 + movi v_p1_2.16b, #0 + movi v_p1_3.16b, #0 + movi v_p2_0.16b, #0 + movi v_p2_1.16b, #0 + movi v_p2_2.16b, #0 + movi v_p2_3.16b, #0 + movi v_p3_0.16b, #0 + movi v_p3_1.16b, #0 + movi v_p3_2.16b, #0 + movi v_p3_3.16b, #0 + + mov x_tbl1, x_tbl + add x_tbl2, x_tbl1, x_vec, lsl #2 + add x_tbl3, x_tbl2, x_vec, lsl #2 + mov x_vec_i, #0 + +.Lloop64_vects: + ldr x_ptr, [x_src, x_vec_i] + add x_vec_i, x_vec_i, #8 + add x_ptr, x_ptr, x_pos + + ldr q_data_0, [x_ptr], #16 + ldr q_data_1, [x_ptr], #16 + + ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 + ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 + ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32 + + ldr q_data_2, [x_ptr], #16 + ldr q_data_3, [x_ptr], #16 + prfm pldl1strm, [x_ptr] + prfm pldl1keep, [x_tbl1] + prfm pldl1keep, [x_tbl2] + prfm pldl1keep, [x_tbl3] + + /* data_0 */ + and v_tmp1.16b, v_data_0.16b, v_mask0f.16b + ushr v_data_0.16b, v_data_0.16b, #4 + + tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_0.16b + eor v_p1_0.16b, v_tmp1_lo.16b, v_p1_0.16b + eor v_p1_0.16b, v_p1_0.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_0.16b + eor v_p2_0.16b, v_tmp1_lo.16b, v_p2_0.16b + eor v_p2_0.16b, v_p2_0.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_0.16b + eor v_p3_0.16b, v_tmp1_lo.16b, v_p3_0.16b + eor v_p3_0.16b, v_p3_0.16b, v_tmp1_hi.16b + + /* data_1 */ + and v_tmp1.16b, v_data_1.16b, v_mask0f.16b + ushr v_data_1.16b, v_data_1.16b, #4 + + tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_1.16b + eor v_p1_1.16b, v_tmp1_lo.16b, v_p1_1.16b + eor v_p1_1.16b, v_p1_1.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_1.16b + eor v_p2_1.16b, v_tmp1_lo.16b, v_p2_1.16b + eor v_p2_1.16b, v_p2_1.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_1.16b + eor v_p3_1.16b, v_tmp1_lo.16b, v_p3_1.16b + eor v_p3_1.16b, v_p3_1.16b, v_tmp1_hi.16b + + /* data_2 */ + and v_tmp1.16b, v_data_2.16b, v_mask0f.16b + ushr v_data_2.16b, v_data_2.16b, #4 + + tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_2.16b + eor v_p1_2.16b, v_tmp1_lo.16b, v_p1_2.16b + eor v_p1_2.16b, v_p1_2.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_2.16b + eor v_p2_2.16b, v_tmp1_lo.16b, v_p2_2.16b + eor v_p2_2.16b, v_p2_2.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_2.16b + eor v_p3_2.16b, v_tmp1_lo.16b, v_p3_2.16b + eor v_p3_2.16b, v_p3_2.16b, v_tmp1_hi.16b + + /* data_3 */ + and v_tmp1.16b, v_data_3.16b, v_mask0f.16b + ushr v_data_3.16b, v_data_3.16b, #4 + + tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_3.16b + eor v_p1_3.16b, v_tmp1_lo.16b, v_p1_3.16b + eor v_p1_3.16b, v_p1_3.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_3.16b + eor v_p2_3.16b, v_tmp1_lo.16b, v_p2_3.16b + eor v_p2_3.16b, v_p2_3.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_3.16b + eor v_p3_3.16b, v_tmp1_lo.16b, v_p3_3.16b + eor v_p3_3.16b, v_p3_3.16b, v_tmp1_hi.16b + + cmp x_vec_i, x_vec + blt .Lloop64_vects + +.Lloop64_vects_end: + add x_ptr, x_dest1, x_pos + stp q_p1_0, q_p1_1, [x_ptr], #32 + stp q_p1_2, q_p1_3, [x_ptr] + + add x_ptr, x_dest2, x_pos + stp q_p2_0, q_p2_1, [x_ptr], #32 + stp q_p2_2, q_p2_3, [x_ptr] + + add x_ptr, x_dest3, x_pos + stp q_p3_0, q_p3_1, [x_ptr], #32 + stp q_p3_2, q_p3_3, [x_ptr] + + add x_pos, x_pos, #64 + cmp x_pos, x_len + ble .Lloop64 + +.Lloop64_end: + /* restore d8 ~ d15 */ + ldp d8, d9, [sp] + ldp d10, d11, [sp, #16] + ldp d12, d13, [sp, #32] + ldp d14, d15, [sp, #48] + add sp, sp, #64 + + add x_len, x_len, #64 + cmp x_pos, x_len + beq .return_pass + +.Lloop16_init: + sub x_len, x_len, #16 + cmp x_pos, x_len + bgt .lessthan16_init + +.Lloop16: + movi v_p1_0.16b, #0 + movi v_p2_0.16b, #0 + movi v_p3_0.16b, #0 + mov x_tbl1, x_tbl + add x_tbl2, x_tbl1, x_vec, lsl #2 + add x_tbl3, x_tbl2, x_vec, lsl #2 + mov x_vec_i, #0 + +.Lloop16_vects: + ldr x_ptr, [x_src, x_vec_i] + add x_vec_i, x_vec_i, #8 + ldr q_data, [x_ptr, x_pos] + + ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 + ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 + ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32 + + and v_data_lo.16b, v_data.16b, v_mask0f.16b + ushr v_data_hi.16b, v_data.16b, #4 + + tbl v_gft1_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b + tbl v_gft1_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b + tbl v_gft2_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b + tbl v_gft2_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b + tbl v_gft3_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b + tbl v_gft3_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b + + eor v_p1_0.16b, v_gft1_hi.16b, v_p1_0.16b + eor v_p1_0.16b, v_p1_0.16b, v_gft1_lo.16b + eor v_p2_0.16b, v_gft2_hi.16b, v_p2_0.16b + eor v_p2_0.16b, v_p2_0.16b, v_gft2_lo.16b + eor v_p3_0.16b, v_gft3_hi.16b, v_p3_0.16b + eor v_p3_0.16b, v_p3_0.16b, v_gft3_lo.16b + + cmp x_vec_i, x_vec + bne .Lloop16_vects + +.Lloop16_vects_end: + str q_p1_0, [x_dest1, x_pos] + str q_p2_0, [x_dest2, x_pos] + str q_p3_0, [x_dest3, x_pos] + add x_pos, x_pos, #16 + cmp x_pos, x_len + ble .Lloop16 + +.Lloop16_end: + sub x_tmp, x_pos, x_len + cmp x_tmp, #16 + beq .return_pass + +.lessthan16_init: + mov x_pos, x_len + b .Lloop16 + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_dot_prod_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_dot_prod_sve.S new file mode 100644 index 0000000000..8f6414ee52 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_dot_prod_sve.S @@ -0,0 +1,189 @@ +/************************************************************* + Copyright (c) 2021 Linaro Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +.text +.align 6 +.arch armv8-a+sve + +#include "../include/aarch64_label.h" + +.global cdecl(gf_3vect_dot_prod_sve) +#ifndef __APPLE__ +.type gf_3vect_dot_prod_sve, %function +#endif +/* void gf_3vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + */ + +/* arguments */ +x_len .req x0 /* vector length */ +x_vec .req x1 /* number of source vectors (ie. data blocks) */ +x_tbl .req x2 +x_src .req x3 +x_dest .req x4 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_vec_i .req x5 +x_ptr .req x6 +x_pos .req x7 + +x_tbl1 .req x8 +x_tbl2 .req x9 +x_tbl3 .req x10 +x_dest1 .req x11 +x_dest2 .req x12 +x_dest3 .req x_dest /* reused */ + +/* r16,r17,r18,r29,r30: special role registers, avoided */ +/* r19..r29 and SP must be preserved */ + +/* vectors */ +z_mask0f .req z0 + +z_src .req z1 +z_src_lo .req z2 +z_src_hi .req z_src + +z_dest1 .req z3 + +z_gft1_lo .req z4 +z_gft1_hi .req z5 +q_gft1_lo .req q4 +q_gft1_hi .req q5 + +/* bottom 64-bit of v8..v15 must be preserved if used */ +z_gft2_lo .req z17 +z_gft2_hi .req z18 +q_gft2_lo .req q17 +q_gft2_hi .req q18 + +z_gft3_lo .req z19 +z_gft3_hi .req z20 +q_gft3_lo .req q19 +q_gft3_hi .req q20 + +z_dest2 .req z27 +z_dest3 .req z28 + +cdecl(gf_3vect_dot_prod_sve): + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ + mov x_pos, #0 + lsl x_vec, x_vec, #3 + ldp x_dest1, x_dest2, [x_dest, #8*0] + ldr x_dest3, [x_dest, #8*2] + +/* Loop 1: x_len, vector length */ +.Lloopsve_vl: + whilelo p0.b, x_pos, x_len + b.none .return_pass + + mov x_vec_i, #0 /* clear x_vec_i */ + ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ + + mov z_dest1.b, #0 /* clear z_dest1 */ + mov z_dest2.b, #0 /* clear z_dest2 */ + mov z_dest3.b, #0 /* clear z_dest3 */ + + /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */ + mov x_tbl1, x_tbl /* reset x_tbl1 */ + add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */ + add x_tbl3, x_tbl2, x_vec, LSL #2 /* reset x_tbl3 */ + +/* Loop 2: x_vec, number of source vectors (ie. data blocks) */ +.Lloopsve_vl_vects: + /* load src data, governed by p0 */ + ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */ + /* split 4-bit lo; 4-bit hi */ + and z_src_lo.d, z_src.d, z_mask0f.d + lsr z_src_hi.b, z_src.b, #4 + + + /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */ + /* load gf_table's */ + ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */ + ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 + + /* prefetch */ + prfb pldl2keep, p0, [x_tbl1] + prfb pldl2keep, p0, [x_tbl2] + + /* calc for next */ + add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */ + ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ + + /* dest 1 */ + /* table indexing, ie. gf(2^8) multiplication */ + tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b + tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b + /* exclusive or, ie. gf(2^8) add */ + eor z_dest1.d, z_gft1_lo.d, z_dest1.d + eor z_dest1.d, z_dest1.d, z_gft1_hi.d + + ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32 + prfb pldl2keep, p0, [x_tbl3] + + /* dest 2 */ + tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b + tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b + eor z_dest2.d, z_gft2_lo.d, z_dest2.d + eor z_dest2.d, z_dest2.d, z_gft2_hi.d + + /* dest 3 */ + tbl z_gft3_lo.b, {z_gft3_lo.b}, z_src_lo.b + tbl z_gft3_hi.b, {z_gft3_hi.b}, z_src_hi.b + eor z_dest3.d, z_gft3_lo.d, z_dest3.d + eor z_dest3.d, z_dest3.d, z_gft3_hi.d + + cmp x_vec_i, x_vec + blt .Lloopsve_vl_vects +/* end of Loop 2 */ + + /* store dest data, governed by p0 */ + st1b z_dest1.b, p0, [x_dest1, x_pos] + st1b z_dest2.b, p0, [x_dest2, x_pos] + st1b z_dest3.b, p0, [x_dest3, x_pos] + + /* increment one vector length */ + incb x_pos + b .Lloopsve_vl +/* end of Loop 1 */ + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_mad_neon.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_mad_neon.S new file mode 100644 index 0000000000..fcfeec1e23 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_mad_neon.S @@ -0,0 +1,391 @@ +/************************************************************** + Copyright (c) 2019 Huawei Technologies Co., Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "../include/aarch64_label.h" + +.text + +.global cdecl(gf_3vect_mad_neon) +#ifndef __APPLE__ +.type gf_3vect_mad_neon, %function +#endif + +/* arguments */ +x_len .req x0 +x_vec .req x1 +x_vec_i .req x2 +x_tbl .req x3 +x_src .req x4 +x_dest .req x5 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_src_end .req x6 +x_dest1 .req x7 +x_dest2 .req x8 +x_dest3 .req x_dest +x_tmp .req x10 +x_tbl1 .req x11 +x_tbl2 .req x12 +x_tbl3 .req x13 +x_const .req x14 + +/* vectors */ +v_mask0f .req v0 +v_tmp_lo .req v1 +v_tmp_hi .req v2 +v_tmp .req v3 +q_tmp .req q3 + +v_gft1_lo .req v4 +v_gft1_hi .req v5 +v_gft2_lo .req v6 +v_gft2_hi .req v7 +v_gft3_lo .req v16 +v_gft3_hi .req v17 +q_gft1_lo .req q4 +q_gft1_hi .req q5 +q_gft2_lo .req q6 +q_gft2_hi .req q7 +q_gft3_lo .req q16 +q_gft3_hi .req q17 + +v_data_0 .req v8 +v_data_1 .req v9 +v_data_2 .req v10 +v_data_3 .req v11 +q_data_0 .req q8 +q_data_1 .req q9 +q_data_2 .req q10 +q_data_3 .req q11 + +v_data_0_lo .req v12 +v_data_1_lo .req v13 +v_data_2_lo .req v14 +v_data_3_lo .req v15 +v_data_0_hi .req v_data_0 +v_data_1_hi .req v_data_1 +v_data_2_hi .req v_data_2 +v_data_3_hi .req v_data_3 + +v_d1_0 .req v20 +v_d1_1 .req v21 +v_d1_2 .req v22 +v_d1_3 .req v23 +v_d2_0 .req v24 +v_d2_1 .req v25 +v_d2_2 .req v26 +v_d2_3 .req v27 +v_d3_0 .req v28 +v_d3_1 .req v29 +v_d3_2 .req v30 +v_d3_3 .req v31 +q_d1_0 .req q20 +q_d1_1 .req q21 +q_d1_2 .req q22 +q_d1_3 .req q23 +q_d2_0 .req q24 +q_d2_1 .req q25 +q_d2_2 .req q26 +q_d2_3 .req q27 +q_d3_0 .req q28 +q_d3_1 .req q29 +q_d3_2 .req q30 +q_d3_3 .req q31 + +v_data .req v21 +q_data .req q21 +v_data_lo .req v22 +v_data_hi .req v23 + +cdecl(gf_3vect_mad_neon): + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + movi v_mask0f.16b, #0x0f + lsl x_vec_i, x_vec_i, #5 + lsl x_vec, x_vec, #5 + add x_tbl1, x_tbl, x_vec_i + add x_tbl2, x_tbl1, x_vec + add x_tbl3, x_tbl2, x_vec + add x_src_end, x_src, x_len + ldr x_dest1, [x_dest] + ldr x_dest2, [x_dest, #8] + ldr x_dest3, [x_dest, #16] + ldr q_gft1_lo, [x_tbl1] + ldr q_gft1_hi, [x_tbl1, #16] + ldr q_gft2_lo, [x_tbl2] + ldr q_gft2_hi, [x_tbl2, #16] + ldr q_gft3_lo, [x_tbl3] + ldr q_gft3_hi, [x_tbl3, #16] + +.Lloop64_init: + /* less than 64 bytes, goto Lloop16_init */ + cmp x_len, #64 + blt .Lloop16_init + + /* save d8 ~ d15 to stack */ + sub sp, sp, #64 + stp d8, d9, [sp] + stp d10, d11, [sp, #16] + stp d12, d13, [sp, #32] + stp d14, d15, [sp, #48] + + sub x_src_end, x_src_end, #64 + +.Lloop64: + ldr q_data_0, [x_src, #16*0] + ldr q_data_1, [x_src, #16*1] + ldr q_data_2, [x_src, #16*2] + ldr q_data_3, [x_src, #16*3] + add x_src, x_src, #64 + + ldr q_d1_0, [x_dest1, #16*0] + ldr q_d1_1, [x_dest1, #16*1] + ldr q_d1_2, [x_dest1, #16*2] + ldr q_d1_3, [x_dest1, #16*3] + + ldr q_d2_0, [x_dest2, #16*0] + ldr q_d2_1, [x_dest2, #16*1] + ldr q_d2_2, [x_dest2, #16*2] + ldr q_d2_3, [x_dest2, #16*3] + + ldr q_d3_0, [x_dest3, #16*0] + ldr q_d3_1, [x_dest3, #16*1] + ldr q_d3_2, [x_dest3, #16*2] + ldr q_d3_3, [x_dest3, #16*3] + + and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b + and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b + and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b + and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b + + ushr v_data_0_hi.16b, v_data_0.16b, #4 + ushr v_data_1_hi.16b, v_data_1.16b, #4 + ushr v_data_2_hi.16b, v_data_2.16b, #4 + ushr v_data_3_hi.16b, v_data_3.16b, #4 + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b + eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b + eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b + eor v_d1_1.16b, v_tmp_lo.16b, v_d1_1.16b + eor v_d1_1.16b, v_d1_1.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b + eor v_d1_2.16b, v_tmp_lo.16b, v_d1_2.16b + eor v_d1_2.16b, v_d1_2.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b + eor v_d1_3.16b, v_tmp_lo.16b, v_d1_3.16b + eor v_d1_3.16b, v_d1_3.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_0_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_0_hi.16b + eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b + eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_1_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_1_hi.16b + eor v_d2_1.16b, v_tmp_lo.16b, v_d2_1.16b + eor v_d2_1.16b, v_d2_1.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_2_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_2_hi.16b + eor v_d2_2.16b, v_tmp_lo.16b, v_d2_2.16b + eor v_d2_2.16b, v_d2_2.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_3_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_3_hi.16b + eor v_d2_3.16b, v_tmp_lo.16b, v_d2_3.16b + eor v_d2_3.16b, v_d2_3.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_0_lo.16b + tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_0_hi.16b + eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b + eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_1_lo.16b + tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_1_hi.16b + eor v_d3_1.16b, v_tmp_lo.16b, v_d3_1.16b + eor v_d3_1.16b, v_d3_1.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_2_lo.16b + tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_2_hi.16b + eor v_d3_2.16b, v_tmp_lo.16b, v_d3_2.16b + eor v_d3_2.16b, v_d3_2.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_3_lo.16b + tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_3_hi.16b + eor v_d3_3.16b, v_tmp_lo.16b, v_d3_3.16b + eor v_d3_3.16b, v_d3_3.16b, v_tmp_hi.16b + + str q_d1_0, [x_dest1, #16*0] + str q_d1_1, [x_dest1, #16*1] + str q_d1_2, [x_dest1, #16*2] + str q_d1_3, [x_dest1, #16*3] + add x_dest1, x_dest1, #64 + + str q_d2_0, [x_dest2, #16*0] + str q_d2_1, [x_dest2, #16*1] + str q_d2_2, [x_dest2, #16*2] + str q_d2_3, [x_dest2, #16*3] + add x_dest2, x_dest2, #64 + + str q_d3_0, [x_dest3, #16*0] + str q_d3_1, [x_dest3, #16*1] + str q_d3_2, [x_dest3, #16*2] + str q_d3_3, [x_dest3, #16*3] + add x_dest3, x_dest3, #64 + + cmp x_src, x_src_end + bls .Lloop64 + +.Lloop64_end: + /* restore d8 ~ d15 */ + ldp d8, d9, [sp] + ldp d10, d11, [sp, #16] + ldp d12, d13, [sp, #32] + ldp d14, d15, [sp, #48] + add sp, sp, #64 + add x_src_end, x_src_end, #64 + +.Lloop16_init: + sub x_src_end, x_src_end, #16 + cmp x_src, x_src_end + bhi .lessthan16_init + +.Lloop16: + ldr q_data, [x_src] + + ldr q_d1_0, [x_dest1] + ldr q_d2_0, [x_dest2] + ldr q_d3_0, [x_dest3] + + and v_data_lo.16b, v_data.16b, v_mask0f.16b + ushr v_data_hi.16b, v_data.16b, #4 + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b + eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b + eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b + eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b + eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b + eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b + eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b + + str q_d1_0, [x_dest1] + str q_d2_0, [x_dest2] + str q_d3_0, [x_dest3] + + add x_src, x_src, #16 + add x_dest1, x_dest1, #16 + add x_dest2, x_dest2, #16 + add x_dest3, x_dest3, #16 + cmp x_src, x_src_end + bls .Lloop16 + +.lessthan16_init: + sub x_tmp, x_src, x_src_end + cmp x_tmp, #16 + beq .return_pass + +.lessthan16: + mov x_src, x_src_end + sub x_dest1, x_dest1, x_tmp + sub x_dest2, x_dest2, x_tmp + sub x_dest3, x_dest3, x_tmp + +#ifndef __APPLE__ + adrp x_const, const_tbl + add x_const, x_const, :lo12:const_tbl +#else + adrp x_const, const_tbl@PAGE + add x_const, x_const, const_tbl@PAGEOFF +#endif + sub x_const, x_const, x_tmp + ldr q_tmp, [x_const, #16] + + ldr q_data, [x_src] + ldr q_d1_0, [x_dest1] + ldr q_d2_0, [x_dest2] + ldr q_d3_0, [x_dest3] + + and v_data_lo.16b, v_data.16b, v_mask0f.16b + ushr v_data_hi.16b, v_data.16b, #4 + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b + eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b + and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b + eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b + eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b + and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b + eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b + eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b + and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b + eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b + + str q_d1_0, [x_dest1] + str q_d2_0, [x_dest2] + str q_d3_0, [x_dest3] + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret + +ASM_DEF_RODATA +.balign 8 +const_tbl: + .dword 0x0000000000000000, 0x0000000000000000 + .dword 0xffffffffffffffff, 0xffffffffffffffff diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_mad_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_mad_sve.S new file mode 100644 index 0000000000..9e0ca5c4b3 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_mad_sve.S @@ -0,0 +1,175 @@ +/************************************************************** + Copyright (c) 2021 Linaro Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +.text +.align 6 +.arch armv8-a+sve + +#include "../include/aarch64_label.h" + +.global cdecl(gf_3vect_mad_sve) +#ifndef __APPLE__ +.type gf_3vect_mad_sve, %function +#endif + +/* gf_3vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest); + */ +/* arguments */ +x_len .req x0 +x_vec .req x1 +x_vec_i .req x2 +x_tbl .req x3 +x_src .req x4 +x_dest .req x5 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_pos .req x6 +x_dest2 .req x7 +x_dest3 .req x8 +x_dest1 .req x12 + +/* vectors */ +z_mask0f .req z0 + +z_src .req z1 +z_src_lo .req z2 +z_src_hi .req z_src + +z_dest1 .req z3 + +z_tmp_lo .req z4 +z_tmp_hi .req z5 + +z_gft1_lo .req z6 +z_gft1_hi .req z7 +q_gft1_lo .req q6 +q_gft1_hi .req q7 + +/* bottom 64-bit of v8..v15 must be preserved if used */ +z_gft2_lo .req z17 +z_gft2_hi .req z18 +q_gft2_lo .req q17 +q_gft2_hi .req q18 + +z_gft3_lo .req z19 +z_gft3_hi .req z20 +q_gft3_lo .req q19 +q_gft3_hi .req q20 + +z_dest2 .req z27 +z_dest3 .req z28 + +cdecl(gf_3vect_mad_sve): + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ + /* load table 1 */ + add x_tbl, x_tbl, x_vec_i, LSL #5 /* x_tbl += x_vec_i * 2^5 */ + + /* Load table 1 with NEON instruction ldp */ + ldp q_gft1_lo, q_gft1_hi, [x_tbl] + /* load table 2 */ + add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */ + ldp q_gft2_lo, q_gft2_hi, [x_tbl] + /* load table 3 */ + add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */ + ldp q_gft3_lo, q_gft3_hi, [x_tbl] + + ldr x_dest1, [x_dest, #8*0] /* pointer to dest1 */ + ldr x_dest2, [x_dest, #8*1] /* pointer to dest2 */ + ldr x_dest3, [x_dest, #8*2] /* pointer to dest3 */ + + mov x_pos, #0 + + /* vector length agnostic */ +.Lloopsve_vl: + whilelo p0.b, x_pos, x_len + b.none .return_pass + + /* dest data prefetch */ + prfb pldl2strm, p0, [x_dest1, x_pos] + prfb pldl2strm, p0, [x_dest2, x_pos] + + /* load src data, governed by p0 */ + ld1b z_src.b, p0/z, [x_src, x_pos] + + /* split 4-bit lo; 4-bit hi */ + and z_src_lo.d, z_src.d, z_mask0f.d + lsr z_src_hi.b, z_src.b, #4 + + /* load dest data, governed by p0 */ + ld1b z_dest1.b, p0/z, [x_dest1, x_pos] + ld1b z_dest2.b, p0/z, [x_dest2, x_pos] + prfb pldl2strm, p0, [x_dest3, x_pos] + + /* dest1 */ + /* table indexing, ie. gf(2^8) multiplication */ + tbl z_tmp_lo.b, {z_gft1_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft1_hi.b}, z_src_hi.b + /* exclusive or, ie. gf(2^8) add */ + eor z_dest1.d, z_tmp_lo.d, z_dest1.d + eor z_dest1.d, z_tmp_hi.d, z_dest1.d + + /* dest2 */ + tbl z_tmp_lo.b, {z_gft2_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft2_hi.b}, z_src_hi.b + + ld1b z_dest3.b, p0/z, [x_dest3, x_pos] + /* store dest data, governed by p0 */ + st1b z_dest1.b, p0, [x_dest1, x_pos] + + eor z_dest2.d, z_tmp_lo.d, z_dest2.d + eor z_dest2.d, z_tmp_hi.d, z_dest2.d + + /* dest3 */ + tbl z_tmp_lo.b, {z_gft3_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft3_hi.b}, z_src_hi.b + eor z_dest3.d, z_tmp_lo.d, z_dest3.d + eor z_dest3.d, z_tmp_hi.d, z_dest3.d + + /* store dest data, governed by p0 */ + st1b z_dest2.b, p0, [x_dest2, x_pos] + st1b z_dest3.b, p0, [x_dest3, x_pos] + /* increment one vector length */ + incb x_pos + + b .Lloopsve_vl + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_dot_prod_neon.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_dot_prod_neon.S new file mode 100644 index 0000000000..6204102f68 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_dot_prod_neon.S @@ -0,0 +1,425 @@ +/************************************************************** + Copyright (c) 2019 Huawei Technologies Co., Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "../include/aarch64_label.h" + +.text + +.global cdecl(gf_4vect_dot_prod_neon) +#ifndef __APPLE__ +.type gf_4vect_dot_prod_neon, %function +#endif + +/* arguments */ +x_len .req x0 +x_vec .req x1 +x_tbl .req x2 +x_src .req x3 +x_dest .req x4 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_vec_i .req x5 +x_ptr .req x6 +x_pos .req x7 +x_tmp .req x8 +x_dest1 .req x9 +x_tbl1 .req x10 +x_dest2 .req x11 +x_tbl2 .req x12 +x_dest3 .req x13 +x_tbl3 .req x14 +x_dest4 .req x_dest +x_tbl4 .req x15 + +/* vectors */ +v_mask0f .req v0 +q_mask0f .req q0 +v_tmp1_lo .req v1 +v_tmp1_hi .req v2 +v_tmp1 .req v3 +q_tmp1 .req q3 + +v_p1_0 .req v4 +v_p2_0 .req v5 +v_p3_0 .req v6 +v_p4_0 .req v7 + +q_p1_0 .req q4 +q_p2_0 .req q5 +q_p3_0 .req q6 +q_p4_0 .req q7 + +v_data_0 .req v8 +v_data_1 .req v9 +v_data_2 .req v10 +v_data_3 .req v11 +q_data_0 .req q8 +q_data_1 .req q9 +q_data_2 .req q10 +q_data_3 .req q11 + +v_p1_3 .req v12 +v_p2_3 .req v13 +v_p3_3 .req v14 +v_p4_3 .req v15 +q_p1_3 .req q12 +q_p2_3 .req q13 +q_p3_3 .req q14 +q_p4_3 .req q15 + +v_gft1_lo .req v16 +v_gft1_hi .req v17 +v_gft2_lo .req v18 +v_gft2_hi .req v19 +v_gft3_lo .req v20 +v_gft3_hi .req v21 +v_gft4_lo .req v22 +v_gft4_hi .req v23 +q_gft1_lo .req q16 +q_gft1_hi .req q17 +q_gft2_lo .req q18 +q_gft2_hi .req q19 +q_gft3_lo .req q20 +q_gft3_hi .req q21 +q_gft4_lo .req q22 +q_gft4_hi .req q23 + +v_p1_1 .req v24 +v_p1_2 .req v25 +v_p2_1 .req v26 +v_p2_2 .req v27 +v_p3_1 .req v28 +v_p3_2 .req v29 +v_p4_1 .req v30 +v_p4_2 .req v31 + +q_p1_1 .req q24 +q_p1_2 .req q25 +q_p2_1 .req q26 +q_p2_2 .req q27 +q_p3_1 .req q28 +q_p3_2 .req q29 +q_p4_1 .req q30 +q_p4_2 .req q31 + +v_data .req v_tmp1 +q_data .req q_tmp1 +v_data_lo .req v_tmp1_lo +v_data_hi .req v_tmp1_hi + +cdecl(gf_4vect_dot_prod_neon): + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + movi v_mask0f.16b, #0x0f + mov x_pos, #0 + lsl x_vec, x_vec, #3 + ldr x_dest1, [x_dest, #8*0] + ldr x_dest2, [x_dest, #8*1] + ldr x_dest3, [x_dest, #8*2] + ldr x_dest4, [x_dest, #8*3] + +.Lloop64_init: + /* less than 64 bytes, goto Lloop16_init */ + cmp x_len, #64 + blt .Lloop16_init + + /* save d8 ~ d15 to stack */ + sub sp, sp, #64 + stp d8, d9, [sp] + stp d10, d11, [sp, #16] + stp d12, d13, [sp, #32] + stp d14, d15, [sp, #48] + + sub x_len, x_len, #64 + +.Lloop64: + movi v_p1_0.16b, #0 + movi v_p1_1.16b, #0 + movi v_p1_2.16b, #0 + movi v_p1_3.16b, #0 + movi v_p2_0.16b, #0 + movi v_p2_1.16b, #0 + movi v_p2_2.16b, #0 + movi v_p2_3.16b, #0 + movi v_p3_0.16b, #0 + movi v_p3_1.16b, #0 + movi v_p3_2.16b, #0 + movi v_p3_3.16b, #0 + movi v_p4_0.16b, #0 + movi v_p4_1.16b, #0 + movi v_p4_2.16b, #0 + movi v_p4_3.16b, #0 + + mov x_tbl1, x_tbl + add x_tbl2, x_tbl1, x_vec, lsl #2 + add x_tbl3, x_tbl2, x_vec, lsl #2 + add x_tbl4, x_tbl3, x_vec, lsl #2 + mov x_vec_i, #0 + prfm pldl1keep, [x_tbl1] + prfm pldl1keep, [x_tbl2] + prfm pldl1keep, [x_tbl3] + prfm pldl1keep, [x_tbl4] + +.Lloop64_vects: + ldr x_ptr, [x_src, x_vec_i] + add x_vec_i, x_vec_i, #8 + add x_ptr, x_ptr, x_pos + + ldr q_data_0, [x_ptr], #16 + ldr q_data_1, [x_ptr], #16 + ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 + ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 + ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32 + ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32 + ldr q_data_2, [x_ptr], #16 + ldr q_data_3, [x_ptr], #16 + + prfm pldl1strm, [x_ptr] + prfm pldl1keep, [x_tbl1] + prfm pldl1keep, [x_tbl2] + prfm pldl1keep, [x_tbl3] + prfm pldl1keep, [x_tbl4] + + /* data_0 */ + and v_tmp1.16b, v_data_0.16b, v_mask0f.16b + ushr v_data_0.16b, v_data_0.16b, #4 + + tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_0.16b + eor v_p1_0.16b, v_tmp1_lo.16b, v_p1_0.16b + eor v_p1_0.16b, v_p1_0.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_0.16b + eor v_p2_0.16b, v_tmp1_lo.16b, v_p2_0.16b + eor v_p2_0.16b, v_p2_0.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_0.16b + eor v_p3_0.16b, v_tmp1_lo.16b, v_p3_0.16b + eor v_p3_0.16b, v_p3_0.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_0.16b + eor v_p4_0.16b, v_tmp1_lo.16b, v_p4_0.16b + eor v_p4_0.16b, v_p4_0.16b, v_tmp1_hi.16b + + /* data_1 */ + and v_tmp1.16b, v_data_1.16b, v_mask0f.16b + ushr v_data_1.16b, v_data_1.16b, #4 + + tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_1.16b + eor v_p1_1.16b, v_tmp1_lo.16b, v_p1_1.16b + eor v_p1_1.16b, v_p1_1.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_1.16b + eor v_p2_1.16b, v_tmp1_lo.16b, v_p2_1.16b + eor v_p2_1.16b, v_p2_1.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_1.16b + eor v_p3_1.16b, v_tmp1_lo.16b, v_p3_1.16b + eor v_p3_1.16b, v_p3_1.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_1.16b + eor v_p4_1.16b, v_tmp1_lo.16b, v_p4_1.16b + eor v_p4_1.16b, v_p4_1.16b, v_tmp1_hi.16b + + /* data_2 */ + and v_tmp1.16b, v_data_2.16b, v_mask0f.16b + ushr v_data_2.16b, v_data_2.16b, #4 + + tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_2.16b + eor v_p1_2.16b, v_tmp1_lo.16b, v_p1_2.16b + eor v_p1_2.16b, v_p1_2.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_2.16b + eor v_p2_2.16b, v_tmp1_lo.16b, v_p2_2.16b + eor v_p2_2.16b, v_p2_2.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_2.16b + eor v_p3_2.16b, v_tmp1_lo.16b, v_p3_2.16b + eor v_p3_2.16b, v_p3_2.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_2.16b + eor v_p4_2.16b, v_tmp1_lo.16b, v_p4_2.16b + eor v_p4_2.16b, v_p4_2.16b, v_tmp1_hi.16b + + /* data_3 */ + and v_tmp1.16b, v_data_3.16b, v_mask0f.16b + ushr v_data_3.16b, v_data_3.16b, #4 + + tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_3.16b + eor v_p1_3.16b, v_tmp1_lo.16b, v_p1_3.16b + eor v_p1_3.16b, v_p1_3.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_3.16b + eor v_p2_3.16b, v_tmp1_lo.16b, v_p2_3.16b + eor v_p2_3.16b, v_p2_3.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_3.16b + eor v_p3_3.16b, v_tmp1_lo.16b, v_p3_3.16b + eor v_p3_3.16b, v_p3_3.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_3.16b + eor v_p4_3.16b, v_tmp1_lo.16b, v_p4_3.16b + eor v_p4_3.16b, v_p4_3.16b, v_tmp1_hi.16b + + cmp x_vec_i, x_vec + blt .Lloop64_vects + +.Lloop64_vects_end: + add x_ptr, x_dest1, x_pos + stp q_p1_0, q_p1_1, [x_ptr], #32 + stp q_p1_2, q_p1_3, [x_ptr] + + add x_ptr, x_dest2, x_pos + stp q_p2_0, q_p2_1, [x_ptr], #32 + stp q_p2_2, q_p2_3, [x_ptr] + + add x_ptr, x_dest3, x_pos + stp q_p3_0, q_p3_1, [x_ptr], #32 + stp q_p3_2, q_p3_3, [x_ptr] + + add x_ptr, x_dest4, x_pos + stp q_p4_0, q_p4_1, [x_ptr], #32 + stp q_p4_2, q_p4_3, [x_ptr] + + add x_pos, x_pos, #64 + cmp x_pos, x_len + ble .Lloop64 + +.Lloop64_end: + /* restore d8 ~ d15 */ + ldp d8, d9, [sp] + ldp d10, d11, [sp, #16] + ldp d12, d13, [sp, #32] + ldp d14, d15, [sp, #48] + add sp, sp, #64 + + add x_len, x_len, #64 + cmp x_pos, x_len + beq .return_pass + +.Lloop16_init: + sub x_len, x_len, #16 + cmp x_pos, x_len + bgt .lessthan16_init + +.Lloop16: + movi v_p1_0.16b, #0 + movi v_p2_0.16b, #0 + movi v_p3_0.16b, #0 + movi v_p4_0.16b, #0 + mov x_tbl1, x_tbl + add x_tbl2, x_tbl1, x_vec, lsl #2 + add x_tbl3, x_tbl2, x_vec, lsl #2 + add x_tbl4, x_tbl3, x_vec, lsl #2 + mov x_vec_i, #0 + +.Lloop16_vects: + ldr x_ptr, [x_src, x_vec_i] + add x_vec_i, x_vec_i, #8 + ldr q_data, [x_ptr, x_pos] + + ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 + ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 + ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32 + ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32 + + prfm pldl1keep, [x_tbl1] + prfm pldl1keep, [x_tbl2] + prfm pldl1keep, [x_tbl3] + prfm pldl1keep, [x_tbl4] + + and v_data_lo.16b, v_data.16b, v_mask0f.16b + ushr v_data_hi.16b, v_data.16b, #4 + + tbl v_gft1_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b + tbl v_gft1_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b + tbl v_gft2_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b + tbl v_gft2_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b + tbl v_gft3_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b + tbl v_gft3_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b + tbl v_gft4_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b + tbl v_gft4_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b + + eor v_p1_0.16b, v_gft1_hi.16b, v_p1_0.16b + eor v_p1_0.16b, v_p1_0.16b, v_gft1_lo.16b + eor v_p2_0.16b, v_gft2_hi.16b, v_p2_0.16b + eor v_p2_0.16b, v_p2_0.16b, v_gft2_lo.16b + eor v_p3_0.16b, v_gft3_hi.16b, v_p3_0.16b + eor v_p3_0.16b, v_p3_0.16b, v_gft3_lo.16b + eor v_p4_0.16b, v_gft4_hi.16b, v_p4_0.16b + eor v_p4_0.16b, v_p4_0.16b, v_gft4_lo.16b + + cmp x_vec_i, x_vec + bne .Lloop16_vects + +.Lloop16_vects_end: + str q_p1_0, [x_dest1, x_pos] + str q_p2_0, [x_dest2, x_pos] + str q_p3_0, [x_dest3, x_pos] + str q_p4_0, [x_dest4, x_pos] + add x_pos, x_pos, #16 + cmp x_pos, x_len + ble .Lloop16 + +.Lloop16_end: + sub x_tmp, x_pos, x_len + cmp x_tmp, #16 + beq .return_pass + +.lessthan16_init: + mov x_pos, x_len + b .Lloop16 + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_dot_prod_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_dot_prod_sve.S new file mode 100644 index 0000000000..eb354279f8 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_dot_prod_sve.S @@ -0,0 +1,208 @@ +/************************************************************* + Copyright (c) 2021 Linaro Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +.text +.align 6 +.arch armv8-a+sve + +#include "../include/aarch64_label.h" + +.global cdecl(gf_4vect_dot_prod_sve) +#ifndef __APPLE__ +.type gf_4vect_dot_prod_sve, %function +#endif +/* void gf_4vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + */ + +/* arguments */ +x_len .req x0 /* vector length */ +x_vec .req x1 /* number of source vectors (ie. data blocks) */ +x_tbl .req x2 +x_src .req x3 +x_dest .req x4 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_vec_i .req x5 +x_ptr .req x6 +x_pos .req x7 + +x_tbl1 .req x8 +x_tbl2 .req x9 +x_tbl3 .req x10 +x_tbl4 .req x11 +x_dest1 .req x12 +x_dest2 .req x13 +x_dest3 .req x14 +x_dest4 .req x_dest /* reused */ + +/* r16,r17,r18,r29,r30: special role registers, avoided */ +/* r19..r29 and SP must be preserved */ + +/* vectors */ +z_mask0f .req z0 + +z_src .req z1 +z_src_lo .req z2 +z_src_hi .req z_src + +z_dest1 .req z3 + +z_gft1_lo .req z4 +z_gft1_hi .req z5 +q_gft1_lo .req q4 +q_gft1_hi .req q5 + +/* bottom 64-bit of v8..v15 must be preserved if used */ +z_gft2_lo .req z17 +z_gft2_hi .req z18 +q_gft2_lo .req q17 +q_gft2_hi .req q18 + +z_gft3_lo .req z19 +z_gft3_hi .req z20 +q_gft3_lo .req q19 +q_gft3_hi .req q20 + +z_gft4_lo .req z21 +z_gft4_hi .req z22 +q_gft4_lo .req q21 +q_gft4_hi .req q22 + +z_dest2 .req z27 +z_dest3 .req z28 +z_dest4 .req z29 + +cdecl(gf_4vect_dot_prod_sve): + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ + mov x_pos, #0 + lsl x_vec, x_vec, #3 + ldp x_dest1, x_dest2, [x_dest, #8*0] + ldp x_dest3, x_dest4, [x_dest, #8*2] + +/* Loop 1: x_len, vector length */ +.Lloopsve_vl: + whilelo p0.b, x_pos, x_len + b.none .return_pass + + mov x_vec_i, #0 /* clear x_vec_i */ + ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ + + mov z_dest1.b, #0 /* clear z_dest1 */ + mov z_dest2.b, #0 /* clear z_dest2 */ + mov z_dest3.b, #0 /* clear z_dest3 */ + mov z_dest4.b, #0 /* clear z_dest4 */ + + /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */ + mov x_tbl1, x_tbl /* reset x_tbl1 */ + add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */ + add x_tbl3, x_tbl2, x_vec, LSL #2 /* reset x_tbl3 */ + add x_tbl4, x_tbl3, x_vec, LSL #2 /* reset x_tbl4 */ + +/* Loop 2: x_vec, number of source vectors (ie. data blocks) */ +.Lloopsve_vl_vects: + /* load src data, governed by p0 */ + ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */ + /* split 4-bit lo; 4-bit hi */ + and z_src_lo.d, z_src.d, z_mask0f.d + lsr z_src_hi.b, z_src.b, #4 + + + /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */ + /* load gf_table's */ + ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */ + ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 + + /* prefetch */ + prfb pldl2keep, p0, [x_tbl1] + prfb pldl2keep, p0, [x_tbl2] + + /* calc for next */ + add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */ + ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ + + /* dest 1 */ + /* table indexing, ie. gf(2^8) multiplication */ + tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b + tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b + /* exclusive or, ie. gf(2^8) add */ + eor z_dest1.d, z_gft1_lo.d, z_dest1.d + eor z_dest1.d, z_dest1.d, z_gft1_hi.d + + ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32 + ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32 + prfb pldl2keep, p0, [x_tbl3] + prfb pldl2keep, p0, [x_tbl4] + + /* dest 2 */ + tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b + tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b + eor z_dest2.d, z_gft2_lo.d, z_dest2.d + eor z_dest2.d, z_dest2.d, z_gft2_hi.d + + /* dest 3 */ + tbl z_gft3_lo.b, {z_gft3_lo.b}, z_src_lo.b + tbl z_gft3_hi.b, {z_gft3_hi.b}, z_src_hi.b + eor z_dest3.d, z_gft3_lo.d, z_dest3.d + eor z_dest3.d, z_dest3.d, z_gft3_hi.d + + /* dest 4 */ + tbl z_gft4_lo.b, {z_gft4_lo.b}, z_src_lo.b + tbl z_gft4_hi.b, {z_gft4_hi.b}, z_src_hi.b + eor z_dest4.d, z_gft4_lo.d, z_dest4.d + eor z_dest4.d, z_dest4.d, z_gft4_hi.d + + cmp x_vec_i, x_vec + blt .Lloopsve_vl_vects +/* end of Loop 2 */ + + /* store dest data, governed by p0 */ + st1b z_dest1.b, p0, [x_dest1, x_pos] + st1b z_dest2.b, p0, [x_dest2, x_pos] + st1b z_dest3.b, p0, [x_dest3, x_pos] + st1b z_dest4.b, p0, [x_dest4, x_pos] + + /* increment one vector length */ + incb x_pos + b .Lloopsve_vl +/* end of Loop 1 */ + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_mad_neon.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_mad_neon.S new file mode 100644 index 0000000000..ebf82e7ffe --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_mad_neon.S @@ -0,0 +1,464 @@ +/************************************************************** + Copyright (c) 2019 Huawei Technologies Co., Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "../include/aarch64_label.h" + +.text + +.global cdecl(gf_4vect_mad_neon) +#ifndef __APPLE__ +.type gf_4vect_mad_neon, %function +#endif + +/* arguments */ +x_len .req x0 +x_vec .req x1 +x_vec_i .req x2 +x_tbl .req x3 +x_src .req x4 +x_dest .req x5 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_src_end .req x6 +x_dest1 .req x7 +x_dest2 .req x8 +x_dest3 .req x9 +x_dest4 .req x_dest +x_tmp .req x10 +x_tbl1 .req x11 +x_tbl2 .req x12 +x_tbl3 .req x13 +x_tbl4 .req x14 +x_const .req x15 + +/* vectors */ +v_mask0f .req v0 +v_tmp_lo .req v1 +v_tmp_hi .req v2 +v_tmp .req v3 +q_tmp .req q3 + +v_gft1_lo .req v4 +v_gft1_hi .req v5 +v_gft2_lo .req v6 +v_gft2_hi .req v7 +v_gft3_lo .req v16 +v_gft3_hi .req v17 +v_gft4_lo .req v18 +v_gft4_hi .req v19 +q_gft1_lo .req q4 +q_gft1_hi .req q5 +q_gft2_lo .req q6 +q_gft2_hi .req q7 +q_gft3_lo .req q16 +q_gft3_hi .req q17 +q_gft4_lo .req q18 +q_gft4_hi .req q19 + +v_data_0 .req v8 +v_data_1 .req v9 +v_data_2 .req v10 +v_data_3 .req v11 +q_data_0 .req q8 +q_data_1 .req q9 +q_data_2 .req q10 +q_data_3 .req q11 + +v_data_0_lo .req v12 +v_data_1_lo .req v13 +v_data_2_lo .req v14 +v_data_3_lo .req v15 +v_data_0_hi .req v_data_0 +v_data_1_hi .req v_data_1 +v_data_2_hi .req v_data_2 +v_data_3_hi .req v_data_3 + +v_d1_0 .req v20 +v_d1_1 .req v21 +v_d1_2 .req v22 +v_d1_3 .req v23 +v_d2_0 .req v24 +v_d2_1 .req v25 +v_d2_2 .req v26 +v_d2_3 .req v27 +v_d3_0 .req v28 +v_d3_1 .req v29 +v_d3_2 .req v30 +v_d3_3 .req v31 +q_d1_0 .req q20 +q_d1_1 .req q21 +q_d1_2 .req q22 +q_d1_3 .req q23 +q_d2_0 .req q24 +q_d2_1 .req q25 +q_d2_2 .req q26 +q_d2_3 .req q27 +q_d3_0 .req q28 +q_d3_1 .req q29 +q_d3_2 .req q30 +q_d3_3 .req q31 + +v_d4_0 .req v_d1_0 +v_d4_1 .req v_d1_1 +v_d4_2 .req v_d1_2 +v_d4_3 .req v_d1_3 +q_d4_0 .req q_d1_0 +q_d4_1 .req q_d1_1 +q_d4_2 .req q_d1_2 +q_d4_3 .req q_d1_3 + +v_data .req v21 +q_data .req q21 +v_data_lo .req v22 +v_data_hi .req v23 + +cdecl(gf_4vect_mad_neon): + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + movi v_mask0f.16b, #0x0f + lsl x_vec_i, x_vec_i, #5 + lsl x_vec, x_vec, #5 + add x_tbl1, x_tbl, x_vec_i + add x_tbl2, x_tbl1, x_vec + add x_tbl3, x_tbl2, x_vec + add x_tbl4, x_tbl3, x_vec + add x_src_end, x_src, x_len + ldr x_dest1, [x_dest, #8*0] + ldr x_dest2, [x_dest, #8*1] + ldr x_dest3, [x_dest, #8*2] + ldr x_dest4, [x_dest, #8*3] + ldr q_gft1_lo, [x_tbl1] + ldr q_gft1_hi, [x_tbl1, #16] + ldr q_gft2_lo, [x_tbl2] + ldr q_gft2_hi, [x_tbl2, #16] + ldr q_gft3_lo, [x_tbl3] + ldr q_gft3_hi, [x_tbl3, #16] + ldr q_gft4_lo, [x_tbl4] + ldr q_gft4_hi, [x_tbl4, #16] + +.Lloop64_init: + /* less than 64 bytes, goto Lloop16_init */ + cmp x_len, #64 + blt .Lloop16_init + + /* save d8 ~ d15 to stack */ + sub sp, sp, #64 + stp d8, d9, [sp] + stp d10, d11, [sp, #16] + stp d12, d13, [sp, #32] + stp d14, d15, [sp, #48] + + sub x_src_end, x_src_end, #64 + +.Lloop64: + ldr q_data_0, [x_src, #16*0] + ldr q_data_1, [x_src, #16*1] + ldr q_data_2, [x_src, #16*2] + ldr q_data_3, [x_src, #16*3] + add x_src, x_src, #64 + + ldr q_d1_0, [x_dest1, #16*0] + ldr q_d1_1, [x_dest1, #16*1] + ldr q_d1_2, [x_dest1, #16*2] + ldr q_d1_3, [x_dest1, #16*3] + + ldr q_d2_0, [x_dest2, #16*0] + ldr q_d2_1, [x_dest2, #16*1] + ldr q_d2_2, [x_dest2, #16*2] + ldr q_d2_3, [x_dest2, #16*3] + + and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b + and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b + and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b + and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b + + ushr v_data_0_hi.16b, v_data_0.16b, #4 + ushr v_data_1_hi.16b, v_data_1.16b, #4 + ushr v_data_2_hi.16b, v_data_2.16b, #4 + ushr v_data_3_hi.16b, v_data_3.16b, #4 + + /* dest1 */ + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b + eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b + eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b + eor v_d1_1.16b, v_tmp_lo.16b, v_d1_1.16b + eor v_d1_1.16b, v_d1_1.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b + eor v_d1_2.16b, v_tmp_lo.16b, v_d1_2.16b + eor v_d1_2.16b, v_d1_2.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b + eor v_d1_3.16b, v_tmp_lo.16b, v_d1_3.16b + eor v_d1_3.16b, v_d1_3.16b, v_tmp_hi.16b + + /* dest2 */ + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_0_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_0_hi.16b + eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b + eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_1_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_1_hi.16b + eor v_d2_1.16b, v_tmp_lo.16b, v_d2_1.16b + eor v_d2_1.16b, v_d2_1.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_2_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_2_hi.16b + eor v_d2_2.16b, v_tmp_lo.16b, v_d2_2.16b + eor v_d2_2.16b, v_d2_2.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_3_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_3_hi.16b + eor v_d2_3.16b, v_tmp_lo.16b, v_d2_3.16b + eor v_d2_3.16b, v_d2_3.16b, v_tmp_hi.16b + + str q_d1_0, [x_dest1, #16*0] + str q_d1_1, [x_dest1, #16*1] + str q_d1_2, [x_dest1, #16*2] + str q_d1_3, [x_dest1, #16*3] + add x_dest1, x_dest1, #64 + + str q_d2_0, [x_dest2, #16*0] + str q_d2_1, [x_dest2, #16*1] + str q_d2_2, [x_dest2, #16*2] + str q_d2_3, [x_dest2, #16*3] + add x_dest2, x_dest2, #64 + + ldr q_d3_0, [x_dest3, #16*0] + ldr q_d3_1, [x_dest3, #16*1] + ldr q_d3_2, [x_dest3, #16*2] + ldr q_d3_3, [x_dest3, #16*3] + + ldr q_d4_0, [x_dest4, #16*0] + ldr q_d4_1, [x_dest4, #16*1] + ldr q_d4_2, [x_dest4, #16*2] + ldr q_d4_3, [x_dest4, #16*3] + + /* dest3 */ + tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_0_lo.16b + tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_0_hi.16b + eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b + eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_1_lo.16b + tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_1_hi.16b + eor v_d3_1.16b, v_tmp_lo.16b, v_d3_1.16b + eor v_d3_1.16b, v_d3_1.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_2_lo.16b + tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_2_hi.16b + eor v_d3_2.16b, v_tmp_lo.16b, v_d3_2.16b + eor v_d3_2.16b, v_d3_2.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_3_lo.16b + tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_3_hi.16b + eor v_d3_3.16b, v_tmp_lo.16b, v_d3_3.16b + eor v_d3_3.16b, v_d3_3.16b, v_tmp_hi.16b + + /* dest4 */ + tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_0_lo.16b + tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_0_hi.16b + eor v_d4_0.16b, v_tmp_lo.16b, v_d4_0.16b + eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_1_lo.16b + tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_1_hi.16b + eor v_d4_1.16b, v_tmp_lo.16b, v_d4_1.16b + eor v_d4_1.16b, v_d4_1.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_2_lo.16b + tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_2_hi.16b + eor v_d4_2.16b, v_tmp_lo.16b, v_d4_2.16b + eor v_d4_2.16b, v_d4_2.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_3_lo.16b + tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_3_hi.16b + eor v_d4_3.16b, v_tmp_lo.16b, v_d4_3.16b + eor v_d4_3.16b, v_d4_3.16b, v_tmp_hi.16b + + str q_d3_0, [x_dest3, #16*0] + str q_d3_1, [x_dest3, #16*1] + str q_d3_2, [x_dest3, #16*2] + str q_d3_3, [x_dest3, #16*3] + add x_dest3, x_dest3, #64 + + str q_d4_0, [x_dest4, #16*0] + str q_d4_1, [x_dest4, #16*1] + str q_d4_2, [x_dest4, #16*2] + str q_d4_3, [x_dest4, #16*3] + add x_dest4, x_dest4, #64 + + cmp x_src, x_src_end + bls .Lloop64 + +.Lloop64_end: + /* restore d8 ~ d15 */ + ldp d8, d9, [sp] + ldp d10, d11, [sp, #16] + ldp d12, d13, [sp, #32] + ldp d14, d15, [sp, #48] + add sp, sp, #64 + add x_src_end, x_src_end, #64 + +.Lloop16_init: + sub x_src_end, x_src_end, #16 + cmp x_src, x_src_end + bhi .lessthan16_init + +.Lloop16: + ldr q_data, [x_src] + + ldr q_d1_0, [x_dest1] + ldr q_d2_0, [x_dest2] + + and v_data_lo.16b, v_data.16b, v_mask0f.16b + ushr v_data_hi.16b, v_data.16b, #4 + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b + eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b + eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b + eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b + eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b + + str q_d1_0, [x_dest1] + str q_d2_0, [x_dest2] + ldr q_d3_0, [x_dest3] + ldr q_d4_0, [x_dest4] + + tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b + eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b + eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b + eor v_d4_0.16b, v_tmp_lo.16b, v_d4_0.16b + eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b + + str q_d3_0, [x_dest3] + str q_d4_0, [x_dest4] + + add x_src, x_src, #16 + add x_dest1, x_dest1, #16 + add x_dest2, x_dest2, #16 + add x_dest3, x_dest3, #16 + add x_dest4, x_dest4, #16 + cmp x_src, x_src_end + bls .Lloop16 + +.lessthan16_init: + sub x_tmp, x_src, x_src_end + cmp x_tmp, #16 + beq .return_pass + +.lessthan16: + mov x_src, x_src_end + sub x_dest1, x_dest1, x_tmp + sub x_dest2, x_dest2, x_tmp + sub x_dest3, x_dest3, x_tmp + sub x_dest4, x_dest4, x_tmp + +#ifndef __APPLE__ + adrp x_const, const_tbl + add x_const, x_const, :lo12:const_tbl +#else + adrp x_const, const_tbl@PAGE + add x_const, x_const, const_tbl@PAGEOFF +#endif + sub x_const, x_const, x_tmp + ldr q_tmp, [x_const, #16] + + ldr q_data, [x_src] + ldr q_d1_0, [x_dest1] + ldr q_d2_0, [x_dest2] + + and v_data_lo.16b, v_data.16b, v_mask0f.16b + ushr v_data_hi.16b, v_data.16b, #4 + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b + eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b + and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b + eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b + eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b + and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b + eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b + + str q_d1_0, [x_dest1] + str q_d2_0, [x_dest2] + ldr q_d3_0, [x_dest3] + ldr q_d4_0, [x_dest4] + + tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b + eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b + and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b + eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b + eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b + and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b + eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b + + str q_d3_0, [x_dest3] + str q_d4_0, [x_dest4] + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret + +ASM_DEF_RODATA +.balign 8 +const_tbl: + .dword 0x0000000000000000, 0x0000000000000000 + .dword 0xffffffffffffffff, 0xffffffffffffffff diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_mad_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_mad_sve.S new file mode 100644 index 0000000000..89ec89f5c6 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_mad_sve.S @@ -0,0 +1,194 @@ +/************************************************************** + Copyright (c) 2021 Linaro Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +.text +.align 6 +.arch armv8-a+sve + +#include "../include/aarch64_label.h" + +.global cdecl(gf_4vect_mad_sve) +#ifndef __APPLE__ +.type gf_4vect_mad_sve, %function +#endif + +/* gf_4vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest); + */ +/* arguments */ +x_len .req x0 +x_vec .req x1 +x_vec_i .req x2 +x_tbl .req x3 +x_src .req x4 +x_dest .req x5 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_pos .req x6 +x_dest2 .req x7 +x_dest3 .req x8 +x_dest4 .req x9 +x_dest1 .req x12 + +/* vectors */ +z_mask0f .req z0 + +z_src .req z1 +z_src_lo .req z2 +z_src_hi .req z_src + +z_dest1 .req z3 + +z_tmp_lo .req z4 +z_tmp_hi .req z5 + +z_gft1_lo .req z6 +z_gft1_hi .req z7 +q_gft1_lo .req q6 +q_gft1_hi .req q7 + +/* bottom 64-bit of v8..v15 must be preserved if used */ +z_gft2_lo .req z17 +z_gft2_hi .req z18 +q_gft2_lo .req q17 +q_gft2_hi .req q18 + +z_gft3_lo .req z19 +z_gft3_hi .req z20 +q_gft3_lo .req q19 +q_gft3_hi .req q20 + +z_gft4_lo .req z21 +z_gft4_hi .req z22 +q_gft4_lo .req q21 +q_gft4_hi .req q22 + +z_dest2 .req z27 +z_dest3 .req z28 +z_dest4 .req z29 + +cdecl(gf_4vect_mad_sve): + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ + /* load table 1 */ + add x_tbl, x_tbl, x_vec_i, LSL #5 /* x_tbl += x_vec_i * 2^5 */ + + /* Load table 1 with NEON instruction ldp */ + ldp q_gft1_lo, q_gft1_hi, [x_tbl] + /* load table 2 */ + add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */ + ldp q_gft2_lo, q_gft2_hi, [x_tbl] + /* load table 3 */ + add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */ + ldp q_gft3_lo, q_gft3_hi, [x_tbl] + /* load table 4 */ + add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */ + ldp q_gft4_lo, q_gft4_hi, [x_tbl] + + ldr x_dest1, [x_dest, #8*0] /* pointer to dest1 */ + ldr x_dest2, [x_dest, #8*1] /* pointer to dest2 */ + ldr x_dest3, [x_dest, #8*2] /* pointer to dest3 */ + ldr x_dest4, [x_dest, #8*3] /* pointer to dest4 */ + + mov x_pos, #0 + + /* vector length agnostic */ +.Lloopsve_vl: + whilelo p0.b, x_pos, x_len + b.none .return_pass + + prfb pldl2strm, p0, [x_dest1, x_pos] + prfb pldl2strm, p0, [x_dest2, x_pos] + + /* load src data, governed by p0 */ + ld1b z_src.b, p0/z, [x_src, x_pos] + + /* split 4-bit lo; 4-bit hi */ + and z_src_lo.d, z_src.d, z_mask0f.d + lsr z_src_hi.b, z_src.b, #4 + + /* load dest data, governed by p0 */ + ld1b z_dest1.b, p0/z, [x_dest1, x_pos] + ld1b z_dest2.b, p0/z, [x_dest2, x_pos] + + prfb pldl2strm, p0, [x_dest3, x_pos] + prfb pldl2strm, p0, [x_dest4, x_pos] + + /* dest1 */ + /* table indexing, ie. gf(2^8) multiplication */ + tbl z_tmp_lo.b, {z_gft1_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft1_hi.b}, z_src_hi.b + /* exclusive or, ie. gf(2^8) add */ + eor z_dest1.d, z_tmp_lo.d, z_dest1.d + eor z_dest1.d, z_tmp_hi.d, z_dest1.d + + ld1b z_dest3.b, p0/z, [x_dest3, x_pos] + ld1b z_dest4.b, p0/z, [x_dest4, x_pos] + + /* dest2 */ + tbl z_tmp_lo.b, {z_gft2_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft2_hi.b}, z_src_hi.b + eor z_dest2.d, z_tmp_lo.d, z_dest2.d + eor z_dest2.d, z_tmp_hi.d, z_dest2.d + + /* dest3 */ + tbl z_tmp_lo.b, {z_gft3_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft3_hi.b}, z_src_hi.b + eor z_dest3.d, z_tmp_lo.d, z_dest3.d + eor z_dest3.d, z_tmp_hi.d, z_dest3.d + + /* store dest data, governed by p0 */ + st1b z_dest1.b, p0, [x_dest1, x_pos] + st1b z_dest2.b, p0, [x_dest2, x_pos] + + /* dest4 */ + tbl z_tmp_lo.b, {z_gft4_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft4_hi.b}, z_src_hi.b + eor z_dest4.d, z_tmp_lo.d, z_dest4.d + eor z_dest4.d, z_tmp_hi.d, z_dest4.d + + st1b z_dest3.b, p0, [x_dest3, x_pos] + st1b z_dest4.b, p0, [x_dest4, x_pos] + /* increment one vector length */ + incb x_pos + + b .Lloopsve_vl + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_dot_prod_neon.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_dot_prod_neon.S new file mode 100644 index 0000000000..13166665d6 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_dot_prod_neon.S @@ -0,0 +1,484 @@ +/************************************************************** + Copyright (c) 2019 Huawei Technologies Co., Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "../include/aarch64_label.h" + +.text + +.global cdecl(gf_5vect_dot_prod_neon) +#ifndef __APPLE__ +.type gf_5vect_dot_prod_neon, %function +#endif + +/* arguments */ +x_len .req x0 +x_vec .req x1 +x_tbl .req x2 +x_src .req x3 +x_dest .req x4 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_vec_i .req x5 +x_ptr .req x6 +x_pos .req x7 +x_tmp .req x8 +x_dest1 .req x9 +x_dest2 .req x10 +x_dest3 .req x11 +x_dest4 .req x12 +x_dest5 .req x13 + +/* vectors */ +v_tmp1 .req v0 +q_tmp1 .req q0 +v_tmp2 .req v1 +q_tmp2 .req q1 + +v_mask0f .req v_tmp1 +q_mask0f .req q_tmp1 +v_tmp_lo .req v_tmp1 +v_tmp_hi .req v_tmp2 + +v_gft_lo .req v2 +v_gft_hi .req v3 +q_gft_lo .req q2 +q_gft_hi .req q3 + +v_p1_0 .req v4 +v_p2_0 .req v5 +v_p3_0 .req v6 +v_p4_0 .req v7 + +q_p1_0 .req q4 +q_p2_0 .req q5 +q_p3_0 .req q6 +q_p4_0 .req q7 + +v_data_0 .req v8 +v_data_1 .req v9 +v_data_2 .req v10 +v_data_3 .req v11 +q_data_0 .req q8 +q_data_1 .req q9 +q_data_2 .req q10 +q_data_3 .req q11 + +v_data_0_lo .req v12 +v_data_1_lo .req v13 +v_data_2_lo .req v14 +v_data_3_lo .req v15 +v_data_0_hi .req v_data_0 +v_data_1_hi .req v_data_1 +v_data_2_hi .req v_data_2 +v_data_3_hi .req v_data_3 + +v_p5_0 .req v16 +v_p1_1 .req v17 +v_p2_1 .req v18 +v_p3_1 .req v19 +v_p4_1 .req v20 +v_p5_1 .req v21 +v_p1_2 .req v22 +v_p2_2 .req v23 +v_p3_2 .req v24 +v_p4_2 .req v25 +v_p5_2 .req v26 +v_p1_3 .req v27 +v_p2_3 .req v28 +v_p3_3 .req v29 +v_p4_3 .req v30 +v_p5_3 .req v31 + +q_p5_0 .req q16 +q_p1_1 .req q17 +q_p2_1 .req q18 +q_p3_1 .req q19 +q_p4_1 .req q20 +q_p5_1 .req q21 +q_p1_2 .req q22 +q_p2_2 .req q23 +q_p3_2 .req q24 +q_p4_2 .req q25 +q_p5_2 .req q26 +q_p1_3 .req q27 +q_p2_3 .req q28 +q_p3_3 .req q29 +q_p4_3 .req q30 +q_p5_3 .req q31 + +v_data .req v_p1_1 +q_data .req q_p1_1 +v_data_lo .req v_p2_1 +v_data_hi .req v_p3_1 + +v_gft1_lo .req v_p4_1 +v_gft1_hi .req v_p5_1 +v_gft2_lo .req v_p1_2 +v_gft2_hi .req v_p2_2 +v_gft3_lo .req v_p3_2 +v_gft3_hi .req v_p4_2 +v_gft4_lo .req v_p5_2 +v_gft4_hi .req v_p1_3 +v_gft5_lo .req v_p2_3 +v_gft5_hi .req v_p3_3 +q_gft1_lo .req q_p4_1 +q_gft1_hi .req q_p5_1 +q_gft2_lo .req q_p1_2 +q_gft2_hi .req q_p2_2 +q_gft3_lo .req q_p3_2 +q_gft3_hi .req q_p4_2 +q_gft4_lo .req q_p5_2 +q_gft4_hi .req q_p1_3 +q_gft5_lo .req q_p2_3 +q_gft5_hi .req q_p3_3 + + +cdecl(gf_5vect_dot_prod_neon): + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + mov x_pos, #0 + lsl x_vec, x_vec, #3 + ldr x_dest1, [x_dest, #8*0] + ldr x_dest2, [x_dest, #8*1] + ldr x_dest3, [x_dest, #8*2] + ldr x_dest4, [x_dest, #8*3] + ldr x_dest5, [x_dest, #8*4] + +.Lloop64_init: + /* less than 64 bytes, goto Lloop16_init */ + cmp x_len, #64 + blt .Lloop16_init + + /* save d8 ~ d15 to stack */ + sub sp, sp, #64 + stp d8, d9, [sp] + stp d10, d11, [sp, #16] + stp d12, d13, [sp, #32] + stp d14, d15, [sp, #48] + + sub x_len, x_len, #64 + +.Lloop64: + movi v_p1_0.16b, #0 + movi v_p1_1.16b, #0 + movi v_p1_2.16b, #0 + movi v_p1_3.16b, #0 + movi v_p2_0.16b, #0 + movi v_p2_1.16b, #0 + movi v_p2_2.16b, #0 + movi v_p2_3.16b, #0 + movi v_p3_0.16b, #0 + movi v_p3_1.16b, #0 + movi v_p3_2.16b, #0 + movi v_p3_3.16b, #0 + movi v_p4_0.16b, #0 + movi v_p4_1.16b, #0 + movi v_p4_2.16b, #0 + movi v_p4_3.16b, #0 + movi v_p5_0.16b, #0 + movi v_p5_1.16b, #0 + movi v_p5_2.16b, #0 + movi v_p5_3.16b, #0 + mov x_vec_i, #0 + +.Lloop64_vects: + ldr x_ptr, [x_src, x_vec_i] + add x_ptr, x_ptr, x_pos + + ldr q_data_0, [x_ptr], #16 + ldr q_data_1, [x_ptr], #16 + ldr q_data_2, [x_ptr], #16 + ldr q_data_3, [x_ptr], #16 + prfm pldl2keep, [x_ptr] + + movi v_mask0f.16b, #0x0f + and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b + and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b + and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b + and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b + ushr v_data_0_hi.16b, v_data_0.16b, #4 + ushr v_data_1_hi.16b, v_data_1.16b, #4 + ushr v_data_2_hi.16b, v_data_2.16b, #4 + ushr v_data_3_hi.16b, v_data_3.16b, #4 + + /* v_p1_x */ + add x_tmp, x_tbl, x_vec_i, lsl #2 + add x_vec_i, x_vec_i, #8 + ldp q_gft_lo, q_gft_hi, [x_tmp] + prfm pldl3keep, [x_tmp, #32] + add x_tmp, x_tmp, x_vec, lsl #2 + + tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b + tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b + eor v_p1_0.16b, v_tmp_lo.16b, v_p1_0.16b + eor v_p1_0.16b, v_p1_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b + tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b + eor v_p1_1.16b, v_tmp_lo.16b, v_p1_1.16b + eor v_p1_1.16b, v_p1_1.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b + tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b + eor v_p1_2.16b, v_tmp_lo.16b, v_p1_2.16b + eor v_p1_2.16b, v_p1_2.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b + tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b + eor v_p1_3.16b, v_tmp_lo.16b, v_p1_3.16b + eor v_p1_3.16b, v_p1_3.16b, v_tmp_hi.16b + + /* v_p2_x */ + ldp q_gft_lo, q_gft_hi, [x_tmp] + prfm pldl3keep, [x_tmp, #32] + add x_tmp, x_tmp, x_vec, lsl #2 + + tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b + tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b + eor v_p2_0.16b, v_tmp_lo.16b, v_p2_0.16b + eor v_p2_0.16b, v_p2_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b + tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b + eor v_p2_1.16b, v_tmp_lo.16b, v_p2_1.16b + eor v_p2_1.16b, v_p2_1.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b + tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b + eor v_p2_2.16b, v_tmp_lo.16b, v_p2_2.16b + eor v_p2_2.16b, v_p2_2.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b + tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b + eor v_p2_3.16b, v_tmp_lo.16b, v_p2_3.16b + eor v_p2_3.16b, v_p2_3.16b, v_tmp_hi.16b + + /* v_p3_x */ + ldp q_gft_lo, q_gft_hi, [x_tmp] + prfm pldl3keep, [x_tmp, #32] + add x_tmp, x_tmp, x_vec, lsl #2 + + tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b + tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b + eor v_p3_0.16b, v_tmp_lo.16b, v_p3_0.16b + eor v_p3_0.16b, v_p3_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b + tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b + eor v_p3_1.16b, v_tmp_lo.16b, v_p3_1.16b + eor v_p3_1.16b, v_p3_1.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b + tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b + eor v_p3_2.16b, v_tmp_lo.16b, v_p3_2.16b + eor v_p3_2.16b, v_p3_2.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b + tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b + eor v_p3_3.16b, v_tmp_lo.16b, v_p3_3.16b + eor v_p3_3.16b, v_p3_3.16b, v_tmp_hi.16b + + /* v_p4_x */ + ldp q_gft_lo, q_gft_hi, [x_tmp] + prfm pldl3keep, [x_tmp, #32] + add x_tmp, x_tmp, x_vec, lsl #2 + + tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b + tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b + eor v_p4_0.16b, v_tmp_lo.16b, v_p4_0.16b + eor v_p4_0.16b, v_p4_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b + tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b + eor v_p4_1.16b, v_tmp_lo.16b, v_p4_1.16b + eor v_p4_1.16b, v_p4_1.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b + tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b + eor v_p4_2.16b, v_tmp_lo.16b, v_p4_2.16b + eor v_p4_2.16b, v_p4_2.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b + tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b + eor v_p4_3.16b, v_tmp_lo.16b, v_p4_3.16b + eor v_p4_3.16b, v_p4_3.16b, v_tmp_hi.16b + + /* v_p5_x */ + ldp q_gft_lo, q_gft_hi, [x_tmp] + prfm pldl3keep, [x_tmp, #32] + + tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b + tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b + eor v_p5_0.16b, v_tmp_lo.16b, v_p5_0.16b + eor v_p5_0.16b, v_p5_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b + tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b + eor v_p5_1.16b, v_tmp_lo.16b, v_p5_1.16b + eor v_p5_1.16b, v_p5_1.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b + tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b + eor v_p5_2.16b, v_tmp_lo.16b, v_p5_2.16b + eor v_p5_2.16b, v_p5_2.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b + tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b + eor v_p5_3.16b, v_tmp_lo.16b, v_p5_3.16b + eor v_p5_3.16b, v_p5_3.16b, v_tmp_hi.16b + + cmp x_vec_i, x_vec + blt .Lloop64_vects + +.Lloop64_vects_end: + add x_ptr, x_dest1, x_pos + stp q_p1_0, q_p1_1, [x_ptr], #32 + stp q_p1_2, q_p1_3, [x_ptr] + + add x_ptr, x_dest2, x_pos + stp q_p2_0, q_p2_1, [x_ptr], #32 + stp q_p2_2, q_p2_3, [x_ptr] + + add x_ptr, x_dest3, x_pos + stp q_p3_0, q_p3_1, [x_ptr], #32 + stp q_p3_2, q_p3_3, [x_ptr] + + add x_ptr, x_dest4, x_pos + stp q_p4_0, q_p4_1, [x_ptr], #32 + stp q_p4_2, q_p4_3, [x_ptr] + + add x_ptr, x_dest5, x_pos + stp q_p5_0, q_p5_1, [x_ptr], #32 + stp q_p5_2, q_p5_3, [x_ptr] + + add x_pos, x_pos, #64 + cmp x_pos, x_len + ble .Lloop64 + +.Lloop64_end: + /* restore d8 ~ d15 */ + ldp d8, d9, [sp] + ldp d10, d11, [sp, #16] + ldp d12, d13, [sp, #32] + ldp d14, d15, [sp, #48] + add sp, sp, #64 + + add x_len, x_len, #64 + cmp x_pos, x_len + beq .return_pass + +.Lloop16_init: + sub x_len, x_len, #16 + cmp x_pos, x_len + bgt .lessthan16_init + +.Lloop16: + movi v_p1_0.16b, #0 + movi v_p2_0.16b, #0 + movi v_p3_0.16b, #0 + movi v_p4_0.16b, #0 + movi v_p5_0.16b, #0 + mov x_vec_i, #0 + +.Lloop16_vects: + ldr x_ptr, [x_src, x_vec_i] + ldr q_data, [x_ptr, x_pos] + + movi v_mask0f.16b, #0x0f + and v_data_lo.16b, v_data.16b, v_mask0f.16b + ushr v_data_hi.16b, v_data.16b, #4 + + add x_tmp, x_tbl, x_vec_i, lsl #2 + add x_vec_i, x_vec_i, #8 + ldp q_gft1_lo, q_gft1_hi, [x_tmp] + add x_tmp, x_tmp, x_vec, lsl #2 + ldp q_gft2_lo, q_gft2_hi, [x_tmp] + add x_tmp, x_tmp, x_vec, lsl #2 + ldp q_gft3_lo, q_gft3_hi, [x_tmp] + add x_tmp, x_tmp, x_vec, lsl #2 + ldp q_gft4_lo, q_gft4_hi, [x_tmp] + add x_tmp, x_tmp, x_vec, lsl #2 + ldp q_gft5_lo, q_gft5_hi, [x_tmp] + + tbl v_gft1_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b + tbl v_gft1_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b + tbl v_gft2_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b + tbl v_gft2_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b + tbl v_gft3_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b + tbl v_gft3_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b + tbl v_gft4_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b + tbl v_gft4_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b + tbl v_gft5_lo.16b, {v_gft5_lo.16b}, v_data_lo.16b + tbl v_gft5_hi.16b, {v_gft5_hi.16b}, v_data_hi.16b + + eor v_p1_0.16b, v_gft1_hi.16b, v_p1_0.16b + eor v_p1_0.16b, v_p1_0.16b, v_gft1_lo.16b + eor v_p2_0.16b, v_gft2_hi.16b, v_p2_0.16b + eor v_p2_0.16b, v_p2_0.16b, v_gft2_lo.16b + eor v_p3_0.16b, v_gft3_hi.16b, v_p3_0.16b + eor v_p3_0.16b, v_p3_0.16b, v_gft3_lo.16b + eor v_p4_0.16b, v_gft4_hi.16b, v_p4_0.16b + eor v_p4_0.16b, v_p4_0.16b, v_gft4_lo.16b + eor v_p5_0.16b, v_gft5_hi.16b, v_p5_0.16b + eor v_p5_0.16b, v_p5_0.16b, v_gft5_lo.16b + + cmp x_vec_i, x_vec + bne .Lloop16_vects + +.Lloop16_vects_end: + str q_p1_0, [x_dest1, x_pos] + str q_p2_0, [x_dest2, x_pos] + str q_p3_0, [x_dest3, x_pos] + str q_p4_0, [x_dest4, x_pos] + str q_p5_0, [x_dest5, x_pos] + add x_pos, x_pos, #16 + cmp x_pos, x_len + ble .Lloop16 + +.Lloop16_end: + sub x_tmp, x_pos, x_len + cmp x_tmp, #16 + beq .return_pass + +.lessthan16_init: + mov x_pos, x_len + b .Lloop16 + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_dot_prod_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_dot_prod_sve.S new file mode 100644 index 0000000000..bb7cd0184e --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_dot_prod_sve.S @@ -0,0 +1,237 @@ +/************************************************************* + Copyright (c) 2021 Linaro Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +.text +.align 6 +.arch armv8-a+sve + +#include "../include/aarch64_label.h" + +.global cdecl(gf_5vect_dot_prod_sve) +#ifndef __APPLE__ +.type gf_5vect_dot_prod_sve, %function +#endif +/* void gf_5vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + */ + +/* arguments */ +x_len .req x0 /* vector length */ +x_vec .req x1 /* number of source vectors (ie. data blocks) */ +x_tbl .req x2 +x_src .req x3 +x_dest .req x4 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_vec_i .req x5 +x_ptr .req x6 +x_pos .req x7 + +x_tbl1 .req x8 +x_tbl2 .req x9 +x_tbl3 .req x10 +x_tbl4 .req x11 +x_tbl5 .req x12 +x_dest1 .req x13 +x_dest2 .req x14 +x_dest4 .req x15 +x_dest5 .req x_dest /* reused */ + +/* r16,r17,r18,r29,r30: special role registers, avoided */ +/* r19..r29 and SP must be preserved */ +x_dest3 .req x19 + +/* vectors */ +z_mask0f .req z0 + +z_src .req z1 +z_src_lo .req z2 +z_src_hi .req z_src + +z_dest1 .req z3 + +z_gft1_lo .req z4 +z_gft1_hi .req z5 +q_gft1_lo .req q4 +q_gft1_hi .req q5 + +/* bottom 64-bit of v8..v15 must be preserved if used */ +z_gft2_lo .req z17 +z_gft2_hi .req z18 +q_gft2_lo .req q17 +q_gft2_hi .req q18 + +z_gft3_lo .req z19 +z_gft3_hi .req z20 +q_gft3_lo .req q19 +q_gft3_hi .req q20 + +z_gft4_lo .req z21 +z_gft4_hi .req z22 +q_gft4_lo .req q21 +q_gft4_hi .req q22 + +z_gft5_lo .req z23 +z_gft5_hi .req z24 +q_gft5_lo .req q23 +q_gft5_hi .req q24 + +z_dest2 .req z27 +z_dest3 .req z28 +z_dest4 .req z29 +z_dest5 .req z30 + +cdecl(gf_5vect_dot_prod_sve): + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + /* save r19..r29 */ + sub sp, sp, #16 /* alignment */ + str x19, [sp] + + mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ + mov x_pos, #0 + lsl x_vec, x_vec, #3 + ldp x_dest1, x_dest2, [x_dest, #8*0] + ldp x_dest3, x_dest4, [x_dest, #8*2] + ldr x_dest5, [x_dest, #8*4] + +/* Loop 1: x_len, vector length */ +.Lloopsve_vl: + whilelo p0.b, x_pos, x_len + b.none .return_pass + + mov x_vec_i, #0 /* clear x_vec_i */ + ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ + + mov z_dest1.b, #0 /* clear z_dest1 */ + mov z_dest2.b, #0 /* clear z_dest2 */ + mov z_dest3.b, #0 /* clear z_dest3 */ + mov z_dest4.b, #0 /* clear z_dest4 */ + mov z_dest5.b, #0 /* clear z_dest5 */ + + /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */ + mov x_tbl1, x_tbl /* reset x_tbl1 */ + add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */ + add x_tbl3, x_tbl2, x_vec, LSL #2 /* reset x_tbl3 */ + add x_tbl4, x_tbl3, x_vec, LSL #2 /* reset x_tbl4 */ + add x_tbl5, x_tbl4, x_vec, LSL #2 /* reset x_tbl5 */ + +/* Loop 2: x_vec, number of source vectors (ie. data blocks) */ +.Lloopsve_vl_vects: + /* load src data, governed by p0 */ + ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */ + /* split 4-bit lo; 4-bit hi */ + and z_src_lo.d, z_src.d, z_mask0f.d + lsr z_src_hi.b, z_src.b, #4 + + + /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */ + /* load gf_table's */ + ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */ + ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 + + /* prefetch */ + prfb pldl2keep, p0, [x_tbl1] + prfb pldl2keep, p0, [x_tbl2] + + /* calc for next */ + add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */ + ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ + + /* dest 1 */ + /* table indexing, ie. gf(2^8) multiplication */ + tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b + tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b + /* exclusive or, ie. gf(2^8) add */ + eor z_dest1.d, z_gft1_lo.d, z_dest1.d + eor z_dest1.d, z_dest1.d, z_gft1_hi.d + + ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32 + ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32 + prfb pldl2keep, p0, [x_tbl3] + prfb pldl2keep, p0, [x_tbl4] + + /* dest 2 */ + tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b + tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b + eor z_dest2.d, z_gft2_lo.d, z_dest2.d + eor z_dest2.d, z_dest2.d, z_gft2_hi.d + + /* dest 3 */ + tbl z_gft3_lo.b, {z_gft3_lo.b}, z_src_lo.b + tbl z_gft3_hi.b, {z_gft3_hi.b}, z_src_hi.b + eor z_dest3.d, z_gft3_lo.d, z_dest3.d + eor z_dest3.d, z_dest3.d, z_gft3_hi.d + + ldp q_gft5_lo, q_gft5_hi, [x_tbl5], #32 + prfb pldl2keep, p0, [x_tbl5] + + /* dest 4 */ + tbl z_gft4_lo.b, {z_gft4_lo.b}, z_src_lo.b + tbl z_gft4_hi.b, {z_gft4_hi.b}, z_src_hi.b + eor z_dest4.d, z_gft4_lo.d, z_dest4.d + eor z_dest4.d, z_dest4.d, z_gft4_hi.d + + /* dest 5 */ + tbl z_gft5_lo.b, {z_gft5_lo.b}, z_src_lo.b + tbl z_gft5_hi.b, {z_gft5_hi.b}, z_src_hi.b + eor z_dest5.d, z_gft5_lo.d, z_dest5.d + eor z_dest5.d, z_dest5.d, z_gft5_hi.d + + cmp x_vec_i, x_vec + blt .Lloopsve_vl_vects +/* end of Loop 2 */ + + /* store dest data, governed by p0 */ + st1b z_dest1.b, p0, [x_dest1, x_pos] + st1b z_dest2.b, p0, [x_dest2, x_pos] + st1b z_dest3.b, p0, [x_dest3, x_pos] + st1b z_dest4.b, p0, [x_dest4, x_pos] + st1b z_dest5.b, p0, [x_dest5, x_pos] + + /* increment one vector length */ + incb x_pos + b .Lloopsve_vl +/* end of Loop 1 */ + +.return_pass: + /* restore r19..r29 */ + ldr x19, [sp] + add sp, sp, #16 + + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_mad_neon.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_mad_neon.S new file mode 100644 index 0000000000..473e4c5774 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_mad_neon.S @@ -0,0 +1,544 @@ +/************************************************************** + Copyright (c) 2019 Huawei Technologies Co., Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "../include/aarch64_label.h" + +.text + +.global cdecl(gf_5vect_mad_neon) +#ifndef __APPLE__ +.type gf_5vect_mad_neon, %function +#endif + +/* arguments */ +x_len .req x0 +x_vec .req x1 +x_vec_i .req x2 +x_tbl .req x3 +x_src .req x4 +x_dest .req x5 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_src_end .req x6 +x_dest1 .req x7 +x_dest2 .req x8 +x_dest3 .req x9 +x_dest4 .req x10 +x_dest5 .req x_dest +x_tmp .req x11 +x_tbl1 .req x12 +x_tbl2 .req x13 +x_tbl3 .req x14 +x_tbl4 .req x15 +x_tbl5 .req x16 +x_const .req x17 + +/* vectors */ +v_mask0f .req v0 +v_tmp_lo .req v1 +v_tmp_hi .req v2 +v_tmp .req v3 +q_tmp .req q3 + +v_gft1_lo .req v4 +v_gft1_hi .req v5 +v_gft2_lo .req v6 +v_gft2_hi .req v7 +v_gft3_lo .req v16 +v_gft3_hi .req v17 +q_gft1_lo .req q4 +q_gft1_hi .req q5 +q_gft2_lo .req q6 +q_gft2_hi .req q7 +q_gft3_lo .req q16 +q_gft3_hi .req q17 + +v_gft4_lo .req v18 +v_gft4_hi .req v19 +q_gft4_lo .req q18 +q_gft4_hi .req q19 +v_gft5_lo .req v_gft2_lo +v_gft5_hi .req v_gft2_hi +q_gft5_lo .req q_gft2_lo +q_gft5_hi .req q_gft2_hi + +v_data_0 .req v8 +v_data_1 .req v9 +v_data_2 .req v10 +v_data_3 .req v11 +q_data_0 .req q8 +q_data_1 .req q9 +q_data_2 .req q10 +q_data_3 .req q11 + +v_data_0_lo .req v12 +v_data_1_lo .req v13 +v_data_2_lo .req v14 +v_data_3_lo .req v15 +v_data_0_hi .req v_data_0 +v_data_1_hi .req v_data_1 +v_data_2_hi .req v_data_2 +v_data_3_hi .req v_data_3 + +v_d1_0 .req v20 +v_d1_1 .req v21 +v_d1_2 .req v22 +v_d1_3 .req v23 +v_d2_0 .req v24 +v_d2_1 .req v25 +v_d2_2 .req v26 +v_d2_3 .req v27 +v_d3_0 .req v28 +v_d3_1 .req v29 +v_d3_2 .req v30 +v_d3_3 .req v31 +q_d1_0 .req q20 +q_d1_1 .req q21 +q_d1_2 .req q22 +q_d1_3 .req q23 +q_d2_0 .req q24 +q_d2_1 .req q25 +q_d2_2 .req q26 +q_d2_3 .req q27 +q_d3_0 .req q28 +q_d3_1 .req q29 +q_d3_2 .req q30 +q_d3_3 .req q31 + +v_d4_0 .req v_d1_0 +v_d4_1 .req v_d1_1 +v_d4_2 .req v_d1_2 +v_d4_3 .req v_d1_3 +q_d4_0 .req q_d1_0 +q_d4_1 .req q_d1_1 +q_d4_2 .req q_d1_2 +q_d4_3 .req q_d1_3 +v_d5_0 .req v_d2_0 +v_d5_1 .req v_d2_1 +v_d5_2 .req v_d2_2 +v_d5_3 .req v_d2_3 +q_d5_0 .req q_d2_0 +q_d5_1 .req q_d2_1 +q_d5_2 .req q_d2_2 +q_d5_3 .req q_d2_3 + +v_data .req v21 +q_data .req q21 +v_data_lo .req v22 +v_data_hi .req v23 + +cdecl(gf_5vect_mad_neon): + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + movi v_mask0f.16b, #0x0f + lsl x_vec_i, x_vec_i, #5 + lsl x_vec, x_vec, #5 + add x_tbl1, x_tbl, x_vec_i + add x_tbl2, x_tbl1, x_vec + add x_tbl3, x_tbl2, x_vec + add x_tbl4, x_tbl3, x_vec + add x_tbl5, x_tbl4, x_vec + add x_src_end, x_src, x_len + ldr x_dest1, [x_dest, #8*0] + ldr x_dest2, [x_dest, #8*1] + ldr x_dest3, [x_dest, #8*2] + ldr x_dest4, [x_dest, #8*3] + ldr x_dest5, [x_dest, #8*4] + ldr q_gft1_lo, [x_tbl1] + ldr q_gft1_hi, [x_tbl1, #16] + ldr q_gft3_lo, [x_tbl3] + ldr q_gft3_hi, [x_tbl3, #16] + ldr q_gft4_lo, [x_tbl4] + ldr q_gft4_hi, [x_tbl4, #16] + +.Lloop64_init: + /* less than 64 bytes, goto Lloop16_init */ + cmp x_len, #64 + blt .Lloop16_init + + /* save d8 ~ d15 to stack */ + sub sp, sp, #64 + stp d8, d9, [sp] + stp d10, d11, [sp, #16] + stp d12, d13, [sp, #32] + stp d14, d15, [sp, #48] + + sub x_src_end, x_src_end, #64 + +.Lloop64: + ldr q_data_0, [x_src, #16*0] + ldr q_data_1, [x_src, #16*1] + ldr q_data_2, [x_src, #16*2] + ldr q_data_3, [x_src, #16*3] + add x_src, x_src, #64 + + ldr q_d1_0, [x_dest1, #16*0] + ldr q_d1_1, [x_dest1, #16*1] + ldr q_d1_2, [x_dest1, #16*2] + ldr q_d1_3, [x_dest1, #16*3] + + ldr q_d2_0, [x_dest2, #16*0] + ldr q_d2_1, [x_dest2, #16*1] + ldr q_d2_2, [x_dest2, #16*2] + ldr q_d2_3, [x_dest2, #16*3] + + ldr q_d3_0, [x_dest3, #16*0] + ldr q_d3_1, [x_dest3, #16*1] + ldr q_d3_2, [x_dest3, #16*2] + ldr q_d3_3, [x_dest3, #16*3] + + ldr q_gft2_lo, [x_tbl2] + ldr q_gft2_hi, [x_tbl2, #16] + + and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b + and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b + and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b + and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b + + ushr v_data_0_hi.16b, v_data_0.16b, #4 + ushr v_data_1_hi.16b, v_data_1.16b, #4 + ushr v_data_2_hi.16b, v_data_2.16b, #4 + ushr v_data_3_hi.16b, v_data_3.16b, #4 + + /* dest1 */ + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b + eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b + eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b + eor v_d1_1.16b, v_tmp_lo.16b, v_d1_1.16b + eor v_d1_1.16b, v_d1_1.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b + eor v_d1_2.16b, v_tmp_lo.16b, v_d1_2.16b + eor v_d1_2.16b, v_d1_2.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b + eor v_d1_3.16b, v_tmp_lo.16b, v_d1_3.16b + eor v_d1_3.16b, v_d1_3.16b, v_tmp_hi.16b + + /* dest2 */ + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_0_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_0_hi.16b + eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b + eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_1_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_1_hi.16b + eor v_d2_1.16b, v_tmp_lo.16b, v_d2_1.16b + eor v_d2_1.16b, v_d2_1.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_2_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_2_hi.16b + eor v_d2_2.16b, v_tmp_lo.16b, v_d2_2.16b + eor v_d2_2.16b, v_d2_2.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_3_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_3_hi.16b + eor v_d2_3.16b, v_tmp_lo.16b, v_d2_3.16b + eor v_d2_3.16b, v_d2_3.16b, v_tmp_hi.16b + + /* dest3 */ + tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_0_lo.16b + tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_0_hi.16b + eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b + eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_1_lo.16b + tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_1_hi.16b + eor v_d3_1.16b, v_tmp_lo.16b, v_d3_1.16b + eor v_d3_1.16b, v_d3_1.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_2_lo.16b + tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_2_hi.16b + eor v_d3_2.16b, v_tmp_lo.16b, v_d3_2.16b + eor v_d3_2.16b, v_d3_2.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_3_lo.16b + tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_3_hi.16b + eor v_d3_3.16b, v_tmp_lo.16b, v_d3_3.16b + eor v_d3_3.16b, v_d3_3.16b, v_tmp_hi.16b + + str q_d1_0, [x_dest1, #16*0] + str q_d1_1, [x_dest1, #16*1] + str q_d1_2, [x_dest1, #16*2] + str q_d1_3, [x_dest1, #16*3] + add x_dest1, x_dest1, #64 + + str q_d2_0, [x_dest2, #16*0] + str q_d2_1, [x_dest2, #16*1] + str q_d2_2, [x_dest2, #16*2] + str q_d2_3, [x_dest2, #16*3] + add x_dest2, x_dest2, #64 + + str q_d3_0, [x_dest3, #16*0] + str q_d3_1, [x_dest3, #16*1] + str q_d3_2, [x_dest3, #16*2] + str q_d3_3, [x_dest3, #16*3] + add x_dest3, x_dest3, #64 + + ldr q_d4_0, [x_dest4, #16*0] + ldr q_d4_1, [x_dest4, #16*1] + ldr q_d4_2, [x_dest4, #16*2] + ldr q_d4_3, [x_dest4, #16*3] + + ldr q_d5_0, [x_dest5, #16*0] + ldr q_d5_1, [x_dest5, #16*1] + ldr q_d5_2, [x_dest5, #16*2] + ldr q_d5_3, [x_dest5, #16*3] + + ldr q_gft5_lo, [x_tbl5] + ldr q_gft5_hi, [x_tbl5, #16] + + /* dest4 */ + tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_0_lo.16b + tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_0_hi.16b + eor v_d4_0.16b, v_tmp_lo.16b, v_d4_0.16b + eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_1_lo.16b + tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_1_hi.16b + eor v_d4_1.16b, v_tmp_lo.16b, v_d4_1.16b + eor v_d4_1.16b, v_d4_1.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_2_lo.16b + tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_2_hi.16b + eor v_d4_2.16b, v_tmp_lo.16b, v_d4_2.16b + eor v_d4_2.16b, v_d4_2.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_3_lo.16b + tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_3_hi.16b + eor v_d4_3.16b, v_tmp_lo.16b, v_d4_3.16b + eor v_d4_3.16b, v_d4_3.16b, v_tmp_hi.16b + + /* dest5 */ + tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_0_lo.16b + tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_0_hi.16b + eor v_d5_0.16b, v_tmp_lo.16b, v_d5_0.16b + eor v_d5_0.16b, v_d5_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_1_lo.16b + tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_1_hi.16b + eor v_d5_1.16b, v_tmp_lo.16b, v_d5_1.16b + eor v_d5_1.16b, v_d5_1.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_2_lo.16b + tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_2_hi.16b + eor v_d5_2.16b, v_tmp_lo.16b, v_d5_2.16b + eor v_d5_2.16b, v_d5_2.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_3_lo.16b + tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_3_hi.16b + eor v_d5_3.16b, v_tmp_lo.16b, v_d5_3.16b + eor v_d5_3.16b, v_d5_3.16b, v_tmp_hi.16b + + str q_d4_0, [x_dest4, #16*0] + str q_d4_1, [x_dest4, #16*1] + str q_d4_2, [x_dest4, #16*2] + str q_d4_3, [x_dest4, #16*3] + add x_dest4, x_dest4, #64 + + str q_d5_0, [x_dest5, #16*0] + str q_d5_1, [x_dest5, #16*1] + str q_d5_2, [x_dest5, #16*2] + str q_d5_3, [x_dest5, #16*3] + add x_dest5, x_dest5, #64 + + cmp x_src, x_src_end + bls .Lloop64 + +.Lloop64_end: + /* restore d8 ~ d15 */ + ldp d8, d9, [sp] + ldp d10, d11, [sp, #16] + ldp d12, d13, [sp, #32] + ldp d14, d15, [sp, #48] + add sp, sp, #64 + add x_src_end, x_src_end, #64 + +.Lloop16_init: + sub x_src_end, x_src_end, #16 + cmp x_src, x_src_end + bhi .lessthan16_init + +.Lloop16: + ldr q_data, [x_src] + + ldr q_d1_0, [x_dest1] + ldr q_d2_0, [x_dest2] + ldr q_d3_0, [x_dest3] + ldr q_gft2_lo, [x_tbl2] + ldr q_gft2_hi, [x_tbl2, #16] + + and v_data_lo.16b, v_data.16b, v_mask0f.16b + ushr v_data_hi.16b, v_data.16b, #4 + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b + eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b + eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b + eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b + eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b + eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b + eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b + + str q_d1_0, [x_dest1] + str q_d2_0, [x_dest2] + str q_d3_0, [x_dest3] + + ldr q_d4_0, [x_dest4] + ldr q_d5_0, [x_dest5] + ldr q_gft5_lo, [x_tbl5] + ldr q_gft5_hi, [x_tbl5, #16] + + tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b + eor v_d4_0.16b, v_tmp_lo.16b, v_d4_0.16b + eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_hi.16b + eor v_d5_0.16b, v_tmp_lo.16b, v_d5_0.16b + eor v_d5_0.16b, v_d5_0.16b, v_tmp_hi.16b + + str q_d4_0, [x_dest4] + str q_d5_0, [x_dest5] + + add x_src, x_src, #16 + add x_dest1, x_dest1, #16 + add x_dest2, x_dest2, #16 + add x_dest3, x_dest3, #16 + add x_dest4, x_dest4, #16 + add x_dest5, x_dest5, #16 + cmp x_src, x_src_end + bls .Lloop16 + +.lessthan16_init: + sub x_tmp, x_src, x_src_end + cmp x_tmp, #16 + beq .return_pass + +.lessthan16: + mov x_src, x_src_end + sub x_dest1, x_dest1, x_tmp + sub x_dest2, x_dest2, x_tmp + sub x_dest3, x_dest3, x_tmp + sub x_dest4, x_dest4, x_tmp + sub x_dest5, x_dest5, x_tmp + +#ifndef __APPLE__ + adrp x_const, const_tbl + add x_const, x_const, :lo12:const_tbl +#else + adrp x_const, const_tbl@PAGE + add x_const, x_const, const_tbl@PAGEOFF +#endif + sub x_const, x_const, x_tmp + ldr q_tmp, [x_const, #16] + + ldr q_data, [x_src] + ldr q_d1_0, [x_dest1] + ldr q_d2_0, [x_dest2] + ldr q_d3_0, [x_dest3] + ldr q_gft2_lo, [x_tbl2] + ldr q_gft2_hi, [x_tbl2, #16] + + and v_data_lo.16b, v_data.16b, v_mask0f.16b + ushr v_data_hi.16b, v_data.16b, #4 + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b + eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b + and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b + eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b + eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b + and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b + eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b + eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b + and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b + eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b + + str q_d1_0, [x_dest1] + str q_d2_0, [x_dest2] + str q_d3_0, [x_dest3] + + ldr q_d4_0, [x_dest4] + ldr q_d5_0, [x_dest5] + ldr q_gft5_lo, [x_tbl5] + ldr q_gft5_hi, [x_tbl5, #16] + + tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b + eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b + and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b + eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_hi.16b + eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b + and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b + eor v_d5_0.16b, v_d5_0.16b, v_tmp_hi.16b + + str q_d4_0, [x_dest4] + str q_d5_0, [x_dest5] + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret + +ASM_DEF_RODATA +.balign 8 +const_tbl: + .dword 0x0000000000000000, 0x0000000000000000 + .dword 0xffffffffffffffff, 0xffffffffffffffff diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_mad_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_mad_sve.S new file mode 100644 index 0000000000..ab374d365a --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_mad_sve.S @@ -0,0 +1,218 @@ +/************************************************************** + Copyright (c) 2021 Linaro Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +.text +.align 6 +.arch armv8-a+sve + +#include "../include/aarch64_label.h" + +.global cdecl(gf_5vect_mad_sve) +#ifndef __APPLE__ +.type gf_5vect_mad_sve, %function +#endif + +/* gf_5vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest); + */ +/* arguments */ +x_len .req x0 +x_vec .req x1 +x_vec_i .req x2 +x_tbl .req x3 +x_src .req x4 +x_dest .req x5 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_pos .req x6 +x_dest2 .req x7 +x_dest3 .req x8 +x_dest4 .req x9 +x_dest5 .req x10 +x_dest1 .req x12 + +/* vectors */ +z_mask0f .req z0 + +z_src .req z1 +z_src_lo .req z2 +z_src_hi .req z_src + +z_dest1 .req z3 + +z_tmp_lo .req z4 +z_tmp_hi .req z5 + +z_gft1_lo .req z6 +z_gft1_hi .req z7 +q_gft1_lo .req q6 +q_gft1_hi .req q7 + +/* bottom 64-bit of v8..v15 must be preserved if used */ +z_gft2_lo .req z17 +z_gft2_hi .req z18 +q_gft2_lo .req q17 +q_gft2_hi .req q18 + +z_gft3_lo .req z19 +z_gft3_hi .req z20 +q_gft3_lo .req q19 +q_gft3_hi .req q20 + +z_gft4_lo .req z21 +z_gft4_hi .req z22 +q_gft4_lo .req q21 +q_gft4_hi .req q22 + +z_gft5_lo .req z23 +z_gft5_hi .req z24 +q_gft5_lo .req q23 +q_gft5_hi .req q24 + +z_dest2 .req z27 +z_dest3 .req z28 +z_dest4 .req z29 +z_dest5 .req z30 + +cdecl(gf_5vect_mad_sve): + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ + /* load table 1 */ + add x_tbl, x_tbl, x_vec_i, LSL #5 /* x_tbl += x_vec_i * 2^5 */ + + /* Load with NEON instruction ldp */ + ldp q_gft1_lo, q_gft1_hi, [x_tbl] + /* load table 2 */ + add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */ + ldp q_gft2_lo, q_gft2_hi, [x_tbl] + /* load table 3 */ + add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */ + ldp q_gft3_lo, q_gft3_hi, [x_tbl] + /* load table 4 */ + add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */ + ldp q_gft4_lo, q_gft4_hi, [x_tbl] + /* load table 5 */ + add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */ + ldp q_gft5_lo, q_gft5_hi, [x_tbl] + + ldr x_dest1, [x_dest, #8*0] /* pointer to dest1 */ + ldr x_dest2, [x_dest, #8*1] /* pointer to dest2 */ + ldr x_dest3, [x_dest, #8*2] /* pointer to dest3 */ + ldr x_dest4, [x_dest, #8*3] /* pointer to dest4 */ + ldr x_dest5, [x_dest, #8*4] /* pointer to dest5 */ + + mov x_pos, #0 + + /* vector length agnostic */ +.Lloopsve_vl: + whilelo p0.b, x_pos, x_len + b.none .return_pass + + prfb pldl2strm, p0, [x_dest1, x_pos] + prfb pldl2strm, p0, [x_dest2, x_pos] + + /* load src data, governed by p0 */ + ld1b z_src.b, p0/z, [x_src, x_pos] + + /* split 4-bit lo; 4-bit hi */ + and z_src_lo.d, z_src.d, z_mask0f.d + lsr z_src_hi.b, z_src.b, #4 + + /* load dest data, governed by p0 */ + ld1b z_dest1.b, p0/z, [x_dest1, x_pos] + ld1b z_dest2.b, p0/z, [x_dest2, x_pos] + + prfb pldl2strm, p0, [x_dest3, x_pos] + prfb pldl2strm, p0, [x_dest4, x_pos] + + /* dest1 */ + /* table indexing, ie. gf(2^8) multiplication */ + tbl z_tmp_lo.b, {z_gft1_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft1_hi.b}, z_src_hi.b + /* exclusive or, ie. gf(2^8) add */ + eor z_dest1.d, z_tmp_lo.d, z_dest1.d + eor z_dest1.d, z_tmp_hi.d, z_dest1.d + + ld1b z_dest3.b, p0/z, [x_dest3, x_pos] + ld1b z_dest4.b, p0/z, [x_dest4, x_pos] + prfb pldl2strm, p0, [x_dest5, x_pos] + + /* dest2 */ + tbl z_tmp_lo.b, {z_gft2_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft2_hi.b}, z_src_hi.b + eor z_dest2.d, z_tmp_lo.d, z_dest2.d + eor z_dest2.d, z_tmp_hi.d, z_dest2.d + + ld1b z_dest5.b, p0/z, [x_dest5, x_pos] + + /* store dest data, governed by p0 */ + st1b z_dest1.b, p0, [x_dest1, x_pos] + + /* dest3 */ + tbl z_tmp_lo.b, {z_gft3_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft3_hi.b}, z_src_hi.b + eor z_dest3.d, z_tmp_lo.d, z_dest3.d + eor z_dest3.d, z_tmp_hi.d, z_dest3.d + + /* dest4 */ + tbl z_tmp_lo.b, {z_gft4_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft4_hi.b}, z_src_hi.b + eor z_dest4.d, z_tmp_lo.d, z_dest4.d + eor z_dest4.d, z_tmp_hi.d, z_dest4.d + + /* store dest data, governed by p0 */ + st1b z_dest2.b, p0, [x_dest2, x_pos] + st1b z_dest3.b, p0, [x_dest3, x_pos] + + /* dest5 */ + tbl z_tmp_lo.b, {z_gft5_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft5_hi.b}, z_src_hi.b + eor z_dest5.d, z_tmp_lo.d, z_dest5.d + eor z_dest5.d, z_tmp_hi.d, z_dest5.d + + /* store dest data, governed by p0 */ + st1b z_dest4.b, p0, [x_dest4, x_pos] + st1b z_dest5.b, p0, [x_dest5, x_pos] + /* increment one vector length */ + incb x_pos + + b .Lloopsve_vl + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_6vect_dot_prod_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_6vect_dot_prod_sve.S new file mode 100644 index 0000000000..acc98953b3 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_6vect_dot_prod_sve.S @@ -0,0 +1,258 @@ +/************************************************************* + Copyright (c) 2021 Linaro Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +.text +.align 6 +.arch armv8-a+sve + +#include "../include/aarch64_label.h" + +.global cdecl(gf_6vect_dot_prod_sve) +#ifndef __APPLE__ +.type gf_6vect_dot_prod_sve, %function +#endif +/* void gf_6vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + */ + +/* arguments */ +x_len .req x0 /* vector length */ +x_vec .req x1 /* number of source vectors (ie. data blocks) */ +x_tbl .req x2 +x_src .req x3 +x_dest .req x4 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_vec_i .req x5 +x_ptr .req x6 +x_pos .req x7 + +x_tbl1 .req x8 +x_tbl2 .req x9 +x_tbl3 .req x10 +x_tbl4 .req x11 +x_tbl5 .req x12 +x_tbl6 .req x13 +x_dest1 .req x14 +x_dest2 .req x15 +x_dest6 .req x_dest /* reused */ + +/* r16,r17,r18,r29,r30: special role registers, avoided */ +/* r19..r29 and SP must be preserved */ +x_dest3 .req x19 +x_dest4 .req x20 +x_dest5 .req x21 + +/* vectors */ +z_mask0f .req z0 + +z_src .req z1 +z_src_lo .req z2 +z_src_hi .req z_src + +z_dest1 .req z3 + +z_gft1_lo .req z4 +z_gft1_hi .req z5 +q_gft1_lo .req q4 +q_gft1_hi .req q5 + +/* bottom 64-bit of v8..v15 must be preserved if used */ +z_gft2_lo .req z17 +z_gft2_hi .req z18 +q_gft2_lo .req q17 +q_gft2_hi .req q18 + +z_gft3_lo .req z19 +z_gft3_hi .req z20 +q_gft3_lo .req q19 +q_gft3_hi .req q20 + +z_gft4_lo .req z21 +z_gft4_hi .req z22 +q_gft4_lo .req q21 +q_gft4_hi .req q22 + +z_gft5_lo .req z23 +z_gft5_hi .req z24 +q_gft5_lo .req q23 +q_gft5_hi .req q24 + +z_gft6_lo .req z25 +z_gft6_hi .req z26 +q_gft6_lo .req q25 +q_gft6_hi .req q26 + +z_dest2 .req z27 +z_dest3 .req z28 +z_dest4 .req z29 +z_dest5 .req z30 +z_dest6 .req z31 + +cdecl(gf_6vect_dot_prod_sve): + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + /* save r19..r29 */ + sub sp, sp, #32 /* alignment */ + stp x19, x20, [sp] + str x21, [sp, #16] + + mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ + mov x_pos, #0 + lsl x_vec, x_vec, #3 + ldp x_dest1, x_dest2, [x_dest, #8*0] + ldp x_dest3, x_dest4, [x_dest, #8*2] + ldp x_dest5, x_dest6, [x_dest, #8*4] /* x_dest6 reuses x_dest */ + +/* Loop 1: x_len, vector length */ +.Lloopsve_vl: + whilelo p0.b, x_pos, x_len + b.none .return_pass + + mov x_vec_i, #0 /* clear x_vec_i */ + ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ + + mov z_dest1.b, #0 /* clear z_dest1 */ + mov z_dest2.b, #0 /* clear z_dest2 */ + mov z_dest3.b, #0 /* clear z_dest3 */ + mov z_dest4.b, #0 /* clear z_dest4 */ + mov z_dest5.b, #0 /* clear z_dest5 */ + mov z_dest6.b, #0 /* clear z_dest6 */ + + /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */ + mov x_tbl1, x_tbl /* reset x_tbl1 */ + add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */ + add x_tbl3, x_tbl2, x_vec, LSL #2 /* reset x_tbl3 */ + add x_tbl4, x_tbl3, x_vec, LSL #2 /* reset x_tbl4 */ + add x_tbl5, x_tbl4, x_vec, LSL #2 /* reset x_tbl5 */ + add x_tbl6, x_tbl5, x_vec, LSL #2 /* reset x_tbl6 */ + +/* Loop 2: x_vec, number of source vectors (ie. data blocks) */ +.Lloopsve_vl_vects: + /* load src data, governed by p0 */ + ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */ + /* split 4-bit lo; 4-bit hi */ + and z_src_lo.d, z_src.d, z_mask0f.d + lsr z_src_hi.b, z_src.b, #4 + + + /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */ + /* load gf_table's */ + ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */ + ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 + + /* prefetch */ + prfb pldl2keep, p0, [x_tbl1] + prfb pldl2keep, p0, [x_tbl2] + + /* calc for next and prefetch */ + add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */ + ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ + + /* dest 1 */ + /* table indexing, ie. gf(2^8) multiplication */ + tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b + tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b + /* exclusive or, ie. gf(2^8) add */ + eor z_dest1.d, z_gft1_lo.d, z_dest1.d + eor z_dest1.d, z_dest1.d, z_gft1_hi.d + + ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32 + ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32 + prfb pldl2keep, p0, [x_tbl3] + prfb pldl2keep, p0, [x_tbl4] + + /* dest 2 */ + tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b + tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b + eor z_dest2.d, z_gft2_lo.d, z_dest2.d + eor z_dest2.d, z_dest2.d, z_gft2_hi.d + + /* dest 3 */ + tbl z_gft3_lo.b, {z_gft3_lo.b}, z_src_lo.b + tbl z_gft3_hi.b, {z_gft3_hi.b}, z_src_hi.b + eor z_dest3.d, z_gft3_lo.d, z_dest3.d + eor z_dest3.d, z_dest3.d, z_gft3_hi.d + + ldp q_gft5_lo, q_gft5_hi, [x_tbl5], #32 + ldp q_gft6_lo, q_gft6_hi, [x_tbl6], #32 + prfb pldl2keep, p0, [x_tbl5] + prfb pldl2keep, p0, [x_tbl6] + + /* dest 4 */ + tbl z_gft4_lo.b, {z_gft4_lo.b}, z_src_lo.b + tbl z_gft4_hi.b, {z_gft4_hi.b}, z_src_hi.b + eor z_dest4.d, z_gft4_lo.d, z_dest4.d + eor z_dest4.d, z_dest4.d, z_gft4_hi.d + + /* dest 5 */ + tbl z_gft5_lo.b, {z_gft5_lo.b}, z_src_lo.b + tbl z_gft5_hi.b, {z_gft5_hi.b}, z_src_hi.b + eor z_dest5.d, z_gft5_lo.d, z_dest5.d + eor z_dest5.d, z_dest5.d, z_gft5_hi.d + + /* dest 6 */ + tbl z_gft6_lo.b, {z_gft6_lo.b}, z_src_lo.b + tbl z_gft6_hi.b, {z_gft6_hi.b}, z_src_hi.b + eor z_dest6.d, z_gft6_lo.d, z_dest6.d + eor z_dest6.d, z_dest6.d, z_gft6_hi.d + + cmp x_vec_i, x_vec + blt .Lloopsve_vl_vects +/* end of Loop 2 */ + + /* store dest data, governed by p0 */ + st1b z_dest1.b, p0, [x_dest1, x_pos] + st1b z_dest2.b, p0, [x_dest2, x_pos] + st1b z_dest3.b, p0, [x_dest3, x_pos] + st1b z_dest4.b, p0, [x_dest4, x_pos] + st1b z_dest5.b, p0, [x_dest5, x_pos] + st1b z_dest6.b, p0, [x_dest6, x_pos] + + /* increment one vector length */ + incb x_pos + b .Lloopsve_vl +/* end of Loop 1 */ + +.return_pass: + /* restore r19..r29 */ + ldr x21, [sp, #16] + ldp x19, x20, [sp] + add sp, sp, #32 + + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_6vect_mad_neon.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_6vect_mad_neon.S new file mode 100644 index 0000000000..3b1b1b4b21 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_6vect_mad_neon.S @@ -0,0 +1,618 @@ +/************************************************************** + Copyright (c) 2019 Huawei Technologies Co., Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "../include/aarch64_label.h" + +.text +.global cdecl(gf_6vect_mad_neon) +#ifndef __APPLE__ +.type gf_6vect_mad_neon, %function +#endif + +/* arguments */ +x_len .req x0 +x_vec .req x1 +x_vec_i .req x2 +x_tbl .req x3 +x_src .req x4 +x_dest .req x5 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_src_end .req x6 +x_dest1 .req x7 +x_dest2 .req x8 +x_dest3 .req x9 +x_dest4 .req x10 +x_dest5 .req x11 +x_dest6 .req x_dest +x_tmp .req x12 +x_tbl1 .req x13 +x_tbl2 .req x14 +x_tbl3 .req x15 +x_tbl4 .req x16 +x_tbl5 .req x17 +x_tbl6 .req x_tbl +x_const .req x18 + +/* vectors */ +v_mask0f .req v0 +v_tmp_lo .req v1 +v_tmp_hi .req v2 +v_tmp .req v3 +q_tmp .req q3 + +v_gft1_lo .req v4 +v_gft1_hi .req v5 +v_gft2_lo .req v6 +v_gft2_hi .req v7 +v_gft3_lo .req v16 +v_gft3_hi .req v17 +q_gft1_lo .req q4 +q_gft1_hi .req q5 +q_gft2_lo .req q6 +q_gft2_hi .req q7 +q_gft3_lo .req q16 +q_gft3_hi .req q17 + +v_gft4_lo .req v18 +v_gft4_hi .req v19 +q_gft4_lo .req q18 +q_gft4_hi .req q19 +v_gft5_lo .req v_gft2_lo +v_gft5_hi .req v_gft2_hi +q_gft5_lo .req q_gft2_lo +q_gft5_hi .req q_gft2_hi +v_gft6_lo .req v_gft3_lo +v_gft6_hi .req v_gft3_hi +q_gft6_lo .req q_gft3_lo +q_gft6_hi .req q_gft3_hi + +v_data_0 .req v8 +v_data_1 .req v9 +v_data_2 .req v10 +v_data_3 .req v11 +q_data_0 .req q8 +q_data_1 .req q9 +q_data_2 .req q10 +q_data_3 .req q11 + +v_data_0_lo .req v12 +v_data_1_lo .req v13 +v_data_2_lo .req v14 +v_data_3_lo .req v15 +v_data_0_hi .req v_data_0 +v_data_1_hi .req v_data_1 +v_data_2_hi .req v_data_2 +v_data_3_hi .req v_data_3 + +v_d1_0 .req v20 +v_d1_1 .req v21 +v_d1_2 .req v22 +v_d1_3 .req v23 +v_d2_0 .req v24 +v_d2_1 .req v25 +v_d2_2 .req v26 +v_d2_3 .req v27 +v_d3_0 .req v28 +v_d3_1 .req v29 +v_d3_2 .req v30 +v_d3_3 .req v31 +q_d1_0 .req q20 +q_d1_1 .req q21 +q_d1_2 .req q22 +q_d1_3 .req q23 +q_d2_0 .req q24 +q_d2_1 .req q25 +q_d2_2 .req q26 +q_d2_3 .req q27 +q_d3_0 .req q28 +q_d3_1 .req q29 +q_d3_2 .req q30 +q_d3_3 .req q31 + +v_d4_0 .req v_d1_0 +v_d4_1 .req v_d1_1 +v_d4_2 .req v_d1_2 +v_d4_3 .req v_d1_3 +q_d4_0 .req q_d1_0 +q_d4_1 .req q_d1_1 +q_d4_2 .req q_d1_2 +q_d4_3 .req q_d1_3 +v_d5_0 .req v_d2_0 +v_d5_1 .req v_d2_1 +v_d5_2 .req v_d2_2 +v_d5_3 .req v_d2_3 +q_d5_0 .req q_d2_0 +q_d5_1 .req q_d2_1 +q_d5_2 .req q_d2_2 +q_d5_3 .req q_d2_3 +v_d6_0 .req v_d3_0 +v_d6_1 .req v_d3_1 +v_d6_2 .req v_d3_2 +v_d6_3 .req v_d3_3 +q_d6_0 .req q_d3_0 +q_d6_1 .req q_d3_1 +q_d6_2 .req q_d3_2 +q_d6_3 .req q_d3_3 + +v_data .req v21 +q_data .req q21 +v_data_lo .req v22 +v_data_hi .req v23 + +cdecl(gf_6vect_mad_neon): + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + movi v_mask0f.16b, #0x0f + lsl x_vec_i, x_vec_i, #5 + lsl x_vec, x_vec, #5 + add x_tbl1, x_tbl, x_vec_i + add x_tbl2, x_tbl1, x_vec + add x_tbl3, x_tbl2, x_vec + add x_tbl4, x_tbl3, x_vec + add x_tbl5, x_tbl4, x_vec + add x_tbl6, x_tbl5, x_vec + add x_src_end, x_src, x_len + ldr x_dest1, [x_dest, #8*0] + ldr x_dest2, [x_dest, #8*1] + ldr x_dest3, [x_dest, #8*2] + ldr x_dest4, [x_dest, #8*3] + ldr x_dest5, [x_dest, #8*4] + ldr x_dest6, [x_dest, #8*5] + ldr q_gft1_lo, [x_tbl1] + ldr q_gft1_hi, [x_tbl1, #16] + ldr q_gft4_lo, [x_tbl4] + ldr q_gft4_hi, [x_tbl4, #16] + +.Lloop64_init: + /* less than 64 bytes, goto Lloop16_init */ + cmp x_len, #64 + blt .Lloop16_init + + /* save d8 ~ d15 to stack */ + sub sp, sp, #64 + stp d8, d9, [sp] + stp d10, d11, [sp, #16] + stp d12, d13, [sp, #32] + stp d14, d15, [sp, #48] + + sub x_src_end, x_src_end, #64 + +.Lloop64: + ldr q_data_0, [x_src, #16*0] + ldr q_data_1, [x_src, #16*1] + ldr q_data_2, [x_src, #16*2] + ldr q_data_3, [x_src, #16*3] + add x_src, x_src, #64 + + ldr q_d1_0, [x_dest1, #16*0] + ldr q_d1_1, [x_dest1, #16*1] + ldr q_d1_2, [x_dest1, #16*2] + ldr q_d1_3, [x_dest1, #16*3] + + ldr q_d2_0, [x_dest2, #16*0] + ldr q_d2_1, [x_dest2, #16*1] + ldr q_d2_2, [x_dest2, #16*2] + ldr q_d2_3, [x_dest2, #16*3] + + ldr q_d3_0, [x_dest3, #16*0] + ldr q_d3_1, [x_dest3, #16*1] + ldr q_d3_2, [x_dest3, #16*2] + ldr q_d3_3, [x_dest3, #16*3] + + ldr q_gft2_lo, [x_tbl2] + ldr q_gft2_hi, [x_tbl2, #16] + ldr q_gft3_lo, [x_tbl3] + ldr q_gft3_hi, [x_tbl3, #16] + + and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b + and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b + and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b + and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b + + ushr v_data_0_hi.16b, v_data_0.16b, #4 + ushr v_data_1_hi.16b, v_data_1.16b, #4 + ushr v_data_2_hi.16b, v_data_2.16b, #4 + ushr v_data_3_hi.16b, v_data_3.16b, #4 + + /* dest1 */ + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b + eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b + eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b + eor v_d1_1.16b, v_tmp_lo.16b, v_d1_1.16b + eor v_d1_1.16b, v_d1_1.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b + eor v_d1_2.16b, v_tmp_lo.16b, v_d1_2.16b + eor v_d1_2.16b, v_d1_2.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b + eor v_d1_3.16b, v_tmp_lo.16b, v_d1_3.16b + eor v_d1_3.16b, v_d1_3.16b, v_tmp_hi.16b + + /* dest2 */ + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_0_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_0_hi.16b + eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b + eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_1_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_1_hi.16b + eor v_d2_1.16b, v_tmp_lo.16b, v_d2_1.16b + eor v_d2_1.16b, v_d2_1.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_2_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_2_hi.16b + eor v_d2_2.16b, v_tmp_lo.16b, v_d2_2.16b + eor v_d2_2.16b, v_d2_2.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_3_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_3_hi.16b + eor v_d2_3.16b, v_tmp_lo.16b, v_d2_3.16b + eor v_d2_3.16b, v_d2_3.16b, v_tmp_hi.16b + + /* dest3 */ + tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_0_lo.16b + tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_0_hi.16b + eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b + eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_1_lo.16b + tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_1_hi.16b + eor v_d3_1.16b, v_tmp_lo.16b, v_d3_1.16b + eor v_d3_1.16b, v_d3_1.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_2_lo.16b + tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_2_hi.16b + eor v_d3_2.16b, v_tmp_lo.16b, v_d3_2.16b + eor v_d3_2.16b, v_d3_2.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_3_lo.16b + tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_3_hi.16b + eor v_d3_3.16b, v_tmp_lo.16b, v_d3_3.16b + eor v_d3_3.16b, v_d3_3.16b, v_tmp_hi.16b + + str q_d1_0, [x_dest1, #16*0] + str q_d1_1, [x_dest1, #16*1] + str q_d1_2, [x_dest1, #16*2] + str q_d1_3, [x_dest1, #16*3] + add x_dest1, x_dest1, #64 + + str q_d2_0, [x_dest2, #16*0] + str q_d2_1, [x_dest2, #16*1] + str q_d2_2, [x_dest2, #16*2] + str q_d2_3, [x_dest2, #16*3] + add x_dest2, x_dest2, #64 + + str q_d3_0, [x_dest3, #16*0] + str q_d3_1, [x_dest3, #16*1] + str q_d3_2, [x_dest3, #16*2] + str q_d3_3, [x_dest3, #16*3] + add x_dest3, x_dest3, #64 + + ldr q_d4_0, [x_dest4, #16*0] + ldr q_d4_1, [x_dest4, #16*1] + ldr q_d4_2, [x_dest4, #16*2] + ldr q_d4_3, [x_dest4, #16*3] + + ldr q_d5_0, [x_dest5, #16*0] + ldr q_d5_1, [x_dest5, #16*1] + ldr q_d5_2, [x_dest5, #16*2] + ldr q_d5_3, [x_dest5, #16*3] + + ldr q_d6_0, [x_dest6, #16*0] + ldr q_d6_1, [x_dest6, #16*1] + ldr q_d6_2, [x_dest6, #16*2] + ldr q_d6_3, [x_dest6, #16*3] + + ldr q_gft5_lo, [x_tbl5] + ldr q_gft5_hi, [x_tbl5, #16] + ldr q_gft6_lo, [x_tbl6] + ldr q_gft6_hi, [x_tbl6, #16] + + /* dest4 */ + tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_0_lo.16b + tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_0_hi.16b + eor v_d4_0.16b, v_tmp_lo.16b, v_d4_0.16b + eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_1_lo.16b + tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_1_hi.16b + eor v_d4_1.16b, v_tmp_lo.16b, v_d4_1.16b + eor v_d4_1.16b, v_d4_1.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_2_lo.16b + tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_2_hi.16b + eor v_d4_2.16b, v_tmp_lo.16b, v_d4_2.16b + eor v_d4_2.16b, v_d4_2.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_3_lo.16b + tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_3_hi.16b + eor v_d4_3.16b, v_tmp_lo.16b, v_d4_3.16b + eor v_d4_3.16b, v_d4_3.16b, v_tmp_hi.16b + + /* dest5 */ + tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_0_lo.16b + tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_0_hi.16b + eor v_d5_0.16b, v_tmp_lo.16b, v_d5_0.16b + eor v_d5_0.16b, v_d5_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_1_lo.16b + tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_1_hi.16b + eor v_d5_1.16b, v_tmp_lo.16b, v_d5_1.16b + eor v_d5_1.16b, v_d5_1.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_2_lo.16b + tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_2_hi.16b + eor v_d5_2.16b, v_tmp_lo.16b, v_d5_2.16b + eor v_d5_2.16b, v_d5_2.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_3_lo.16b + tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_3_hi.16b + eor v_d5_3.16b, v_tmp_lo.16b, v_d5_3.16b + eor v_d5_3.16b, v_d5_3.16b, v_tmp_hi.16b + + /* dest6 */ + tbl v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_0_lo.16b + tbl v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_0_hi.16b + eor v_d6_0.16b, v_tmp_lo.16b, v_d6_0.16b + eor v_d6_0.16b, v_d6_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_1_lo.16b + tbl v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_1_hi.16b + eor v_d6_1.16b, v_tmp_lo.16b, v_d6_1.16b + eor v_d6_1.16b, v_d6_1.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_2_lo.16b + tbl v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_2_hi.16b + eor v_d6_2.16b, v_tmp_lo.16b, v_d6_2.16b + eor v_d6_2.16b, v_d6_2.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_3_lo.16b + tbl v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_3_hi.16b + eor v_d6_3.16b, v_tmp_lo.16b, v_d6_3.16b + eor v_d6_3.16b, v_d6_3.16b, v_tmp_hi.16b + + str q_d4_0, [x_dest4, #16*0] + str q_d4_1, [x_dest4, #16*1] + str q_d4_2, [x_dest4, #16*2] + str q_d4_3, [x_dest4, #16*3] + add x_dest4, x_dest4, #64 + + str q_d5_0, [x_dest5, #16*0] + str q_d5_1, [x_dest5, #16*1] + str q_d5_2, [x_dest5, #16*2] + str q_d5_3, [x_dest5, #16*3] + add x_dest5, x_dest5, #64 + + str q_d6_0, [x_dest6, #16*0] + str q_d6_1, [x_dest6, #16*1] + str q_d6_2, [x_dest6, #16*2] + str q_d6_3, [x_dest6, #16*3] + add x_dest6, x_dest6, #64 + + cmp x_src, x_src_end + bls .Lloop64 + +.Lloop64_end: + /* restore d8 ~ d15 */ + ldp d8, d9, [sp] + ldp d10, d11, [sp, #16] + ldp d12, d13, [sp, #32] + ldp d14, d15, [sp, #48] + add sp, sp, #64 + add x_src_end, x_src_end, #64 + +.Lloop16_init: + sub x_src_end, x_src_end, #16 + cmp x_src, x_src_end + bhi .lessthan16_init + +.Lloop16: + ldr q_data, [x_src] + + ldr q_d1_0, [x_dest1] + ldr q_d2_0, [x_dest2] + ldr q_d3_0, [x_dest3] + ldr q_gft2_lo, [x_tbl2] + ldr q_gft2_hi, [x_tbl2, #16] + ldr q_gft3_lo, [x_tbl3] + ldr q_gft3_hi, [x_tbl3, #16] + + and v_data_lo.16b, v_data.16b, v_mask0f.16b + ushr v_data_hi.16b, v_data.16b, #4 + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b + eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b + eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b + eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b + eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b + eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b + eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b + + str q_d1_0, [x_dest1] + str q_d2_0, [x_dest2] + str q_d3_0, [x_dest3] + + ldr q_d4_0, [x_dest4] + ldr q_d5_0, [x_dest5] + ldr q_d6_0, [x_dest6] + ldr q_gft5_lo, [x_tbl5] + ldr q_gft5_hi, [x_tbl5, #16] + ldr q_gft6_lo, [x_tbl6] + ldr q_gft6_hi, [x_tbl6, #16] + + tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b + eor v_d4_0.16b, v_tmp_lo.16b, v_d4_0.16b + eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_hi.16b + eor v_d5_0.16b, v_tmp_lo.16b, v_d5_0.16b + eor v_d5_0.16b, v_d5_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_hi.16b + eor v_d6_0.16b, v_tmp_lo.16b, v_d6_0.16b + eor v_d6_0.16b, v_d6_0.16b, v_tmp_hi.16b + + str q_d4_0, [x_dest4] + str q_d5_0, [x_dest5] + str q_d6_0, [x_dest6] + + add x_src, x_src, #16 + add x_dest1, x_dest1, #16 + add x_dest2, x_dest2, #16 + add x_dest3, x_dest3, #16 + add x_dest4, x_dest4, #16 + add x_dest5, x_dest5, #16 + add x_dest6, x_dest6, #16 + cmp x_src, x_src_end + bls .Lloop16 + +.lessthan16_init: + sub x_tmp, x_src, x_src_end + cmp x_tmp, #16 + beq .return_pass + +.lessthan16: + mov x_src, x_src_end + sub x_dest1, x_dest1, x_tmp + sub x_dest2, x_dest2, x_tmp + sub x_dest3, x_dest3, x_tmp + sub x_dest4, x_dest4, x_tmp + sub x_dest5, x_dest5, x_tmp + sub x_dest6, x_dest6, x_tmp + +#ifndef __APPLE__ + adrp x_const, const_tbl + add x_const, x_const, :lo12:const_tbl +#else + adrp x_const, const_tbl@PAGE + add x_const, x_const, const_tbl@PAGEOFF +#endif + sub x_const, x_const, x_tmp + ldr q_tmp, [x_const, #16] + + ldr q_data, [x_src] + ldr q_d1_0, [x_dest1] + ldr q_d2_0, [x_dest2] + ldr q_d3_0, [x_dest3] + ldr q_gft2_lo, [x_tbl2] + ldr q_gft2_hi, [x_tbl2, #16] + ldr q_gft3_lo, [x_tbl3] + ldr q_gft3_hi, [x_tbl3, #16] + + and v_data_lo.16b, v_data.16b, v_mask0f.16b + ushr v_data_hi.16b, v_data.16b, #4 + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b + eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b + and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b + eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b + eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b + and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b + eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b + eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b + and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b + eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b + + str q_d1_0, [x_dest1] + str q_d2_0, [x_dest2] + str q_d3_0, [x_dest3] + + ldr q_d4_0, [x_dest4] + ldr q_d5_0, [x_dest5] + ldr q_d6_0, [x_dest6] + ldr q_gft5_lo, [x_tbl5] + ldr q_gft5_hi, [x_tbl5, #16] + ldr q_gft6_lo, [x_tbl6] + ldr q_gft6_hi, [x_tbl6, #16] + + tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b + eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b + and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b + eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_hi.16b + eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b + and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b + eor v_d5_0.16b, v_d5_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_hi.16b + eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b + and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b + eor v_d6_0.16b, v_d6_0.16b, v_tmp_hi.16b + + str q_d4_0, [x_dest4] + str q_d5_0, [x_dest5] + str q_d6_0, [x_dest6] + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret + +ASM_DEF_RODATA +.balign 8 +const_tbl: + .dword 0x0000000000000000, 0x0000000000000000 + .dword 0xffffffffffffffff, 0xffffffffffffffff diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_6vect_mad_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_6vect_mad_sve.S new file mode 100644 index 0000000000..c4f372cd73 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_6vect_mad_sve.S @@ -0,0 +1,237 @@ +/************************************************************** + Copyright (c) 2021 Linaro Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +.text +.align 6 +.arch armv8-a+sve + +#include "../include/aarch64_label.h" + +.global cdecl(gf_6vect_mad_sve) +#ifndef __APPLE__ +.type gf_6vect_mad_sve, %function +#endif + +/* gf_6vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest); + */ +/* arguments */ +x_len .req x0 +x_vec .req x1 +x_vec_i .req x2 +x_tbl .req x3 +x_src .req x4 +x_dest .req x5 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_pos .req x6 +x_dest2 .req x7 +x_dest3 .req x8 +x_dest4 .req x9 +x_dest5 .req x10 +x_dest6 .req x11 +x_dest1 .req x12 + +/* vectors */ +z_mask0f .req z0 + +z_src .req z1 +z_src_lo .req z2 +z_src_hi .req z_src + +z_dest1 .req z3 + +z_tmp_lo .req z4 +z_tmp_hi .req z5 + +z_gft1_lo .req z6 +z_gft1_hi .req z7 +q_gft1_lo .req q6 +q_gft1_hi .req q7 + +/* bottom 64-bit of v8..v15 must be preserved if used */ +z_gft2_lo .req z17 +z_gft2_hi .req z18 +q_gft2_lo .req q17 +q_gft2_hi .req q18 + +z_gft3_lo .req z19 +z_gft3_hi .req z20 +q_gft3_lo .req q19 +q_gft3_hi .req q20 + +z_gft4_lo .req z21 +z_gft4_hi .req z22 +q_gft4_lo .req q21 +q_gft4_hi .req q22 + +z_gft5_lo .req z23 +z_gft5_hi .req z24 +q_gft5_lo .req q23 +q_gft5_hi .req q24 + +z_gft6_lo .req z25 +z_gft6_hi .req z26 +q_gft6_lo .req q25 +q_gft6_hi .req q26 + +z_dest2 .req z27 +z_dest3 .req z28 +z_dest4 .req z29 +z_dest5 .req z30 +z_dest6 .req z31 + +cdecl(gf_6vect_mad_sve): + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ + /* load table 1 */ + add x_tbl, x_tbl, x_vec_i, LSL #5 /* x_tbl += x_vec_i * 2^5 */ + + /* Load with NEON instruction ldp */ + ldp q_gft1_lo, q_gft1_hi, [x_tbl] + /* load table 2 */ + add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */ + ldp q_gft2_lo, q_gft2_hi, [x_tbl] + /* load table 3 */ + add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */ + ldp q_gft3_lo, q_gft3_hi, [x_tbl] + /* load table 4 */ + add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */ + ldp q_gft4_lo, q_gft4_hi, [x_tbl] + /* load table 5 */ + add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */ + ldp q_gft5_lo, q_gft5_hi, [x_tbl] + /* load table 6 */ + add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */ + ldp q_gft6_lo, q_gft6_hi, [x_tbl] + + ldr x_dest1, [x_dest, #8*0] /* pointer to dest1 */ + ldr x_dest2, [x_dest, #8*1] /* pointer to dest2 */ + ldr x_dest3, [x_dest, #8*2] /* pointer to dest3 */ + ldr x_dest4, [x_dest, #8*3] /* pointer to dest4 */ + ldr x_dest5, [x_dest, #8*4] /* pointer to dest5 */ + ldr x_dest6, [x_dest, #8*5] /* pointer to dest6 */ + + mov x_pos, #0 + + /* vector length agnostic */ +.Lloopsve_vl: + whilelo p0.b, x_pos, x_len + b.none .return_pass + + prfb pldl2strm, p0, [x_dest1, x_pos] + prfb pldl2strm, p0, [x_dest2, x_pos] + + /* load src data, governed by p0 */ + ld1b z_src.b, p0/z, [x_src, x_pos] + + /* split 4-bit lo; 4-bit hi */ + and z_src_lo.d, z_src.d, z_mask0f.d + lsr z_src_hi.b, z_src.b, #4 + + /* load dest data, governed by p0 */ + ld1b z_dest1.b, p0/z, [x_dest1, x_pos] + ld1b z_dest2.b, p0/z, [x_dest2, x_pos] + + prfb pldl2strm, p0, [x_dest3, x_pos] + prfb pldl2strm, p0, [x_dest4, x_pos] + + /* dest1 */ + /* table indexing, ie. gf(2^8) multiplication */ + tbl z_tmp_lo.b, {z_gft1_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft1_hi.b}, z_src_hi.b + /* exclusive or, ie. gf(2^8) add */ + eor z_dest1.d, z_tmp_lo.d, z_dest1.d + eor z_dest1.d, z_tmp_hi.d, z_dest1.d + + ld1b z_dest3.b, p0/z, [x_dest3, x_pos] + ld1b z_dest4.b, p0/z, [x_dest4, x_pos] + + prfb pldl2strm, p0, [x_dest5, x_pos] + prfb pldl2strm, p0, [x_dest6, x_pos] + + /* dest2 */ + tbl z_tmp_lo.b, {z_gft2_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft2_hi.b}, z_src_hi.b + eor z_dest2.d, z_tmp_lo.d, z_dest2.d + eor z_dest2.d, z_tmp_hi.d, z_dest2.d + + ld1b z_dest5.b, p0/z, [x_dest5, x_pos] + ld1b z_dest6.b, p0/z, [x_dest6, x_pos] + + /* dest3 */ + tbl z_tmp_lo.b, {z_gft3_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft3_hi.b}, z_src_hi.b + eor z_dest3.d, z_tmp_lo.d, z_dest3.d + eor z_dest3.d, z_tmp_hi.d, z_dest3.d + + /* store dest data, governed by p0 */ + st1b z_dest1.b, p0, [x_dest1, x_pos] + st1b z_dest2.b, p0, [x_dest2, x_pos] + + /* dest4 */ + tbl z_tmp_lo.b, {z_gft4_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft4_hi.b}, z_src_hi.b + eor z_dest4.d, z_tmp_lo.d, z_dest4.d + eor z_dest4.d, z_tmp_hi.d, z_dest4.d + + /* dest5 */ + tbl z_tmp_lo.b, {z_gft5_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft5_hi.b}, z_src_hi.b + eor z_dest5.d, z_tmp_lo.d, z_dest5.d + eor z_dest5.d, z_tmp_hi.d, z_dest5.d + + st1b z_dest3.b, p0, [x_dest3, x_pos] + st1b z_dest4.b, p0, [x_dest4, x_pos] + + /* dest6 */ + tbl z_tmp_lo.b, {z_gft6_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft6_hi.b}, z_src_hi.b + eor z_dest6.d, z_tmp_lo.d, z_dest6.d + eor z_dest6.d, z_tmp_hi.d, z_dest6.d + + st1b z_dest5.b, p0, [x_dest5, x_pos] + st1b z_dest6.b, p0, [x_dest6, x_pos] + /* increment one vector length */ + incb x_pos + + b .Lloopsve_vl + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_7vect_dot_prod_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_7vect_dot_prod_sve.S new file mode 100644 index 0000000000..0f74873de0 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_7vect_dot_prod_sve.S @@ -0,0 +1,281 @@ +/************************************************************* + Copyright (c) 2021 Linaro Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +.text +.align 6 +.arch armv8-a+sve + +#include "../include/aarch64_label.h" + +.global cdecl(gf_7vect_dot_prod_sve) +#ifndef __APPLE__ +.type gf_7vect_dot_prod_sve, %function +#endif +/* void gf_7vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + */ + +/* arguments */ +x_len .req x0 /* vector length */ +x_vec .req x1 /* number of source vectors (ie. data blocks) */ +x_tbl .req x2 +x_src .req x3 +x_dest .req x4 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_vec_i .req x5 +x_ptr .req x6 +x_pos .req x7 + +x_tbl1 .req x8 +x_tbl2 .req x9 +x_tbl3 .req x10 +x_tbl4 .req x11 +x_tbl5 .req x12 +x_tbl6 .req x13 +x_tbl7 .req x14 + +x_dest1 .req x15 + +/* r16,r17,r18,r29,r30: special role registers, avoided */ +/* r19..r29 and SP must be preserved */ +x_dest2 .req x19 +x_dest3 .req x20 +x_dest4 .req x21 +x_dest5 .req x22 +x_dest6 .req x23 +x_dest7 .req x_dest /* reused */ + +/* vectors */ +z_mask0f .req z0 + +z_src .req z1 +z_src_lo .req z2 +z_src_hi .req z_src + +z_dest1 .req z3 +z_gft1_lo .req z4 +z_gft1_hi .req z5 +q_gft1_lo .req q4 +q_gft1_hi .req q5 + +z_gft7_lo .req z6 +z_gft7_hi .req z7 +q_gft7_lo .req q6 +q_gft7_hi .req q7 + +/* bottom 64-bit of v8..v15 must be preserved if used */ +z_dest7 .req z16 + +z_gft2_lo .req z17 +z_gft2_hi .req z18 +q_gft2_lo .req q17 +q_gft2_hi .req q18 + +z_gft3_lo .req z19 +z_gft3_hi .req z20 +q_gft3_lo .req q19 +q_gft3_hi .req q20 + +z_gft4_lo .req z21 +z_gft4_hi .req z22 +q_gft4_lo .req q21 +q_gft4_hi .req q22 + +z_gft5_lo .req z23 +z_gft5_hi .req z24 +q_gft5_lo .req q23 +q_gft5_hi .req q24 + +z_gft6_lo .req z25 +z_gft6_hi .req z26 +q_gft6_lo .req q25 +q_gft6_hi .req q26 + +z_dest2 .req z27 +z_dest3 .req z28 +z_dest4 .req z29 +z_dest5 .req z30 +z_dest6 .req z31 + +cdecl(gf_7vect_dot_prod_sve): + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + /* save r19..r29 */ + sub sp, sp, #48 /* alignment */ + stp x19, x20, [sp] + stp x21, x22, [sp, #16] + str x23, [sp, #32] + + mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ + mov x_pos, #0 + lsl x_vec, x_vec, #3 + ldp x_dest1, x_dest2, [x_dest, #8*0] + ldp x_dest3, x_dest4, [x_dest, #8*2] + ldp x_dest5, x_dest6, [x_dest, #8*4] + ldr x_dest7, [x_dest, #8*6] /* x_dest7 reuses x_dest */ + +/* Loop 1: x_len, vector length */ +.Lloopsve_vl: + whilelo p0.b, x_pos, x_len + b.none .return_pass + + mov x_vec_i, #0 /* clear x_vec_i */ + ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ + + mov z_dest1.b, #0 /* clear z_dest1 */ + mov z_dest2.b, #0 /* clear z_dest2 */ + mov z_dest3.b, #0 /* clear z_dest3 */ + mov z_dest4.b, #0 /* clear z_dest4 */ + mov z_dest5.b, #0 /* clear z_dest5 */ + mov z_dest6.b, #0 /* clear z_dest6 */ + mov z_dest7.b, #0 /* clear z_dest7 */ + + /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */ + mov x_tbl1, x_tbl /* reset x_tbl1 */ + add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */ + add x_tbl3, x_tbl2, x_vec, LSL #2 /* reset x_tbl3 */ + add x_tbl4, x_tbl3, x_vec, LSL #2 /* reset x_tbl4 */ + add x_tbl5, x_tbl4, x_vec, LSL #2 /* reset x_tbl5 */ + add x_tbl6, x_tbl5, x_vec, LSL #2 /* reset x_tbl6 */ + add x_tbl7, x_tbl6, x_vec, LSL #2 /* reset x_tbl7 */ + +/* Loop 2: x_vec, number of source vectors (ie. data blocks) */ +.Lloopsve_vl_vects: + /* load src data, governed by p0 */ + ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */ + /* split 4-bit lo; 4-bit hi */ + and z_src_lo.d, z_src.d, z_mask0f.d + lsr z_src_hi.b, z_src.b, #4 + + /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */ + /* load gf_table's */ + ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */ + ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 + + /* prefetch */ + prfb pldl2keep, p0, [x_tbl1] + prfb pldl2keep, p0, [x_tbl2] + + /* calc for next and prefetch */ + add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */ + ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ + + /* dest 1 */ + /* table indexing, ie. gf(2^8) multiplication */ + tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b + tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b + /* exclusive or, ie. gf(2^8) add */ + eor z_dest1.d, z_gft1_lo.d, z_dest1.d + eor z_dest1.d, z_gft1_hi.d, z_dest1.d + + ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32 + ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32 + prfb pldl2keep, p0, [x_tbl3] + prfb pldl2keep, p0, [x_tbl4] + + /* dest 2 */ + tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b + tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b + eor z_dest2.d, z_gft2_lo.d, z_dest2.d + eor z_dest2.d, z_gft2_hi.d, z_dest2.d + + /* dest 3 */ + tbl z_gft3_lo.b, {z_gft3_lo.b}, z_src_lo.b + tbl z_gft3_hi.b, {z_gft3_hi.b}, z_src_hi.b + eor z_dest3.d, z_gft3_lo.d, z_dest3.d + eor z_dest3.d, z_gft3_hi.d, z_dest3.d + + ldp q_gft5_lo, q_gft5_hi, [x_tbl5], #32 + ldp q_gft6_lo, q_gft6_hi, [x_tbl6], #32 + prfb pldl2keep, p0, [x_tbl5] + prfb pldl2keep, p0, [x_tbl6] + + /* dest 4 */ + tbl z_gft4_lo.b, {z_gft4_lo.b}, z_src_lo.b + tbl z_gft4_hi.b, {z_gft4_hi.b}, z_src_hi.b + eor z_dest4.d, z_gft4_lo.d, z_dest4.d + eor z_dest4.d, z_gft4_hi.d, z_dest4.d + + /* dest 5 */ + tbl z_gft5_lo.b, {z_gft5_lo.b}, z_src_lo.b + tbl z_gft5_hi.b, {z_gft5_hi.b}, z_src_hi.b + eor z_dest5.d, z_gft5_lo.d, z_dest5.d + eor z_dest5.d, z_gft5_hi.d, z_dest5.d + + ldp q_gft7_lo, q_gft7_hi, [x_tbl7], #32 + prfb pldl2keep, p0, [x_tbl7] + + /* dest 6 */ + tbl z_gft6_lo.b, {z_gft6_lo.b}, z_src_lo.b + tbl z_gft6_hi.b, {z_gft6_hi.b}, z_src_hi.b + eor z_dest6.d, z_gft6_lo.d, z_dest6.d + eor z_dest6.d, z_gft6_hi.d, z_dest6.d + + /* dest 7 */ + tbl z_gft7_lo.b, {z_gft7_lo.b}, z_src_lo.b + tbl z_gft7_hi.b, {z_gft7_hi.b}, z_src_hi.b + eor z_dest7.d, z_gft7_lo.d, z_dest7.d + eor z_dest7.d, z_gft7_hi.d, z_dest7.d + + cmp x_vec_i, x_vec + blt .Lloopsve_vl_vects +/* end of Loop 2 */ + + /* store dest data, governed by p0 */ + st1b z_dest1.b, p0, [x_dest1, x_pos] + st1b z_dest2.b, p0, [x_dest2, x_pos] + st1b z_dest3.b, p0, [x_dest3, x_pos] + st1b z_dest4.b, p0, [x_dest4, x_pos] + st1b z_dest5.b, p0, [x_dest5, x_pos] + st1b z_dest6.b, p0, [x_dest6, x_pos] + st1b z_dest7.b, p0, [x_dest7, x_pos] + + /* increment one vector length */ + incb x_pos + b .Lloopsve_vl +/* end of Loop 1 */ + +.return_pass: + /* restore r19..r29 */ + ldr x23, [sp, #32] + ldp x21, x22, [sp, #16] + ldp x19, x20, [sp] + add sp, sp, #48 + + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_8vect_dot_prod_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_8vect_dot_prod_sve.S new file mode 100644 index 0000000000..20768f4889 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_8vect_dot_prod_sve.S @@ -0,0 +1,307 @@ +/************************************************************* + Copyright (c) 2021 Linaro Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +.text +.align 6 +.arch armv8-a+sve + +#include "../include/aarch64_label.h" + +.global cdecl(gf_8vect_dot_prod_sve) +#ifndef __APPLE__ +.type gf_8vect_dot_prod_sve, %function +#endif +/* void gf_8vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + */ + +/* arguments */ +x_len .req x0 /* vector length */ +x_vec .req x1 /* number of source vectors (ie. data blocks) */ +x_tbl .req x2 +x_src .req x3 +x_dest .req x4 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_vec_i .req x5 +x_ptr .req x6 +x_pos .req x7 + +x_tbl1 .req x8 +x_tbl2 .req x9 +x_tbl3 .req x10 +x_tbl4 .req x11 +x_tbl5 .req x12 +x_tbl6 .req x13 +x_tbl7 .req x14 + +x_dest1 .req x15 + +/* r16,r17,r18,r29,r30: special role registers, avoided */ +/* r19..r29 and SP must be preserved */ +x_dest2 .req x19 +x_dest3 .req x20 +x_dest4 .req x21 +x_dest5 .req x22 +x_dest6 .req x23 +x_dest7 .req x24 +x_dest8 .req x_dest /* reused */ +x_tbl8 .req x25 + +/* vectors */ +z_mask0f .req z0 + +z_src .req z1 +z_src_lo .req z2 +z_src_hi .req z_src + +z_dest1 .req z3 +z_gft1_lo .req z4 +z_gft1_hi .req z5 +q_gft1_lo .req q4 +q_gft1_hi .req q5 + +z_gft7_lo .req z6 +z_gft7_hi .req z7 +q_gft7_lo .req q6 +q_gft7_hi .req q7 + +/* bottom 64-bit of v8..v15 must be preserved if used */ +z_dest7 .req z8 + +z_gft8_lo .req z9 +z_gft8_hi .req z10 +q_gft8_lo .req q9 +q_gft8_hi .req q10 + +z_dest8 .req z16 + +z_gft2_lo .req z17 +z_gft2_hi .req z18 +q_gft2_lo .req q17 +q_gft2_hi .req q18 + +z_gft3_lo .req z19 +z_gft3_hi .req z20 +q_gft3_lo .req q19 +q_gft3_hi .req q20 + +z_gft4_lo .req z21 +z_gft4_hi .req z22 +q_gft4_lo .req q21 +q_gft4_hi .req q22 + +z_gft5_lo .req z23 +z_gft5_hi .req z24 +q_gft5_lo .req q23 +q_gft5_hi .req q24 + +z_gft6_lo .req z25 +z_gft6_hi .req z26 +q_gft6_lo .req q25 +q_gft6_hi .req q26 + +z_dest2 .req z27 +z_dest3 .req z28 +z_dest4 .req z29 +z_dest5 .req z30 +z_dest6 .req z31 + +cdecl(gf_8vect_dot_prod_sve): + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + /* save r19..r29 */ + sub sp, sp, #80 /* alignment */ + stp x19, x20, [sp] + stp x21, x22, [sp, #16] + stp x23, x24, [sp, #32] + stp d8, d9, [sp, #48] + str d10, [sp, #56] + str x25, [sp, #64] + + mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ + mov x_pos, #0 + lsl x_vec, x_vec, #3 + ldp x_dest1, x_dest2, [x_dest, #8*0] + ldp x_dest3, x_dest4, [x_dest, #8*2] + ldp x_dest5, x_dest6, [x_dest, #8*4] + ldp x_dest7, x_dest8, [x_dest, #8*6] /* x_dest8 reuses x_dest */ + +/* Loop 1: x_len, vector length */ +.Lloopsve_vl: + whilelo p0.b, x_pos, x_len + b.none .return_pass + + mov x_vec_i, #0 /* clear x_vec_i */ + ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ + + mov z_dest1.b, #0 /* clear z_dest1 */ + mov z_dest2.b, #0 /* clear z_dest2 */ + mov z_dest3.b, #0 /* clear z_dest3 */ + mov z_dest4.b, #0 /* clear z_dest4 */ + mov z_dest5.b, #0 /* clear z_dest5 */ + mov z_dest6.b, #0 /* clear z_dest6 */ + mov z_dest7.b, #0 /* clear z_dest7 */ + mov z_dest8.b, #0 /* clear z_dest8 */ + + /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */ + mov x_tbl1, x_tbl /* reset x_tbl1 */ + add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */ + add x_tbl3, x_tbl2, x_vec, LSL #2 /* reset x_tbl3 */ + add x_tbl4, x_tbl3, x_vec, LSL #2 /* reset x_tbl4 */ + add x_tbl5, x_tbl4, x_vec, LSL #2 /* reset x_tbl5 */ + add x_tbl6, x_tbl5, x_vec, LSL #2 /* reset x_tbl6 */ + add x_tbl7, x_tbl6, x_vec, LSL #2 /* reset x_tbl7 */ + add x_tbl8, x_tbl7, x_vec, LSL #2 /* reset x_tbl8 */ + +/* Loop 2: x_vec, number of source vectors (ie. data blocks) */ +.Lloopsve_vl_vects: + /* load src data, governed by p0 */ + ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */ + /* split 4-bit lo; 4-bit hi */ + and z_src_lo.d, z_src.d, z_mask0f.d + lsr z_src_hi.b, z_src.b, #4 + + /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */ + /* load gf_table's */ + ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */ + ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 + + /* prefetch */ + prfb pldl2keep, p0, [x_tbl1] + prfb pldl2keep, p0, [x_tbl2] + + /* calc for next and prefetch */ + add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */ + ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ + + /* dest 1 */ + /* table indexing, ie. gf(2^8) multiplication */ + tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b + tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b + /* exclusive or, ie. gf(2^8) add */ + eor z_dest1.d, z_gft1_lo.d, z_dest1.d + eor z_dest1.d, z_gft1_hi.d, z_dest1.d + + ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32 + ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32 + prfb pldl2keep, p0, [x_tbl3] + prfb pldl2keep, p0, [x_tbl4] + + /* dest 2 */ + tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b + tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b + eor z_dest2.d, z_gft2_lo.d, z_dest2.d + eor z_dest2.d, z_gft2_hi.d, z_dest2.d + + /* dest 3 */ + tbl z_gft3_lo.b, {z_gft3_lo.b}, z_src_lo.b + tbl z_gft3_hi.b, {z_gft3_hi.b}, z_src_hi.b + eor z_dest3.d, z_gft3_lo.d, z_dest3.d + eor z_dest3.d, z_gft3_hi.d, z_dest3.d + + ldp q_gft5_lo, q_gft5_hi, [x_tbl5], #32 + ldp q_gft6_lo, q_gft6_hi, [x_tbl6], #32 + prfb pldl2keep, p0, [x_tbl5] + prfb pldl2keep, p0, [x_tbl6] + + /* dest 4 */ + tbl z_gft4_lo.b, {z_gft4_lo.b}, z_src_lo.b + tbl z_gft4_hi.b, {z_gft4_hi.b}, z_src_hi.b + eor z_dest4.d, z_gft4_lo.d, z_dest4.d + eor z_dest4.d, z_gft4_hi.d, z_dest4.d + + /* dest 5 */ + tbl z_gft5_lo.b, {z_gft5_lo.b}, z_src_lo.b + tbl z_gft5_hi.b, {z_gft5_hi.b}, z_src_hi.b + eor z_dest5.d, z_gft5_lo.d, z_dest5.d + eor z_dest5.d, z_gft5_hi.d, z_dest5.d + + ldp q_gft7_lo, q_gft7_hi, [x_tbl7], #32 + ldp q_gft8_lo, q_gft8_hi, [x_tbl8], #32 + prfb pldl2keep, p0, [x_tbl7] + prfb pldl2keep, p0, [x_tbl8] + + /* dest 6 */ + tbl z_gft6_lo.b, {z_gft6_lo.b}, z_src_lo.b + tbl z_gft6_hi.b, {z_gft6_hi.b}, z_src_hi.b + eor z_dest6.d, z_gft6_lo.d, z_dest6.d + eor z_dest6.d, z_gft6_hi.d, z_dest6.d + + /* dest 7 */ + tbl z_gft7_lo.b, {z_gft7_lo.b}, z_src_lo.b + tbl z_gft7_hi.b, {z_gft7_hi.b}, z_src_hi.b + eor z_dest7.d, z_gft7_lo.d, z_dest7.d + eor z_dest7.d, z_gft7_hi.d, z_dest7.d + + /* dest 8 */ + tbl z_gft8_lo.b, {z_gft8_lo.b}, z_src_lo.b + tbl z_gft8_hi.b, {z_gft8_hi.b}, z_src_hi.b + eor z_dest8.d, z_gft8_lo.d, z_dest8.d + eor z_dest8.d, z_gft8_hi.d, z_dest8.d + + cmp x_vec_i, x_vec + blt .Lloopsve_vl_vects +/* end of Loop 2 */ + + /* store dest data, governed by p0 */ + st1b z_dest1.b, p0, [x_dest1, x_pos] + st1b z_dest2.b, p0, [x_dest2, x_pos] + st1b z_dest3.b, p0, [x_dest3, x_pos] + st1b z_dest4.b, p0, [x_dest4, x_pos] + st1b z_dest5.b, p0, [x_dest5, x_pos] + st1b z_dest6.b, p0, [x_dest6, x_pos] + st1b z_dest7.b, p0, [x_dest7, x_pos] + st1b z_dest8.b, p0, [x_dest8, x_pos] + + /* increment one vector length */ + incb x_pos + b .Lloopsve_vl +/* end of Loop 1 */ + +.return_pass: + /* restore r19..r29 */ + ldr x25, [sp, #64] + ldr d10, [sp, #56] + ldp d8, d9, [sp, #48] + ldp x23, x24, [sp, #32] + ldp x21, x22, [sp, #16] + ldp x19, x20, [sp] + add sp, sp, #80 + + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_dot_prod_neon.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_dot_prod_neon.S new file mode 100644 index 0000000000..4d17362894 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_dot_prod_neon.S @@ -0,0 +1,303 @@ +/************************************************************** + Copyright (c) 2019 Huawei Technologies Co., Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "../include/aarch64_label.h" + +.text + +.global cdecl(gf_vect_dot_prod_neon) +#ifndef __APPLE__ +.type gf_vect_dot_prod_neon, %function +#endif + +/* arguments */ +x_len .req x0 +x_vec .req x1 +x_tbl .req x2 +x_src .req x3 +x_dest1 .req x4 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_vec_i .req x5 +x_ptr .req x6 +x_pos .req x7 +x_tmp .req x8 +x_tbl1 .req x9 + +/* vectors */ +v_gft1_lo .req v0 +v_gft1_hi .req v1 +q_gft1_lo .req q0 +q_gft1_hi .req q1 +v_mask0f .req v2 +q_mask0f .req q2 + +v_data_0 .req v8 +v_data_1 .req v9 +v_data_2 .req v10 +v_data_3 .req v11 +v_data_4 .req v12 +v_data_5 .req v13 +v_data_6 .req v14 +v_data_7 .req v15 +q_data_0 .req q8 +q_data_1 .req q9 +q_data_2 .req q10 +q_data_3 .req q11 +q_data_4 .req q12 +q_data_5 .req q13 +q_data_6 .req q14 +q_data_7 .req q15 + +v_data_0_lo .req v16 +v_data_1_lo .req v17 +v_data_2_lo .req v18 +v_data_3_lo .req v19 +v_data_4_lo .req v20 +v_data_5_lo .req v21 +v_data_6_lo .req v22 +v_data_7_lo .req v23 +v_data_0_hi .req v_data_0 +v_data_1_hi .req v_data_1 +v_data_2_hi .req v_data_2 +v_data_3_hi .req v_data_3 +v_data_4_hi .req v_data_4 +v_data_5_hi .req v_data_5 +v_data_6_hi .req v_data_6 +v_data_7_hi .req v_data_7 + +v_p0 .req v24 +v_p1 .req v25 +v_p2 .req v26 +v_p3 .req v27 +v_p4 .req v28 +v_p5 .req v29 +v_p6 .req v30 +v_p7 .req v31 +q_p0 .req q24 +q_p1 .req q25 +q_p2 .req q26 +q_p3 .req q27 +q_p4 .req q28 +q_p5 .req q29 +q_p6 .req q30 +q_p7 .req q31 + +v_p .req v_p0 +q_p .req q_p0 +v_data .req v_p1 +q_data .req q_p1 +v_data_lo .req v_p2 +v_data_hi .req v_p3 + + +cdecl(gf_vect_dot_prod_neon): + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + movi v_mask0f.16b, #0x0f + mov x_pos, #0 + + lsl x_vec, x_vec, #3 + +.Lloop128_init: + /* less than 128 bytes, goto Lloop16_init */ + cmp x_len, #128 + blt .Lloop16_init + + /* save d8 ~ d15 to stack */ + sub sp, sp, #64 + stp d8, d9, [sp] + stp d10, d11, [sp, #16] + stp d12, d13, [sp, #32] + stp d14, d15, [sp, #48] + + sub x_len, x_len, #128 + +.Lloop128: + movi v_p0.16b, #0 + movi v_p1.16b, #0 + movi v_p2.16b, #0 + movi v_p3.16b, #0 + movi v_p4.16b, #0 + movi v_p5.16b, #0 + movi v_p6.16b, #0 + movi v_p7.16b, #0 + + mov x_tbl1, x_tbl + mov x_vec_i, #0 + +.Lloop128_vects: + ldr x_ptr, [x_src, x_vec_i] + add x_vec_i, x_vec_i, #8 + add x_ptr, x_ptr, x_pos + + ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 + + ldp q_data_0, q_data_1, [x_ptr], #32 + ldp q_data_2, q_data_3, [x_ptr], #32 + ldp q_data_4, q_data_5, [x_ptr], #32 + ldp q_data_6, q_data_7, [x_ptr] + + prfm pldl1keep, [x_tbl1] + prfm pldl1strm, [x_ptr] + + and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b + and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b + and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b + and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b + and v_data_4_lo.16b, v_data_4.16b, v_mask0f.16b + and v_data_5_lo.16b, v_data_5.16b, v_mask0f.16b + and v_data_6_lo.16b, v_data_6.16b, v_mask0f.16b + and v_data_7_lo.16b, v_data_7.16b, v_mask0f.16b + + ushr v_data_0_hi.16b, v_data_0.16b, #4 + ushr v_data_1_hi.16b, v_data_1.16b, #4 + ushr v_data_2_hi.16b, v_data_2.16b, #4 + ushr v_data_3_hi.16b, v_data_3.16b, #4 + ushr v_data_4_hi.16b, v_data_4.16b, #4 + ushr v_data_5_hi.16b, v_data_5.16b, #4 + ushr v_data_6_hi.16b, v_data_6.16b, #4 + ushr v_data_7_hi.16b, v_data_7.16b, #4 + + tbl v_data_0_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b + tbl v_data_1_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b + tbl v_data_2_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b + tbl v_data_3_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b + tbl v_data_4_lo.16b, {v_gft1_lo.16b}, v_data_4_lo.16b + tbl v_data_5_lo.16b, {v_gft1_lo.16b}, v_data_5_lo.16b + tbl v_data_6_lo.16b, {v_gft1_lo.16b}, v_data_6_lo.16b + tbl v_data_7_lo.16b, {v_gft1_lo.16b}, v_data_7_lo.16b + + tbl v_data_0_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b + tbl v_data_1_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b + tbl v_data_2_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b + tbl v_data_3_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b + tbl v_data_4_hi.16b, {v_gft1_hi.16b}, v_data_4_hi.16b + tbl v_data_5_hi.16b, {v_gft1_hi.16b}, v_data_5_hi.16b + tbl v_data_6_hi.16b, {v_gft1_hi.16b}, v_data_6_hi.16b + tbl v_data_7_hi.16b, {v_gft1_hi.16b}, v_data_7_hi.16b + + eor v_p0.16b, v_data_0_lo.16b, v_p0.16b + eor v_p0.16b, v_p0.16b, v_data_0_hi.16b + eor v_p1.16b, v_data_1_lo.16b, v_p1.16b + eor v_p1.16b, v_p1.16b, v_data_1_hi.16b + eor v_p2.16b, v_data_2_lo.16b, v_p2.16b + eor v_p2.16b, v_p2.16b, v_data_2_hi.16b + eor v_p3.16b, v_data_3_lo.16b, v_p3.16b + eor v_p3.16b, v_p3.16b, v_data_3_hi.16b + eor v_p4.16b, v_data_4_lo.16b, v_p4.16b + eor v_p4.16b, v_p4.16b, v_data_4_hi.16b + eor v_p5.16b, v_data_5_lo.16b, v_p5.16b + eor v_p5.16b, v_p5.16b, v_data_5_hi.16b + eor v_p6.16b, v_data_6_lo.16b, v_p6.16b + eor v_p6.16b, v_p6.16b, v_data_6_hi.16b + eor v_p7.16b, v_data_7_lo.16b, v_p7.16b + eor v_p7.16b, v_p7.16b, v_data_7_hi.16b + + cmp x_vec_i, x_vec + blt .Lloop128_vects + +.Lloop128_vects_end: + add x_ptr, x_dest1, x_pos + stp q_p0, q_p1, [x_ptr], #32 + stp q_p2, q_p3, [x_ptr], #32 + stp q_p4, q_p5, [x_ptr], #32 + stp q_p6, q_p7, [x_ptr] + + add x_pos, x_pos, #128 + cmp x_pos, x_len + ble .Lloop128 + +.Lloop128_end: + /* restore d8 ~ d15 */ + ldp d8, d9, [sp] + ldp d10, d11, [sp, #16] + ldp d12, d13, [sp, #32] + ldp d14, d15, [sp, #48] + add sp, sp, #64 + + add x_len, x_len, #128 + cmp x_pos, x_len + beq .return_pass + +.Lloop16_init: + sub x_len, x_len, #16 + cmp x_pos, x_len + bgt .lessthan16_init + +.Lloop16: + movi v_p.16b, #0 + mov x_tbl1, x_tbl + mov x_vec_i, #0 + +.Lloop16_vects: + ldr x_ptr, [x_src, x_vec_i] + ldr q_data, [x_ptr, x_pos] + add x_vec_i, x_vec_i, #8 + + ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 + + and v_data_lo.16b, v_data.16b, v_mask0f.16b + ushr v_data_hi.16b, v_data.16b, #4 + + tbl v_data_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b + tbl v_data_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b + eor v_p.16b, v_data_lo.16b, v_p.16b + eor v_p.16b, v_p.16b, v_data_hi.16b + + cmp x_vec_i, x_vec + blt .Lloop16_vects + +.Lloop16_vects_end: + str q_p, [x_dest1, x_pos] + add x_pos, x_pos, #16 + cmp x_pos, x_len + ble .Lloop16 + +.Lloop16_end: + sub x_tmp, x_pos, x_len + cmp x_tmp, #16 + beq .return_pass + +.lessthan16_init: + mov x_pos, x_len + b .Lloop16 + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_dot_prod_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_dot_prod_sve.S new file mode 100644 index 0000000000..48ce151fde --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_dot_prod_sve.S @@ -0,0 +1,132 @@ +/************************************************************** + Copyright (c) 2021 Linaro Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +.text +.align 6 +.arch armv8-a+sve + +#include "../include/aarch64_label.h" + +.global cdecl(gf_vect_dot_prod_sve) +#ifndef __APPLE__ +.type gf_vect_dot_prod_sve, %function +#endif +/* void gf_vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char *dest); + */ + +/* arguments */ +x_len .req x0 /* vector length */ +x_vec .req x1 /* number of source vectors (ie. data blocks) */ +x_tbl .req x2 +x_src .req x3 +x_dest1 .req x4 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_vec_i .req x5 +x_ptr .req x6 +x_pos .req x7 +x_tbl1 .req x8 + +/* vectors */ +z_mask0f .req z0 + +z_src .req z1 +z_src_lo .req z2 +z_src_hi .req z_src + +z_dest .req z3 + +z_gft1_lo .req z4 +z_gft1_hi .req z5 +q_gft1_lo .req q4 +q_gft1_hi .req q5 + +cdecl(gf_vect_dot_prod_sve): + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ + mov x_pos, #0 + lsl x_vec, x_vec, #3 + +/* Loop 1: x_len, vector length */ +.Lloopsve_vl: + whilelo p0.b, x_pos, x_len + b.none .return_pass + + mov z_dest.b, #0 /* clear z_dest */ + mov x_vec_i, #0 /* clear x_vec_i */ + mov x_tbl1, x_tbl /* reset x_tbl1 */ + +/* Loop 2: x_vec, number of source vectors (ie. data blocks) */ +.Lloopsve_vl_vects: + ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ + /* load src data, governed by p0 */ + ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */ + + add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */ + + /* load gf_table */ + ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is added by #32 + for each src vect */ + + /* split 4-bit lo; 4-bit hi */ + and z_src_lo.d, z_src.d, z_mask0f.d + lsr z_src_hi.b, z_src.b, #4 + + /* table indexing, ie. gf(2^8) multiplication */ + tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b + tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b + + /* exclusive or, ie. gf(2^8) add */ + eor z_dest.d, z_gft1_lo.d, z_dest.d + eor z_dest.d, z_gft1_hi.d, z_dest.d + + cmp x_vec_i, x_vec + blt .Lloopsve_vl_vects + + /* end of Loop 2 */ + /* store dest data, governed by p0 */ + st1b z_dest.b, p0, [x_dest1, x_pos] + /* increment one vector length */ + incb x_pos + + b .Lloopsve_vl + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mad_neon.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mad_neon.S new file mode 100644 index 0000000000..bc2b957820 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mad_neon.S @@ -0,0 +1,324 @@ +/************************************************************** + Copyright (c) 2019 Huawei Technologies Co., Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "../include/aarch64_label.h" + +.text + +.global cdecl(gf_vect_mad_neon) +#ifndef __APPLE__ +.type gf_vect_mad_neon, %function +#endif + +/* arguments */ +x_len .req x0 +x_vec .req x1 +x_vec_i .req x2 +x_tbl .req x3 +x_src .req x4 +x_dest .req x5 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_src_end .req x6 +x_dest1 .req x_dest +x_tmp .req x7 +x_const .req x8 + +/* vectors */ +v_mask0f .req v0 +v_tmp .req v1 +q_tmp .req q1 + +v_tmp1_lo .req v2 +v_tmp1_hi .req v3 +v_tmp2_lo .req v4 +v_tmp2_hi .req v5 + +v_gft1_lo .req v6 +v_gft1_hi .req v7 +q_gft1_lo .req q6 +q_gft1_hi .req q7 + +v_data_0 .req v8 +v_data_1 .req v9 +v_data_2 .req v10 +v_data_3 .req v11 +v_data_4 .req v12 +v_data_5 .req v13 +v_data_6 .req v14 +v_data_7 .req v15 +q_data_0 .req q8 +q_data_1 .req q9 +q_data_2 .req q10 +q_data_3 .req q11 +q_data_4 .req q12 +q_data_5 .req q13 +q_data_6 .req q14 +q_data_7 .req q15 + +v_data_0_lo .req v16 +v_data_1_lo .req v17 +v_data_2_lo .req v18 +v_data_3_lo .req v19 +v_data_4_lo .req v20 +v_data_5_lo .req v21 +v_data_6_lo .req v22 +v_data_7_lo .req v23 +v_data_0_hi .req v_data_0 +v_data_1_hi .req v_data_1 +v_data_2_hi .req v_data_2 +v_data_3_hi .req v_data_3 +v_data_4_hi .req v_data_4 +v_data_5_hi .req v_data_5 +v_data_6_hi .req v_data_6 +v_data_7_hi .req v_data_7 + +v_d1_0 .req v24 +v_d1_1 .req v25 +v_d1_2 .req v26 +v_d1_3 .req v27 +v_d1_4 .req v28 +v_d1_5 .req v29 +v_d1_6 .req v30 +v_d1_7 .req v31 +q_d1_0 .req q24 +q_d1_1 .req q25 +q_d1_2 .req q26 +q_d1_3 .req q27 +q_d1_4 .req q28 +q_d1_5 .req q29 +q_d1_6 .req q30 +q_d1_7 .req q31 + +v_data .req v_d1_1 +q_data .req q_d1_1 +v_data_lo .req v_d1_2 +v_data_hi .req v_d1_3 + + +cdecl(gf_vect_mad_neon): + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + movi v_mask0f.16b, #0x0f + lsl x_vec_i, x_vec_i, #5 + add x_tbl, x_tbl, x_vec_i + add x_src_end, x_src, x_len + + ldr q_gft1_lo, [x_tbl] + ldr q_gft1_hi, [x_tbl, #16] + +.Lloop128_init: + /* less than 128 bytes, goto Lloop16_init */ + cmp x_len, #128 + blt .Lloop16_init + + /* save d8 ~ d15 to stack */ + sub sp, sp, #64 + stp d8, d9, [sp] + stp d10, d11, [sp, #16] + stp d12, d13, [sp, #32] + stp d14, d15, [sp, #48] + + sub x_src_end, x_src_end, #128 + +.Lloop128: + ldr q_data_0, [x_src, #16*0] + ldr q_data_1, [x_src, #16*1] + ldr q_data_2, [x_src, #16*2] + ldr q_data_3, [x_src, #16*3] + ldr q_data_4, [x_src, #16*4] + ldr q_data_5, [x_src, #16*5] + ldr q_data_6, [x_src, #16*6] + ldr q_data_7, [x_src, #16*7] + + ldr q_d1_0, [x_dest1, #16*0] + ldr q_d1_1, [x_dest1, #16*1] + ldr q_d1_2, [x_dest1, #16*2] + ldr q_d1_3, [x_dest1, #16*3] + ldr q_d1_4, [x_dest1, #16*4] + ldr q_d1_5, [x_dest1, #16*5] + ldr q_d1_6, [x_dest1, #16*6] + ldr q_d1_7, [x_dest1, #16*7] + + and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b + and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b + and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b + and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b + and v_data_4_lo.16b, v_data_4.16b, v_mask0f.16b + and v_data_5_lo.16b, v_data_5.16b, v_mask0f.16b + and v_data_6_lo.16b, v_data_6.16b, v_mask0f.16b + and v_data_7_lo.16b, v_data_7.16b, v_mask0f.16b + + ushr v_data_0_hi.16b, v_data_0.16b, #4 + ushr v_data_1_hi.16b, v_data_1.16b, #4 + ushr v_data_2_hi.16b, v_data_2.16b, #4 + ushr v_data_3_hi.16b, v_data_3.16b, #4 + ushr v_data_4_hi.16b, v_data_4.16b, #4 + ushr v_data_5_hi.16b, v_data_5.16b, #4 + ushr v_data_6_hi.16b, v_data_6.16b, #4 + ushr v_data_7_hi.16b, v_data_7.16b, #4 + + tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b + tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b + tbl v_tmp2_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b + tbl v_tmp2_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b + + eor v_d1_0.16b, v_tmp1_lo.16b, v_d1_0.16b + eor v_d1_0.16b, v_d1_0.16b, v_tmp1_hi.16b + eor v_d1_1.16b, v_tmp2_lo.16b, v_d1_1.16b + eor v_d1_1.16b, v_d1_1.16b, v_tmp2_hi.16b + + tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b + tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b + tbl v_tmp2_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b + tbl v_tmp2_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b + + eor v_d1_2.16b, v_tmp1_lo.16b, v_d1_2.16b + eor v_d1_2.16b, v_d1_2.16b, v_tmp1_hi.16b + eor v_d1_3.16b, v_tmp2_lo.16b, v_d1_3.16b + eor v_d1_3.16b, v_d1_3.16b, v_tmp2_hi.16b + + tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_4_lo.16b + tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_4_hi.16b + tbl v_tmp2_lo.16b, {v_gft1_lo.16b}, v_data_5_lo.16b + tbl v_tmp2_hi.16b, {v_gft1_hi.16b}, v_data_5_hi.16b + + eor v_d1_4.16b, v_tmp1_lo.16b, v_d1_4.16b + eor v_d1_4.16b, v_d1_4.16b, v_tmp1_hi.16b + eor v_d1_5.16b, v_tmp2_lo.16b, v_d1_5.16b + eor v_d1_5.16b, v_d1_5.16b, v_tmp2_hi.16b + + tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_6_lo.16b + tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_6_hi.16b + tbl v_tmp2_lo.16b, {v_gft1_lo.16b}, v_data_7_lo.16b + tbl v_tmp2_hi.16b, {v_gft1_hi.16b}, v_data_7_hi.16b + + eor v_d1_6.16b, v_tmp1_lo.16b, v_d1_6.16b + eor v_d1_6.16b, v_d1_6.16b, v_tmp1_hi.16b + eor v_d1_7.16b, v_tmp2_lo.16b, v_d1_7.16b + eor v_d1_7.16b, v_d1_7.16b, v_tmp2_hi.16b + + str q_d1_0, [x_dest1, #16*0] + str q_d1_1, [x_dest1, #16*1] + str q_d1_2, [x_dest1, #16*2] + str q_d1_3, [x_dest1, #16*3] + str q_d1_4, [x_dest1, #16*4] + str q_d1_5, [x_dest1, #16*5] + str q_d1_6, [x_dest1, #16*6] + str q_d1_7, [x_dest1, #16*7] + + add x_src, x_src, #128 + add x_dest1, x_dest1, #128 + cmp x_src, x_src_end + bls .Lloop128 + +.Lloop128_end: + /* restore d8 ~ d15 */ + ldp d8, d9, [sp] + ldp d10, d11, [sp, #16] + ldp d12, d13, [sp, #32] + ldp d14, d15, [sp, #48] + add sp, sp, #64 + add x_src_end, x_src_end, #128 + +.Lloop16_init: + sub x_src_end, x_src_end, #16 + cmp x_src, x_src_end + bhi .lessthan16_init + +.Lloop16: + ldr q_data, [x_src] + ldr q_d1_0, [x_dest1] + + and v_data_lo.16b, v_data.16b, v_mask0f.16b + ushr v_data_hi.16b, v_data.16b, #4 + + tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b + tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b + eor v_d1_0.16b, v_tmp1_lo.16b, v_d1_0.16b + eor v_d1_0.16b, v_d1_0.16b, v_tmp1_hi.16b + + str q_d1_0, [x_dest1] + + add x_dest1, x_dest1, #16 + add x_src, x_src, #16 + cmp x_src, x_src_end + bls .Lloop16 + +.lessthan16_init: + sub x_tmp, x_src, x_src_end + cmp x_tmp, #16 + beq .return_pass + +.lessthan16: + mov x_src, x_src_end + sub x_dest1, x_dest1, x_tmp + +#ifndef __APPLE__ + adrp x_const, const_tbl + add x_const, x_const, :lo12:const_tbl +#else + adrp x_const, const_tbl@PAGE + add x_const, x_const, const_tbl@PAGEOFF +#endif + sub x_const, x_const, x_tmp + ldr q_tmp, [x_const, #16] + + ldr q_data, [x_src] + ldr q_d1_0, [x_dest1] + + and v_data_lo.16b, v_data.16b, v_mask0f.16b + ushr v_data_hi.16b, v_data.16b, #4 + + tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b + tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b + eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b + and v_tmp1_hi.16b, v_tmp1_hi.16b, v_tmp.16b + eor v_d1_0.16b, v_d1_0.16b, v_tmp1_hi.16b + + str q_d1_0, [x_dest1] + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret + +ASM_DEF_RODATA +.balign 8 +const_tbl: + .dword 0x0000000000000000, 0x0000000000000000 + .dword 0xffffffffffffffff, 0xffffffffffffffff diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mad_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mad_sve.S new file mode 100644 index 0000000000..41d6da9d9a --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mad_sve.S @@ -0,0 +1,126 @@ +/************************************************************** + Copyright (c) 2021 Linaro Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +.text +.align 6 +.arch armv8-a+sve + +#include "../include/aarch64_label.h" + +.global cdecl(gf_vect_mad_sve) +#ifndef __APPLE__ +.type gf_vect_mad_sve, %function +#endif + +/* gf_vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char *dest); + */ +/* arguments */ +x_len .req x0 +x_vec .req x1 +x_vec_i .req x2 +x_tbl .req x3 +x_src .req x4 +x_dest .req x5 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_pos .req x6 + +/* vectors */ +z_mask0f .req z0 + +z_src .req z1 +z_src_lo .req z2 +z_src_hi .req z_src + +z_dest .req z3 + +z_tmp1_lo .req z4 +z_tmp1_hi .req z5 + +z_gft1_lo .req z6 +z_gft1_hi .req z7 +q_gft1_lo .req q6 +q_gft1_hi .req q7 + +cdecl(gf_vect_mad_sve): + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ + add x_tbl, x_tbl, x_vec_i, LSL #5 /* x_tbl += x_vec_i * 2^5 */ + + /* Load with NEON instruction ldp */ + ldp q_gft1_lo, q_gft1_hi, [x_tbl] + + mov x_pos, #0 + + /* vector length agnostic */ +.Lloopsve_vl: + whilelo p0.b, x_pos, x_len + b.none .return_pass + + /* prefetch dest data */ + prfb pldl2strm, p0, [x_dest, x_pos] + + /* load src data, governed by p0 */ + ld1b z_src.b, p0/z, [x_src, x_pos] + + /* split 4-bit lo; 4-bit hi */ + and z_src_lo.d, z_src.d, z_mask0f.d + lsr z_src_hi.b, z_src.b, #4 + + /* load dest data, governed by p0 */ + ld1b z_dest.b, p0/z, [x_dest, x_pos] + + /* table indexing, ie. gf(2^8) multiplication */ + tbl z_tmp1_lo.b, {z_gft1_lo.b}, z_src_lo.b + tbl z_tmp1_hi.b, {z_gft1_hi.b}, z_src_hi.b + + /* exclusive or, ie. gf(2^8) add */ + eor z_dest.d, z_tmp1_lo.d, z_dest.d + eor z_dest.d, z_tmp1_hi.d, z_dest.d + + /* store dest data, governed by p0 */ + st1b z_dest.b, p0, [x_dest, x_pos] + /* increment one vector length */ + incb x_pos + + b .Lloopsve_vl + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mul_neon.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mul_neon.S new file mode 100644 index 0000000000..096b91dd29 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mul_neon.S @@ -0,0 +1,240 @@ +/************************************************************** + Copyright (c) 2019 Huawei Technologies Co., Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "../include/aarch64_label.h" + +.text + +.global cdecl(gf_vect_mul_neon) +#ifndef __APPLE__ +.type gf_vect_mul_neon, %function +#endif + +/* arguments */ +x_len .req x0 +x_tbl .req x1 +x_src .req x2 +x_dest .req x3 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_dest1 .req x_dest +x_src_end .req x4 +x_tmp .req x5 + +/* vectors */ +v_mask0f .req v0 + +v_gft1_lo .req v2 +v_gft1_hi .req v3 +q_gft1_lo .req q2 +q_gft1_hi .req q3 + +v_data_0 .req v16 +v_data_1 .req v17 +v_data_2 .req v18 +v_data_3 .req v19 +v_data_4 .req v20 +v_data_5 .req v21 +v_data_6 .req v22 +v_data_7 .req v23 +q_data_0 .req q16 +q_data_1 .req q17 +q_data_2 .req q18 +q_data_3 .req q19 +q_data_4 .req q20 +q_data_5 .req q21 +q_data_6 .req q22 +q_data_7 .req q23 + +v_data_0_lo .req v24 +v_data_1_lo .req v25 +v_data_2_lo .req v26 +v_data_3_lo .req v27 +v_data_4_lo .req v28 +v_data_5_lo .req v29 +v_data_6_lo .req v30 +v_data_7_lo .req v31 +v_data_0_hi .req v_data_0 +v_data_1_hi .req v_data_1 +v_data_2_hi .req v_data_2 +v_data_3_hi .req v_data_3 +v_data_4_hi .req v_data_4 +v_data_5_hi .req v_data_5 +v_data_6_hi .req v_data_6 +v_data_7_hi .req v_data_7 + + +cdecl(gf_vect_mul_neon): + /* less than 32 bytes, return_fail */ + cmp x_len, #32 + blt .return_fail + + movi v_mask0f.16b, #0x0f + add x_src_end, x_src, x_len + ldr q_gft1_lo, [x_tbl] + ldr q_gft1_hi, [x_tbl, #16] + + +.Lloop128_init: + /* less than 128 bytes, goto Lloop16_init */ + cmp x_len, #128 + blt .Lloop32_init + + /* save d8 ~ d15 to stack */ + sub sp, sp, #64 + stp d8, d9, [sp] + stp d10, d11, [sp, #16] + stp d12, d13, [sp, #32] + stp d14, d15, [sp, #48] + + sub x_src_end, x_src_end, #128 + +.Lloop128: + ldr q_data_0, [x_src, #16*0] + ldr q_data_1, [x_src, #16*1] + ldr q_data_2, [x_src, #16*2] + ldr q_data_3, [x_src, #16*3] + ldr q_data_4, [x_src, #16*4] + ldr q_data_5, [x_src, #16*5] + ldr q_data_6, [x_src, #16*6] + ldr q_data_7, [x_src, #16*7] + + and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b + and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b + and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b + and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b + and v_data_4_lo.16b, v_data_4.16b, v_mask0f.16b + and v_data_5_lo.16b, v_data_5.16b, v_mask0f.16b + and v_data_6_lo.16b, v_data_6.16b, v_mask0f.16b + and v_data_7_lo.16b, v_data_7.16b, v_mask0f.16b + + ushr v_data_0_hi.16b, v_data_0.16b, #4 + ushr v_data_1_hi.16b, v_data_1.16b, #4 + ushr v_data_2_hi.16b, v_data_2.16b, #4 + ushr v_data_3_hi.16b, v_data_3.16b, #4 + ushr v_data_4_hi.16b, v_data_4.16b, #4 + ushr v_data_5_hi.16b, v_data_5.16b, #4 + ushr v_data_6_hi.16b, v_data_6.16b, #4 + ushr v_data_7_hi.16b, v_data_7.16b, #4 + + tbl v_data_0_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b + tbl v_data_1_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b + tbl v_data_2_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b + tbl v_data_3_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b + tbl v_data_4_lo.16b, {v_gft1_lo.16b}, v_data_4_lo.16b + tbl v_data_5_lo.16b, {v_gft1_lo.16b}, v_data_5_lo.16b + tbl v_data_6_lo.16b, {v_gft1_lo.16b}, v_data_6_lo.16b + tbl v_data_7_lo.16b, {v_gft1_lo.16b}, v_data_7_lo.16b + + tbl v_data_0_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b + tbl v_data_1_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b + tbl v_data_2_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b + tbl v_data_3_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b + tbl v_data_4_hi.16b, {v_gft1_hi.16b}, v_data_4_hi.16b + tbl v_data_5_hi.16b, {v_gft1_hi.16b}, v_data_5_hi.16b + tbl v_data_6_hi.16b, {v_gft1_hi.16b}, v_data_6_hi.16b + tbl v_data_7_hi.16b, {v_gft1_hi.16b}, v_data_7_hi.16b + + eor v_data_0.16b, v_data_0_hi.16b, v_data_0_lo.16b + eor v_data_1.16b, v_data_1_hi.16b, v_data_1_lo.16b + eor v_data_2.16b, v_data_2_hi.16b, v_data_2_lo.16b + eor v_data_3.16b, v_data_3_hi.16b, v_data_3_lo.16b + eor v_data_4.16b, v_data_4_hi.16b, v_data_4_lo.16b + eor v_data_5.16b, v_data_5_hi.16b, v_data_5_lo.16b + eor v_data_6.16b, v_data_6_hi.16b, v_data_6_lo.16b + eor v_data_7.16b, v_data_7_hi.16b, v_data_7_lo.16b + + str q_data_0, [x_dest1, #16*0] + str q_data_1, [x_dest1, #16*1] + str q_data_2, [x_dest1, #16*2] + str q_data_3, [x_dest1, #16*3] + str q_data_4, [x_dest1, #16*4] + str q_data_5, [x_dest1, #16*5] + str q_data_6, [x_dest1, #16*6] + str q_data_7, [x_dest1, #16*7] + + add x_src, x_src, #128 + add x_dest1, x_dest1, #128 + cmp x_src, x_src_end + bls .Lloop128 + +.Lloop128_end: + /* restore d8 ~ d15 */ + ldp d8, d9, [sp] + ldp d10, d11, [sp, #16] + ldp d12, d13, [sp, #32] + ldp d14, d15, [sp, #48] + add sp, sp, #64 + add x_src_end, x_src_end, #128 + cmp x_src, x_src_end + beq .return_pass + +.Lloop32_init: + sub x_src_end, x_src_end, #32 + cmp x_src, x_src_end + bhi .return_fail + +.Lloop32: + ldr q_data_0, [x_src, #16*0] + ldr q_data_1, [x_src, #16*1] + + and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b + and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b + ushr v_data_0_hi.16b, v_data_0.16b, #4 + ushr v_data_1_hi.16b, v_data_1.16b, #4 + tbl v_data_0_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b + tbl v_data_1_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b + tbl v_data_0_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b + tbl v_data_1_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b + eor v_data_0.16b, v_data_0_hi.16b, v_data_0_lo.16b + eor v_data_1.16b, v_data_1_hi.16b, v_data_1_lo.16b + str q_data_0, [x_dest1, #16*0] + str q_data_1, [x_dest1, #16*1] + + add x_dest1, x_dest1, #32 + add x_src, x_src, #32 + cmp x_src, x_src_end + bls .Lloop32 + +.Lloop32_end: + sub x_tmp, x_src, x_src_end + cmp x_tmp, #32 + beq .return_pass + b .return_fail + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mul_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mul_sve.S new file mode 100644 index 0000000000..d2219bf54c --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mul_sve.S @@ -0,0 +1,123 @@ +/************************************************************** + Copyright (c) 2021 Linaro Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +.text +.align 6 +.arch armv8-a+sve + +#include "../include/aarch64_label.h" + +.global cdecl(gf_vect_mul_sve) +#ifndef __APPLE__ +.type gf_vect_mul_sve, %function +#endif + +/* Refer to include/gf_vect_mul.h + * + * @param len Length of vector in bytes. Must be aligned to 32B. + * @param gftbl Pointer to 32-byte array of pre-calculated constants based on C. + * @param src Pointer to src data array. Must be aligned to 32B. + * @param dest Pointer to destination data array. Must be aligned to 32B. + * @returns 0 pass, other fail + * + * int gf_vect_mul(int len, unsigned char *gftbl, void *src, void *dest); + */ + +/* arguments */ +x_len .req x0 +x_tbl .req x1 +x_src .req x2 +x_dest .req x3 +x_tmp .req x4 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_pos .req x5 + +/* vectors */ +z_mask0f .req z0 + +z_src .req z1 +z_src_lo .req z2 +z_src_hi .req z_src /* reuse */ + +z_dest .req z3 +z_tmp1_lo .req z4 +z_tmp1_hi .req z_dest /* reuse */ + +z_gft1_lo .req z6 +z_gft1_hi .req z7 +q_gft1_lo .req q6 +q_gft1_hi .req q7 + +cdecl(gf_vect_mul_sve): + /* len not aligned to 32B, return_fail */ + and x_tmp, x_len, #0x1f + cmp x_tmp, #0 + bne .return_fail + + mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ + mov x_pos, #0 + + /* Load with NEON instruction ldp */ + ldp q_gft1_lo, q_gft1_hi, [x_tbl] + + /* vector length agnostic */ +.Lloopsve_vl: + whilelo p0.b, x_pos, x_len + b.none .return_pass + + /* load src data, governed by p0 */ + ld1b z_src.b, p0/z, [x_src, x_pos] + + /* split 4-bit lo; 4-bit hi */ + and z_src_lo.d, z_src.d, z_mask0f.d + lsr z_src_hi.b, z_src.b, #4 + + /* table indexing, ie. gf(2^8) multiplication */ + tbl z_tmp1_lo.b, {z_gft1_lo.b}, z_src_lo.b + tbl z_tmp1_hi.b, {z_gft1_hi.b}, z_src_hi.b + /* exclusive or, ie. gf(2^8) add */ + eor z_dest.d, z_tmp1_hi.d, z_tmp1_lo.d + + /* store dest data, governed by p0 */ + st1b z_dest.b, p0, [x_dest, x_pos] + /* increment one vector length */ + incb x_pos + + b .Lloopsve_vl + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/contrib/libs/isa-l/erasure_code/aarch64/ya.make b/contrib/libs/isa-l/erasure_code/aarch64/ya.make new file mode 100644 index 0000000000..ba7f601cfa --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/ya.make @@ -0,0 +1,51 @@ +LIBRARY() + +LICENSE(BSD-3-Clause) + +LICENSE_TEXTS(.yandex_meta/licenses.list.txt) + +VERSION(2.31) + +NO_UTIL() + +NO_COMPILER_WARNINGS() + +ADDINCL( + contrib/libs/isa-l/include +) + +IF(ARCH_AARCH64) +CFLAGS(-D__ASSEMBLY__) +SRCS( + ec_multibinary_arm.S + gf_2vect_dot_prod_neon.S + gf_2vect_dot_prod_sve.S + gf_2vect_mad_neon.S + gf_2vect_mad_sve.S + gf_3vect_dot_prod_neon.S + gf_3vect_dot_prod_sve.S + gf_3vect_mad_neon.S + gf_3vect_mad_sve.S + gf_4vect_dot_prod_neon.S + gf_4vect_dot_prod_sve.S + gf_4vect_mad_neon.S + gf_4vect_mad_sve.S + gf_5vect_dot_prod_neon.S + gf_5vect_dot_prod_sve.S + gf_5vect_mad_neon.S + gf_5vect_mad_sve.S + gf_6vect_dot_prod_sve.S + gf_6vect_mad_neon.S + gf_6vect_mad_sve.S + gf_7vect_dot_prod_sve.S + gf_8vect_dot_prod_sve.S + gf_vect_dot_prod_neon.S + gf_vect_dot_prod_sve.S + gf_vect_mad_neon.S + gf_vect_mad_sve.S + gf_vect_mul_neon.S + gf_vect_mul_sve.S +) +ENDIF() + +END() diff --git a/contrib/libs/isa-l/erasure_code/ec_base.c b/contrib/libs/isa-l/erasure_code/ec_base.c index 9a8fbc759e..c076b517bf 100644 --- a/contrib/libs/isa-l/erasure_code/ec_base.c +++ b/contrib/libs/isa-l/erasure_code/ec_base.c @@ -29,10 +29,12 @@ #include <limits.h> #include <string.h> // for memset +#include <stdint.h> + #include "erasure_code.h" #include "ec_base.h" // for GF tables -void ec_init_tables(int k, int rows, unsigned char *a, unsigned char *g_tbls) +void ec_init_tables_base(int k, int rows, unsigned char *a, unsigned char *g_tbls) { int i, j; @@ -171,7 +173,7 @@ void gf_vect_mul_init(unsigned char c, unsigned char *tbl) unsigned char c4 = (c2 << 1) ^ ((c2 & 0x80) ? 0x1d : 0); //Mult by GF{2} unsigned char c8 = (c4 << 1) ^ ((c4 & 0x80) ? 0x1d : 0); //Mult by GF{2} -#if __WORDSIZE == 64 || _WIN64 || __x86_64__ +#if (__WORDSIZE == 64 || _WIN64 || __x86_64__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) unsigned long long v1, v2, v4, v8, *t; unsigned long long v10, v20, v40, v80; unsigned char c17, c18, c20, c24; @@ -331,41 +333,17 @@ void ec_encode_data_update_base(int len, int k, int rows, int vec_i, unsigned ch } } -void gf_vect_mul_base(int len, unsigned char *a, unsigned char *src, unsigned char *dest) +int gf_vect_mul_base(int len, unsigned char *a, unsigned char *src, unsigned char *dest) { //2nd element of table array is ref value used to fill it in unsigned char c = a[1]; + + // Len must be aligned to 32B + if ((len % 32) != 0) { + return -1; + } + while (len-- > 0) *dest++ = gf_mul_erasure(c, *src++); + return 0; } - -struct slver { - unsigned short snum; - unsigned char ver; - unsigned char core; -}; - -// Version info -struct slver gf_vect_mul_init_slver_00020035; -struct slver gf_vect_mul_init_slver = { 0x0035, 0x02, 0x00 }; - -struct slver ec_encode_data_base_slver_00010135; -struct slver ec_encode_data_base_slver = { 0x0135, 0x01, 0x00 }; - -struct slver gf_vect_mul_base_slver_00010136; -struct slver gf_vect_mul_base_slver = { 0x0136, 0x01, 0x00 }; - -struct slver gf_vect_dot_prod_base_slver_00010137; -struct slver gf_vect_dot_prod_base_slver = { 0x0137, 0x01, 0x00 }; - -struct slver gf_mul_slver_00000214; -struct slver gf_mul_slver = { 0x0214, 0x00, 0x00 }; - -struct slver gf_invert_matrix_slver_00000215; -struct slver gf_invert_matrix_slver = { 0x0215, 0x00, 0x00 }; - -struct slver gf_gen_rs_matrix_slver_00000216; -struct slver gf_gen_rs_matrix_slver = { 0x0216, 0x00, 0x00 }; - -struct slver gf_gen_cauchy1_matrix_slver_00000217; -struct slver gf_gen_cauchy1_matrix_slver = { 0x0217, 0x00, 0x00 }; diff --git a/contrib/libs/isa-l/erasure_code/ec_base.h b/contrib/libs/isa-l/erasure_code/ec_base.h index 070b276652..ace384968b 100644 --- a/contrib/libs/isa-l/erasure_code/ec_base.h +++ b/contrib/libs/isa-l/erasure_code/ec_base.h @@ -30,6 +30,77 @@ #ifndef _EC_BASE_H_ #define _EC_BASE_H_ +#include <stdint.h> + +#define MAX_NUM_OUTPUTS_CALL 6 + +static const uint64_t gf_table_gfni[256] = { + 0x0000000000000000, 0x102040810204080, 0x8001828488102040, 0x8103868c983060c0, + 0x408041c2c4881020, 0x418245cad4a850a0, 0xc081c3464c983060, 0xc183c74e5cb870e0, + 0x2040a061e2c48810, 0x2142a469f2e4c890, 0xa04122e56ad4a850, 0xa14326ed7af4e8d0, + 0x60c0e1a3264c9830, 0x61c2e5ab366cd8b0, 0xe0c16327ae5cb870, 0xe1c3672fbe7cf8f0, + 0x102050b071e2c488, 0x112254b861c28408, 0x9021d234f9f2e4c8, 0x9123d63ce9d2a448, + 0x50a01172b56ad4a8, 0x51a2157aa54a9428, 0xd0a193f63d7af4e8, 0xd1a397fe2d5ab468, + 0x3060f0d193264c98, 0x3162f4d983060c18, 0xb06172551b366cd8, 0xb163765d0b162c58, + 0x70e0b11357ae5cb8, 0x71e2b51b478e1c38, 0xf0e13397dfbe7cf8, 0xf1e3379fcf9e3c78, + 0x8810a8d83871e2c4, 0x8912acd02851a244, 0x08112a5cb061c284, 0x09132e54a0418204, + 0xc890e91afcf9f2e4, 0xc992ed12ecd9b264, 0x48916b9e74e9d2a4, 0x49936f9664c99224, + 0xa85008b9dab56ad4, 0xa9520cb1ca952a54, 0x28518a3d52a54a94, 0x29538e3542850a14, + 0xe8d0497b1e3d7af4, 0xe9d24d730e1d3a74, 0x68d1cbff962d5ab4, 0x69d3cff7860d1a34, + 0x9830f8684993264c, 0x9932fc6059b366cc, 0x18317aecc183060c, 0x19337ee4d1a3468c, + 0xd8b0b9aa8d1b366c, 0xd9b2bda29d3b76ec, 0x58b13b2e050b162c, 0x59b33f26152b56ac, + 0xb8705809ab57ae5c, 0xb9725c01bb77eedc, 0x3871da8d23478e1c, 0x3973de853367ce9c, + 0xf8f019cb6fdfbe7c, 0xf9f21dc37ffffefc, 0x78f19b4fe7cf9e3c, 0x79f39f47f7efdebc, + 0xc488d46c1c3871e2, 0xc58ad0640c183162, 0x448956e8942851a2, 0x458b52e084081122, + 0x840895aed8b061c2, 0x850a91a6c8902142, 0x0409172a50a04182, 0x050b132240800102, + 0xe4c8740dfefcf9f2, 0xe5ca7005eedcb972, 0x64c9f68976ecd9b2, 0x65cbf28166cc9932, + 0xa44835cf3a74e9d2, 0xa54a31c72a54a952, 0x2449b74bb264c992, 0x254bb343a2448912, + 0xd4a884dc6ddab56a, 0xd5aa80d47dfaf5ea, 0x54a90658e5ca952a, 0x55ab0250f5ead5aa, + 0x9428c51ea952a54a, 0x952ac116b972e5ca, 0x1429479a2142850a, 0x152b43923162c58a, + 0xf4e824bd8f1e3d7a, 0xf5ea20b59f3e7dfa, 0x74e9a639070e1d3a, 0x75eba231172e5dba, + 0xb468657f4b962d5a, 0xb56a61775bb66dda, 0x3469e7fbc3860d1a, 0x356be3f3d3a64d9a, + 0x4c987cb424499326, 0x4d9a78bc3469d3a6, 0xcc99fe30ac59b366, 0xcd9bfa38bc79f3e6, + 0x0c183d76e0c18306, 0x0d1a397ef0e1c386, 0x8c19bff268d1a346, 0x8d1bbbfa78f1e3c6, + 0x6cd8dcd5c68d1b36, 0x6ddad8ddd6ad5bb6, 0xecd95e514e9d3b76, 0xeddb5a595ebd7bf6, + 0x2c589d1702050b16, 0x2d5a991f12254b96, 0xac591f938a152b56, 0xad5b1b9b9a356bd6, + 0x5cb82c0455ab57ae, 0x5dba280c458b172e, 0xdcb9ae80ddbb77ee, 0xddbbaa88cd9b376e, + 0x1c386dc69123478e, 0x1d3a69ce8103070e, 0x9c39ef42193367ce, 0x9d3beb4a0913274e, + 0x7cf88c65b76fdfbe, 0x7dfa886da74f9f3e, 0xfcf90ee13f7ffffe, 0xfdfb0ae92f5fbf7e, + 0x3c78cda773e7cf9e, 0x3d7ac9af63c78f1e, 0xbc794f23fbf7efde, 0xbd7b4b2bebd7af5e, + 0xe2c46a368e1c3871, 0xe3c66e3e9e3c78f1, 0x62c5e8b2060c1831, 0x63c7ecba162c58b1, + 0xa2442bf44a942851, 0xa3462ffc5ab468d1, 0x2245a970c2840811, 0x2347ad78d2a44891, + 0xc284ca576cd8b061, 0xc386ce5f7cf8f0e1, 0x428548d3e4c89021, 0x43874cdbf4e8d0a1, + 0x82048b95a850a041, 0x83068f9db870e0c1, 0x0205091120408001, 0x03070d193060c081, + 0xf2e43a86fffefcf9, 0xf3e63e8eefdebc79, 0x72e5b80277eedcb9, 0x73e7bc0a67ce9c39, + 0xb2647b443b76ecd9, 0xb3667f4c2b56ac59, 0x3265f9c0b366cc99, 0x3367fdc8a3468c19, + 0xd2a49ae71d3a74e9, 0xd3a69eef0d1a3469, 0x52a51863952a54a9, 0x53a71c6b850a1429, + 0x9224db25d9b264c9, 0x9326df2dc9922449, 0x122559a151a24489, 0x13275da941820409, + 0x6ad4c2eeb66ddab5, 0x6bd6c6e6a64d9a35, 0xead5406a3e7dfaf5, 0xebd744622e5dba75, + 0x2a54832c72e5ca95, 0x2b56872462c58a15, 0xaa5501a8faf5ead5, 0xab5705a0ead5aa55, + 0x4a94628f54a952a5, 0x4b96668744891225, 0xca95e00bdcb972e5, 0xcb97e403cc993265, + 0x0a14234d90214285, 0x0b16274580010205, 0x8a15a1c9183162c5, 0x8b17a5c108112245, + 0x7af4925ec78f1e3d, 0x7bf69656d7af5ebd, 0xfaf510da4f9f3e7d, 0xfbf714d25fbf7efd, + 0x3a74d39c03070e1d, 0x3b76d79413274e9d, 0xba7551188b172e5d, 0xbb7755109b376edd, + 0x5ab4323f254b962d, 0x5bb63637356bd6ad, 0xdab5b0bbad5bb66d, 0xdbb7b4b3bd7bf6ed, + 0x1a3473fde1c3860d, 0x1b3677f5f1e3c68d, 0x9a35f17969d3a64d, 0x9b37f57179f3e6cd, + 0x264cbe5a92244993, 0x274eba5282040913, 0xa64d3cde1a3469d3, 0xa74f38d60a142953, + 0x66ccff9856ac59b3, 0x67cefb90468c1933, 0xe6cd7d1cdebc79f3, 0xe7cf7914ce9c3973, + 0x060c1e3b70e0c183, 0x070e1a3360c08103, 0x860d9cbff8f0e1c3, 0x870f98b7e8d0a143, + 0x468c5ff9b468d1a3, 0x478e5bf1a4489123, 0xc68ddd7d3c78f1e3, 0xc78fd9752c58b163, + 0x366ceeeae3c68d1b, 0x376eeae2f3e6cd9b, 0xb66d6c6e6bd6ad5b, 0xb76f68667bf6eddb, + 0x76ecaf28274e9d3b, 0x77eeab20376eddbb, 0xf6ed2dacaf5ebd7b, 0xf7ef29a4bf7efdfb, + 0x162c4e8b0102050b, 0x172e4a831122458b, 0x962dcc0f8912254b, 0x972fc807993265cb, + 0x56ac0f49c58a152b, 0x57ae0b41d5aa55ab, 0xd6ad8dcd4d9a356b, 0xd7af89c55dba75eb, + 0xae5c1682aa55ab57, 0xaf5e128aba75ebd7, 0x2e5d940622458b17, 0x2f5f900e3265cb97, + 0xeedc57406eddbb77, 0xefde53487efdfbf7, 0x6eddd5c4e6cd9b37, 0x6fdfd1ccf6eddbb7, + 0x8e1cb6e348912347, 0x8f1eb2eb58b163c7, 0x0e1d3467c0810307, 0x0f1f306fd0a14387, + 0xce9cf7218c193367, 0xcf9ef3299c3973e7, 0x4e9d75a504091327, 0x4f9f71ad142953a7, + 0xbe7c4632dbb76fdf, 0xbf7e423acb972f5f, 0x3e7dc4b653a74f9f, 0x3f7fc0be43870f1f, + 0xfefc07f01f3f7fff, 0xfffe03f80f1f3f7f, 0x7efd8574972f5fbf, 0x7fff817c870f1f3f, + 0x9e3ce6533973e7cf, 0x9f3ee25b2953a74f, 0x1e3d64d7b163c78f, 0x1f3f60dfa143870f, + 0xdebca791fdfbf7ef, 0xdfbea399eddbb76f, 0x5ebd251575ebd7af, 0x5fbf211d65cb972f +}; + // Global GF(256) tables #ifndef GF_LARGE_TABLES static const unsigned char gff_base[] = { diff --git a/contrib/libs/isa-l/erasure_code/ec_base.patch b/contrib/libs/isa-l/erasure_code/ec_base.patch deleted file mode 100644 index 86a927f8c3..0000000000 --- a/contrib/libs/isa-l/erasure_code/ec_base.patch +++ /dev/null @@ -1,44 +0,0 @@ -47c47 -< unsigned char gf_mul_erasure(unsigned char a, unsigned char b) ---- -> unsigned char gf_mul(unsigned char a, unsigned char b) -86c86 -< p = gf_mul_erasure(p, gen); ---- -> p = gf_mul(p, gen); -88c88 -< gen = gf_mul_erasure(gen, 2); ---- -> gen = gf_mul(gen, 2); -147,148c147,148 -< in_mat[i * n + j] = gf_mul_erasure(in_mat[i * n + j], temp); -< out_mat[i * n + j] = gf_mul_erasure(out_mat[i * n + j], temp); ---- -> in_mat[i * n + j] = gf_mul(in_mat[i * n + j], temp); -> out_mat[i * n + j] = gf_mul(out_mat[i * n + j], temp); -157,158c157,158 -< out_mat[j * n + k] ^= gf_mul_erasure(temp, out_mat[i * n + k]); -< in_mat[j * n + k] ^= gf_mul_erasure(temp, in_mat[i * n + k]); ---- -> out_mat[j * n + k] ^= gf_mul(temp, out_mat[i * n + k]); -> in_mat[j * n + k] ^= gf_mul(temp, in_mat[i * n + k]); -283c283 -< s ^= gf_mul_erasure(src[j][i], v[j * 32 + 1]); ---- -> s ^= gf_mul(src[j][i], v[j * 32 + 1]); -296c296 -< s ^= gf_mul_erasure(src[i], v[vec_i * 32 + 1]); ---- -> s ^= gf_mul(src[i], v[vec_i * 32 + 1]); -311c311 -< s ^= gf_mul_erasure(src[j][i], v[j * 32 + l * srcs * 32 + 1]); ---- -> s ^= gf_mul(src[j][i], v[j * 32 + l * srcs * 32 + 1]); -327c327 -< s ^= gf_mul_erasure(data[i], v[vec_i * 32 + l * k * 32 + 1]); ---- -> s ^= gf_mul(data[i], v[vec_i * 32 + l * k * 32 + 1]); -339c339 -< *dest++ = gf_mul_erasure(c, *src++); ---- -> *dest++ = gf_mul(c, *src++); diff --git a/contrib/libs/isa-l/erasure_code/ec_base_aliases.c b/contrib/libs/isa-l/erasure_code/ec_base_aliases.c index d046ff61ad..705dfb685c 100644 --- a/contrib/libs/isa-l/erasure_code/ec_base_aliases.c +++ b/contrib/libs/isa-l/erasure_code/ec_base_aliases.c @@ -56,6 +56,10 @@ void ec_encode_data_update(int len, int k, int rows, int vec_i, unsigned char *v int gf_vect_mul(int len, unsigned char *a, void *src, void *dest) { - gf_vect_mul_base(len, a, (unsigned char *)src, (unsigned char *)dest); - return 0; + return gf_vect_mul_base(len, a, (unsigned char *)src, (unsigned char *)dest); +} + +void ec_init_tables(int k, int rows, unsigned char *a, unsigned char *g_tbls) +{ + return ec_init_tables_base(k, rows, a, g_tbls); } diff --git a/contrib/libs/isa-l/erasure_code/ec_highlevel_func.c b/contrib/libs/isa-l/erasure_code/ec_highlevel_func.c index c57d460a61..373cd33726 100644 --- a/contrib/libs/isa-l/erasure_code/ec_highlevel_func.c +++ b/contrib/libs/isa-l/erasure_code/ec_highlevel_func.c @@ -1,5 +1,5 @@ /********************************************************************** - Copyright(c) 2011-2015 Intel Corporation All rights reserved. + Copyright(c) 2011-2019 Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions @@ -28,6 +28,7 @@ **********************************************************************/ #include <limits.h> #include "erasure_code.h" +#include "ec_base.h" /* for GF tables */ #if __x86_64__ || __i386__ || _M_X64 || _M_IX86 void ec_encode_data_sse(int len, int k, int rows, unsigned char *g_tbls, unsigned char **data, @@ -39,13 +40,19 @@ void ec_encode_data_sse(int len, int k, int rows, unsigned char *g_tbls, unsigne return; } - while (rows >= 4) { - gf_4vect_dot_prod_sse(len, k, g_tbls, data, coding); - g_tbls += 4 * k * 32; - coding += 4; - rows -= 4; + while (rows >= 6) { + gf_6vect_dot_prod_sse(len, k, g_tbls, data, coding); + g_tbls += 6 * k * 32; + coding += 6; + rows -= 6; } switch (rows) { + case 5: + gf_5vect_dot_prod_sse(len, k, g_tbls, data, coding); + break; + case 4: + gf_4vect_dot_prod_sse(len, k, g_tbls, data, coding); + break; case 3: gf_3vect_dot_prod_sse(len, k, g_tbls, data, coding); break; @@ -69,13 +76,19 @@ void ec_encode_data_avx(int len, int k, int rows, unsigned char *g_tbls, unsigne return; } - while (rows >= 4) { - gf_4vect_dot_prod_avx(len, k, g_tbls, data, coding); - g_tbls += 4 * k * 32; - coding += 4; - rows -= 4; + while (rows >= 6) { + gf_6vect_dot_prod_avx(len, k, g_tbls, data, coding); + g_tbls += 6 * k * 32; + coding += 6; + rows -= 6; } switch (rows) { + case 5: + gf_5vect_dot_prod_avx(len, k, g_tbls, data, coding); + break; + case 4: + gf_4vect_dot_prod_avx(len, k, g_tbls, data, coding); + break; case 3: gf_3vect_dot_prod_avx(len, k, g_tbls, data, coding); break; @@ -100,13 +113,19 @@ void ec_encode_data_avx2(int len, int k, int rows, unsigned char *g_tbls, unsign return; } - while (rows >= 4) { - gf_4vect_dot_prod_avx2(len, k, g_tbls, data, coding); - g_tbls += 4 * k * 32; - coding += 4; - rows -= 4; + while (rows >= 6) { + gf_6vect_dot_prod_avx2(len, k, g_tbls, data, coding); + g_tbls += 6 * k * 32; + coding += 6; + rows -= 6; } switch (rows) { + case 5: + gf_5vect_dot_prod_avx2(len, k, g_tbls, data, coding); + break; + case 4: + gf_4vect_dot_prod_avx2(len, k, g_tbls, data, coding); + break; case 3: gf_3vect_dot_prod_avx2(len, k, g_tbls, data, coding); break; @@ -132,6 +151,10 @@ extern int gf_3vect_dot_prod_avx512(int len, int k, unsigned char *g_tbls, unsigned char **data, unsigned char **coding); extern int gf_4vect_dot_prod_avx512(int len, int k, unsigned char *g_tbls, unsigned char **data, unsigned char **coding); +extern int gf_5vect_dot_prod_avx512(int len, int k, unsigned char *g_tbls, + unsigned char **data, unsigned char **coding); +extern int gf_6vect_dot_prod_avx512(int len, int k, unsigned char *g_tbls, + unsigned char **data, unsigned char **coding); extern void gf_vect_mad_avx512(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, unsigned char *dest); extern void gf_2vect_mad_avx512(int len, int vec, int vec_i, unsigned char *gftbls, @@ -140,6 +163,10 @@ extern void gf_3vect_mad_avx512(int len, int vec, int vec_i, unsigned char *gftb unsigned char *src, unsigned char **dest); extern void gf_4vect_mad_avx512(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, unsigned char **dest); +extern void gf_5vect_mad_avx512(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest); +extern void gf_6vect_mad_avx512(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest); void ec_encode_data_avx512(int len, int k, int rows, unsigned char *g_tbls, unsigned char **data, unsigned char **coding) @@ -150,13 +177,19 @@ void ec_encode_data_avx512(int len, int k, int rows, unsigned char *g_tbls, return; } - while (rows >= 4) { - gf_4vect_dot_prod_avx512(len, k, g_tbls, data, coding); - g_tbls += 4 * k * 32; - coding += 4; - rows -= 4; + while (rows >= 6) { + gf_6vect_dot_prod_avx512(len, k, g_tbls, data, coding); + g_tbls += 6 * k * 32; + coding += 6; + rows -= 6; } switch (rows) { + case 5: + gf_5vect_dot_prod_avx512(len, k, g_tbls, data, coding); + break; + case 4: + gf_4vect_dot_prod_avx512(len, k, g_tbls, data, coding); + break; case 3: gf_3vect_dot_prod_avx512(len, k, g_tbls, data, coding); break; @@ -179,13 +212,19 @@ void ec_encode_data_update_avx512(int len, int k, int rows, int vec_i, unsigned return; } - while (rows >= 4) { - gf_4vect_mad_avx512(len, k, vec_i, g_tbls, data, coding); - g_tbls += 4 * k * 32; - coding += 4; - rows -= 4; + while (rows >= 6) { + gf_6vect_mad_avx512(len, k, vec_i, g_tbls, data, coding); + g_tbls += 6 * k * 32; + coding += 6; + rows -= 6; } switch (rows) { + case 5: + gf_5vect_mad_avx512(len, k, vec_i, g_tbls, data, coding); + break; + case 4: + gf_4vect_mad_avx512(len, k, vec_i, g_tbls, data, coding); + break; case 3: gf_3vect_mad_avx512(len, k, vec_i, g_tbls, data, coding); break; @@ -200,6 +239,179 @@ void ec_encode_data_update_avx512(int len, int k, int rows, int vec_i, unsigned } } +#if AS_FEATURE_LEVEL >= 10 + +extern void gf_vect_dot_prod_avx512_gfni(int len, int k, unsigned char *g_tbls, + unsigned char **data, unsigned char *dest); +extern void gf_2vect_dot_prod_avx512_gfni(int len, int k, unsigned char *g_tbls, + unsigned char **data, unsigned char **coding); +extern void gf_3vect_dot_prod_avx512_gfni(int len, int k, unsigned char *g_tbls, + unsigned char **data, unsigned char **coding); +extern void gf_4vect_dot_prod_avx512_gfni(int len, int k, unsigned char *g_tbls, + unsigned char **data, unsigned char **coding); +extern void gf_5vect_dot_prod_avx512_gfni(int len, int k, unsigned char *g_tbls, + unsigned char **data, unsigned char **coding); +extern void gf_6vect_dot_prod_avx512_gfni(int len, int k, unsigned char *g_tbls, + unsigned char **data, unsigned char **coding); + +extern void gf_vect_mad_avx512_gfni(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char *dest); +extern void gf_2vect_mad_avx512_gfni(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest); +extern void gf_3vect_mad_avx512_gfni(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest); +extern void gf_4vect_mad_avx512_gfni(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest); +extern void gf_5vect_mad_avx512_gfni(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest); +extern void gf_6vect_mad_avx512_gfni(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest); + +extern void gf_vect_dot_prod_avx2_gfni(int len, int k, unsigned char *g_tbls, + unsigned char **data, unsigned char *dest); +extern void gf_2vect_dot_prod_avx2_gfni(int len, int k, unsigned char *g_tbls, + unsigned char **data, unsigned char **coding); +extern void gf_3vect_dot_prod_avx2_gfni(int len, int k, unsigned char *g_tbls, + unsigned char **data, unsigned char **coding); +extern void gf_vect_mad_avx2_gfni(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char *dest); +extern void gf_2vect_mad_avx2_gfni(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest); +extern void gf_3vect_mad_avx2_gfni(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest); +extern void gf_4vect_mad_avx2_gfni(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest); +extern void gf_5vect_mad_avx2_gfni(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest); + +void ec_init_tables_gfni(int k, int rows, unsigned char *a, unsigned char *g_tbls) +{ + int i, j; + + uint64_t *g64 = (uint64_t *) g_tbls; + + for (i = 0; i < rows; i++) + for (j = 0; j < k; j++) + *(g64++) = gf_table_gfni[*a++]; + +} + +void ec_encode_data_avx512_gfni(int len, int k, int rows, unsigned char *g_tbls, + unsigned char **data, unsigned char **coding) +{ + + while (rows >= 6) { + gf_6vect_dot_prod_avx512_gfni(len, k, g_tbls, data, coding); + g_tbls += 6 * k * 8; + coding += 6; + rows -= 6; + } + switch (rows) { + case 5: + gf_5vect_dot_prod_avx512_gfni(len, k, g_tbls, data, coding); + break; + case 4: + gf_4vect_dot_prod_avx512_gfni(len, k, g_tbls, data, coding); + break; + case 3: + gf_3vect_dot_prod_avx512_gfni(len, k, g_tbls, data, coding); + break; + case 2: + gf_2vect_dot_prod_avx512_gfni(len, k, g_tbls, data, coding); + break; + case 1: + gf_vect_dot_prod_avx512_gfni(len, k, g_tbls, data, *coding); + break; + case 0: + default: + break; + } +} + +void ec_encode_data_avx2_gfni(int len, int k, int rows, unsigned char *g_tbls, + unsigned char **data, unsigned char **coding) +{ + while (rows >= 3) { + gf_3vect_dot_prod_avx2_gfni(len, k, g_tbls, data, coding); + g_tbls += 3 * k * 8; + coding += 3; + rows -= 3; + } + switch (rows) { + case 2: + gf_2vect_dot_prod_avx2_gfni(len, k, g_tbls, data, coding); + break; + case 1: + gf_vect_dot_prod_avx2_gfni(len, k, g_tbls, data, *coding); + break; + case 0: + default: + break; + } +} + +void ec_encode_data_update_avx512_gfni(int len, int k, int rows, int vec_i, + unsigned char *g_tbls, unsigned char *data, + unsigned char **coding) +{ + while (rows >= 6) { + gf_6vect_mad_avx512_gfni(len, k, vec_i, g_tbls, data, coding); + g_tbls += 6 * k * 8; + coding += 6; + rows -= 6; + } + switch (rows) { + case 5: + gf_5vect_mad_avx512_gfni(len, k, vec_i, g_tbls, data, coding); + break; + case 4: + gf_4vect_mad_avx512_gfni(len, k, vec_i, g_tbls, data, coding); + break; + case 3: + gf_3vect_mad_avx512_gfni(len, k, vec_i, g_tbls, data, coding); + break; + case 2: + gf_2vect_mad_avx512_gfni(len, k, vec_i, g_tbls, data, coding); + break; + case 1: + gf_vect_mad_avx512_gfni(len, k, vec_i, g_tbls, data, *coding); + break; + case 0: + default: + break; + } +} + +void ec_encode_data_update_avx2_gfni(int len, int k, int rows, int vec_i, + unsigned char *g_tbls, unsigned char *data, + unsigned char **coding) +{ + while (rows >= 5) { + gf_5vect_mad_avx2_gfni(len, k, vec_i, g_tbls, data, coding); + g_tbls += 5 * k * 8; + coding += 5; + rows -= 5; + } + switch (rows) { + case 4: + gf_4vect_mad_avx2_gfni(len, k, vec_i, g_tbls, data, coding); + break; + case 3: + gf_3vect_mad_avx2_gfni(len, k, vec_i, g_tbls, data, coding); + break; + case 2: + gf_2vect_mad_avx2_gfni(len, k, vec_i, g_tbls, data, coding); + break; + case 1: + gf_vect_mad_avx2_gfni(len, k, vec_i, g_tbls, data, *coding); + break; + case 0: + default: + break; + } +} + +#endif // AS_FEATURE_LEVEL >= 10 #endif // HAVE_AS_KNOWS_AVX512 #if __WORDSIZE == 64 || _WIN64 || __x86_64__ @@ -321,16 +533,3 @@ void ec_encode_data_update_avx2(int len, int k, int rows, int vec_i, unsigned ch #endif //__WORDSIZE == 64 || _WIN64 || __x86_64__ #endif //__x86_64__ || __i386__ || _M_X64 || _M_IX86 - -struct slver { - unsigned short snum; - unsigned char ver; - unsigned char core; -}; - -// Version info -struct slver ec_init_tables_slver_00010068; -struct slver ec_init_tables_slver = { 0x0068, 0x01, 0x00 }; - -struct slver ec_encode_data_sse_slver_00020069; -struct slver ec_encode_data_sse_slver = { 0x0069, 0x02, 0x00 }; diff --git a/contrib/libs/isa-l/erasure_code/ec_multibinary.asm b/contrib/libs/isa-l/erasure_code/ec_multibinary.asm index a07f45d6f8..424687877d 100644 --- a/contrib/libs/isa-l/erasure_code/ec_multibinary.asm +++ b/contrib/libs/isa-l/erasure_code/ec_multibinary.asm @@ -53,6 +53,16 @@ extern gf_vect_mad_avx2 %endif +%if (AS_FEATURE_LEVEL) >= 10 + extern ec_init_tables_gfni + extern ec_encode_data_avx512_gfni + extern ec_encode_data_avx2_gfni + extern ec_encode_data_update_avx512_gfni + extern ec_encode_data_update_avx2_gfni +%endif + +extern ec_init_tables_base + extern gf_vect_mul_base extern ec_encode_data_base extern ec_encode_data_update_base @@ -71,6 +81,7 @@ mbin_interface gf_vect_dot_prod mbin_interface gf_vect_mul mbin_interface ec_encode_data_update mbin_interface gf_vect_mad +mbin_interface ec_init_tables %ifidn __OUTPUT_FORMAT__, elf32 mbin_dispatch_init5 ec_encode_data, ec_encode_data_base, ec_encode_data_sse, ec_encode_data_avx, ec_encode_data_avx2 @@ -78,18 +89,13 @@ mbin_interface gf_vect_mad mbin_dispatch_init2 gf_vect_mul, gf_vect_mul_base mbin_dispatch_init2 ec_encode_data_update, ec_encode_data_update_base mbin_dispatch_init2 gf_vect_mad, gf_vect_mad_base + mbin_dispatch_init2 ec_init_tables, ec_init_tables_base %else mbin_dispatch_init5 gf_vect_mul, gf_vect_mul_base, gf_vect_mul_sse, gf_vect_mul_avx, gf_vect_mul_avx - mbin_dispatch_init6 ec_encode_data, ec_encode_data_base, ec_encode_data_sse, ec_encode_data_avx, ec_encode_data_avx2, ec_encode_data_avx512 - mbin_dispatch_init6 ec_encode_data_update, ec_encode_data_update_base, ec_encode_data_update_sse, ec_encode_data_update_avx, ec_encode_data_update_avx2, ec_encode_data_update_avx512 + mbin_dispatch_init8 ec_encode_data, ec_encode_data_base, ec_encode_data_sse, ec_encode_data_avx, ec_encode_data_avx2, ec_encode_data_avx512, ec_encode_data_avx2_gfni, ec_encode_data_avx512_gfni + mbin_dispatch_init8 ec_encode_data_update, ec_encode_data_update_base, ec_encode_data_update_sse, ec_encode_data_update_avx, ec_encode_data_update_avx2, ec_encode_data_update_avx512, ec_encode_data_update_avx2_gfni, ec_encode_data_update_avx512_gfni mbin_dispatch_init6 gf_vect_mad, gf_vect_mad_base, gf_vect_mad_sse, gf_vect_mad_avx, gf_vect_mad_avx2, gf_vect_mad_avx512 mbin_dispatch_init6 gf_vect_dot_prod, gf_vect_dot_prod_base, gf_vect_dot_prod_sse, gf_vect_dot_prod_avx, gf_vect_dot_prod_avx2, gf_vect_dot_prod_avx512 + mbin_dispatch_init8 ec_init_tables, ec_init_tables_base, ec_init_tables_base, ec_init_tables_base, ec_init_tables_base, ec_init_tables_base, ec_init_tables_gfni, ec_init_tables_gfni %endif - -;;; func core, ver, snum -slversion ec_encode_data, 00, 06, 0133 -slversion gf_vect_mul, 00, 05, 0134 -slversion ec_encode_data_update, 00, 05, 0212 -slversion gf_vect_dot_prod, 00, 05, 0138 -slversion gf_vect_mad, 00, 04, 0213 diff --git a/contrib/libs/isa-l/erasure_code/ec_multibinary_darwin.asm b/contrib/libs/isa-l/erasure_code/ec_multibinary_darwin.asm index 8c2537f562..c05ff5b720 100644 --- a/contrib/libs/isa-l/erasure_code/ec_multibinary_darwin.asm +++ b/contrib/libs/isa-l/erasure_code/ec_multibinary_darwin.asm @@ -53,6 +53,16 @@ extern _gf_vect_mad_avx2 %endif +%if (AS_FEATURE_LEVEL) >= 10 + extern _ec_init_tables_gfni + extern _ec_encode_data_avx512_gfni + extern _ec_encode_data_avx2_gfni + extern _ec_encode_data_update_avx512_gfni + extern _ec_encode_data_update_avx2_gfni +%endif + +extern _ec_init_tables_base + extern _gf_vect_mul_base extern _ec_encode_data_base extern _ec_encode_data_update_base @@ -71,6 +81,7 @@ mbin_interface _gf_vect_dot_prod mbin_interface _gf_vect_mul mbin_interface _ec_encode_data_update mbin_interface _gf_vect_mad +mbin_interface _ec_init_tables %ifidn __OUTPUT_FORMAT__, elf32 mbin_dispatch_init5 _ec_encode_data, _ec_encode_data_base, _ec_encode_data_sse, _ec_encode_data_avx, _ec_encode_data_avx2 @@ -78,19 +89,13 @@ mbin_interface _gf_vect_mad mbin_dispatch_init2 _gf_vect_mul, _gf_vect_mul_base mbin_dispatch_init2 _ec_encode_data_update, _ec_encode_data_update_base mbin_dispatch_init2 _gf_vect_mad, _gf_vect_mad_base + mbin_dispatch_init2 _ec_init_tables, _ec_init_tables_base %else mbin_dispatch_init5 _gf_vect_mul, _gf_vect_mul_base, _gf_vect_mul_sse, _gf_vect_mul_avx, _gf_vect_mul_avx - mbin_dispatch_init6 _ec_encode_data, _ec_encode_data_base, _ec_encode_data_sse, _ec_encode_data_avx, _ec_encode_data_avx2, _ec_encode_data_avx512 - mbin_dispatch_init6 _ec_encode_data_update, _ec_encode_data_update_base, _ec_encode_data_update_sse, _ec_encode_data_update_avx, _ec_encode_data_update_avx2, _ec_encode_data_update_avx512 + mbin_dispatch_init8 _ec_encode_data, _ec_encode_data_base, _ec_encode_data_sse, _ec_encode_data_avx, _ec_encode_data_avx2, _ec_encode_data_avx512, _ec_encode_data_avx2_gfni, _ec_encode_data_avx512_gfni + mbin_dispatch_init8 _ec_encode_data_update, _ec_encode_data_update_base, _ec_encode_data_update_sse, _ec_encode_data_update_avx, _ec_encode_data_update_avx2, _ec_encode_data_update_avx512, _ec_encode_data_update_avx2_gfni, _ec_encode_data_update_avx512_gfni mbin_dispatch_init6 _gf_vect_mad, _gf_vect_mad_base, _gf_vect_mad_sse, _gf_vect_mad_avx, _gf_vect_mad_avx2, _gf_vect_mad_avx512 mbin_dispatch_init6 _gf_vect_dot_prod, _gf_vect_dot_prod_base, _gf_vect_dot_prod_sse, _gf_vect_dot_prod_avx, _gf_vect_dot_prod_avx2, _gf_vect_dot_prod_avx512 + mbin_dispatch_init8 _ec_init_tables, _ec_init_tables_base, _ec_init_tables_base, _ec_init_tables_base, _ec_init_tables_base, _ec_init_tables_base, _ec_init_tables_gfni, _ec_init_tables_gfni %endif - - -;;; func core, ver, snum -slversion ec_encode_data, 00, 06, 0133 -slversion gf_vect_mul, 00, 05, 0134 -slversion ec_encode_data_update, 00, 05, 0212 -slversion gf_vect_dot_prod, 00, 05, 0138 -slversion gf_vect_mad, 00, 04, 0213 diff --git a/contrib/libs/isa-l/erasure_code/erasure_code_base_perf.c b/contrib/libs/isa-l/erasure_code/erasure_code_base_perf.c index 9587788d86..4fca10599d 100644 --- a/contrib/libs/isa-l/erasure_code/erasure_code_base_perf.c +++ b/contrib/libs/isa-l/erasure_code/erasure_code_base_perf.c @@ -30,25 +30,26 @@ #include <stdio.h> #include <stdlib.h> #include <string.h> // for memset, memcmp +#include <assert.h> #include "erasure_code.h" #include "test.h" -//#define CACHED_TEST -#ifdef CACHED_TEST +#ifndef GT_L3_CACHE +# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ +#endif + +#if !defined(COLD_TEST) && !defined(TEST_CUSTOM) // Cached test, loop many times over small dataset # define TEST_SOURCES 32 # define TEST_LEN(m) ((128*1024 / m) & ~(64-1)) # define TEST_TYPE_STR "_warm" -#else -# ifndef TEST_CUSTOM +#elif defined (COLD_TEST) // Uncached test. Pull from large mem base. -# define TEST_SOURCES 32 -# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ -# define TEST_LEN(m) ((GT_L3_CACHE / m) & ~(64-1)) -# define TEST_TYPE_STR "_cold" -# else -# define TEST_TYPE_STR "_cus" -# endif +# define TEST_SOURCES 32 +# define TEST_LEN(m) ((GT_L3_CACHE / m) & ~(64-1)) +# define TEST_TYPE_STR "_cold" +#elif defined (TEST_CUSTOM) +# define TEST_TYPE_STR "_cus" #endif #define MMAX TEST_SOURCES @@ -60,7 +61,7 @@ typedef unsigned char u8; void ec_encode_perf(int m, int k, u8 * a, u8 * g_tbls, u8 ** buffs) { - ec_init_tables(k, m - k, &a[k * k], g_tbls); + ec_init_tables_base(k, m - k, &a[k * k], g_tbls); ec_encode_data_base(TEST_LEN(m), k, m - k, g_tbls, buffs, &buffs[k]); } @@ -88,7 +89,7 @@ int ec_decode_perf(int m, int k, u8 * a, u8 * g_tbls, u8 ** buffs, u8 * src_in_e c[k * i + j] = d[k * src_err_list[i] + j]; // Recover data - ec_init_tables(k, nerrs, c, g_tbls); + ec_init_tables_base(k, nerrs, c, g_tbls); ec_encode_data_base(TEST_LEN(m), k, nerrs, g_tbls, recov, temp_buffs); return 0; @@ -112,10 +113,8 @@ int main(int argc, char *argv[]) printf("erasure_code_base_perf: %dx%d %d\n", m, TEST_LEN(m), nerrs); - if (m > MMAX || k > KMAX || nerrs > (m - k)) { - printf(" Input test parameter error\n"); - return -1; - } + // check input parameters + assert(!(m > MMAX || k > KMAX || nerrs > (m - k))); memcpy(src_err_list, err_list, nerrs); memset(src_in_err, 0, TEST_SOURCES); diff --git a/contrib/libs/isa-l/erasure_code/erasure_code_base_test.c b/contrib/libs/isa-l/erasure_code/erasure_code_base_test.c index a87f33f9f4..ad48d8e448 100644 --- a/contrib/libs/isa-l/erasure_code/erasure_code_base_test.c +++ b/contrib/libs/isa-l/erasure_code/erasure_code_base_test.c @@ -30,10 +30,11 @@ #include <stdio.h> #include <stdlib.h> #include <string.h> // for memset, memcmp +#include <assert.h> #include "erasure_code.h" -// #include "types.h" +#include "test.h" -#define TEST_LEN 512 +#define TEST_LEN 8192 #define TEST_SIZE (TEST_LEN/2) #ifndef TEST_SOURCES @@ -264,8 +265,7 @@ int main(int argc, char *argv[]) // Pick a first test m = 9; k = 5; - if (m > MMAX || k > KMAX) - return -1; + assert((m <= MMAX) && (k <= KMAX)); // Make random data for (i = 0; i < k; i++) @@ -278,7 +278,7 @@ int main(int argc, char *argv[]) gf_gen_rs_matrix(encode_matrix, m, k); // Generate g_tbls from encode matrix encode_matrix - ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls); + ec_init_tables_base(k, m - k, &encode_matrix[k * k], g_tbls); // Perform matrix dot_prod for EC encoding // using g_tbls from encode matrix encode_matrix @@ -304,7 +304,7 @@ int main(int argc, char *argv[]) } // Recover data - ec_init_tables(k, nerrs, decode_matrix, g_tbls); + ec_init_tables_base(k, nerrs, decode_matrix, g_tbls); ec_encode_data_base(TEST_LEN, k, nerrs, g_tbls, recov, &temp_buffs[k]); for (i = 0; i < nerrs; i++) { @@ -346,7 +346,7 @@ int main(int argc, char *argv[]) gf_gen_cauchy1_matrix(encode_matrix, m, k); // Generate g_tbls from encode matrix encode_matrix - ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls); + ec_init_tables_base(k, m - k, &encode_matrix[k * k], g_tbls); // Perform matrix dot_prod for EC encoding // using g_tbls from encode matrix encode_matrix @@ -372,7 +372,7 @@ int main(int argc, char *argv[]) } // Recover data - ec_init_tables(k, nerrs, decode_matrix, g_tbls); + ec_init_tables_base(k, nerrs, decode_matrix, g_tbls); ec_encode_data_base(TEST_LEN, k, nerrs, g_tbls, recov, &temp_buffs[k]); for (i = 0; i < nerrs; i++) { @@ -417,7 +417,7 @@ int main(int argc, char *argv[]) // Make parity vects // Generate g_tbls from encode matrix a - ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls); + ec_init_tables_base(k, m - k, &encode_matrix[k * k], g_tbls); // Perform matrix dot_prod for EC encoding // using g_tbls from encode matrix a ec_encode_data_base(TEST_LEN, k, m - k, g_tbls, buffs, &buffs[k]); @@ -442,7 +442,7 @@ int main(int argc, char *argv[]) } // Recover data - ec_init_tables(k, nerrs, decode_matrix, g_tbls); + ec_init_tables_base(k, nerrs, decode_matrix, g_tbls); ec_encode_data_base(TEST_LEN, k, nerrs, g_tbls, recov, &temp_buffs[k]); for (i = 0; i < nerrs; i++) { @@ -470,7 +470,9 @@ int main(int argc, char *argv[]) return -1; } } +#ifdef TEST_VERBOSE putchar('.'); +#endif } // Run tests at end of buffer for Electric Fence @@ -500,7 +502,7 @@ int main(int argc, char *argv[]) // Make parity vects // Generate g_tbls from encode matrix a - ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls); + ec_init_tables_base(k, m - k, &encode_matrix[k * k], g_tbls); // Perform matrix dot_prod for EC encoding // using g_tbls from encode matrix a ec_encode_data_base(size, k, m - k, g_tbls, efence_buffs, @@ -526,7 +528,7 @@ int main(int argc, char *argv[]) } // Recover data - ec_init_tables(k, nerrs, decode_matrix, g_tbls); + ec_init_tables_base(k, nerrs, decode_matrix, g_tbls); ec_encode_data_base(size, k, nerrs, g_tbls, recov, &temp_buffs[k]); for (i = 0; i < nerrs; i++) { @@ -593,7 +595,7 @@ int main(int argc, char *argv[]) // Make parity vects // Generate g_tbls from encode matrix a - ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls); + ec_init_tables_base(k, m - k, &encode_matrix[k * k], g_tbls); // Perform matrix dot_prod for EC encoding // using g_tbls from encode matrix a ec_encode_data_base(size, k, m - k, g_tbls, ubuffs, &ubuffs[k]); @@ -618,7 +620,7 @@ int main(int argc, char *argv[]) } // Recover data - ec_init_tables(k, nerrs, decode_matrix, g_tbls); + ec_init_tables_base(k, nerrs, decode_matrix, g_tbls); ec_encode_data_base(size, k, nerrs, g_tbls, recov, &temp_ubuffs[k]); for (i = 0; i < nerrs; i++) { @@ -681,7 +683,9 @@ int main(int argc, char *argv[]) } } +#ifdef TEST_VERBOSE putchar('.'); +#endif } // Test size alignment @@ -705,7 +709,7 @@ int main(int argc, char *argv[]) // Make parity vects // Generate g_tbls from encode matrix a - ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls); + ec_init_tables_base(k, m - k, &encode_matrix[k * k], g_tbls); // Perform matrix dot_prod for EC encoding // using g_tbls from encode matrix a ec_encode_data_base(size, k, m - k, g_tbls, buffs, &buffs[k]); @@ -729,7 +733,7 @@ int main(int argc, char *argv[]) } // Recover data - ec_init_tables(k, nerrs, decode_matrix, g_tbls); + ec_init_tables_base(k, nerrs, decode_matrix, g_tbls); ec_encode_data_base(size, k, nerrs, g_tbls, recov, &temp_buffs[k]); for (i = 0; i < nerrs; i++) { diff --git a/contrib/libs/isa-l/erasure_code/erasure_code_base_test.patch b/contrib/libs/isa-l/erasure_code/erasure_code_base_test.patch deleted file mode 100644 index 0d84217177..0000000000 --- a/contrib/libs/isa-l/erasure_code/erasure_code_base_test.patch +++ /dev/null @@ -1,12 +0,0 @@ -34c34 -< // #include "types.h" ---- -> #include "types.h" -36c36 -< #define TEST_LEN 512 ---- -> #define TEST_LEN 8192 -204c204 -< s ^= gf_mul_erasure(invert_matrix[j * k + i], ---- -> s ^= gf_mul(invert_matrix[j * k + i], diff --git a/contrib/libs/isa-l/erasure_code/erasure_code_perf.c b/contrib/libs/isa-l/erasure_code/erasure_code_perf.c index c4cad880f1..25c8774507 100644 --- a/contrib/libs/isa-l/erasure_code/erasure_code_perf.c +++ b/contrib/libs/isa-l/erasure_code/erasure_code_perf.c @@ -33,22 +33,25 @@ #include "erasure_code.h" #include "test.h" -//#define CACHED_TEST -#ifdef CACHED_TEST +#ifndef GT_L3_CACHE +# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ +#endif + +#if !defined(COLD_TEST) && !defined(TEST_CUSTOM) // Cached test, loop many times over small dataset # define TEST_SOURCES 32 # define TEST_LEN(m) ((128*1024 / m) & ~(64-1)) # define TEST_TYPE_STR "_warm" -#else -# ifndef TEST_CUSTOM +#elif defined (COLD_TEST) // Uncached test. Pull from large mem base. -# define TEST_SOURCES 32 -# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ -# define TEST_LEN(m) ((GT_L3_CACHE / m) & ~(64-1)) -# define TEST_TYPE_STR "_cold" -# else -# define TEST_TYPE_STR "_cus" -# endif +# define TEST_SOURCES 32 +# define TEST_LEN(m) ((GT_L3_CACHE / m) & ~(64-1)) +# define TEST_TYPE_STR "_cold" +#elif defined (TEST_CUSTOM) +# define TEST_TYPE_STR "_cus" +#endif +#ifndef TEST_SEED +# define TEST_SEED 0x1234 #endif #define MMAX TEST_SOURCES @@ -58,14 +61,26 @@ typedef unsigned char u8; -void ec_encode_perf(int m, int k, u8 * a, u8 * g_tbls, u8 ** buffs) +void usage(const char *app_name) +{ + fprintf(stderr, + "Usage: %s [options]\n" + " -h Help\n" + " -k <val> Number of source buffers\n" + " -p <val> Number of parity buffers\n" + " -e <val> Number of simulated buffers with errors (cannot be higher than p or k)\n", + app_name); +} + +void ec_encode_perf(int m, int k, u8 * a, u8 * g_tbls, u8 ** buffs, struct perf *start) { ec_init_tables(k, m - k, &a[k * k], g_tbls); - ec_encode_data(TEST_LEN(m), k, m - k, g_tbls, buffs, &buffs[k]); + BENCHMARK(start, BENCHMARK_TIME, + ec_encode_data(TEST_LEN(m), k, m - k, g_tbls, buffs, &buffs[k])); } int ec_decode_perf(int m, int k, u8 * a, u8 * g_tbls, u8 ** buffs, u8 * src_in_err, - u8 * src_err_list, int nerrs, u8 ** temp_buffs) + u8 * src_err_list, int nerrs, u8 ** temp_buffs, struct perf *start) { int i, j, r; u8 b[MMAX * KMAX], c[MMAX * KMAX], d[MMAX * KMAX]; @@ -89,34 +104,109 @@ int ec_decode_perf(int m, int k, u8 * a, u8 * g_tbls, u8 ** buffs, u8 * src_in_e // Recover data ec_init_tables(k, nerrs, c, g_tbls); - ec_encode_data(TEST_LEN(m), k, nerrs, g_tbls, recov, temp_buffs); + BENCHMARK(start, BENCHMARK_TIME, + ec_encode_data(TEST_LEN(m), k, nerrs, g_tbls, recov, temp_buffs)); return 0; } int main(int argc, char *argv[]) { - int i, j, m, k, nerrs, check; + int i, j, m, k, p, nerrs, check, ret = -1; void *buf; - u8 *temp_buffs[TEST_SOURCES], *buffs[TEST_SOURCES]; + u8 *temp_buffs[TEST_SOURCES] = { NULL }; + u8 *buffs[TEST_SOURCES] = { NULL }; u8 a[MMAX * KMAX]; u8 g_tbls[KMAX * TEST_SOURCES * 32], src_in_err[TEST_SOURCES]; u8 src_err_list[TEST_SOURCES]; struct perf start; - // Pick test parameters - m = 14; - k = 10; + /* Set default parameters */ + k = 8; + p = 6; nerrs = 4; - const u8 err_list[] = { 2, 4, 5, 7 }; - printf("erasure_code_perf: %dx%d %d\n", m, TEST_LEN(m), nerrs); + /* Parse arguments */ + for (i = 1; i < argc; i++) { + if (strcmp(argv[i], "-k") == 0) { + k = atoi(argv[++i]); + } else if (strcmp(argv[i], "-p") == 0) { + p = atoi(argv[++i]); + } else if (strcmp(argv[i], "-e") == 0) { + nerrs = atoi(argv[++i]); + } else if (strcmp(argv[i], "-h") == 0) { + usage(argv[0]); + return 0; + } else { + usage(argv[0]); + return -1; + } + } + + if (nerrs > k) { + printf + ("Number of errors (%d) cannot be higher than number of data buffers (%d)\n", + nerrs, k); + return -1; + } + + if (k <= 0) { + printf("Number of source buffers (%d) must be > 0\n", k); + return -1; + } + + if (p <= 0) { + printf("Number of parity buffers (%d) must be > 0\n", p); + return -1; + } + + if (nerrs <= 0) { + printf("Number of errors (%d) must be > 0\n", nerrs); + return -1; + } - if (m > MMAX || k > KMAX || nerrs > (m - k)) { - printf(" Input test parameter error\n"); + if (nerrs > p) { + printf + ("Number of errors (%d) cannot be higher than number of parity buffers (%d)\n", + nerrs, p); return -1; } + m = k + p; + + if (m > MMAX) { + printf("Number of total buffers (data and parity) cannot be higher than %d\n", + MMAX); + return -1; + } + + u8 *err_list = malloc((size_t)nerrs); + if (err_list == NULL) { + printf("Error allocating list of array of error indices\n"); + return -1; + } + + srand(TEST_SEED); + + for (i = 0; i < nerrs;) { + u8 next_err = rand() % k; + for (j = 0; j < i; j++) + if (next_err == err_list[j]) + break; + if (j != i) + continue; + err_list[i++] = next_err; + } + + printf("Testing with %u data buffers and %u parity buffers (num errors = %u, in [ ", k, + p, nerrs); + for (i = 0; i < nerrs; i++) + printf("%d ", (int)err_list[i]); + + printf("])\n"); + + printf("erasure_code_perf: %dx%d %d\n", m, TEST_LEN(m), nerrs); + memcpy(src_err_list, err_list, nerrs); memset(src_in_err, 0, TEST_SOURCES); for (i = 0; i < nerrs; i++) @@ -125,16 +215,16 @@ int main(int argc, char *argv[]) // Allocate the arrays for (i = 0; i < m; i++) { if (posix_memalign(&buf, 64, TEST_LEN(m))) { - printf("alloc error: Fail\n"); - return -1; + printf("Error allocating buffers\n"); + goto exit; } buffs[i] = buf; } - for (i = 0; i < (m - k); i++) { + for (i = 0; i < p; i++) { if (posix_memalign(&buf, 64, TEST_LEN(m))) { - printf("alloc error: Fail\n"); - return -1; + printf("Error allocating buffers\n"); + goto exit; } temp_buffs[i] = buf; } @@ -147,24 +237,24 @@ int main(int argc, char *argv[]) gf_gen_rs_matrix(a, m, k); // Start encode test - BENCHMARK(&start, BENCHMARK_TIME, ec_encode_perf(m, k, a, g_tbls, buffs)); + ec_encode_perf(m, k, a, g_tbls, buffs, &start); printf("erasure_code_encode" TEST_TYPE_STR ": "); perf_print(start, (long long)(TEST_LEN(m)) * (m)); // Start decode test - BENCHMARK(&start, BENCHMARK_TIME, check = - ec_decode_perf(m, k, a, g_tbls, buffs, src_in_err, src_err_list, nerrs, - temp_buffs)); + check = ec_decode_perf(m, k, a, g_tbls, buffs, src_in_err, src_err_list, nerrs, + temp_buffs, &start); if (check == BAD_MATRIX) { printf("BAD MATRIX\n"); - return check; + ret = check; + goto exit; } for (i = 0; i < nerrs; i++) { if (0 != memcmp(temp_buffs[i], buffs[src_err_list[i]], TEST_LEN(m))) { printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs); - return -1; + goto exit; } } @@ -172,5 +262,14 @@ int main(int argc, char *argv[]) perf_print(start, (long long)(TEST_LEN(m)) * (k + nerrs)); printf("done all: Pass\n"); - return 0; + + ret = 0; + + exit: + free(err_list); + for (i = 0; i < TEST_SOURCES; i++) { + free(buffs[i]); + free(temp_buffs[i]); + } + return ret; } diff --git a/contrib/libs/isa-l/erasure_code/erasure_code_test.c b/contrib/libs/isa-l/erasure_code/erasure_code_test.c index 16a6457e4e..f45b38a06a 100644 --- a/contrib/libs/isa-l/erasure_code/erasure_code_test.c +++ b/contrib/libs/isa-l/erasure_code/erasure_code_test.c @@ -30,8 +30,9 @@ #include <stdio.h> #include <stdlib.h> #include <string.h> // for memset, memcmp +#include <assert.h> #include "erasure_code.h" -// #include "types.h" +#include "test.h" #define TEST_LEN 8192 #define TEST_SIZE (TEST_LEN/2) @@ -215,13 +216,14 @@ static int gf_gen_decode_matrix(unsigned char *encode_matrix, int main(int argc, char *argv[]) { - int re = 0; + int re = -1; int i, j, p, rtest, m, k; int nerrs, nsrcerrs; void *buf; unsigned int decode_index[MMAX]; - unsigned char *temp_buffs[TEST_SOURCES], *buffs[TEST_SOURCES]; - unsigned char *encode_matrix, *decode_matrix, *invert_matrix, *g_tbls; + unsigned char *temp_buffs[TEST_SOURCES] = { NULL }, *buffs[TEST_SOURCES] = { NULL }; + unsigned char *encode_matrix = NULL, *decode_matrix = NULL, *invert_matrix = + NULL, *g_tbls = NULL; unsigned char src_in_err[TEST_SOURCES], src_err_list[TEST_SOURCES]; unsigned char *recov[TEST_SOURCES]; @@ -238,7 +240,7 @@ int main(int argc, char *argv[]) for (i = 0; i < TEST_SOURCES; i++) { if (posix_memalign(&buf, 64, TEST_LEN)) { printf("alloc error: Fail"); - return -1; + goto exit; } buffs[i] = buf; } @@ -246,7 +248,7 @@ int main(int argc, char *argv[]) for (i = 0; i < TEST_SOURCES; i++) { if (posix_memalign(&buf, 64, TEST_LEN)) { printf("alloc error: Fail"); - return -1; + goto exit; } temp_buffs[i] = buf; } @@ -260,13 +262,12 @@ int main(int argc, char *argv[]) if (encode_matrix == NULL || decode_matrix == NULL || invert_matrix == NULL || g_tbls == NULL) { printf("Test failure! Error with malloc\n"); - return -1; + goto exit; } // Pick a first test m = 9; k = 5; - if (m > MMAX || k > KMAX) - return -1; + assert((m <= MMAX) && (k <= KMAX)); // Make random data for (i = 0; i < k; i++) @@ -295,7 +296,7 @@ int main(int argc, char *argv[]) nerrs, nsrcerrs, k, m); if (re != 0) { printf("Fail to gf_gen_decode_matrix\n"); - return -1; + goto exit; } // Pack recovery array as list of valid sources // Its order must be the same as the order @@ -327,15 +328,18 @@ int main(int argc, char *argv[]) dump(temp_buffs[k + i], 25); printf("orig :"); dump(buffs[src_err_list[i]], 25); - return -1; + re = -1; + goto exit; } } // Pick a first test m = 9; k = 5; - if (m > MMAX || k > KMAX) - return -1; + if (m > MMAX || k > KMAX) { + re = -1; + goto exit; + } // Make random data for (i = 0; i < k; i++) @@ -363,7 +367,7 @@ int main(int argc, char *argv[]) nerrs, nsrcerrs, k, m); if (re != 0) { printf("Fail to gf_gen_decode_matrix\n"); - return -1; + goto exit; } // Pack recovery array as list of valid sources // Its order must be the same as the order @@ -395,7 +399,8 @@ int main(int argc, char *argv[]) dump(temp_buffs[k + i], 25); printf("orig :"); dump(buffs[src_err_list[i]], 25); - return -1; + re = -1; + goto exit; } } @@ -433,7 +438,7 @@ int main(int argc, char *argv[]) src_in_err, nerrs, nsrcerrs, k, m); if (re != 0) { printf("Fail to gf_gen_decode_matrix\n"); - return -1; + goto exit; } // Pack recovery array as list of valid sources // Its order must be the same as the order @@ -468,22 +473,29 @@ int main(int argc, char *argv[]) dump(buffs[src_err_list[i]], 25); printf("recov %d:", src_err_list[i]); dump(temp_buffs[k + i], 25); - return -1; + re = -1; + goto exit; } } +#ifdef TEST_VERBOSE putchar('.'); +#endif } // Run tests at end of buffer for Electric Fence k = 16; align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16; - if (k > KMAX) - return -1; + if (k > KMAX) { + re = -1; + goto exit; + } for (rows = 1; rows <= 16; rows++) { m = k + rows; - if (m > MMAX) - return -1; + if (m > MMAX) { + re = -1; + goto exit; + } // Make random data for (i = 0; i < k; i++) @@ -516,7 +528,7 @@ int main(int argc, char *argv[]) src_in_err, nerrs, nsrcerrs, k, m); if (re != 0) { printf("Fail to gf_gen_decode_matrix\n"); - return -1; + goto exit; } // Pack recovery array as list of valid sources // Its order must be the same as the order @@ -556,7 +568,8 @@ int main(int argc, char *argv[]) dump(temp_buffs[k + i], align); printf("orig :"); dump(efence_buffs[src_err_list[i]], align); - return -1; + re = -1; + goto exit; } } } @@ -608,7 +621,7 @@ int main(int argc, char *argv[]) src_in_err, nerrs, nsrcerrs, k, m); if (re != 0) { printf("Fail to gf_gen_decode_matrix\n"); - return -1; + goto exit; } // Pack recovery array as list of valid sources // Its order must be the same as the order @@ -643,7 +656,8 @@ int main(int argc, char *argv[]) dump(ubuffs[src_err_list[i]], 25); printf("recov %d:", src_err_list[i]); dump(temp_ubuffs[k + i], 25); - return -1; + re = -1; + goto exit; } } @@ -656,13 +670,15 @@ int main(int argc, char *argv[]) if (memcmp(buffs[i], temp_buffs[0], offset)) { printf("Fail rand ualign encode pad start\n"); - return -1; + re = -1; + goto exit; } if (memcmp (buffs[i] + offset + size, temp_buffs[0], PTR_ALIGN_CHK_B - offset)) { printf("Fail rand ualign encode pad end\n"); - return -1; + re = -1; + goto exit; } } @@ -671,17 +687,21 @@ int main(int argc, char *argv[]) offset = temp_ubuffs[k + i] - temp_buffs[k + i]; if (memcmp(temp_buffs[k + i], temp_buffs[0], offset)) { printf("Fail rand ualign decode pad start\n"); - return -1; + re = -1; + goto exit; } if (memcmp (temp_buffs[k + i] + offset + size, temp_buffs[0], PTR_ALIGN_CHK_B - offset)) { printf("Fail rand ualign decode pad end\n"); - return -1; + re = -1; + goto exit; } } +#ifdef TEST_VERBOSE putchar('.'); +#endif } // Test size alignment @@ -719,7 +739,7 @@ int main(int argc, char *argv[]) src_in_err, nerrs, nsrcerrs, k, m); if (re != 0) { printf("Fail to gf_gen_decode_matrix\n"); - return -1; + goto exit; } // Pack recovery array as list of valid sources // Its order must be the same as the order @@ -754,11 +774,26 @@ int main(int argc, char *argv[]) dump(buffs[src_err_list[i]], 25); printf("recov %d:", src_err_list[i]); dump(temp_buffs[k + i], 25); - return -1; + re = -1; + goto exit; } } } printf("done EC tests: Pass\n"); - return 0; + re = 0; + + exit: + for (i = 0; i < TEST_SOURCES; i++) { + if (buffs[i]) + aligned_free(buffs[i]); + if (temp_buffs[i]) + aligned_free(temp_buffs[i]); + } + free(encode_matrix); + free(decode_matrix); + free(invert_matrix); + free(g_tbls); + + return re; } diff --git a/contrib/libs/isa-l/erasure_code/erasure_code_test.patch b/contrib/libs/isa-l/erasure_code/erasure_code_test.patch deleted file mode 100644 index 0bf88ff23b..0000000000 --- a/contrib/libs/isa-l/erasure_code/erasure_code_test.patch +++ /dev/null @@ -1,8 +0,0 @@ -34c34 -< // #include "types.h" ---- -> #include "types.h" -205c205 -< s ^= gf_mul_erasure(invert_matrix[j * k + i], ---- -> s ^= gf_mul(invert_matrix[j * k + i], diff --git a/contrib/libs/isa-l/erasure_code/erasure_code_update_perf.c b/contrib/libs/isa-l/erasure_code/erasure_code_update_perf.c index 909e894149..e74a217cb3 100644 --- a/contrib/libs/isa-l/erasure_code/erasure_code_update_perf.c +++ b/contrib/libs/isa-l/erasure_code/erasure_code_update_perf.c @@ -31,7 +31,6 @@ #include <stdlib.h> #include <string.h> // for memset, memcmp #include "erasure_code.h" -#include "types.h" #include "test.h" //By default, test multibinary version @@ -48,22 +47,25 @@ #define str(s) #s #define xstr(s) str(s) -//#define CACHED_TEST -#ifdef CACHED_TEST +#ifndef GT_L3_CACHE +# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ +#endif + +#if !defined(COLD_TEST) && !defined(TEST_CUSTOM) // Cached test, loop many times over small dataset # define TEST_SOURCES 32 # define TEST_LEN(m) ((128*1024 / m) & ~(64-1)) # define TEST_TYPE_STR "_warm" -#else -# ifndef TEST_CUSTOM +#elif defined (COLD_TEST) // Uncached test. Pull from large mem base. -# define TEST_SOURCES 32 -# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ -# define TEST_LEN(m) ((GT_L3_CACHE / m) & ~(64-1)) -# define TEST_TYPE_STR "_cold" -# else -# define TEST_TYPE_STR "_cus" -# endif +# define TEST_SOURCES 32 +# define TEST_LEN(m) ((GT_L3_CACHE / m) & ~(64-1)) +# define TEST_TYPE_STR "_cold" +#elif defined (TEST_CUSTOM) +# define TEST_TYPE_STR "_cus" +#endif +#ifndef TEST_SEED +# define TEST_SEED 0x1234 #endif #define MMAX TEST_SOURCES @@ -71,6 +73,17 @@ typedef unsigned char u8; +void usage(const char *app_name) +{ + fprintf(stderr, + "Usage: %s [options]\n" + " -h Help\n" + " -k <val> Number of source buffers\n" + " -p <val> Number of parity buffers\n" + " -e <val> Number of simulated buffers with errors (cannot be higher than p or k)\n", + app_name); +} + void dump(unsigned char *buf, int len) { int i; @@ -134,29 +147,103 @@ int decode_test(int m, int k, u8 ** update_buffs, u8 ** recov, u8 * a, u8 * src_ int main(int argc, char *argv[]) { - int i, j, check, m, k, nerrs; + int i, j, check, m, k, p, nerrs, ret = -1; void *buf; - u8 *temp_buffs[TEST_SOURCES], *buffs[TEST_SOURCES]; - u8 *update_buffs[TEST_SOURCES]; - u8 *perf_update_buffs[TEST_SOURCES]; + u8 *temp_buffs[TEST_SOURCES] = { NULL }; + u8 *buffs[TEST_SOURCES] = { NULL }; + u8 *update_buffs[TEST_SOURCES] = { NULL }; + u8 *perf_update_buffs[TEST_SOURCES] = { NULL }; u8 a[MMAX * KMAX]; u8 g_tbls[KMAX * TEST_SOURCES * 32], src_in_err[TEST_SOURCES]; u8 src_err_list[TEST_SOURCES], *recov[TEST_SOURCES]; struct perf start; - // Pick test parameters + /* Set default parameters */ k = 10; - m = k + VECT; + p = VECT; nerrs = VECT; - const u8 err_list[] = { 0, 2, 4, 5, 7, 8 }; - printf(xstr(FUNCTION_UNDER_TEST) "_perf: %dx%d %d\n", m, TEST_LEN(m), nerrs); + /* Parse arguments */ + for (i = 1; i < argc; i++) { + if (strcmp(argv[i], "-k") == 0) { + k = atoi(argv[++i]); + } else if (strcmp(argv[i], "-p") == 0) { + p = atoi(argv[++i]); + } else if (strcmp(argv[i], "-e") == 0) { + nerrs = atoi(argv[++i]); + } else if (strcmp(argv[i], "-h") == 0) { + usage(argv[0]); + return 0; + } else { + usage(argv[0]); + return -1; + } + } + + if (nerrs > k) { + printf + ("Number of errors (%d) cannot be higher than number of data buffers (%d)\n", + nerrs, k); + return -1; + } + + if (k <= 0) { + printf("Number of source buffers (%d) must be > 0\n", k); + return -1; + } + + if (p <= 0) { + printf("Number of parity buffers (%d) must be > 0\n", p); + return -1; + } + + if (nerrs > p) { + printf + ("Number of errors (%d) cannot be higher than number of parity buffers (%d)\n", + nerrs, p); + return -1; + } + + if (nerrs <= 0) { + printf("Number of errors (%d) must be > 0\n", nerrs); + return -1; + } + + m = k + p; - if (m > MMAX || k > KMAX || nerrs > (m - k)) { - printf(" Input test parameter error\n"); + if (m > MMAX) { + printf("Number of total buffers (data and parity) cannot be higher than %d\n", + MMAX); return -1; } + u8 *err_list = malloc((size_t)nerrs); + if (err_list == NULL) { + printf("Error allocating list of array of error indices\n"); + return -1; + } + + srand(TEST_SEED); + + for (i = 0; i < nerrs;) { + u8 next_err = rand() % k; + for (j = 0; j < i; j++) + if (next_err == err_list[j]) + break; + if (j != i) + continue; + err_list[i++] = next_err; + } + + printf("Testing with %u data buffers and %u parity buffers (num errors = %u, in [ ", k, + p, nerrs); + for (i = 0; i < nerrs; i++) + printf("%d ", err_list[i]); + + printf("])\n"); + + printf(xstr(FUNCTION_UNDER_TEST) "_perf: %dx%d %d\n", m, TEST_LEN(m), nerrs); + memcpy(src_err_list, err_list, nerrs); memset(src_in_err, 0, TEST_SOURCES); for (i = 0; i < nerrs; i++) @@ -165,16 +252,16 @@ int main(int argc, char *argv[]) // Allocate the arrays for (i = 0; i < m; i++) { if (posix_memalign(&buf, 64, TEST_LEN(m))) { - printf("alloc error: Fail\n"); - return -1; + printf("Error allocating buffers\n"); + goto exit; } buffs[i] = buf; } for (i = 0; i < (m - k); i++) { if (posix_memalign(&buf, 64, TEST_LEN(m))) { - printf("alloc error: Fail\n"); - return -1; + printf("Error allocating buffers\n"); + goto exit; } temp_buffs[i] = buf; memset(temp_buffs[i], 0, TEST_LEN(m)); // initialize the destination buffer to be zero for update function @@ -182,16 +269,16 @@ int main(int argc, char *argv[]) for (i = 0; i < TEST_SOURCES; i++) { if (posix_memalign(&buf, 64, TEST_LEN(m))) { - printf("alloc error: Fail"); - return -1; + printf("Error allocating buffers\n"); + goto exit; } update_buffs[i] = buf; memset(update_buffs[i], 0, TEST_LEN(m)); // initialize the destination buffer to be zero for update function } for (i = 0; i < TEST_SOURCES; i++) { if (posix_memalign(&buf, 64, TEST_LEN(m))) { - printf("alloc error: Fail"); - return -1; + printf("Error allocating buffers\n"); + goto exit; } perf_update_buffs[i] = buf; memset(perf_update_buffs[i], 0, TEST_LEN(m)); // initialize the destination buffer to be zero for update function @@ -214,7 +301,7 @@ int main(int argc, char *argv[]) dump(update_buffs[k + i], 25); printf("buffs%d :", i); dump(buffs[k + i], 25); - return -1; + goto exit; } } @@ -263,13 +350,14 @@ int main(int argc, char *argv[]) nerrs, g_tbls, perf_update_buffs)); if (check) { printf("BAD_MATRIX\n"); - return -1; + ret = check; + goto exit; } for (i = 0; i < nerrs; i++) { if (0 != memcmp(temp_buffs[i], update_buffs[src_err_list[i]], TEST_LEN(m))) { printf("Fail error recovery (%d, %d, %d) - \n", m, k, nerrs); - return -1; + goto exit; } } @@ -277,5 +365,16 @@ int main(int argc, char *argv[]) perf_print(start, (long long)(TEST_LEN(m)) * (k + nerrs)); printf("done all: Pass\n"); - return 0; + + ret = 0; + + exit: + free(err_list); + for (i = 0; i < TEST_SOURCES; i++) { + free(buffs[i]); + free(temp_buffs[i]); + free(update_buffs[i]); + free(perf_update_buffs[i]); + } + return ret; } diff --git a/contrib/libs/isa-l/erasure_code/erasure_code_update_test.c b/contrib/libs/isa-l/erasure_code/erasure_code_update_test.c index 9014da7890..b13485cd72 100644 --- a/contrib/libs/isa-l/erasure_code/erasure_code_update_test.c +++ b/contrib/libs/isa-l/erasure_code/erasure_code_update_test.c @@ -30,8 +30,9 @@ #include <stdio.h> #include <stdlib.h> #include <string.h> // for memset, memcmp +#include <assert.h> #include "erasure_code.h" -// #include "types.h" +#include "test.h" #ifndef ALIGN_SIZE # define ALIGN_SIZE 16 @@ -227,14 +228,15 @@ static int gf_gen_decode_matrix(unsigned char *encode_matrix, int main(int argc, char *argv[]) { - int re = 0; + int re = -1; int i, j, p, rtest, m, k; int nerrs, nsrcerrs; void *buf; unsigned int decode_index[MMAX]; - unsigned char *temp_buffs[TEST_SOURCES], *buffs[TEST_SOURCES]; - unsigned char *update_buffs[TEST_SOURCES]; - unsigned char *encode_matrix, *decode_matrix, *invert_matrix, *g_tbls; + unsigned char *temp_buffs[TEST_SOURCES] = { NULL }, *buffs[TEST_SOURCES] = { NULL }; + unsigned char *update_buffs[TEST_SOURCES] = { NULL }; + unsigned char *encode_matrix = NULL, *decode_matrix = NULL, *invert_matrix = + NULL, *g_tbls = NULL; unsigned char src_in_err[TEST_SOURCES], src_err_list[TEST_SOURCES]; unsigned char *recov[TEST_SOURCES]; @@ -253,7 +255,7 @@ int main(int argc, char *argv[]) for (i = 0; i < TEST_SOURCES; i++) { if (posix_memalign(&buf, 64, TEST_LEN)) { printf("alloc error: Fail"); - return -1; + goto exit; } buffs[i] = buf; } @@ -261,7 +263,7 @@ int main(int argc, char *argv[]) for (i = 0; i < TEST_SOURCES; i++) { if (posix_memalign(&buf, 64, TEST_LEN)) { printf("alloc error: Fail"); - return -1; + goto exit; } temp_buffs[i] = buf; memset(temp_buffs[i], 0, TEST_LEN); // initialize the destination buffer to be zero for update function @@ -270,7 +272,7 @@ int main(int argc, char *argv[]) for (i = 0; i < TEST_SOURCES; i++) { if (posix_memalign(&buf, 64, TEST_LEN)) { printf("alloc error: Fail"); - return -1; + goto exit; } update_buffs[i] = buf; memset(update_buffs[i], 0, TEST_LEN); // initialize the destination buffer to be zero for update function @@ -284,13 +286,12 @@ int main(int argc, char *argv[]) if (encode_matrix == NULL || decode_matrix == NULL || invert_matrix == NULL || g_tbls == NULL) { printf("Test failure! Error with malloc\n"); - return -1; + goto exit; } // Pick a first test m = 14; k = 10; - if (m > MMAX || k > KMAX) - return -1; + assert(!(m > MMAX || k > KMAX)); // Make random data for (i = 0; i < k; i++) { @@ -321,7 +322,7 @@ int main(int argc, char *argv[]) dump(update_buffs[k + i], 25); printf("buffs%d :", i); dump(buffs[k + i], 25); - return -1; + goto exit; } } @@ -335,7 +336,7 @@ int main(int argc, char *argv[]) nerrs, nsrcerrs, k, m); if (re != 0) { printf("Fail to gf_gen_decode_matrix\n"); - return -1; + goto exit; } // Pack recovery array as list of valid sources // Its order must be the same as the order @@ -367,16 +368,21 @@ int main(int argc, char *argv[]) dump(temp_buffs[k + i], 25); printf("orig :"); dump(update_buffs[src_err_list[i]], 25); - return -1; + re = -1; + goto exit; } } +#ifdef TEST_VERBOSE putchar('.'); +#endif // Pick a first test m = 7; k = 5; - if (m > MMAX || k > KMAX) - return -1; + if (m > MMAX || k > KMAX) { + re = -1; + goto exit; + } // Zero the destination buffer for update function for (i = k; i < TEST_SOURCES; i++) { @@ -411,7 +417,8 @@ int main(int argc, char *argv[]) dump(update_buffs[k + i], 25); printf("buffs%d :", i); dump(buffs[k + i], 25); - return -1; + re = -1; + goto exit; } } @@ -425,7 +432,7 @@ int main(int argc, char *argv[]) nerrs, nsrcerrs, k, m); if (re != 0) { printf("Fail to gf_gen_decode_matrix\n"); - return -1; + goto exit; } // Pack recovery array as list of valid sources // Its order must be the same as the order @@ -462,10 +469,13 @@ int main(int argc, char *argv[]) dump(temp_buffs[k + i], 25); printf("orig :"); dump(update_buffs[src_err_list[i]], 25); - return -1; + re = -1; + goto exit; } } +#ifdef TEST_VERBOSE putchar('.'); +#endif // Do more random tests for (rtest = 0; rtest < RANDOMS; rtest++) { @@ -508,7 +518,8 @@ int main(int argc, char *argv[]) dump(update_buffs[k + i], 25); printf("buffs%d :", i); dump(buffs[k + i], 25); - return -1; + re = -1; + goto exit; } } @@ -522,7 +533,7 @@ int main(int argc, char *argv[]) src_in_err, nerrs, nsrcerrs, k, m); if (re != 0) { printf("Fail to gf_gen_decode_matrix\n"); - return -1; + goto exit; } // Pack recovery array as list of valid sources // Its order must be the same as the order @@ -565,22 +576,29 @@ int main(int argc, char *argv[]) dump(update_buffs[src_err_list[i]], 25); printf("recov %d:", src_err_list[i]); dump(temp_buffs[k + i], 25); - return -1; + re = -1; + goto exit; } } +#ifdef TEST_VERBOSE putchar('.'); +#endif } // Run tests at end of buffer for Electric Fence k = 16; align = (LEN_ALIGN_CHK_B != 0) ? 1 : ALIGN_SIZE; - if (k > KMAX) - return -1; + if (k > KMAX) { + re = -1; + goto exit; + } for (rows = 1; rows <= 16; rows++) { m = k + rows; - if (m > MMAX) - return -1; + if (m > MMAX) { + re = -1; + goto exit; + } for (i = k; i < TEST_SOURCES; i++) { memset(buffs[i], 0, TEST_LEN); @@ -628,7 +646,8 @@ int main(int argc, char *argv[]) dump(efence_update_buffs[k + i], 25); printf("efence_buffs%d :", i); dump(efence_buffs[k + i], 25); - return -1; + re = -1; + goto exit; } } @@ -642,7 +661,7 @@ int main(int argc, char *argv[]) src_in_err, nerrs, nsrcerrs, k, m); if (re != 0) { printf("Fail to gf_gen_decode_matrix\n"); - return -1; + goto exit; } // Pack recovery array as list of valid sources // Its order must be the same as the order @@ -688,11 +707,14 @@ int main(int argc, char *argv[]) dump(temp_buffs[k + i], align); printf("orig :"); dump(efence_update_buffs[src_err_list[i]], align); - return -1; + re = 1; + goto exit; } } } +#ifdef TEST_VERBOSE putchar('.'); +#endif } @@ -752,7 +774,8 @@ int main(int argc, char *argv[]) dump(update_ubuffs[k + i], 25); printf("ubuffs%d :", i); dump(ubuffs[k + i], 25); - return -1; + re = -1; + goto exit; } } @@ -766,7 +789,7 @@ int main(int argc, char *argv[]) src_in_err, nerrs, nsrcerrs, k, m); if (re != 0) { printf("Fail to gf_gen_decode_matrix\n"); - return -1; + goto exit; } // Pack recovery array as list of valid sources // Its order must be the same as the order @@ -808,7 +831,8 @@ int main(int argc, char *argv[]) dump(update_ubuffs[src_err_list[i]], 25); printf("recov %d:", src_err_list[i]); dump(temp_ubuffs[k + i], 25); - return -1; + re = -1; + goto exit; } } @@ -821,13 +845,15 @@ int main(int argc, char *argv[]) if (memcmp(update_buffs[i], temp_buffs[0], offset)) { printf("Fail rand ualign encode pad start\n"); - return -1; + re = -1; + goto exit; } if (memcmp (update_buffs[i] + offset + size, temp_buffs[0], PTR_ALIGN_CHK_B - offset)) { printf("Fail rand ualign encode pad end\n"); - return -1; + re = -1; + goto exit; } } @@ -836,17 +862,21 @@ int main(int argc, char *argv[]) offset = temp_ubuffs[k + i] - temp_buffs[k + i]; if (memcmp(temp_buffs[k + i], temp_buffs[0], offset)) { printf("Fail rand ualign decode pad start\n"); - return -1; + re = -1; + goto exit; } if (memcmp (temp_buffs[k + i] + offset + size, temp_buffs[0], PTR_ALIGN_CHK_B - offset)) { printf("Fail rand ualign decode pad end\n"); - return -1; + re = -1; + goto exit; } } +#ifdef TEST_VERBOSE putchar('.'); +#endif } // Test size alignment @@ -893,7 +923,8 @@ int main(int argc, char *argv[]) dump(update_buffs[k + i], 25); printf("buffs%d (size=%d) :", i, size); dump(buffs[k + i], 25); - return -1; + re = -1; + goto exit; } } @@ -906,7 +937,7 @@ int main(int argc, char *argv[]) src_in_err, nerrs, nsrcerrs, k, m); if (re != 0) { printf("Fail to gf_gen_decode_matrix\n"); - return -1; + goto exit; } // Pack recovery array as list of valid sources // Its order must be the same as the order @@ -948,12 +979,30 @@ int main(int argc, char *argv[]) dump(update_buffs[src_err_list[i]], 25); printf("recov %d:", src_err_list[i]); dump(temp_buffs[k + i], 25); - return -1; + re = -1; + goto exit; } } +#ifdef TEST_VERBOSE putchar('.'); +#endif } printf("done EC tests: Pass\n"); + re = 0; + + exit: + for (i = 0; i < TEST_SOURCES; i++) { + if (buffs[i]) + aligned_free(buffs[i]); + if (temp_buffs[i]) + aligned_free(temp_buffs[i]); + if (update_buffs[i]) + aligned_free(update_buffs[i]); + } + free(encode_matrix); + free(decode_matrix); + free(invert_matrix); + free(g_tbls); return 0; } diff --git a/contrib/libs/isa-l/erasure_code/erasure_code_update_test.patch b/contrib/libs/isa-l/erasure_code/erasure_code_update_test.patch deleted file mode 100644 index 3726f2d805..0000000000 --- a/contrib/libs/isa-l/erasure_code/erasure_code_update_test.patch +++ /dev/null @@ -1,8 +0,0 @@ -34c34 -< // #include "types.h" ---- -> #include "types.h" -217c217 -< s ^= gf_mul_erasure(invert_matrix[j * k + i], ---- -> s ^= gf_mul(invert_matrix[j * k + i], diff --git a/contrib/libs/isa-l/erasure_code/gen_rs_matrix_limits.c b/contrib/libs/isa-l/erasure_code/gen_rs_matrix_limits.c index 85061484bc..18a559088d 100644 --- a/contrib/libs/isa-l/erasure_code/gen_rs_matrix_limits.c +++ b/contrib/libs/isa-l/erasure_code/gen_rs_matrix_limits.c @@ -9,7 +9,7 @@ #define ROWS M_MAX #define COLS K_MAX -static inline int min(int a, int b) +static inline uint64_t min(const uint64_t a, const uint64_t b) { if (a <= b) return a; @@ -17,10 +17,11 @@ static inline int min(int a, int b) return b; } -void gen_sub_matrix(unsigned char *out_matrix, int dim, unsigned char *in_matrix, int rows, - int cols, uint64_t row_indicator, uint64_t col_indicator) +void gen_sub_matrix(unsigned char *out_matrix, const uint64_t dim, unsigned char *in_matrix, + const uint64_t rows, const uint64_t cols, const uint64_t row_indicator, + const uint64_t col_indicator) { - int i, j, r, s; + uint64_t i, j, r, s; for (i = 0, r = 0; i < rows; i++) { if (!(row_indicator & ((uint64_t) 1 << i))) @@ -51,23 +52,23 @@ uint64_t next_subset(uint64_t * subset, uint64_t element_count, uint64_t subsize return 0; } -int are_submatrices_singular(unsigned char *vmatrix, int rows, int cols) +int are_submatrices_singular(unsigned char *vmatrix, const uint64_t rows, const uint64_t cols) { unsigned char matrix[COLS * COLS]; unsigned char invert_matrix[COLS * COLS]; - uint64_t row_indicator, col_indicator, subset_init, subsize; + uint64_t subsize; /* Check all square subsize x subsize submatrices of the rows x cols * vmatrix for singularity*/ for (subsize = 1; subsize <= min(rows, cols); subsize++) { - subset_init = (1 << subsize) - 1; - col_indicator = subset_init; + const uint64_t subset_init = (1ULL << subsize) - 1ULL; + uint64_t col_indicator = subset_init; do { - row_indicator = subset_init; + uint64_t row_indicator = subset_init; do { gen_sub_matrix(matrix, subsize, vmatrix, rows, cols, row_indicator, col_indicator); - if (gf_invert_matrix(matrix, invert_matrix, subsize)) + if (gf_invert_matrix(matrix, invert_matrix, (int)subsize)) return 1; } while (next_subset(&row_indicator, rows, subsize) == 0); @@ -80,7 +81,7 @@ int are_submatrices_singular(unsigned char *vmatrix, int rows, int cols) int main(int argc, char **argv) { unsigned char vmatrix[(ROWS + COLS) * COLS]; - int rows, cols; + uint64_t rows, cols; if (K_MAX > MAX_CHECK) { printf("K_MAX too large for this test\n"); @@ -108,7 +109,7 @@ int main(int argc, char **argv) break; } - printf(" k = %2d, m <= %2d \n", cols, rows + cols - 1); + printf(" k = %2u, m <= %2u \n", (unsigned)cols, (unsigned)(rows + cols - 1)); } return 0; diff --git a/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx.asm b/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx.asm index 6233d42e5d..b5dcb0e112 100644 --- a/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx.asm +++ b/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx.asm @@ -52,7 +52,7 @@ %define PS 8 %define LOG_PS 3 - %define func(x) x: + %define func(x) x: endbranch %macro FUNC_SAVE 0 push r12 %endmacro @@ -84,9 +84,9 @@ %define func(x) proc_frame x %macro FUNC_SAVE 0 alloc_stack stack_size - save_xmm128 xmm6, 0*16 - save_xmm128 xmm7, 1*16 - save_xmm128 xmm8, 2*16 + vmovdqa [rsp + 0*16], xmm6 + vmovdqa [rsp + 1*16], xmm7 + vmovdqa [rsp + 2*16], xmm8 save_reg r12, 3*16 + 0*8 save_reg r13, 3*16 + 1*8 save_reg r14, 3*16 + 2*8 @@ -127,7 +127,7 @@ %define PS 4 %define LOG_PS 2 - %define func(x) x: + %define func(x) x: endbranch %define arg(x) [ebp + PS*2 + PS*x] %define var(x) [ebp - PS - PS*x] @@ -238,13 +238,9 @@ section .text %endif align 16 -global gf_2vect_dot_prod_avx:ISAL_SYM_TYPE_FUNCTION -func(gf_2vect_dot_prod_avx) -%ifidn __OUTPUT_FORMAT__, macho64 -global _gf_2vect_dot_prod_avx:ISAL_SYM_TYPE_FUNCTION -func(_gf_2vect_dot_prod_avx) -%endif +global gf_2vect_dot_prod_avx, function +func(gf_2vect_dot_prod_avx) FUNC_SAVE SLDR len, len_m sub len, 16 @@ -336,6 +332,3 @@ section .data align 16 mask0f: dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f - -;;; func core, ver, snum -slversion gf_2vect_dot_prod_avx, 02, 05, 0191 diff --git a/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx.patch b/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx.patch deleted file mode 100644 index bca96af58e..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx.patch +++ /dev/null @@ -1,8 +0,0 @@ -242,246d241 -< func(gf_2vect_dot_prod_avx) -< %ifidn __OUTPUT_FORMAT__, macho64 -< global _gf_2vect_dot_prod_avx:ISAL_SYM_TYPE_FUNCTION -< func(_gf_2vect_dot_prod_avx) -< %endif -247a243 -> func(gf_2vect_dot_prod_avx) diff --git a/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx2.asm b/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx2.asm index 53052d56e0..3d13300528 100644 --- a/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx2.asm +++ b/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx2.asm @@ -54,7 +54,7 @@ %define PS 8 %define LOG_PS 3 - %define func(x) x: + %define func(x) x: endbranch %macro FUNC_SAVE 0 push r12 %endmacro @@ -131,7 +131,7 @@ %define PS 4 %define LOG_PS 2 - %define func(x) x: + %define func(x) x: endbranch %define arg(x) [ebp + PS*2 + PS*x] %define var(x) [ebp - PS - PS*x] @@ -248,13 +248,9 @@ section .text %endif align 16 -global gf_2vect_dot_prod_avx2:ISAL_SYM_TYPE_FUNCTION -func(gf_2vect_dot_prod_avx2) -%ifidn __OUTPUT_FORMAT__, macho64 -global _gf_2vect_dot_prod_avx2:ISAL_SYM_TYPE_FUNCTION -func(_gf_2vect_dot_prod_avx2) -%endif +global gf_2vect_dot_prod_avx2, function +func(gf_2vect_dot_prod_avx2) FUNC_SAVE SLDR len, len_m sub len, 32 @@ -353,8 +349,3 @@ func(_gf_2vect_dot_prod_avx2) ret endproc_frame - -section .data - -;;; func core, ver, snum -slversion gf_2vect_dot_prod_avx2, 04, 05, 0196 diff --git a/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx2.patch b/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx2.patch deleted file mode 100644 index cee2de5a58..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx2.patch +++ /dev/null @@ -1,8 +0,0 @@ -252,256d251 -< func(gf_2vect_dot_prod_avx2) -< %ifidn __OUTPUT_FORMAT__, macho64 -< global _gf_2vect_dot_prod_avx2:ISAL_SYM_TYPE_FUNCTION -< func(_gf_2vect_dot_prod_avx2) -< %endif -257a253 -> func(gf_2vect_dot_prod_avx2) diff --git a/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx2_gfni.asm b/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx2_gfni.asm new file mode 100644 index 0000000000..bdf03442e0 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx2_gfni.asm @@ -0,0 +1,362 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2023 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; +;;; gf_2vect_dot_prod_avx2_gfni(len, vec, *g_tbls, **buffs, **dests); +;;; + +%include "reg_sizes.asm" +%include "gf_vect_gfni.inc" +%include "memcpy.asm" + +%if AS_FEATURE_LEVEL >= 10 + +%ifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + + %define tmp r11 + %define tmp2 r10 + %define tmp3 r13 ; must be saved and restored + %define tmp4 r12 ; must be saved and restored + %define tmp5 r14 ; must be saved and restored + + %define stack_size 3*8 + %define func(x) x: endbranch + %macro FUNC_SAVE 0 + sub rsp, stack_size + mov [rsp + 0*8], r12 + mov [rsp + 1*8], r13 + mov [rsp + 2*8], r14 + %endmacro + %macro FUNC_RESTORE 0 + mov r12, [rsp + 0*8] + mov r13, [rsp + 1*8] + mov r14, [rsp + 2*8] + add rsp, stack_size + %endmacro +%endif + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + + %define arg4 r12 ; must be saved, loaded and restored + %define arg5 r15 ; must be saved and restored + %define tmp r11 + %define tmp2 r10 + %define tmp3 r13 ; must be saved and restored + %define tmp4 r14 ; must be saved and restored + %define tmp5 rdi ; must be saved and restored + %define stack_size 7*16 + 5*8 ; must be an odd multiple of 8 + %define arg(x) [rsp + stack_size + 8 + 8*x] + + %define func(x) proc_frame x + %macro FUNC_SAVE 0 + alloc_stack stack_size + vmovdqa [rsp + 0*16], xmm6 + vmovdqa [rsp + 1*16], xmm7 + vmovdqa [rsp + 2*16], xmm8 + vmovdqa [rsp + 3*16], xmm9 + vmovdqa [rsp + 4*16], xmm10 + vmovdqa [rsp + 5*16], xmm11 + vmovdqa [rsp + 6*16], xmm12 + mov [rsp + 7*16 + 0*8], r12 + mov [rsp + 7*16 + 1*8], r13 + mov [rsp + 7*16 + 2*8], r14 + mov [rsp + 7*16 + 3*8], r15 + mov [rsp + 7*16 + 4*8], rdi + end_prolog + mov arg4, arg(4) + %endmacro + + %macro FUNC_RESTORE 0 + vmovdqa xmm6, [rsp + 0*16] + vmovdqa xmm7, [rsp + 1*16] + vmovdqa xmm8, [rsp + 2*16] + vmovdqa xmm9, [rsp + 3*16] + vmovdqa xmm10, [rsp + 4*16] + vmovdqa xmm11, [rsp + 5*16] + vmovdqa xmm12, [rsp + 6*16] + mov r12, [rsp + 7*16 + 0*8] + mov r13, [rsp + 7*16 + 1*8] + mov r14, [rsp + 7*16 + 2*8] + mov r15, [rsp + 7*16 + 3*8] + mov rdi, [rsp + 7*16 + 4*8] + add rsp, stack_size + %endmacro +%endif + + +%define len arg0 +%define vec arg1 +%define mul_array arg2 +%define src arg3 +%define dest arg4 +%define ptr arg5 +%define vec_i tmp2 +%define dest2 tmp3 +%define dest1 tmp5 +%define pos rax + +%ifndef EC_ALIGNED_ADDR +;;; Use Un-aligned load/store + %define XLDR vmovdqu + %define XSTR vmovdqu +%else +;;; Use Non-temporal load/stor + %ifdef NO_NT_LDST + %define XLDR vmovdqa + %define XSTR vmovdqa + %else + %define XLDR vmovntdqa + %define XSTR vmovntdq + %endif +%endif + +%define x0l ymm0 +%define x0h ymm1 +%define x0x ymm2 + +%define xgft1 ymm3 +%define xgft2 ymm4 + +%define xtmp1 ymm5 +%define xtmp2 ymm6 + +%define xp1l ymm7 +%define xp2l ymm8 + +%define xp1h ymm9 +%define xp2h ymm10 + +%define xp1x ymm11 +%define xp2x ymm12 + +%define x0 x0l +%define xp1 xp1l +%define xp2 xp2l + +default rel +[bits 64] + +section .text + +;; +;; Encodes 96 bytes of all "k" sources into 2x 96 bytes (parity disk) +;; +%macro ENCODE_96B_2 0 + vpxor xp1l, xp1l, xp1l + vpxor xp1h, xp1h, xp1h + vpxor xp1x, xp1x, xp1x + + vpxor xp2l, xp2l, xp2l + vpxor xp2h, xp2h, xp2h + vpxor xp2x, xp2x, xp2x + mov tmp, mul_array + xor vec_i, vec_i + +%%next_vect: + ;; load next source vector + mov ptr, [src + vec_i] + XLDR x0l, [ptr + pos] + XLDR x0h, [ptr + pos + 32] + XLDR x0x, [ptr + pos + 64] + add vec_i, 8 + + vbroadcastsd xgft1, [tmp] + vbroadcastsd xgft2, [tmp + vec] + + GF_MUL_XOR VEX, x0l, xgft1, xtmp1, xp1l, xgft2, xtmp2, xp2l + GF_MUL_XOR VEX, x0h, xgft1, xtmp1, xp1h, xgft2, xtmp2, xp2h + GF_MUL_XOR VEX, x0x, xgft1, xtmp1, xp1x, xgft2, xtmp2, xp2x + add tmp, 8 + + cmp vec_i, vec + jl %%next_vect + + XSTR [dest1 + pos], xp1l + XSTR [dest1 + pos + 32], xp1h + XSTR [dest1 + pos + 64], xp1x + XSTR [dest2 + pos], xp2l + XSTR [dest2 + pos + 32], xp2h + XSTR [dest2 + pos + 64], xp2x +%endmacro + +;; +;; Encodes 64 bytes of all "k" sources into 2x 64 bytes (parity disks) +;; +%macro ENCODE_64B_2 0 + vpxor xp1l, xp1l, xp1l + vpxor xp1h, xp1h, xp1h + vpxor xp2l, xp2l, xp2l + vpxor xp2h, xp2h, xp2h + mov tmp, mul_array + xor vec_i, vec_i + +%%next_vect: + mov ptr, [src + vec_i] + XLDR x0l, [ptr + pos] ;; Get next source vector low 32 bytes + XLDR x0h, [ptr + pos + 32] ;; Get next source vector high 32 bytes + add vec_i, 8 + + vbroadcastsd xgft1, [tmp] + vbroadcastsd xgft2, [tmp + vec] + add tmp, 8 + + GF_MUL_XOR VEX, x0l, xgft1, xtmp1, xp1l, xgft2, xtmp2, xp2l + GF_MUL_XOR VEX, x0h, xgft1, xgft1, xp1h, xgft2, xgft2, xp2h + + cmp vec_i, vec + jl %%next_vect + + XSTR [dest1 + pos], xp1l + XSTR [dest1 + pos + 32], xp1h + XSTR [dest2 + pos], xp2l + XSTR [dest2 + pos + 32], xp2h +%endmacro + +;; +;; Encodes 32 bytes of all "k" sources into 2x 32 bytes (parity disks) +;; +%macro ENCODE_32B_2 0 + vpxor xp1, xp1, xp1 + vpxor xp2, xp2, xp2 + mov tmp, mul_array + xor vec_i, vec_i + +%%next_vect: + mov ptr, [src + vec_i] + XLDR x0, [ptr + pos] ;Get next source vector (32 bytes) + add vec_i, 8 + + vbroadcastsd xgft1, [tmp] + vbroadcastsd xgft2, [tmp + vec] + add tmp, 8 + + GF_MUL_XOR VEX, x0, xgft1, xgft1, xp1, xgft2, xgft2, xp2 + + cmp vec_i, vec + jl %%next_vect + + XSTR [dest1 + pos], xp1 + XSTR [dest2 + pos], xp2 +%endmacro + +;; +;; Encodes less than 32 bytes of all "k" sources into 2 parity disks +;; +%macro ENCODE_LT_32B_2 1 +%define %%LEN %1 + + vpxor xp1, xp1, xp1 + vpxor xp2, xp2, xp2 + xor vec_i, vec_i + +%%next_vect: + mov ptr, [src + vec_i] + simd_load_avx2 x0, ptr + pos, %%LEN, tmp, tmp4 ;Get next source vector + add vec_i, 8 + + vbroadcastsd xgft1, [mul_array] + vbroadcastsd xgft2, [mul_array + vec] + add mul_array, 8 + + GF_MUL_XOR VEX, x0, xgft1, xgft1, xp1, xgft2, xgft2, xp2 + + cmp vec_i, vec + jl %%next_vect + + ;Store updated encoded data + lea ptr, [dest1 + pos] + simd_store_avx2 ptr, xp1, %%LEN, tmp, tmp4 + + lea ptr, [dest2 + pos] + simd_store_avx2 ptr, xp2, %%LEN, tmp, tmp4 +%endmacro + +align 16 +global gf_2vect_dot_prod_avx2_gfni, function +func(gf_2vect_dot_prod_avx2_gfni) + FUNC_SAVE + + xor pos, pos + shl vec, 3 ;; vec *= 8. Make vec_i count by 8 + mov dest1, [dest] + mov dest2, [dest + 8] + + cmp len, 96 + jl .len_lt_96 + +.loop96: + ENCODE_96B_2 + + add pos, 96 ;; Loop on 96 bytes at a time first + sub len, 96 + cmp len, 96 + jge .loop96 + +.len_lt_96: + cmp len, 64 + jl .len_lt_64 + + ENCODE_64B_2 + + add pos, 64 ;; encode next 64 bytes + sub len, 64 + +.len_lt_64: + cmp len, 32 + jl .len_lt_32 + + ENCODE_32B_2 + + add pos, 32 ;; encode next 32 bytes + sub len, 32 + +.len_lt_32: + cmp len, 0 + jle .exit + + ENCODE_LT_32B_2 len ;; encode remaining bytes + +.exit: + vzeroupper + + FUNC_RESTORE + ret + +endproc_frame +%endif ; if AS_FEATURE_LEVEL >= 10 diff --git a/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx512.asm b/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx512.asm index 0fe2f434a1..1593c963a3 100644 --- a/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx512.asm +++ b/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx512.asm @@ -50,7 +50,7 @@ %define PS 8 %define LOG_PS 3 - %define func(x) x: + %define func(x) x: endbranch %macro FUNC_SAVE 0 push r12 %endmacro @@ -73,7 +73,7 @@ %define return rax %define PS 8 %define LOG_PS 3 - %define stack_size 9*16 + 5*8 ; must be an odd multiple of 8 + %define stack_size 3*16 + 3*8 ; must be an odd multiple of 8 %define arg(x) [rsp + stack_size + PS + PS*x] %define func(x) proc_frame x @@ -82,16 +82,9 @@ vmovdqa [rsp + 0*16], xmm6 vmovdqa [rsp + 1*16], xmm7 vmovdqa [rsp + 2*16], xmm8 - vmovdqa [rsp + 3*16], xmm9 - vmovdqa [rsp + 4*16], xmm10 - vmovdqa [rsp + 5*16], xmm11 - vmovdqa [rsp + 6*16], xmm12 - vmovdqa [rsp + 7*16], xmm13 - vmovdqa [rsp + 8*16], xmm14 - save_reg r12, 9*16 + 0*8 - save_reg r13, 9*16 + 1*8 - save_reg r14, 9*16 + 2*8 - save_reg r15, 9*16 + 3*8 + save_reg r12, 3*16 + 0*8 + save_reg r13, 3*16 + 1*8 + save_reg r15, 3*16 + 2*8 end_prolog mov arg4, arg(4) %endmacro @@ -100,16 +93,9 @@ vmovdqa xmm6, [rsp + 0*16] vmovdqa xmm7, [rsp + 1*16] vmovdqa xmm8, [rsp + 2*16] - vmovdqa xmm9, [rsp + 3*16] - vmovdqa xmm10, [rsp + 4*16] - vmovdqa xmm11, [rsp + 5*16] - vmovdqa xmm12, [rsp + 6*16] - vmovdqa xmm13, [rsp + 7*16] - vmovdqa xmm14, [rsp + 8*16] - mov r12, [rsp + 9*16 + 0*8] - mov r13, [rsp + 9*16 + 1*8] - mov r14, [rsp + 9*16 + 2*8] - mov r15, [rsp + 9*16 + 3*8] + mov r12, [rsp + 3*16 + 0*8] + mov r13, [rsp + 3*16 + 1*8] + mov r15, [rsp + 3*16 + 2*8] add rsp, stack_size %endmacro %endif @@ -133,8 +119,8 @@ %else ;;; Use Non-temporal load/stor %ifdef NO_NT_LDST - %define XLDR vmovdqa - %define XSTR vmovdqa + %define XLDR vmovdqa64 + %define XSTR vmovdqa64 %else %define XLDR vmovntdqa %define XSTR vmovntdq @@ -160,13 +146,8 @@ default rel section .text align 16 -global gf_2vect_dot_prod_avx512:ISAL_SYM_TYPE_FUNCTION +global gf_2vect_dot_prod_avx512, function func(gf_2vect_dot_prod_avx512) -%ifidn __OUTPUT_FORMAT__, macho64 -global _gf_2vect_dot_prod_avx512:ISAL_SYM_TYPE_FUNCTION -func(_gf_2vect_dot_prod_avx512) -%endif - FUNC_SAVE sub len, 64 jl .return_fail diff --git a/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx512.patch b/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx512.patch deleted file mode 100644 index b00998d4b6..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx512.patch +++ /dev/null @@ -1,6 +0,0 @@ -165,169d164 -< %ifidn __OUTPUT_FORMAT__, macho64 -< global _gf_2vect_dot_prod_avx512:ISAL_SYM_TYPE_FUNCTION -< func(_gf_2vect_dot_prod_avx512) -< %endif -< diff --git a/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx512_gfni.asm b/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx512_gfni.asm new file mode 100644 index 0000000000..33967b2928 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx512_gfni.asm @@ -0,0 +1,209 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2023 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; +;;; gf_2vect_dot_prod_avx512_gfni(len, vec, *g_tbls, **buffs, **dests); +;;; + +%include "reg_sizes.asm" +%include "gf_vect_gfni.inc" + +%if AS_FEATURE_LEVEL >= 10 + +%ifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + + %define tmp r11 + %define tmp2 r10 + %define tmp3 r12 ; must be saved and restored + + %define func(x) x: endbranch + %macro FUNC_SAVE 0 + push r12 + %endmacro + %macro FUNC_RESTORE 0 + pop r12 + %endmacro +%endif + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + + %define arg4 r12 ; must be saved, loaded and restored + %define arg5 r14 ; must be saved and restored + %define tmp r11 + %define tmp2 r10 + %define tmp3 r13 ; must be saved and restored + %define stack_size 3*8 ; must be an odd multiple of 8 + %define arg(x) [rsp + stack_size + 8 + 8*x] + + %define func(x) proc_frame x + %macro FUNC_SAVE 0 + alloc_stack stack_size + mov [rsp + 0*8], r12 + mov [rsp + 1*8], r13 + mov [rsp + 2*8], r14 + end_prolog + mov arg4, arg(4) + %endmacro + + %macro FUNC_RESTORE 0 + mov r12, [rsp + 0*8] + mov r13, [rsp + 1*8] + mov r14, [rsp + 2*8] + add rsp, stack_size + %endmacro +%endif + + +%define len arg0 +%define vec arg1 +%define mul_array arg2 +%define src arg3 +%define dest1 arg4 +%define ptr arg5 +%define vec_i tmp2 +%define dest2 tmp3 +%define pos rax + + +%ifndef EC_ALIGNED_ADDR +;;; Use Un-aligned load/store + %define XLDR vmovdqu8 + %define XSTR vmovdqu8 +%else +;;; Use Non-temporal load/stor + %ifdef NO_NT_LDST + %define XLDR vmovdqa64 + %define XSTR vmovdqa64 + %else + %define XLDR vmovntdqa + %define XSTR vmovntdq + %endif +%endif + +%define xgft1 zmm3 +%define xgft2 zmm4 + +%define x0 zmm0 +%define xp1 zmm1 +%define xp2 zmm2 + +default rel +[bits 64] + +section .text + +;; +;; Encodes 64 bytes of all "k" sources into 2x 64 bytes (parity disks) +;; +%macro ENCODE_64B_2 0-1 +%define %%KMASK %1 + + vpxorq xp1, xp1, xp1 + vpxorq xp2, xp2, xp2 + mov tmp, mul_array + xor vec_i, vec_i + +%%next_vect: + mov ptr, [src + vec_i] +%if %0 == 1 + vmovdqu8 x0{%%KMASK}, [ptr + pos] ;Get next source vector (less than 64 bytes) +%else + XLDR x0, [ptr + pos] ;Get next source vector (64 bytes) +%endif + add vec_i, 8 + + vbroadcastf32x2 xgft1, [tmp] + vbroadcastf32x2 xgft2, [tmp + vec] + add tmp, 8 + + GF_MUL_XOR EVEX, x0, xgft1, xgft1, xp1, xgft2, xgft2, xp2 + + cmp vec_i, vec + jl %%next_vect + +%if %0 == 1 + vmovdqu8 [dest1 + pos]{%%KMASK}, xp1 + vmovdqu8 [dest2 + pos]{%%KMASK}, xp2 +%else + XSTR [dest1 + pos], xp1 + XSTR [dest2 + pos], xp2 +%endif +%endmacro + +align 16 +global gf_2vect_dot_prod_avx512_gfni, function +func(gf_2vect_dot_prod_avx512_gfni) + FUNC_SAVE + + xor pos, pos + shl vec, 3 ;vec *= 8. Make vec_i count by 8 + mov dest2, [dest1 + 8] + mov dest1, [dest1] + + cmp len, 64 + jl .len_lt_64 + +.loop64: + + ENCODE_64B_2 + + add pos, 64 ;Loop on 64 bytes at a time + sub len, 64 + cmp len, 64 + jge .loop64 + +.len_lt_64: + cmp len, 0 + jle .exit + + xor tmp, tmp + bts tmp, len + dec tmp + kmovq k1, tmp + + ENCODE_64B_2 k1 + +.exit: + vzeroupper + + FUNC_RESTORE + ret + +endproc_frame +%endif ; if AS_FEATURE_LEVEL >= 10 diff --git a/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_sse.asm b/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_sse.asm index ad61093471..986160204d 100644 --- a/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_sse.asm +++ b/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_sse.asm @@ -52,7 +52,7 @@ %define PS 8 %define LOG_PS 3 - %define func(x) x: + %define func(x) x: endbranch %macro FUNC_SAVE 0 push r12 %endmacro @@ -127,7 +127,7 @@ %define PS 4 %define LOG_PS 2 - %define func(x) x: + %define func(x) x: endbranch %define arg(x) [ebp + PS*2 + PS*x] %define var(x) [ebp - PS - PS*x] @@ -238,13 +238,9 @@ section .text %endif align 16 -global gf_2vect_dot_prod_sse:ISAL_SYM_TYPE_FUNCTION -func(gf_2vect_dot_prod_sse) -%ifidn __OUTPUT_FORMAT__, macho64 -global _gf_2vect_dot_prod_sse:ISAL_SYM_TYPE_FUNCTION -func(_gf_2vect_dot_prod_sse) -%endif +global gf_2vect_dot_prod_sse, function +func(gf_2vect_dot_prod_sse) FUNC_SAVE SLDR len, len_m sub len, 16 @@ -338,6 +334,3 @@ section .data align 16 mask0f: dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f - -;;; func core, ver, snum -slversion gf_2vect_dot_prod_sse, 00, 04, 0062 diff --git a/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_sse.patch b/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_sse.patch deleted file mode 100644 index 439a2b1ac9..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_sse.patch +++ /dev/null @@ -1,8 +0,0 @@ -242,246d241 -< func(gf_2vect_dot_prod_sse) -< %ifidn __OUTPUT_FORMAT__, macho64 -< global _gf_2vect_dot_prod_sse:ISAL_SYM_TYPE_FUNCTION -< func(_gf_2vect_dot_prod_sse) -< %endif -247a243 -> func(gf_2vect_dot_prod_sse) diff --git a/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_sse_test.c b/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_sse_test.c deleted file mode 100644 index 406183bc30..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_sse_test.c +++ /dev/null @@ -1,480 +0,0 @@ -/********************************************************************** - Copyright(c) 2011-2015 Intel Corporation All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -**********************************************************************/ - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> // for memset, memcmp -#include "erasure_code.h" -// #include "types.h" - -#ifndef FUNCTION_UNDER_TEST -# define FUNCTION_UNDER_TEST gf_2vect_dot_prod_sse -#endif -#ifndef TEST_MIN_SIZE -# define TEST_MIN_SIZE 16 -#endif - -#define str(s) #s -#define xstr(s) str(s) - -#define TEST_LEN 2048 -#define TEST_SIZE (TEST_LEN/2) -#define TEST_MEM TEST_SIZE -#define TEST_LOOPS 1000 -#define TEST_TYPE_STR "" - -#ifndef TEST_SOURCES -# define TEST_SOURCES 16 -#endif -#ifndef RANDOMS -# define RANDOMS 20 -#endif - -#ifdef EC_ALIGNED_ADDR -// Define power of 2 range to check ptr, len alignment -# define PTR_ALIGN_CHK_B 0 -# define LEN_ALIGN_CHK_B 0 // 0 for aligned only -#else -// Define power of 2 range to check ptr, len alignment -# define PTR_ALIGN_CHK_B 32 -# define LEN_ALIGN_CHK_B 32 // 0 for aligned only -#endif - -typedef unsigned char u8; - -extern void FUNCTION_UNDER_TEST(int len, int vlen, unsigned char *gftbls, - unsigned char **src, unsigned char **dest); - -void dump(unsigned char *buf, int len) -{ - int i; - for (i = 0; i < len;) { - printf(" %2x", 0xff & buf[i++]); - if (i % 32 == 0) - printf("\n"); - } - printf("\n"); -} - -void dump_matrix(unsigned char **s, int k, int m) -{ - int i, j; - for (i = 0; i < k; i++) { - for (j = 0; j < m; j++) { - printf(" %2x", s[i][j]); - } - printf("\n"); - } - printf("\n"); -} - -void dump_u8xu8(unsigned char *s, int k, int m) -{ - int i, j; - for (i = 0; i < k; i++) { - for (j = 0; j < m; j++) { - printf(" %2x", 0xff & s[j + (i * m)]); - } - printf("\n"); - } - printf("\n"); -} - -int main(int argc, char *argv[]) -{ - int i, j, rtest, srcs; - void *buf; - u8 g1[TEST_SOURCES], g2[TEST_SOURCES], g_tbls[2 * TEST_SOURCES * 32]; - u8 *dest1, *dest2, *dest_ref1, *dest_ref2, *dest_ptrs[2]; - u8 *buffs[TEST_SOURCES]; - - int align, size; - unsigned char *efence_buffs[TEST_SOURCES]; - unsigned int offset; - u8 *ubuffs[TEST_SOURCES]; - u8 *udest_ptrs[2]; - - printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d ", TEST_SOURCES, TEST_LEN); - - // Allocate the arrays - for (i = 0; i < TEST_SOURCES; i++) { - if (posix_memalign(&buf, 64, TEST_LEN)) { - printf("alloc error: Fail"); - return -1; - } - buffs[i] = buf; - } - - if (posix_memalign(&buf, 64, TEST_LEN)) { - printf("alloc error: Fail"); - return -1; - } - dest1 = buf; - - if (posix_memalign(&buf, 64, TEST_LEN)) { - printf("alloc error: Fail"); - return -1; - } - dest2 = buf; - - if (posix_memalign(&buf, 64, TEST_LEN)) { - printf("alloc error: Fail"); - return -1; - } - dest_ref1 = buf; - - if (posix_memalign(&buf, 64, TEST_LEN)) { - printf("alloc error: Fail"); - return -1; - } - dest_ref2 = buf; - - dest_ptrs[0] = dest1; - dest_ptrs[1] = dest2; - - // Test of all zeros - for (i = 0; i < TEST_SOURCES; i++) - memset(buffs[i], 0, TEST_LEN); - - memset(dest1, 0, TEST_LEN); - memset(dest2, 0, TEST_LEN); - memset(dest_ref1, 0, TEST_LEN); - memset(dest_ref2, 0, TEST_LEN); - memset(g1, 2, TEST_SOURCES); - memset(g2, 1, TEST_SOURCES); - - for (i = 0; i < TEST_SOURCES; i++) { - gf_vect_mul_init(g1[i], &g_tbls[i * 32]); - gf_vect_mul_init(g2[i], &g_tbls[32 * TEST_SOURCES + i * 32]); - } - - gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1); - gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES], buffs, - dest_ref2); - - FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs); - - if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) { - printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test1\n"); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref1, 25); - printf("dprod_dut:"); - dump(dest1, 25); - return -1; - } - if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) { - printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test2\n"); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref2, 25); - printf("dprod_dut:"); - dump(dest2, 25); - return -1; - } - - putchar('.'); - - // Rand data test - - for (rtest = 0; rtest < RANDOMS; rtest++) { - for (i = 0; i < TEST_SOURCES; i++) - for (j = 0; j < TEST_LEN; j++) - buffs[i][j] = rand(); - - for (i = 0; i < TEST_SOURCES; i++) { - g1[i] = rand(); - g2[i] = rand(); - } - - for (i = 0; i < TEST_SOURCES; i++) { - gf_vect_mul_init(g1[i], &g_tbls[i * 32]); - gf_vect_mul_init(g2[i], &g_tbls[(32 * TEST_SOURCES) + (i * 32)]); - } - - gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1); - gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES], - buffs, dest_ref2); - - FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs); - - if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test1 %d\n", rtest); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref1, 25); - printf("dprod_dut:"); - dump(dest1, 25); - return -1; - } - if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test2 %d\n", rtest); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref2, 25); - printf("dprod_dut:"); - dump(dest2, 25); - return -1; - } - - putchar('.'); - } - - // Rand data test with varied parameters - for (rtest = 0; rtest < RANDOMS; rtest++) { - for (srcs = TEST_SOURCES; srcs > 0; srcs--) { - for (i = 0; i < srcs; i++) - for (j = 0; j < TEST_LEN; j++) - buffs[i][j] = rand(); - - for (i = 0; i < srcs; i++) { - g1[i] = rand(); - g2[i] = rand(); - } - - for (i = 0; i < srcs; i++) { - gf_vect_mul_init(g1[i], &g_tbls[i * 32]); - gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]); - } - - gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[0], buffs, dest_ref1); - gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[32 * srcs], buffs, - dest_ref2); - - FUNCTION_UNDER_TEST(TEST_LEN, srcs, g_tbls, buffs, dest_ptrs); - - if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) - " test1 srcs=%d\n", srcs); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref1, 25); - printf("dprod_dut:"); - dump(dest1, 25); - return -1; - } - if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) - " test2 srcs=%d\n", srcs); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref2, 25); - printf("dprod_dut:"); - dump(dest2, 25); - return -1; - } - - putchar('.'); - } - } - - // Run tests at end of buffer for Electric Fence - align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16; - for (size = TEST_MIN_SIZE; size <= TEST_SIZE; size += align) { - for (i = 0; i < TEST_SOURCES; i++) - for (j = 0; j < TEST_LEN; j++) - buffs[i][j] = rand(); - - for (i = 0; i < TEST_SOURCES; i++) // Line up TEST_SIZE from end - efence_buffs[i] = buffs[i] + TEST_LEN - size; - - for (i = 0; i < TEST_SOURCES; i++) { - g1[i] = rand(); - g2[i] = rand(); - } - - for (i = 0; i < TEST_SOURCES; i++) { - gf_vect_mul_init(g1[i], &g_tbls[i * 32]); - gf_vect_mul_init(g2[i], &g_tbls[(32 * TEST_SOURCES) + (i * 32)]); - } - - gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[0], efence_buffs, dest_ref1); - gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES], - efence_buffs, dest_ref2); - - FUNCTION_UNDER_TEST(size, TEST_SOURCES, g_tbls, efence_buffs, dest_ptrs); - - if (0 != memcmp(dest_ref1, dest1, size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test1 %d\n", rtest); - dump_matrix(efence_buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref1, align); - printf("dprod_dut:"); - dump(dest1, align); - return -1; - } - - if (0 != memcmp(dest_ref2, dest2, size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test2 %d\n", rtest); - dump_matrix(efence_buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref2, align); - printf("dprod_dut:"); - dump(dest2, align); - return -1; - } - - putchar('.'); - } - - // Test rand ptr alignment if available - - for (rtest = 0; rtest < RANDOMS; rtest++) { - size = (TEST_LEN - PTR_ALIGN_CHK_B) & ~(TEST_MIN_SIZE - 1); - srcs = rand() % TEST_SOURCES; - if (srcs == 0) - continue; - - offset = (PTR_ALIGN_CHK_B != 0) ? 1 : PTR_ALIGN_CHK_B; - // Add random offsets - for (i = 0; i < srcs; i++) - ubuffs[i] = buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset)); - - udest_ptrs[0] = dest1 + (rand() & (PTR_ALIGN_CHK_B - offset)); - udest_ptrs[1] = dest2 + (rand() & (PTR_ALIGN_CHK_B - offset)); - - memset(dest1, 0, TEST_LEN); // zero pad to check write-over - memset(dest2, 0, TEST_LEN); - - for (i = 0; i < srcs; i++) - for (j = 0; j < size; j++) - ubuffs[i][j] = rand(); - - for (i = 0; i < srcs; i++) { - g1[i] = rand(); - g2[i] = rand(); - } - - for (i = 0; i < srcs; i++) { - gf_vect_mul_init(g1[i], &g_tbls[i * 32]); - gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]); - } - - gf_vect_dot_prod_base(size, srcs, &g_tbls[0], ubuffs, dest_ref1); - gf_vect_dot_prod_base(size, srcs, &g_tbls[32 * srcs], ubuffs, dest_ref2); - - FUNCTION_UNDER_TEST(size, srcs, g_tbls, ubuffs, udest_ptrs); - - if (memcmp(dest_ref1, udest_ptrs[0], size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n", - srcs); - dump_matrix(ubuffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref1, 25); - printf("dprod_dut:"); - dump(udest_ptrs[0], 25); - return -1; - } - if (memcmp(dest_ref2, udest_ptrs[1], size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n", - srcs); - dump_matrix(ubuffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref2, 25); - printf("dprod_dut:"); - dump(udest_ptrs[1], 25); - return -1; - } - // Confirm that padding around dests is unchanged - memset(dest_ref1, 0, PTR_ALIGN_CHK_B); // Make reference zero buff - offset = udest_ptrs[0] - dest1; - - if (memcmp(dest1, dest_ref1, offset)) { - printf("Fail rand ualign pad1 start\n"); - return -1; - } - if (memcmp(dest1 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) { - printf("Fail rand ualign pad1 end\n"); - return -1; - } - - offset = udest_ptrs[1] - dest2; - if (memcmp(dest2, dest_ref1, offset)) { - printf("Fail rand ualign pad2 start\n"); - return -1; - } - if (memcmp(dest2 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) { - printf("Fail rand ualign pad2 end\n"); - return -1; - } - - putchar('.'); - } - - // Test all size alignment - align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16; - - for (size = TEST_LEN; size >= TEST_MIN_SIZE; size -= align) { - srcs = TEST_SOURCES; - - for (i = 0; i < srcs; i++) - for (j = 0; j < size; j++) - buffs[i][j] = rand(); - - for (i = 0; i < srcs; i++) { - g1[i] = rand(); - g2[i] = rand(); - } - - for (i = 0; i < srcs; i++) { - gf_vect_mul_init(g1[i], &g_tbls[i * 32]); - gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]); - } - - gf_vect_dot_prod_base(size, srcs, &g_tbls[0], buffs, dest_ref1); - gf_vect_dot_prod_base(size, srcs, &g_tbls[32 * srcs], buffs, dest_ref2); - - FUNCTION_UNDER_TEST(size, srcs, g_tbls, buffs, dest_ptrs); - - if (memcmp(dest_ref1, dest_ptrs[0], size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n", - size); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref1, 25); - printf("dprod_dut:"); - dump(dest_ptrs[0], 25); - return -1; - } - if (memcmp(dest_ref2, dest_ptrs[1], size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n", - size); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref2, 25); - printf("dprod_dut:"); - dump(dest_ptrs[1], 25); - return -1; - } - } - - printf("Pass\n"); - return 0; - -} diff --git a/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_sse_test.patch b/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_sse_test.patch deleted file mode 100644 index 21bbfaa667..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_sse_test.patch +++ /dev/null @@ -1,4 +0,0 @@ -34c34 -< // #include "types.h" ---- -> #include "types.h" diff --git a/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx.asm b/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx.asm index 2d51dad33f..08e9a7f040 100644 --- a/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx.asm +++ b/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx.asm @@ -97,7 +97,7 @@ %define return rax %define return.w eax - %define func(x) x: + %define func(x) x: endbranch %define FUNC_SAVE %define FUNC_RESTORE %endif @@ -155,13 +155,9 @@ section .text align 16 -global gf_2vect_mad_avx:ISAL_SYM_TYPE_FUNCTION -func(gf_2vect_mad_avx) -%ifidn __OUTPUT_FORMAT__, macho64 -global _gf_2vect_mad_avx:ISAL_SYM_TYPE_FUNCTION -func(_gf_2vect_mad_avx) -%endif +global gf_2vect_mad_avx, function +func(gf_2vect_mad_avx) FUNC_SAVE sub len, 16 jl .return_fail @@ -235,6 +231,3 @@ section .data align 16 mask0f: dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f - -;;; func core, ver, snum -slversion gf_2vect_mad_avx, 02, 01, 0204 diff --git a/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx.patch b/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx.patch deleted file mode 100644 index b2bb2f2c3d..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx.patch +++ /dev/null @@ -1,8 +0,0 @@ -159,163d158 -< func(gf_2vect_mad_avx) -< %ifidn __OUTPUT_FORMAT__, macho64 -< global _gf_2vect_mad_avx:ISAL_SYM_TYPE_FUNCTION -< func(_gf_2vect_mad_avx) -< %endif -164a160 -> func(gf_2vect_mad_avx) diff --git a/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx2.asm b/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx2.asm index 2b0fd8ea2d..aa6a61c949 100644 --- a/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx2.asm +++ b/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx2.asm @@ -104,7 +104,7 @@ %define return rax %define return.w eax - %define func(x) x: + %define func(x) x: endbranch %define FUNC_SAVE %define FUNC_RESTORE %endif @@ -163,13 +163,9 @@ section .text %define xtmpd2 ymm9 align 16 -global gf_2vect_mad_avx2:ISAL_SYM_TYPE_FUNCTION -func(gf_2vect_mad_avx2) -%ifidn __OUTPUT_FORMAT__, macho64 -global _gf_2vect_mad_avx2:ISAL_SYM_TYPE_FUNCTION -func(_gf_2vect_mad_avx2) -%endif +global gf_2vect_mad_avx2, function +func(gf_2vect_mad_avx2) FUNC_SAVE sub len, 32 jl .return_fail @@ -244,8 +240,3 @@ func(_gf_2vect_mad_avx2) ret endproc_frame - -section .data - -;;; func core, ver, snum -slversion gf_2vect_mad_avx2, 04, 01, 0205 diff --git a/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx2.patch b/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx2.patch deleted file mode 100644 index 6f00af6393..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx2.patch +++ /dev/null @@ -1,8 +0,0 @@ -167,171d166 -< func(gf_2vect_mad_avx2) -< %ifidn __OUTPUT_FORMAT__, macho64 -< global _gf_2vect_mad_avx2:ISAL_SYM_TYPE_FUNCTION -< func(_gf_2vect_mad_avx2) -< %endif -172a168 -> func(gf_2vect_mad_avx2) diff --git a/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx2_gfni.asm b/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx2_gfni.asm new file mode 100644 index 0000000000..0445555419 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx2_gfni.asm @@ -0,0 +1,298 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2023 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; +;;; gf_2vect_mad_avx2_gfni(len, vec, vec_i, mul_array, src, dest); +;;; + +%include "reg_sizes.asm" +%include "gf_vect_gfni.inc" +%include "memcpy.asm" + +%if AS_FEATURE_LEVEL >= 10 + +%ifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + %define tmp r11 + %define tmp2 r10 + %define func(x) x: endbranch + %define FUNC_SAVE + %define FUNC_RESTORE +%endif + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define arg4 r12 ; must be saved, loaded and restored + %define arg5 r13 ; must be saved and restored + %define tmp r11 + %define tmp2 r10 + %define stack_size 16*9 + 3*8 + %define arg(x) [rsp + stack_size + 8 + 8*x] + %define func(x) proc_frame x + + %macro FUNC_SAVE 0 + sub rsp, stack_size + vmovdqa [rsp + 0*16], xmm6 + vmovdqa [rsp + 1*16], xmm7 + vmovdqa [rsp + 2*16], xmm8 + vmovdqa [rsp + 3*16], xmm9 + vmovdqa [rsp + 4*16], xmm10 + vmovdqa [rsp + 5*16], xmm11 + vmovdqa [rsp + 6*16], xmm12 + vmovdqa [rsp + 7*16], xmm13 + vmovdqa [rsp + 8*16], xmm14 + mov [rsp + 9*16 + 0*8], r12 + mov [rsp + 9*16 + 1*8], r13 + end_prolog + mov arg4, arg(4) + mov arg5, arg(5) + %endmacro + + %macro FUNC_RESTORE 0 + vmovdqa xmm6, [rsp + 0*16] + vmovdqa xmm7, [rsp + 1*16] + vmovdqa xmm8, [rsp + 2*16] + vmovdqa xmm9, [rsp + 3*16] + vmovdqa xmm10, [rsp + 4*16] + vmovdqa xmm11, [rsp + 5*16] + vmovdqa xmm12, [rsp + 6*16] + vmovdqa xmm13, [rsp + 7*16] + vmovdqa xmm14, [rsp + 8*16] + mov r12, [rsp + 9*16 + 0*8] + mov r13, [rsp + 9*16 + 1*8] + add rsp, stack_size + %endmacro +%endif + +%define len arg0 +%define vec arg1 +%define vec_i arg2 +%define mul_array arg3 +%define src arg4 +%define dest1 arg5 +%define pos rax +%define dest2 mul_array +%define dest3 vec_i + +%ifndef EC_ALIGNED_ADDR +;;; Use Un-aligned load/store + %define XLDR vmovdqu + %define XSTR vmovdqu +%else +;;; Use Non-temporal load/stor + %ifdef NO_NT_LDST + %define XLDR vmovdqa + %define XSTR vmovdqa + %else + %define XLDR vmovntdqa + %define XSTR vmovntdq + %endif +%endif + +default rel +[bits 64] +section .text + +%define x0l ymm0 +%define x0h ymm1 +%define x0x ymm2 + +%define xgft1 ymm3 +%define xgft2 ymm4 +%define xd1l ymm5 +%define xd1h ymm6 +%define xd1x ymm7 + +%define xd2l ymm8 +%define xd2h ymm9 +%define xd2x ymm10 + +%define xret1l ymm11 +%define xret1h ymm12 +%define xret2l ymm13 +%define xret2h ymm14 + +%define x0 x0l +%define xd1 xd1l +%define xd2 xd2l +%define xret1 xret1l +%define xret2 xret2l + +;; +;; Encodes 96 bytes of a single source into 2x 96 bytes (parity disks) +;; +%macro ENCODE_96B_2 0 + + ;Get next source vector + XLDR x0l, [src + pos] + XLDR x0h, [src + pos + 32] + XLDR x0x, [src + pos + 64] + ;Get next dest vectors + XLDR xd1l, [dest1 + pos] + XLDR xd1h, [dest1 + pos + 32] + XLDR xd1x, [dest1 + pos + 64] + XLDR xd2l, [dest2 + pos] + XLDR xd2h, [dest2 + pos + 32] + XLDR xd2x, [dest2 + pos + 64] + + GF_MUL_XOR VEX, x0l, xgft1, xret1l, xd1l, xgft2, xret2l, xd2l + GF_MUL_XOR VEX, x0h, xgft1, xret1h, xd1h, xgft2, xret2h, xd2h + GF_MUL_XOR VEX, x0x, xgft1, xret1l, xd1x, xgft2, xret2l, xd2x + + XSTR [dest1 + pos], xd1l + XSTR [dest1 + pos + 32], xd1h + XSTR [dest1 + pos + 64], xd1x + XSTR [dest2 + pos], xd2l + XSTR [dest2 + pos + 32], xd2h + XSTR [dest2 + pos + 64], xd2x +%endmacro + +;; +;; Encodes 64 bytes of a single source into 2x 64 bytes (parity disks) +;; +%macro ENCODE_64B_2 0 + + ;Get next source vector + XLDR x0l, [src + pos] + XLDR x0h, [src + pos + 32] + ;Get next dest vectors + XLDR xd1l, [dest1 + pos] + XLDR xd1h, [dest1 + pos + 32] + XLDR xd2l, [dest2 + pos] + XLDR xd2h, [dest2 + pos + 32] + + GF_MUL_XOR VEX, x0l, xgft1, xret1l, xd1l, xgft2, xret2l, xd2l + GF_MUL_XOR VEX, x0h, xgft1, xret1h, xd1h, xgft2, xret2h, xd2h + + XSTR [dest1 + pos], xd1l + XSTR [dest1 + pos + 32], xd1h + XSTR [dest2 + pos], xd2l + XSTR [dest2 + pos + 32], xd2h +%endmacro + +;; +;; Encodes 32 bytes of a single source into 2x 32 bytes (parity disks) +;; +%macro ENCODE_32B_2 0 + + ;Get next source vector + XLDR x0, [src + pos] + ;Get next dest vectors + XLDR xd1, [dest1 + pos] + XLDR xd2, [dest2 + pos] + + GF_MUL_XOR VEX, x0, xgft1, xret1, xd1, xgft2, xret2, xd2 + + XSTR [dest1 + pos], xd1 + XSTR [dest2 + pos], xd2 +%endmacro + +;; +;; Encodes less than 32 bytes of a single source into 2x parity disks +;; +%macro ENCODE_LT_32B_2 1 +%define %%LEN %1 + + ;Get next source vector + simd_load_avx2 x0, src + pos, %%LEN, tmp, tmp2 + ;Get next dest vectors + simd_load_avx2 xd1, dest1 + pos, %%LEN, tmp, tmp2 + simd_load_avx2 xd2, dest2 + pos, %%LEN, tmp, tmp2 + + GF_MUL_XOR VEX, x0, xgft1, xret1, xd1, xgft2, xret2, xd2 + + lea dest1, [dest1 + pos] + simd_store_avx2 dest1, xd1, %%LEN, tmp, tmp2 + lea dest2, [dest2 + pos] + simd_store_avx2 dest2, xd2, %%LEN, tmp, tmp2 +%endmacro + +align 16 +global gf_2vect_mad_avx2_gfni, function +func(gf_2vect_mad_avx2_gfni) + FUNC_SAVE + + xor pos, pos + shl vec_i, 3 ;Multiply by 8 + shl vec, 3 ;Multiply by 8 + lea tmp, [mul_array + vec_i] + vbroadcastsd xgft1, [tmp] + vbroadcastsd xgft2, [tmp + vec] + mov dest2, [dest1 + 8] ; reuse mul_array + mov dest1, [dest1] + + cmp len, 96 + jl .len_lt_96 + +.loop96: + ENCODE_96B_2 + add pos, 96 ;; loop on 96 bytes at a time + sub len, 96 + cmp len, 96 + jge .loop96 + +.len_lt_96: + cmp len, 64 + jl .len_lt_64 + ENCODE_64B_2 ;; encode next 64 bytes + + add pos, 64 + sub len, 64 + +.len_lt_64: + cmp len, 32 + jl .len_lt_32 + + ENCODE_32B_2 ;; encode next 32 bytes + + add pos, 32 + sub len, 32 + +.len_lt_32: + cmp len, 0 + jle .exit + + ENCODE_LT_32B_2 len ;; encode final bytes + +.exit: + vzeroupper + + FUNC_RESTORE + ret + +endproc_frame +%endif ; if AS_FEATURE_LEVEL >= 10 diff --git a/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx512.asm b/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx512.asm index acb67e4334..1b76432eb7 100644 --- a/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx512.asm +++ b/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx512.asm @@ -45,7 +45,7 @@ %define tmp r11 %define tmp2 r10 %define return rax - %define func(x) x: + %define func(x) x: endbranch %define FUNC_SAVE %define FUNC_RESTORE %endif @@ -118,8 +118,8 @@ %else ;;; Use Non-temporal load/stor %ifdef NO_NT_LDST - %define XLDR vmovdqa - %define XSTR vmovdqa + %define XLDR vmovdqa64 + %define XSTR vmovdqa64 %else %define XLDR vmovntdqa %define XSTR vmovntdq @@ -149,13 +149,8 @@ section .text %define xmask0f zmm14 align 16 -global gf_2vect_mad_avx512:ISAL_SYM_TYPE_FUNCTION +global gf_2vect_mad_avx512, function func(gf_2vect_mad_avx512) -%ifidn __OUTPUT_FORMAT__, macho64 -global _gf_2vect_mad_avx512:ISAL_SYM_TYPE_FUNCTION -func(_gf_2vect_mad_avx512) -%endif - FUNC_SAVE sub len, 64 jl .return_fail diff --git a/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx512.patch b/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx512.patch deleted file mode 100644 index 6b3d2e6d23..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx512.patch +++ /dev/null @@ -1,6 +0,0 @@ -154,158d153 -< %ifidn __OUTPUT_FORMAT__, macho64 -< global _gf_2vect_mad_avx512:ISAL_SYM_TYPE_FUNCTION -< func(_gf_2vect_mad_avx512) -< %endif -< diff --git a/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx512_gfni.asm b/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx512_gfni.asm new file mode 100644 index 0000000000..41343305b1 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx512_gfni.asm @@ -0,0 +1,189 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2023 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; +;;; gf_2vect_mad_avx512_gfni(len, vec, vec_i, mul_array, src, dest); +;;; + +%include "reg_sizes.asm" +%include "gf_vect_gfni.inc" + +%if AS_FEATURE_LEVEL >= 10 + +%ifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + %define tmp r11 + %define tmp2 r10 + %define func(x) x: endbranch + %define FUNC_SAVE + %define FUNC_RESTORE +%endif + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define arg4 r12 + %define arg5 r13 + %define tmp r11 + %define tmp2 r10 + %define stack_size 16 + 3*8 ; must be an odd multiple of 8 + %define arg(x) [rsp + stack_size + 8 + 8*x] + + %define func(x) proc_frame x + %macro FUNC_SAVE 0 + sub rsp, stack_size + vmovdqa [rsp + 16*0], xmm6 + mov [rsp + 16 + 0*8], r12 + mov [rsp + 16 + 1*8], r13 + end_prolog + mov arg4, arg(4) + mov arg5, arg(5) + %endmacro + + %macro FUNC_RESTORE 0 + vmovdqa xmm6, [rsp + 16*0] + mov r12, [rsp + 16 + 0*8] + mov r13, [rsp + 16 + 1*8] + add rsp, stack_size + %endmacro +%endif + +%define len arg0 +%define vec arg1 +%define vec_i arg2 +%define mul_array arg3 +%define src arg4 +%define dest1 arg5 +%define pos rax +%define dest2 tmp2 + +%ifndef EC_ALIGNED_ADDR +;;; Use Un-aligned load/store + %define XLDR vmovdqu8 + %define XSTR vmovdqu8 +%else +;;; Use Non-temporal load/stor + %ifdef NO_NT_LDST + %define XLDR vmovdqa64 + %define XSTR vmovdqa64 + %else + %define XLDR vmovntdqa + %define XSTR vmovntdq + %endif +%endif + +default rel +[bits 64] +section .text + +%define x0 zmm0 +%define xd1 zmm1 +%define xd2 zmm2 +%define xgft1 zmm3 +%define xgft2 zmm4 +%define xret1 zmm5 +%define xret2 zmm6 + +;; +;; Encodes 64 bytes of a single source into 2x 64 bytes (parity disks) +;; +%macro ENCODE_64B_2 0-1 +%define %%KMASK %1 + +%if %0 == 1 + vmovdqu8 x0{%%KMASK}, [src + pos] ;Get next source vector + vmovdqu8 xd1{%%KMASK}, [dest1 + pos] ;Get next dest vector + vmovdqu8 xd2{%%KMASK}, [dest2 + pos] ;Get next dest vector +%else + XLDR x0, [src + pos] ;Get next source vector + XLDR xd1, [dest1 + pos] ;Get next dest vector + XLDR xd2, [dest2 + pos] ;Get next dest vector +%endif + + GF_MUL_XOR EVEX, x0, xgft1, xret1, xd1, xgft2, xret2, xd2 + +%if %0 == 1 + vmovdqu8 [dest1 + pos]{%%KMASK}, xd1 + vmovdqu8 [dest2 + pos]{%%KMASK}, xd2 +%else + XSTR [dest1 + pos], xd1 + XSTR [dest2 + pos], xd2 +%endif +%endmacro + +align 16 +global gf_2vect_mad_avx512_gfni, function +func(gf_2vect_mad_avx512_gfni) + FUNC_SAVE + + xor pos, pos + shl vec_i, 3 ;Multiply by 8 + shl vec, 3 + lea tmp, [mul_array + vec_i] + vbroadcastf32x2 xgft1, [tmp] + vbroadcastf32x2 xgft2, [tmp + vec] + mov dest2, [dest1 + 8] ; reuse mul_array + mov dest1, [dest1] + + cmp len, 64 + jl .len_lt_64 +.loop64: + ENCODE_64B_2 + + add pos, 64 ;Loop on 64 bytes at a time + sub len, 64 + cmp len, 64 + jge .loop64 + +.len_lt_64: + cmp len, 0 + jle .exit + + xor tmp, tmp + bts tmp, len + dec tmp + kmovq k1, tmp + + ENCODE_64B_2 k1 + +.exit: + vzeroupper + + FUNC_RESTORE + ret + +endproc_frame +%endif ; if AS_FEATURE_LEVEL >= 10 diff --git a/contrib/libs/isa-l/erasure_code/gf_2vect_mad_sse.asm b/contrib/libs/isa-l/erasure_code/gf_2vect_mad_sse.asm index 5bf380df14..1fa6729a6c 100644 --- a/contrib/libs/isa-l/erasure_code/gf_2vect_mad_sse.asm +++ b/contrib/libs/isa-l/erasure_code/gf_2vect_mad_sse.asm @@ -97,7 +97,7 @@ %define return rax %define return.w eax - %define func(x) x: + %define func(x) x: endbranch %define FUNC_SAVE %define FUNC_RESTORE %endif @@ -154,13 +154,8 @@ section .text align 16 -global gf_2vect_mad_sse:ISAL_SYM_TYPE_FUNCTION +global gf_2vect_mad_sse, function func(gf_2vect_mad_sse) -%ifidn __OUTPUT_FORMAT__, macho64 -global _gf_2vect_mad_sse:ISAL_SYM_TYPE_FUNCTION -func(_gf_2vect_mad_sse) -%endif - FUNC_SAVE sub len, 16 jl .return_fail @@ -239,6 +234,3 @@ align 16 mask0f: dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f - -;;; func core, ver, snum -slversion gf_2vect_mad_sse, 00, 01, 0203 diff --git a/contrib/libs/isa-l/erasure_code/gf_2vect_mad_sse.patch b/contrib/libs/isa-l/erasure_code/gf_2vect_mad_sse.patch deleted file mode 100644 index 1d9e040742..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_2vect_mad_sse.patch +++ /dev/null @@ -1,6 +0,0 @@ -159,163d158 -< %ifidn __OUTPUT_FORMAT__, macho64 -< global _gf_2vect_mad_sse:ISAL_SYM_TYPE_FUNCTION -< func(_gf_2vect_mad_sse) -< %endif -< diff --git a/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx.asm b/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx.asm index a2619507b7..7676c56229 100644 --- a/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx.asm +++ b/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx.asm @@ -52,7 +52,7 @@ %define PS 8 %define LOG_PS 3 - %define func(x) x: + %define func(x) x: endbranch %macro FUNC_SAVE 0 push r12 push r13 @@ -87,12 +87,12 @@ %define func(x) proc_frame x %macro FUNC_SAVE 0 alloc_stack stack_size - save_xmm128 xmm6, 0*16 - save_xmm128 xmm7, 1*16 - save_xmm128 xmm8, 2*16 - save_xmm128 xmm9, 3*16 - save_xmm128 xmm10, 4*16 - save_xmm128 xmm11, 5*16 + vmovdqa [rsp + 0*16], xmm6 + vmovdqa [rsp + 1*16], xmm7 + vmovdqa [rsp + 2*16], xmm8 + vmovdqa [rsp + 3*16], xmm9 + vmovdqa [rsp + 4*16], xmm10 + vmovdqa [rsp + 5*16], xmm11 save_reg r12, 6*16 + 0*8 save_reg r13, 6*16 + 1*8 save_reg r14, 6*16 + 2*8 @@ -139,7 +139,7 @@ %define PS 4 %define LOG_PS 2 - %define func(x) x: + %define func(x) x: endbranch %define arg(x) [ebp + PS*2 + PS*x] %define var(x) [ebp - PS - PS*x] @@ -261,13 +261,8 @@ section .text %endif align 16 -global gf_3vect_dot_prod_avx:ISAL_SYM_TYPE_FUNCTION +global gf_3vect_dot_prod_avx, function func(gf_3vect_dot_prod_avx) -%ifidn __OUTPUT_FORMAT__, macho64 -global _gf_3vect_dot_prod_avx:ISAL_SYM_TYPE_FUNCTION -func(_gf_3vect_dot_prod_avx) -%endif - FUNC_SAVE SLDR len, len_m sub len, 16 @@ -377,6 +372,3 @@ section .data align 16 mask0f: dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f - -;;; func core, ver, snum -slversion gf_3vect_dot_prod_avx, 02, 05, 0192 diff --git a/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx.patch b/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx.patch deleted file mode 100644 index 8689356763..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx.patch +++ /dev/null @@ -1,6 +0,0 @@ -266,270d265 -< %ifidn __OUTPUT_FORMAT__, macho64 -< global _gf_3vect_dot_prod_avx:ISAL_SYM_TYPE_FUNCTION -< func(_gf_3vect_dot_prod_avx) -< %endif -< diff --git a/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx2.asm b/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx2.asm index 26b6b82e21..d06ccc30d2 100644 --- a/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx2.asm +++ b/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx2.asm @@ -54,7 +54,7 @@ %define PS 8 %define LOG_PS 3 - %define func(x) x: + %define func(x) x: endbranch %macro FUNC_SAVE 0 push r12 push r13 @@ -143,7 +143,7 @@ %define PS 4 %define LOG_PS 2 - %define func(x) x: + %define func(x) x: endbranch %define arg(x) [ebp + PS*2 + PS*x] %define var(x) [ebp - PS - PS*x] @@ -269,13 +269,8 @@ section .text %endif align 16 -global gf_3vect_dot_prod_avx2:ISAL_SYM_TYPE_FUNCTION +global gf_3vect_dot_prod_avx2, function func(gf_3vect_dot_prod_avx2) -%ifidn __OUTPUT_FORMAT__, macho64 -global _gf_3vect_dot_prod_avx2:ISAL_SYM_TYPE_FUNCTION -func(_gf_3vect_dot_prod_avx2) -%endif - FUNC_SAVE SLDR len, len_m sub len, 32 @@ -395,8 +390,3 @@ func(_gf_3vect_dot_prod_avx2) ret endproc_frame - -section .data - -;;; func core, ver, snum -slversion gf_3vect_dot_prod_avx2, 04, 05, 0197 diff --git a/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx2.patch b/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx2.patch deleted file mode 100644 index 9c59162877..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx2.patch +++ /dev/null @@ -1,6 +0,0 @@ -274,278d273 -< %ifidn __OUTPUT_FORMAT__, macho64 -< global _gf_3vect_dot_prod_avx2:ISAL_SYM_TYPE_FUNCTION -< func(_gf_3vect_dot_prod_avx2) -< %endif -< diff --git a/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx2_gfni.asm b/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx2_gfni.asm new file mode 100644 index 0000000000..76a19763a3 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx2_gfni.asm @@ -0,0 +1,335 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2023 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; +;;; gf_3vect_dot_prod_avx2_gfni(len, vec, *g_tbls, **buffs, **dests); +;;; + +%include "reg_sizes.asm" +%include "gf_vect_gfni.inc" +%include "memcpy.asm" + +%if AS_FEATURE_LEVEL >= 10 + +%ifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + + %define tmp r11 + %define tmp2 r10 + %define tmp3 r13 ; must be saved and restored + %define tmp4 r12 ; must be saved and restored + %define tmp5 r14 ; must be saved and restored + %define tmp6 r15 ; must be saved and restored + + %define stack_size 4*8 + %define func(x) x: endbranch + %macro FUNC_SAVE 0 + sub rsp, stack_size + mov [rsp + 0*8], r12 + mov [rsp + 1*8], r13 + mov [rsp + 2*8], r14 + mov [rsp + 3*8], r15 + %endmacro + %macro FUNC_RESTORE 0 + mov r12, [rsp + 0*8] + mov r13, [rsp + 1*8] + mov r14, [rsp + 2*8] + mov r15, [rsp + 3*8] + add rsp, stack_size + %endmacro +%endif + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + + %define arg4 r12 ; must be saved, loaded and restored + %define arg5 r15 ; must be saved and restored + %define tmp r11 + %define tmp2 r10 + %define tmp3 r13 ; must be saved and restored + %define tmp4 r14 ; must be saved and restored + %define tmp5 rdi ; must be saved and restored + %define tmp6 rsi ; must be saved and restored + %define stack_size 8*16 + 7*8 ; must be an odd multiple of 8 + %define arg(x) [rsp + stack_size + 8 + 8*x] + + %define func(x) proc_frame x + %macro FUNC_SAVE 0 + alloc_stack stack_size + vmovdqa [rsp + 0*16], xmm6 + vmovdqa [rsp + 1*16], xmm7 + vmovdqa [rsp + 2*16], xmm8 + vmovdqa [rsp + 3*16], xmm9 + vmovdqa [rsp + 4*16], xmm10 + vmovdqa [rsp + 5*16], xmm11 + vmovdqa [rsp + 6*16], xmm12 + vmovdqa [rsp + 7*16], xmm13 + mov [rsp + 8*16 + 0*8], r12 + mov [rsp + 8*16 + 1*8], r13 + mov [rsp + 8*16 + 2*8], r14 + mov [rsp + 8*16 + 3*8], r15 + mov [rsp + 8*16 + 4*8], rdi + mov [rsp + 8*16 + 5*8], rsi + end_prolog + mov arg4, arg(4) + %endmacro + + %macro FUNC_RESTORE 0 + vmovdqa xmm6, [rsp + 0*16] + vmovdqa xmm7, [rsp + 1*16] + vmovdqa xmm8, [rsp + 2*16] + vmovdqa xmm9, [rsp + 3*16] + vmovdqa xmm10, [rsp + 4*16] + vmovdqa xmm11, [rsp + 5*16] + vmovdqa xmm12, [rsp + 6*16] + vmovdqa xmm13, [rsp + 7*16] + mov r12, [rsp + 8*16 + 0*8] + mov r13, [rsp + 8*16 + 1*8] + mov r14, [rsp + 8*16 + 2*8] + mov r15, [rsp + 8*16 + 3*8] + mov rdi, [rsp + 8*16 + 4*8] + mov rsi, [rsp + 8*16 + 5*8] + add rsp, stack_size + %endmacro +%endif + + +%define len arg0 +%define vec arg1 +%define mul_array arg2 +%define src arg3 +%define dest arg4 +%define ptr arg5 +%define vec_i tmp2 +%define dest2 tmp3 +%define dest3 tmp4 +%define dest1 tmp5 +%define pos rax + +%ifndef EC_ALIGNED_ADDR +;;; Use Un-aligned load/store + %define XLDR vmovdqu + %define XSTR vmovdqu +%else +;;; Use Non-temporal load/stor + %ifdef NO_NT_LDST + %define XLDR vmovdqa + %define XSTR vmovdqa + %else + %define XLDR vmovntdqa + %define XSTR vmovntdq + %endif +%endif + +%define x0l ymm0 +%define x0h ymm1 + +%define xgft1 ymm8 +%define xgft2 ymm9 +%define xgft3 ymm10 + +%define xtmp1 ymm11 +%define xtmp2 ymm12 +%define xtmp3 ymm13 + +%define xp1l ymm2 +%define xp2l ymm3 +%define xp3l ymm4 +%define xp1h ymm5 +%define xp2h ymm6 +%define xp3h ymm7 + +%define x0 x0l +%define xp1 xp1l +%define xp2 xp2l +%define xp3 xp3l + +default rel +[bits 64] + +section .text + +;; +;; Encodes 64 bytes of all "k" sources into 3x 64 bytes (parity disks) +;; +%macro ENCODE_64B_3 0 + vpxor xp1l, xp1l, xp1l + vpxor xp1h, xp1h, xp1h + vpxor xp2l, xp2l, xp2l + vpxor xp2h, xp2h, xp2h + vpxor xp3l, xp3l, xp3l + vpxor xp3h, xp3h, xp3h + mov tmp, mul_array + xor vec_i, vec_i + +%%next_vect: + mov ptr, [src + vec_i] + XLDR x0l, [ptr + pos] ;; Get next source vector low 32 bytes + XLDR x0h, [ptr + pos + 32] ;; Get next source vector high 32 bytes + add vec_i, 8 + + vbroadcastsd xgft1, [tmp] + vbroadcastsd xgft2, [tmp + vec] + vbroadcastsd xgft3, [tmp + vec*2] + add tmp, 8 + + GF_MUL_XOR VEX, x0l, xgft1, xtmp1, xp1l, xgft2, xtmp2, xp2l, xgft3, xtmp3, xp3l + GF_MUL_XOR VEX, x0h, xgft1, xgft1, xp1h, xgft2, xgft2, xp2h, xgft3, xgft3, xp3h + + cmp vec_i, vec + jl %%next_vect + + XSTR [dest1 + pos], xp1l + XSTR [dest1 + pos + 32], xp1h + XSTR [dest2 + pos], xp2l + XSTR [dest2 + pos + 32], xp2h + XSTR [dest3 + pos], xp3l + XSTR [dest3 + pos + 32], xp3h +%endmacro + +;; +;; Encodes 32 bytes of all "k" sources into 3x 32 bytes (parity disks) +;; +%macro ENCODE_32B_3 0 + vpxor xp1, xp1, xp1 + vpxor xp2, xp2, xp2 + vpxor xp3, xp3, xp3 + mov tmp, mul_array + xor vec_i, vec_i + +%%next_vect: + mov ptr, [src + vec_i] + XLDR x0, [ptr + pos] ;Get next source vector (32 bytes) + add vec_i, 8 + + vbroadcastsd xgft1, [tmp] + vbroadcastsd xgft2, [tmp + vec] + vbroadcastsd xgft3, [tmp + vec*2] + add tmp, 8 + + GF_MUL_XOR VEX, x0, xgft1, xgft1, xp1, xgft2, xgft2, xp2, xgft3, xgft3, xp3 + + cmp vec_i, vec + jl %%next_vect + + XSTR [dest1 + pos], xp1 + XSTR [dest2 + pos], xp2 + XSTR [dest3 + pos], xp3 +%endmacro + +;; +;; Encodes less than 32 bytes of all "k" sources into 3 parity disks +;; +%macro ENCODE_LT_32B_3 1 +%define %%LEN %1 + + vpxor xp1, xp1, xp1 + vpxor xp2, xp2, xp2 + vpxor xp3, xp3, xp3 + xor vec_i, vec_i + +%%next_vect: + mov ptr, [src + vec_i] + simd_load_avx2 x0, ptr + pos, %%LEN, tmp, tmp6 ;Get next source vector + add vec_i, 8 + + vbroadcastsd xgft1, [mul_array] + vbroadcastsd xgft2, [mul_array + vec] + vbroadcastsd xgft3, [mul_array + vec*2] + add mul_array, 8 + + GF_MUL_XOR VEX, x0, xgft1, xgft1, xp1, xgft2, xgft2, xp2, xgft3, xgft3, xp3 + + cmp vec_i, vec + jl %%next_vect + + ;Store updated encoded data + lea ptr, [dest1 + pos] + simd_store_avx2 ptr, xp1, %%LEN, tmp, vec_i + + lea ptr, [dest2 + pos] + simd_store_avx2 ptr, xp2, %%LEN, tmp, vec_i + + lea ptr, [dest3 + pos] + simd_store_avx2 ptr, xp3, %%LEN, tmp, vec_i +%endmacro + +align 16 +global gf_3vect_dot_prod_avx2_gfni, function +func(gf_3vect_dot_prod_avx2_gfni) + FUNC_SAVE + + xor pos, pos + shl vec, 3 ;; vec *= 8. Make vec_i count by 8 + mov dest1, [dest] + mov dest2, [dest + 8] + mov dest3, [dest + 2*8] + + cmp len, 64 + jl .len_lt_64 + +.loop64: + ENCODE_64B_3 + + add pos, 64 ;; Loop on 64 bytes at a time first + sub len, 64 + cmp len, 64 + jge .loop64 + +.len_lt_64: + cmp len, 32 + jl .len_lt_32 + + ENCODE_32B_3 + + add pos, 32 ;; encode next 32 bytes + sub len, 32 + +.len_lt_32: + cmp len, 0 + jle .exit + + ENCODE_LT_32B_3 len + +.exit: + vzeroupper + + FUNC_RESTORE + ret + +endproc_frame +%endif ; if AS_FEATURE_LEVEL >= 10 diff --git a/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx512.asm b/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx512.asm index 16a90eb2af..fcd919367d 100644 --- a/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx512.asm +++ b/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx512.asm @@ -44,8 +44,6 @@ %define arg5 r9 %define tmp r11 - %define tmp.w r11d - %define tmp.b r11b %define tmp2 r10 %define tmp3 r13 ; must be saved and restored %define tmp4 r12 ; must be saved and restored @@ -53,7 +51,7 @@ %define PS 8 %define LOG_PS 3 - %define func(x) x: + %define func(x) x: endbranch %macro FUNC_SAVE 0 push r12 push r13 @@ -73,15 +71,13 @@ %define arg4 r12 ; must be saved, loaded and restored %define arg5 r15 ; must be saved and restored %define tmp r11 - %define tmp.w r11d - %define tmp.b r11b %define tmp2 r10 %define tmp3 r13 ; must be saved and restored %define tmp4 r14 ; must be saved and restored %define return rax %define PS 8 %define LOG_PS 3 - %define stack_size 9*16 + 5*8 ; must be an odd multiple of 8 + %define stack_size 6*16 + 5*8 ; must be an odd multiple of 8 %define arg(x) [rsp + stack_size + PS + PS*x] %define func(x) proc_frame x @@ -93,13 +89,10 @@ vmovdqa [rsp + 3*16], xmm9 vmovdqa [rsp + 4*16], xmm10 vmovdqa [rsp + 5*16], xmm11 - vmovdqa [rsp + 6*16], xmm12 - vmovdqa [rsp + 7*16], xmm13 - vmovdqa [rsp + 8*16], xmm14 - save_reg r12, 9*16 + 0*8 - save_reg r13, 9*16 + 1*8 - save_reg r14, 9*16 + 2*8 - save_reg r15, 9*16 + 3*8 + save_reg r12, 6*16 + 0*8 + save_reg r13, 6*16 + 1*8 + save_reg r14, 6*16 + 2*8 + save_reg r15, 6*16 + 3*8 end_prolog mov arg4, arg(4) %endmacro @@ -111,13 +104,10 @@ vmovdqa xmm9, [rsp + 3*16] vmovdqa xmm10, [rsp + 4*16] vmovdqa xmm11, [rsp + 5*16] - vmovdqa xmm12, [rsp + 6*16] - vmovdqa xmm13, [rsp + 7*16] - vmovdqa xmm14, [rsp + 8*16] - mov r12, [rsp + 9*16 + 0*8] - mov r13, [rsp + 9*16 + 1*8] - mov r14, [rsp + 9*16 + 2*8] - mov r15, [rsp + 9*16 + 3*8] + mov r12, [rsp + 6*16 + 0*8] + mov r13, [rsp + 6*16 + 1*8] + mov r14, [rsp + 6*16 + 2*8] + mov r15, [rsp + 6*16 + 3*8] add rsp, stack_size %endmacro %endif @@ -142,8 +132,8 @@ %else ;;; Use Non-temporal load/stor %ifdef NO_NT_LDST - %define XLDR vmovdqa - %define XSTR vmovdqa + %define XLDR vmovdqa64 + %define XSTR vmovdqa64 %else %define XLDR vmovntdqa %define XSTR vmovntdq @@ -173,13 +163,8 @@ default rel section .text align 16 -global gf_3vect_dot_prod_avx512:ISAL_SYM_TYPE_FUNCTION +global gf_3vect_dot_prod_avx512, function func(gf_3vect_dot_prod_avx512) -%ifidn __OUTPUT_FORMAT__, macho64 -global _gf_3vect_dot_prod_avx512:ISAL_SYM_TYPE_FUNCTION -func(_gf_3vect_dot_prod_avx512) -%endif - FUNC_SAVE sub len, 64 jl .return_fail diff --git a/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx512.patch b/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx512.patch deleted file mode 100644 index 8397eb6861..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx512.patch +++ /dev/null @@ -1,6 +0,0 @@ -178,182d177 -< %ifidn __OUTPUT_FORMAT__, macho64 -< global _gf_3vect_dot_prod_avx512:ISAL_SYM_TYPE_FUNCTION -< func(_gf_3vect_dot_prod_avx512) -< %endif -< diff --git a/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx512_gfni.asm b/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx512_gfni.asm new file mode 100644 index 0000000000..39ee6382a2 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx512_gfni.asm @@ -0,0 +1,225 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2023 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; +;;; gf_3vect_dot_prod_avx512_gfni(len, vec, *g_tbls, **buffs, **dests); +;;; + +%include "reg_sizes.asm" +%include "gf_vect_gfni.inc" + +%if AS_FEATURE_LEVEL >= 10 + +%ifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + + %define tmp r11 + %define tmp2 r10 + %define tmp3 r13 ; must be saved and restored + %define tmp4 r12 ; must be saved and restored + + %define func(x) x: endbranch + %macro FUNC_SAVE 0 + push r12 + push r13 + %endmacro + %macro FUNC_RESTORE 0 + pop r13 + pop r12 + %endmacro +%endif + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + + %define arg4 r12 ; must be saved, loaded and restored + %define arg5 r15 ; must be saved and restored + %define tmp r11 + %define tmp2 r10 + %define tmp3 r13 ; must be saved and restored + %define tmp4 r14 ; must be saved and restored + %define stack_size 1*16 + 5*8 ; must be an odd multiple of 8 + %define arg(x) [rsp + stack_size + 8 + 8*x] + + %define func(x) proc_frame x + %macro FUNC_SAVE 0 + alloc_stack stack_size + vmovdqa [rsp + 0*16], xmm6 + mov [rsp + 1*16 + 0*8], r12 + mov [rsp + 1*16 + 1*8], r13 + mov [rsp + 1*16 + 2*8], r14 + mov [rsp + 1*16 + 3*8], r15 + end_prolog + mov arg4, arg(4) + %endmacro + + %macro FUNC_RESTORE 0 + vmovdqa xmm6, [rsp + 0*16] + mov r12, [rsp + 1*16 + 0*8] + mov r13, [rsp + 1*16 + 1*8] + mov r14, [rsp + 1*16 + 2*8] + mov r15, [rsp + 1*16 + 3*8] + add rsp, stack_size + %endmacro +%endif + + +%define len arg0 +%define vec arg1 +%define mul_array arg2 +%define src arg3 +%define dest1 arg4 +%define ptr arg5 +%define vec_i tmp2 +%define dest2 tmp3 +%define dest3 tmp4 +%define pos rax + + +%ifndef EC_ALIGNED_ADDR +;;; Use Un-aligned load/store + %define XLDR vmovdqu8 + %define XSTR vmovdqu8 +%else +;;; Use Non-temporal load/stor + %ifdef NO_NT_LDST + %define XLDR vmovdqa64 + %define XSTR vmovdqa64 + %else + %define XLDR vmovntdqa + %define XSTR vmovntdq + %endif +%endif + +%define xgft1 zmm4 +%define xgft2 zmm5 +%define xgft3 zmm6 + +%define x0 zmm0 +%define xp1 zmm1 +%define xp2 zmm2 +%define xp3 zmm3 + +default rel +[bits 64] + +section .text + +;; +;; Encodes 64 bytes of all "k" sources into 3x 64 bytes (parity disks) +;; +%macro ENCODE_64B_3 0-1 +%define %%KMASK %1 + + vpxorq xp1, xp1, xp1 + vpxorq xp2, xp2, xp2 + vpxorq xp3, xp3, xp3 + mov tmp, mul_array + xor vec_i, vec_i + +%%next_vect: + mov ptr, [src + vec_i] +%if %0 == 1 + vmovdqu8 x0{%%KMASK}, [ptr + pos] ;Get next source vector (less than 64 bytes) +%else + XLDR x0, [ptr + pos] ;Get next source vector (64 bytes) +%endif + add vec_i, 8 + + vbroadcastf32x2 xgft1, [tmp] + vbroadcastf32x2 xgft2, [tmp + vec] + vbroadcastf32x2 xgft3, [tmp + vec*2] + add tmp, 8 + + GF_MUL_XOR EVEX, x0, xgft1, xgft1, xp1, xgft2, xgft2, xp2, xgft3, xgft3, xp3 + + cmp vec_i, vec + jl %%next_vect + +%if %0 == 1 + vmovdqu8 [dest1 + pos]{%%KMASK}, xp1 + vmovdqu8 [dest2 + pos]{%%KMASK}, xp2 + vmovdqu8 [dest3 + pos]{%%KMASK}, xp3 +%else + XSTR [dest1 + pos], xp1 + XSTR [dest2 + pos], xp2 + XSTR [dest3 + pos], xp3 +%endif +%endmacro + +align 16 +global gf_3vect_dot_prod_avx512_gfni, function +func(gf_3vect_dot_prod_avx512_gfni) + FUNC_SAVE + + xor pos, pos + shl vec, 3 ;vec *= 8. Make vec_i count by 8 + mov dest2, [dest1 + 8] + mov dest3, [dest1 + 2*8] + mov dest1, [dest1] + + cmp len, 64 + jl .len_lt_64 + +.loop64: + + ENCODE_64B_3 + + add pos, 64 ;Loop on 64 bytes at a time + sub len, 64 + cmp len, 64 + jge .loop64 + +.len_lt_64: + cmp len, 0 + jle .exit + + xor tmp, tmp + bts tmp, len + dec tmp + kmovq k1, tmp + + ENCODE_64B_3 k1 + +.exit: + vzeroupper + + FUNC_RESTORE + ret + +endproc_frame +%endif ; if AS_FEATURE_LEVEL >= 10 diff --git a/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_sse.asm b/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_sse.asm index 582fac8481..af0875016c 100644 --- a/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_sse.asm +++ b/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_sse.asm @@ -52,7 +52,7 @@ %define PS 8 %define LOG_PS 3 - %define func(x) x: + %define func(x) x: endbranch %macro FUNC_SAVE 0 push r12 push r13 @@ -139,7 +139,7 @@ %define PS 4 %define LOG_PS 2 - %define func(x) x: + %define func(x) x: endbranch %define arg(x) [ebp + PS*2 + PS*x] %define var(x) [ebp - PS - PS*x] @@ -261,13 +261,8 @@ section .text %endif align 16 -global gf_3vect_dot_prod_sse:ISAL_SYM_TYPE_FUNCTION +global gf_3vect_dot_prod_sse, function func(gf_3vect_dot_prod_sse) -%ifidn __OUTPUT_FORMAT__, macho64 -global _gf_3vect_dot_prod_sse:ISAL_SYM_TYPE_FUNCTION -func(_gf_3vect_dot_prod_sse) -%endif - FUNC_SAVE SLDR len, len_m sub len, 16 @@ -378,6 +373,3 @@ section .data align 16 mask0f: dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f - -;;; func core, ver, snum -slversion gf_3vect_dot_prod_sse, 00, 06, 0063 diff --git a/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_sse.patch b/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_sse.patch deleted file mode 100644 index f21ce0ff9c..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_sse.patch +++ /dev/null @@ -1,6 +0,0 @@ -266,270d265 -< %ifidn __OUTPUT_FORMAT__, macho64 -< global _gf_3vect_dot_prod_sse:ISAL_SYM_TYPE_FUNCTION -< func(_gf_3vect_dot_prod_sse) -< %endif -< diff --git a/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_sse_test.c b/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_sse_test.c deleted file mode 100644 index b2c19382ff..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_sse_test.c +++ /dev/null @@ -1,586 +0,0 @@ -/********************************************************************** - Copyright(c) 2011-2015 Intel Corporation All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -**********************************************************************/ - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> // for memset, memcmp -#include "erasure_code.h" -// #include "types.h" - -#ifndef FUNCTION_UNDER_TEST -# define FUNCTION_UNDER_TEST gf_3vect_dot_prod_sse -#endif -#ifndef TEST_MIN_SIZE -# define TEST_MIN_SIZE 16 -#endif - -#define str(s) #s -#define xstr(s) str(s) - -#define TEST_LEN 2048 -#define TEST_SIZE (TEST_LEN/2) -#define TEST_MEM TEST_SIZE -#define TEST_LOOPS 1000 -#define TEST_TYPE_STR "" - -#ifndef TEST_SOURCES -# define TEST_SOURCES 16 -#endif -#ifndef RANDOMS -# define RANDOMS 20 -#endif - -#ifdef EC_ALIGNED_ADDR -// Define power of 2 range to check ptr, len alignment -# define PTR_ALIGN_CHK_B 0 -# define LEN_ALIGN_CHK_B 0 // 0 for aligned only -#else -// Define power of 2 range to check ptr, len alignment -# define PTR_ALIGN_CHK_B 32 -# define LEN_ALIGN_CHK_B 32 // 0 for aligned only -#endif - -typedef unsigned char u8; - -extern void FUNCTION_UNDER_TEST(int len, int vlen, unsigned char *gftbls, - unsigned char **src, unsigned char **dest); - -void dump(unsigned char *buf, int len) -{ - int i; - for (i = 0; i < len;) { - printf(" %2x", 0xff & buf[i++]); - if (i % 32 == 0) - printf("\n"); - } - printf("\n"); -} - -void dump_matrix(unsigned char **s, int k, int m) -{ - int i, j; - for (i = 0; i < k; i++) { - for (j = 0; j < m; j++) { - printf(" %2x", s[i][j]); - } - printf("\n"); - } - printf("\n"); -} - -void dump_u8xu8(unsigned char *s, int k, int m) -{ - int i, j; - for (i = 0; i < k; i++) { - for (j = 0; j < m; j++) { - printf(" %2x", 0xff & s[j + (i * m)]); - } - printf("\n"); - } - printf("\n"); -} - -int main(int argc, char *argv[]) -{ - int i, j, rtest, srcs; - void *buf; - u8 g1[TEST_SOURCES], g2[TEST_SOURCES], g3[TEST_SOURCES]; - u8 g_tbls[3 * TEST_SOURCES * 32], *dest_ptrs[3], *buffs[TEST_SOURCES]; - u8 *dest1, *dest2, *dest3, *dest_ref1, *dest_ref2, *dest_ref3; - - int align, size; - unsigned char *efence_buffs[TEST_SOURCES]; - unsigned int offset; - u8 *ubuffs[TEST_SOURCES]; - u8 *udest_ptrs[3]; - printf(xstr(FUNCTION_UNDER_TEST) "_test: %dx%d ", TEST_SOURCES, TEST_LEN); - - // Allocate the arrays - for (i = 0; i < TEST_SOURCES; i++) { - if (posix_memalign(&buf, 64, TEST_LEN)) { - printf("alloc error: Fail"); - return -1; - } - buffs[i] = buf; - } - - if (posix_memalign(&buf, 64, TEST_LEN)) { - printf("alloc error: Fail"); - return -1; - } - dest1 = buf; - - if (posix_memalign(&buf, 64, TEST_LEN)) { - printf("alloc error: Fail"); - return -1; - } - dest2 = buf; - - if (posix_memalign(&buf, 64, TEST_LEN)) { - printf("alloc error: Fail"); - return -1; - } - dest3 = buf; - - if (posix_memalign(&buf, 64, TEST_LEN)) { - printf("alloc error: Fail"); - return -1; - } - dest_ref1 = buf; - - if (posix_memalign(&buf, 64, TEST_LEN)) { - printf("alloc error: Fail");; - return -1; - } - dest_ref2 = buf; - - if (posix_memalign(&buf, 64, TEST_LEN)) { - printf("alloc error: Fail"); - return -1; - } - dest_ref3 = buf; - - dest_ptrs[0] = dest1; - dest_ptrs[1] = dest2; - dest_ptrs[2] = dest3; - - // Test of all zeros - for (i = 0; i < TEST_SOURCES; i++) - memset(buffs[i], 0, TEST_LEN); - - memset(dest1, 0, TEST_LEN); - memset(dest2, 0, TEST_LEN); - memset(dest3, 0, TEST_LEN); - memset(dest_ref1, 0, TEST_LEN); - memset(dest_ref2, 0, TEST_LEN); - memset(dest_ref3, 0, TEST_LEN); - memset(g1, 2, TEST_SOURCES); - memset(g2, 1, TEST_SOURCES); - memset(g3, 7, TEST_SOURCES); - - for (i = 0; i < TEST_SOURCES; i++) { - gf_vect_mul_init(g1[i], &g_tbls[i * 32]); - gf_vect_mul_init(g2[i], &g_tbls[32 * TEST_SOURCES + i * 32]); - gf_vect_mul_init(g3[i], &g_tbls[64 * TEST_SOURCES + i * 32]); - } - - gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1); - gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES], buffs, - dest_ref2); - gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES], buffs, - dest_ref3); - - FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs); - - if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) { - printf("Fail zero" xstr(FUNCTION_UNDER_TEST) " test1\n"); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref1, 25); - printf("dprod_dut:"); - dump(dest1, 25); - return -1; - } - if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) { - printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test2\n"); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref2, 25); - printf("dprod_dut:"); - dump(dest2, 25); - return -1; - } - if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) { - printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test3\n"); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref3, 25); - printf("dprod_dut:"); - dump(dest3, 25); - return -1; - } - - putchar('.'); - - // Rand data test - - for (rtest = 0; rtest < RANDOMS; rtest++) { - for (i = 0; i < TEST_SOURCES; i++) - for (j = 0; j < TEST_LEN; j++) - buffs[i][j] = rand(); - - for (i = 0; i < TEST_SOURCES; i++) { - g1[i] = rand(); - g2[i] = rand(); - g3[i] = rand(); - } - - for (i = 0; i < TEST_SOURCES; i++) { - gf_vect_mul_init(g1[i], &g_tbls[i * 32]); - gf_vect_mul_init(g2[i], &g_tbls[(32 * TEST_SOURCES) + (i * 32)]); - gf_vect_mul_init(g3[i], &g_tbls[(64 * TEST_SOURCES) + (i * 32)]); - } - - gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1); - gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES], - buffs, dest_ref2); - gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES], - buffs, dest_ref3); - - FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs); - - if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test1 %d\n", rtest); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref1, 25); - printf("dprod_dut:"); - dump(dest1, 25); - return -1; - } - if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test2 %d\n", rtest); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref2, 25); - printf("dprod_dut:"); - dump(dest2, 25); - return -1; - } - if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test3 %d\n", rtest); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref3, 25); - printf("dprod_dut:"); - dump(dest3, 25); - return -1; - } - - putchar('.'); - } - - // Rand data test with varied parameters - for (rtest = 0; rtest < RANDOMS; rtest++) { - for (srcs = TEST_SOURCES; srcs > 0; srcs--) { - for (i = 0; i < srcs; i++) - for (j = 0; j < TEST_LEN; j++) - buffs[i][j] = rand(); - - for (i = 0; i < srcs; i++) { - g1[i] = rand(); - g2[i] = rand(); - g3[i] = rand(); - } - - for (i = 0; i < srcs; i++) { - gf_vect_mul_init(g1[i], &g_tbls[i * 32]); - gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]); - gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]); - } - - gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[0], buffs, dest_ref1); - gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[32 * srcs], buffs, - dest_ref2); - gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[64 * srcs], buffs, - dest_ref3); - - FUNCTION_UNDER_TEST(TEST_LEN, srcs, g_tbls, buffs, dest_ptrs); - - if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) - " test1 srcs=%d\n", srcs); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref1, 25); - printf("dprod_dut:"); - dump(dest1, 25); - return -1; - } - if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) - " test2 srcs=%d\n", srcs); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref2, 25); - printf("dprod_dut:"); - dump(dest2, 25); - return -1; - } - if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) - " test3 srcs=%d\n", srcs); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref3, 25); - printf("dprod_dut:"); - dump(dest3, 25); - return -1; - } - - putchar('.'); - } - } - - // Run tests at end of buffer for Electric Fence - align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16; - for (size = TEST_MIN_SIZE; size <= TEST_SIZE; size += align) { - for (i = 0; i < TEST_SOURCES; i++) - for (j = 0; j < TEST_LEN; j++) - buffs[i][j] = rand(); - - for (i = 0; i < TEST_SOURCES; i++) // Line up TEST_SIZE from end - efence_buffs[i] = buffs[i] + TEST_LEN - size; - - for (i = 0; i < TEST_SOURCES; i++) { - g1[i] = rand(); - g2[i] = rand(); - g3[i] = rand(); - } - - for (i = 0; i < TEST_SOURCES; i++) { - gf_vect_mul_init(g1[i], &g_tbls[i * 32]); - gf_vect_mul_init(g2[i], &g_tbls[(32 * TEST_SOURCES) + (i * 32)]); - gf_vect_mul_init(g3[i], &g_tbls[(64 * TEST_SOURCES) + (i * 32)]); - } - - gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[0], efence_buffs, dest_ref1); - gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES], - efence_buffs, dest_ref2); - gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES], - efence_buffs, dest_ref3); - - FUNCTION_UNDER_TEST(size, TEST_SOURCES, g_tbls, efence_buffs, dest_ptrs); - - if (0 != memcmp(dest_ref1, dest1, size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test1 %d\n", rtest); - dump_matrix(efence_buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref1, align); - printf("dprod_dut:"); - dump(dest1, align); - return -1; - } - - if (0 != memcmp(dest_ref2, dest2, size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test2 %d\n", rtest); - dump_matrix(efence_buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref2, align); - printf("dprod_dut:"); - dump(dest2, align); - return -1; - } - - if (0 != memcmp(dest_ref3, dest3, size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test3 %d\n", rtest); - dump_matrix(efence_buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref3, align); - printf("dprod_dut:"); - dump(dest3, align); - return -1; - } - - putchar('.'); - } - - // Test rand ptr alignment if available - - for (rtest = 0; rtest < RANDOMS; rtest++) { - size = (TEST_LEN - PTR_ALIGN_CHK_B) & ~(TEST_MIN_SIZE - 1); - srcs = rand() % TEST_SOURCES; - if (srcs == 0) - continue; - - offset = (PTR_ALIGN_CHK_B != 0) ? 1 : PTR_ALIGN_CHK_B; - // Add random offsets - for (i = 0; i < srcs; i++) - ubuffs[i] = buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset)); - - udest_ptrs[0] = dest1 + (rand() & (PTR_ALIGN_CHK_B - offset)); - udest_ptrs[1] = dest2 + (rand() & (PTR_ALIGN_CHK_B - offset)); - udest_ptrs[2] = dest3 + (rand() & (PTR_ALIGN_CHK_B - offset)); - - memset(dest1, 0, TEST_LEN); // zero pad to check write-over - memset(dest2, 0, TEST_LEN); - memset(dest3, 0, TEST_LEN); - - for (i = 0; i < srcs; i++) - for (j = 0; j < size; j++) - ubuffs[i][j] = rand(); - - for (i = 0; i < srcs; i++) { - g1[i] = rand(); - g2[i] = rand(); - g3[i] = rand(); - } - - for (i = 0; i < srcs; i++) { - gf_vect_mul_init(g1[i], &g_tbls[i * 32]); - gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]); - gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]); - } - - gf_vect_dot_prod_base(size, srcs, &g_tbls[0], ubuffs, dest_ref1); - gf_vect_dot_prod_base(size, srcs, &g_tbls[32 * srcs], ubuffs, dest_ref2); - gf_vect_dot_prod_base(size, srcs, &g_tbls[64 * srcs], ubuffs, dest_ref3); - - FUNCTION_UNDER_TEST(size, srcs, g_tbls, ubuffs, udest_ptrs); - - if (memcmp(dest_ref1, udest_ptrs[0], size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n", - srcs); - dump_matrix(ubuffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref1, 25); - printf("dprod_dut:"); - dump(udest_ptrs[0], 25); - return -1; - } - if (memcmp(dest_ref2, udest_ptrs[1], size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n", - srcs); - dump_matrix(ubuffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref2, 25); - printf("dprod_dut:"); - dump(udest_ptrs[1], 25); - return -1; - } - if (memcmp(dest_ref3, udest_ptrs[2], size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n", - srcs); - dump_matrix(ubuffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref3, 25); - printf("dprod_dut:"); - dump(udest_ptrs[2], 25); - return -1; - } - // Confirm that padding around dests is unchanged - memset(dest_ref1, 0, PTR_ALIGN_CHK_B); // Make reference zero buff - offset = udest_ptrs[0] - dest1; - - if (memcmp(dest1, dest_ref1, offset)) { - printf("Fail rand ualign pad1 start\n"); - return -1; - } - if (memcmp(dest1 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) { - printf("Fail rand ualign pad1 end\n"); - return -1; - } - - offset = udest_ptrs[1] - dest2; - if (memcmp(dest2, dest_ref1, offset)) { - printf("Fail rand ualign pad2 start\n"); - return -1; - } - if (memcmp(dest2 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) { - printf("Fail rand ualign pad2 end\n"); - return -1; - } - - offset = udest_ptrs[2] - dest3; - if (memcmp(dest3, dest_ref1, offset)) { - printf("Fail rand ualign pad3 start\n"); - return -1; - } - if (memcmp(dest3 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) { - printf("Fail rand ualign pad3 end\n");; - return -1; - } - - putchar('.'); - } - - // Test all size alignment - align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16; - - for (size = TEST_LEN; size >= TEST_MIN_SIZE; size -= align) { - srcs = TEST_SOURCES; - - for (i = 0; i < srcs; i++) - for (j = 0; j < size; j++) - buffs[i][j] = rand(); - - for (i = 0; i < srcs; i++) { - g1[i] = rand(); - g2[i] = rand(); - g3[i] = rand(); - } - - for (i = 0; i < srcs; i++) { - gf_vect_mul_init(g1[i], &g_tbls[i * 32]); - gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]); - gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]); - } - - gf_vect_dot_prod_base(size, srcs, &g_tbls[0], buffs, dest_ref1); - gf_vect_dot_prod_base(size, srcs, &g_tbls[32 * srcs], buffs, dest_ref2); - gf_vect_dot_prod_base(size, srcs, &g_tbls[64 * srcs], buffs, dest_ref3); - - FUNCTION_UNDER_TEST(size, srcs, g_tbls, buffs, dest_ptrs); - - if (memcmp(dest_ref1, dest_ptrs[0], size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n", - size); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref1, 25); - printf("dprod_dut:"); - dump(dest_ptrs[0], 25); - return -1; - } - if (memcmp(dest_ref2, dest_ptrs[1], size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n", - size); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref2, 25); - printf("dprod_dut:"); - dump(dest_ptrs[1], 25); - return -1; - } - if (memcmp(dest_ref3, dest_ptrs[2], size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n", - size); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref3, 25); - printf("dprod_dut:"); - dump(dest_ptrs[2], 25); - return -1; - } - } - - printf("Pass\n"); - return 0; - -} diff --git a/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_sse_test.patch b/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_sse_test.patch deleted file mode 100644 index 21bbfaa667..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_sse_test.patch +++ /dev/null @@ -1,4 +0,0 @@ -34c34 -< // #include "types.h" ---- -> #include "types.h" diff --git a/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx.asm b/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx.asm index 7cf630558c..4e30d1764e 100644 --- a/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx.asm +++ b/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx.asm @@ -97,7 +97,7 @@ %define return rax %define return.w eax - %define func(x) x: + %define func(x) x: endbranch %define FUNC_SAVE %define FUNC_RESTORE %endif @@ -158,12 +158,8 @@ section .text %define xd3 xtmph1 align 16 -global gf_3vect_mad_avx:ISAL_SYM_TYPE_FUNCTION +global gf_3vect_mad_avx, function func(gf_3vect_mad_avx) -%ifidn __OUTPUT_FORMAT__, macho64 -global _gf_3vect_mad_avx:ISAL_SYM_TYPE_FUNCTION -func(_gf_3vect_mad_avx) -%endif FUNC_SAVE sub len, 16 jl .return_fail @@ -287,6 +283,3 @@ align 16 mask0f: dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f constip16: dq 0xf8f9fafbfcfdfeff, 0xf0f1f2f3f4f5f6f7 - -;;; func core, ver, snum -slversion gf_3vect_mad_avx, 02, 01, 0207 diff --git a/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx.patch b/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx.patch deleted file mode 100644 index 983b4fc414..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx.patch +++ /dev/null @@ -1,5 +0,0 @@ -163,166d162 -< %ifidn __OUTPUT_FORMAT__, macho64 -< global _gf_3vect_mad_avx:ISAL_SYM_TYPE_FUNCTION -< func(_gf_3vect_mad_avx) -< %endif diff --git a/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx2.asm b/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx2.asm index c218b4db28..069c5103bc 100644 --- a/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx2.asm +++ b/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx2.asm @@ -103,7 +103,7 @@ %define return rax %define return.w eax - %define func(x) x: + %define func(x) x: endbranch %define FUNC_SAVE %define FUNC_RESTORE %endif @@ -165,13 +165,8 @@ section .text %define xd3 ymm10 align 16 -global gf_3vect_mad_avx2:ISAL_SYM_TYPE_FUNCTION +global gf_3vect_mad_avx2, function func(gf_3vect_mad_avx2) -%ifidn __OUTPUT_FORMAT__, macho64 -global _gf_3vect_mad_avx2:ISAL_SYM_TYPE_FUNCTION -func(_gf_3vect_mad_avx2) -%endif - FUNC_SAVE sub len, 32 jl .return_fail @@ -317,6 +312,3 @@ align 32 constip32: dq 0xf8f9fafbfcfdfeff, 0xf0f1f2f3f4f5f6f7 dq 0xe8e9eaebecedeeef, 0xe0e1e2e3e4e5e6e7 - -;;; func core, ver, snum -slversion gf_3vect_mad_avx2, 04, 01, 0208 diff --git a/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx2.patch b/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx2.patch deleted file mode 100644 index 058f09b3c4..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx2.patch +++ /dev/null @@ -1,6 +0,0 @@ -170,174d169 -< %ifidn __OUTPUT_FORMAT__, macho64 -< global _gf_3vect_mad_avx2:ISAL_SYM_TYPE_FUNCTION -< func(_gf_3vect_mad_avx2) -< %endif -< diff --git a/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx2_gfni.asm b/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx2_gfni.asm new file mode 100644 index 0000000000..8a04577acd --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx2_gfni.asm @@ -0,0 +1,276 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2023 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; +;;; gf_3vect_mad_avx2_gfni(len, vec, vec_i, mul_array, src, dest); +;;; + +%include "reg_sizes.asm" +%include "gf_vect_gfni.inc" +%include "memcpy.asm" + +%if AS_FEATURE_LEVEL >= 10 + +%ifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + %define tmp r11 + %define tmp2 r10 + %define func(x) x: endbranch + %define FUNC_SAVE + %define FUNC_RESTORE +%endif + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define arg4 r12 ; must be saved, loaded and restored + %define arg5 r13 ; must be saved and restored + %define tmp r11 + %define tmp2 r10 + %define stack_size 16*10 + 3*8 + %define arg(x) [rsp + stack_size + 8 + 8*x] + %define func(x) proc_frame x + + %macro FUNC_SAVE 0 + sub rsp, stack_size + vmovdqa [rsp + 0*16], xmm6 + vmovdqa [rsp + 1*16], xmm7 + vmovdqa [rsp + 2*16], xmm8 + vmovdqa [rsp + 3*16], xmm9 + vmovdqa [rsp + 4*16], xmm10 + vmovdqa [rsp + 5*16], xmm11 + vmovdqa [rsp + 6*16], xmm12 + vmovdqa [rsp + 7*16], xmm13 + vmovdqa [rsp + 8*16], xmm14 + vmovdqa [rsp + 9*16], xmm15 + mov [rsp + 10*16 + 0*8], r12 + mov [rsp + 10*16 + 1*8], r13 + end_prolog + mov arg4, arg(4) + mov arg5, arg(5) + %endmacro + + %macro FUNC_RESTORE 0 + vmovdqa xmm6, [rsp + 0*16] + vmovdqa xmm7, [rsp + 1*16] + vmovdqa xmm8, [rsp + 2*16] + vmovdqa xmm9, [rsp + 3*16] + vmovdqa xmm10, [rsp + 4*16] + vmovdqa xmm11, [rsp + 5*16] + vmovdqa xmm12, [rsp + 6*16] + vmovdqa xmm13, [rsp + 7*16] + vmovdqa xmm14, [rsp + 8*16] + vmovdqa xmm15, [rsp + 9*16] + mov r12, [rsp + 10*16 + 0*8] + mov r13, [rsp + 10*16 + 1*8] + add rsp, stack_size + %endmacro +%endif + +%define len arg0 +%define vec arg1 +%define vec_i arg2 +%define mul_array arg3 +%define src arg4 +%define dest1 arg5 +%define pos rax +%define dest2 mul_array +%define dest3 vec_i + +%ifndef EC_ALIGNED_ADDR +;;; Use Un-aligned load/store + %define XLDR vmovdqu + %define XSTR vmovdqu +%else +;;; Use Non-temporal load/stor + %ifdef NO_NT_LDST + %define XLDR vmovdqa + %define XSTR vmovdqa + %else + %define XLDR vmovntdqa + %define XSTR vmovntdq + %endif +%endif + +default rel +[bits 64] +section .text + +%define x0l ymm0 +%define x0h ymm0 ; reuse ymm0 +%define xgft1 ymm1 +%define xgft2 ymm2 +%define xgft3 ymm3 +%define xd1l ymm4 +%define xd1h ymm5 +%define xd2l ymm6 +%define xd2h ymm7 +%define xd3l ymm8 +%define xd3h ymm9 + +%define xret1l ymm10 +%define xret1h ymm11 +%define xret2l ymm12 +%define xret2h ymm13 +%define xret3l ymm14 +%define xret3h ymm15 + +%define x0 x0l +%define xd1 xd1l +%define xd2 xd2l +%define xd3 xd3l +%define xret1 xret1l +%define xret2 xret2l +%define xret3 xret3l + +;; +;; Encodes 64 bytes of a single source into 3x 64 bytes (parity disks) +;; +%macro ENCODE_64B_3 0 + ; get next source vector + XLDR x0l, [src + pos] ;; read low 32 bytes + ; get next dest vectors + XLDR xd1l, [dest1 + pos] + XLDR xd1h, [dest1 + pos + 32] + XLDR xd2l, [dest2 + pos] + XLDR xd2h, [dest2 + pos + 32] + XLDR xd3l, [dest3 + pos] + XLDR xd3h, [dest3 + pos + 32] + + GF_MUL_XOR VEX, x0l, xgft1, xret1l, xd1l, xgft2, xret2l, xd2l, xgft3, xret3l, xd3l + + XLDR x0h, [src + pos + 32] ;; read high 32 bytes + + GF_MUL_XOR VEX, x0h, xgft1, xret1h, xd1h, xgft2, xret2h, xd2h, xgft3, xret3h, xd3h + + XSTR [dest1 + pos], xd1l + XSTR [dest1 + pos + 32], xd1h + XSTR [dest2 + pos], xd2l + XSTR [dest2 + pos + 32], xd2h + XSTR [dest3 + pos], xd3l + XSTR [dest3 + pos + 32], xd3h +%endmacro + +;; +;; Encodes 32 bytes of a single source into 3x 32 bytes (parity disks) +;; +%macro ENCODE_32B_3 0 + ; get next source vector + XLDR x0, [src + pos] + ; get next dest vectors + XLDR xd1, [dest1 + pos] + XLDR xd2, [dest2 + pos] + XLDR xd3, [dest3 + pos] + + GF_MUL_XOR VEX, x0, xgft1, xret1, xd1, xgft2, xret2, xd2, xgft3, xret3, xd3 + + XSTR [dest1 + pos], xd1 + XSTR [dest2 + pos], xd2 + XSTR [dest3 + pos], xd3 +%endmacro + +;; +;; Encodes less than 32 bytes of a single source into 3x parity disks +;; +%macro ENCODE_LT_32B_3 1 +%define %%LEN %1 + ; get next source vector + simd_load_avx2 x0, src + pos, %%LEN, tmp, tmp2 + ; get next dest vectors + simd_load_avx2 xd1, dest1 + pos, %%LEN, tmp, tmp2 + simd_load_avx2 xd2, dest2 + pos, %%LEN, tmp, tmp2 + simd_load_avx2 xd3, dest3 + pos, %%LEN, tmp, tmp2 + + GF_MUL_XOR VEX, x0, xgft1, xret1, xd1, xgft2, xret2, xd2, xgft3, xret3, xd3 + + lea dest1, [dest1 + pos] + simd_store_avx2 dest1, xd1, %%LEN, tmp, tmp2 + lea dest2, [dest2 + pos] + simd_store_avx2 dest2, xd2, %%LEN, tmp, tmp2 + lea dest3, [dest3 + pos] + simd_store_avx2 dest3, xd3, %%LEN, tmp, tmp2 +%endmacro + +align 16 +global gf_3vect_mad_avx2_gfni, function +func(gf_3vect_mad_avx2_gfni) + FUNC_SAVE + + xor pos, pos + shl vec_i, 3 ;Multiply by 8 + shl vec, 3 ;Multiply by 8 + lea tmp, [mul_array + vec_i] + vbroadcastsd xgft1, [tmp] + vbroadcastsd xgft2, [tmp + vec] + vbroadcastsd xgft3, [tmp + vec*2] + mov dest2, [dest1 + 8] ; reuse mul_array + mov dest3, [dest1 + 2*8] ; reuse vec_i + mov dest1, [dest1] + + cmp len, 64 + jl .len_lt_64 + +.loop64: + ENCODE_64B_3 ;; loop on 64 bytes at a time + + add pos, 64 + sub len, 64 + cmp len, 64 + jge .loop64 + +.len_lt_64: + cmp len, 32 + jl .len_lt_32 + + ENCODE_32B_3 ;; encode next 32 bytes + + add pos, 32 + sub len, 32 + +.len_lt_32: + cmp len, 0 + jle .exit + + ENCODE_LT_32B_3 len ;; encode final bytes + +.exit: + vzeroupper + + FUNC_RESTORE + ret + +endproc_frame +%endif ; if AS_FEATURE_LEVEL >= 10 diff --git a/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx512.asm b/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx512.asm index 53b3eb5afa..567624d273 100644 --- a/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx512.asm +++ b/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx512.asm @@ -44,7 +44,7 @@ %define arg5 r9 %define tmp r11 %define return rax - %define func(x) x: + %define func(x) x: endbranch %define FUNC_SAVE %define FUNC_RESTORE %endif @@ -117,8 +117,8 @@ %else ;;; Use Non-temporal load/stor %ifdef NO_NT_LDST - %define XLDR vmovdqa - %define XSTR vmovdqa + %define XLDR vmovdqa64 + %define XSTR vmovdqa64 %else %define XLDR vmovntdqa %define XSTR vmovntdq @@ -152,13 +152,8 @@ section .text %define xmask0f zmm17 align 16 -global gf_3vect_mad_avx512:ISAL_SYM_TYPE_FUNCTION +global gf_3vect_mad_avx512, function func(gf_3vect_mad_avx512) -%ifidn __OUTPUT_FORMAT__, macho64 -global _gf_3vect_mad_avx512:ISAL_SYM_TYPE_FUNCTION -func(_gf_3vect_mad_avx512) -%endif - FUNC_SAVE sub len, 64 jl .return_fail @@ -209,7 +204,7 @@ func(_gf_3vect_mad_avx512) vpshufb xtmph3 {k1}{z}, xgft3_hi, x0 ;Lookup mul table of high nibble vpshufb xtmpl3 {k1}{z}, xgft3_lo, xtmpa ;Lookup mul table of low nibble vpxorq xtmph3, xtmph3, xtmpl3 ;GF add high and low partials - vpxorq xd3, xd3, xtmph3 ;xd2 += partial + vpxorq xd3, xd3, xtmph3 ;xd3 += partial XSTR [dest1+pos], xd1 XSTR [dest2+pos], xd2 diff --git a/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx512.patch b/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx512.patch deleted file mode 100644 index d8b12fac96..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx512.patch +++ /dev/null @@ -1,6 +0,0 @@ -157,161d156 -< %ifidn __OUTPUT_FORMAT__, macho64 -< global _gf_3vect_mad_avx512:ISAL_SYM_TYPE_FUNCTION -< func(_gf_3vect_mad_avx512) -< %endif -< diff --git a/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx512_gfni.asm b/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx512_gfni.asm new file mode 100644 index 0000000000..53cc812595 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx512_gfni.asm @@ -0,0 +1,204 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2023 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; +;;; gf_3vect_mad_avx512_gfni(len, vec, vec_i, mul_array, src, dest); +;;; + +%include "reg_sizes.asm" +%include "gf_vect_gfni.inc" + +%if AS_FEATURE_LEVEL >= 10 + +%ifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + %define tmp r11 + %define func(x) x: endbranch + %define FUNC_SAVE + %define FUNC_RESTORE +%endif + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define arg4 r12 ; must be saved, loaded and restored + %define arg5 r13 ; must be saved and restored + %define tmp r11 + %define stack_size 16*4 + 3*8 + %define arg(x) [rsp + stack_size + 8 + 8*x] + %define func(x) proc_frame x + + %macro FUNC_SAVE 0 + sub rsp, stack_size + vmovdqa [rsp + 16*0], xmm6 + vmovdqa [rsp + 16*1], xmm7 + vmovdqa [rsp + 16*2], xmm8 + vmovdqa [rsp + 16*3], xmm9 + mov [rsp + 4*16 + 0*8], r12 + mov [rsp + 4*16 + 1*8], r13 + end_prolog + mov arg4, arg(4) + mov arg5, arg(5) + %endmacro + + %macro FUNC_RESTORE 0 + vmovdqa xmm6, [rsp + 16*0] + vmovdqa xmm7, [rsp + 16*1] + vmovdqa xmm8, [rsp + 16*2] + vmovdqa xmm9, [rsp + 16*3] + mov r12, [rsp + 4*16 + 0*8] + mov r13, [rsp + 4*16 + 1*8] + add rsp, stack_size + %endmacro +%endif + +%define len arg0 +%define vec arg1 +%define vec_i arg2 +%define mul_array arg3 +%define src arg4 +%define dest1 arg5 +%define pos rax +%define dest2 mul_array +%define dest3 vec_i + +%ifndef EC_ALIGNED_ADDR +;;; Use Un-aligned load/store + %define XLDR vmovdqu8 + %define XSTR vmovdqu8 +%else +;;; Use Non-temporal load/stor + %ifdef NO_NT_LDST + %define XLDR vmovdqa64 + %define XSTR vmovdqa64 + %else + %define XLDR vmovntdqa + %define XSTR vmovntdq + %endif +%endif + +default rel +[bits 64] +section .text + +%define x0 zmm0 +%define xgft1 zmm1 +%define xgft2 zmm2 +%define xgft3 zmm3 +%define xd1 zmm4 +%define xd2 zmm5 +%define xd3 zmm6 + +%define xret1 zmm7 +%define xret2 zmm8 +%define xret3 zmm9 + +;; +;; Encodes 64 bytes of a single source into 3x 64 bytes (parity disks) +;; +%macro ENCODE_64B_3 0-1 +%define %%KMASK %1 + +%if %0 == 1 + vmovdqu8 x0{%%KMASK}, [src + pos] ;Get next source vector + vmovdqu8 xd1{%%KMASK}, [dest1 + pos] ;Get next dest vector + vmovdqu8 xd2{%%KMASK}, [dest2 + pos] ;Get next dest vector + vmovdqu8 xd3{%%KMASK}, [dest3 + pos] ;Get next dest vector +%else + XLDR x0, [src + pos] ;Get next source vector + XLDR xd1, [dest1 + pos] ;Get next dest vector + XLDR xd2, [dest2 + pos] ;Get next dest vector + XLDR xd3, [dest3 + pos] ;Get next dest vector +%endif + + GF_MUL_XOR EVEX, x0, xgft1, xret1, xd1, xgft2, xret2, xd2, xgft3, xret3, xd3 + +%if %0 == 1 + vmovdqu8 [dest1 + pos]{%%KMASK}, xd1 + vmovdqu8 [dest2 + pos]{%%KMASK}, xd2 + vmovdqu8 [dest3 + pos]{%%KMASK}, xd3 +%else + XSTR [dest1 + pos], xd1 + XSTR [dest2 + pos], xd2 + XSTR [dest3 + pos], xd3 +%endif +%endmacro + +align 16 +global gf_3vect_mad_avx512_gfni, function +func(gf_3vect_mad_avx512_gfni) + FUNC_SAVE + + xor pos, pos + shl vec_i, 3 ;Multiply by 8 + shl vec, 3 ;Multiply by 8 + lea tmp, [mul_array + vec_i] + vbroadcastf32x2 xgft1, [tmp] + vbroadcastf32x2 xgft2, [tmp + vec] + vbroadcastf32x2 xgft3, [tmp + vec*2] + mov dest2, [dest1 + 8] ; reuse mul_array + mov dest3, [dest1 + 2*8] ; reuse vec + mov dest1, [dest1] + + cmp len, 64 + jl .len_lt_64 +.loop64: + ENCODE_64B_3 + + add pos, 64 ;Loop on 64 bytes at a time + sub len, 64 + cmp len, 64 + jge .loop64 + +.len_lt_64: + cmp len, 0 + jle .exit + + xor tmp, tmp + bts tmp, len + dec tmp + kmovq k1, tmp + + ENCODE_64B_3 k1 + +.exit: + vzeroupper + + FUNC_RESTORE + ret + +endproc_frame +%endif ; if AS_FEATURE_LEVEL >= 10 diff --git a/contrib/libs/isa-l/erasure_code/gf_3vect_mad_sse.asm b/contrib/libs/isa-l/erasure_code/gf_3vect_mad_sse.asm index d6dbe8f200..0a4284d53e 100644 --- a/contrib/libs/isa-l/erasure_code/gf_3vect_mad_sse.asm +++ b/contrib/libs/isa-l/erasure_code/gf_3vect_mad_sse.asm @@ -96,7 +96,7 @@ %define return rax %define return.w eax - %define func(x) x: + %define func(x) x: endbranch %define FUNC_SAVE %define FUNC_RESTORE %endif @@ -156,13 +156,8 @@ section .text %define xd3 xtmph1 align 16 -global gf_3vect_mad_sse:ISAL_SYM_TYPE_FUNCTION +global gf_3vect_mad_sse, function func(gf_3vect_mad_sse) -%ifidn __OUTPUT_FORMAT__, macho64 -global _gf_3vect_mad_sse:ISAL_SYM_TYPE_FUNCTION -func(_gf_3vect_mad_sse) -%endif - FUNC_SAVE sub len, 16 jl .return_fail @@ -298,6 +293,3 @@ mask0f: dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f constip16: dq 0xf8f9fafbfcfdfeff, 0xf0f1f2f3f4f5f6f7 - -;;; func core, ver, snum -slversion gf_3vect_mad_sse, 00, 01, 0206 diff --git a/contrib/libs/isa-l/erasure_code/gf_3vect_mad_sse.patch b/contrib/libs/isa-l/erasure_code/gf_3vect_mad_sse.patch deleted file mode 100644 index 83363c45cf..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_3vect_mad_sse.patch +++ /dev/null @@ -1,6 +0,0 @@ -161,165d160 -< %ifidn __OUTPUT_FORMAT__, macho64 -< global _gf_3vect_mad_sse:ISAL_SYM_TYPE_FUNCTION -< func(_gf_3vect_mad_sse) -< %endif -< diff --git a/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx.asm b/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx.asm index 30f1e81f6b..077018eefd 100644 --- a/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx.asm +++ b/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx.asm @@ -54,7 +54,7 @@ %define PS 8 %define LOG_PS 3 - %define func(x) x: + %define func(x) x: endbranch %macro FUNC_SAVE 0 push r12 push r13 @@ -95,15 +95,15 @@ %define func(x) proc_frame x %macro FUNC_SAVE 0 alloc_stack stack_size - save_xmm128 xmm6, 0*16 - save_xmm128 xmm7, 1*16 - save_xmm128 xmm8, 2*16 - save_xmm128 xmm9, 3*16 - save_xmm128 xmm10, 4*16 - save_xmm128 xmm11, 5*16 - save_xmm128 xmm12, 6*16 - save_xmm128 xmm13, 7*16 - save_xmm128 xmm14, 8*16 + vmovdqa [rsp + 0*16], xmm6 + vmovdqa [rsp + 1*16], xmm7 + vmovdqa [rsp + 2*16], xmm8 + vmovdqa [rsp + 3*16], xmm9 + vmovdqa [rsp + 4*16], xmm10 + vmovdqa [rsp + 5*16], xmm11 + vmovdqa [rsp + 6*16], xmm12 + vmovdqa [rsp + 7*16], xmm13 + vmovdqa [rsp + 8*16], xmm14 save_reg r12, 9*16 + 0*8 save_reg r13, 9*16 + 1*8 save_reg r14, 9*16 + 2*8 @@ -159,7 +159,7 @@ %define PS 4 %define LOG_PS 2 - %define func(x) x: + %define func(x) x: endbranch %define arg(x) [ebp + PS*2 + PS*x] %define var(x) [ebp - PS - PS*x] @@ -294,13 +294,8 @@ section .text %define xp4 xmm5 %endif align 16 -global gf_4vect_dot_prod_avx:ISAL_SYM_TYPE_FUNCTION +global gf_4vect_dot_prod_avx, function func(gf_4vect_dot_prod_avx) -%ifidn __OUTPUT_FORMAT__, macho64 -global _gf_4vect_dot_prod_avx:ISAL_SYM_TYPE_FUNCTION -func(_gf_4vect_dot_prod_avx) -%endif - FUNC_SAVE SLDR len, len_m sub len, 16 @@ -441,6 +436,3 @@ section .data align 16 mask0f: dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f - -;;; func core, ver, snum -slversion gf_4vect_dot_prod_avx, 02, 05, 0193 diff --git a/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx.patch b/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx.patch deleted file mode 100644 index aa908028bb..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx.patch +++ /dev/null @@ -1,6 +0,0 @@ -299,303d298 -< %ifidn __OUTPUT_FORMAT__, macho64 -< global _gf_4vect_dot_prod_avx:ISAL_SYM_TYPE_FUNCTION -< func(_gf_4vect_dot_prod_avx) -< %endif -< diff --git a/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx2.asm b/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx2.asm index efe2f76de9..8d5febe0fa 100644 --- a/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx2.asm +++ b/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx2.asm @@ -56,7 +56,7 @@ %define PS 8 %define LOG_PS 3 - %define func(x) x: + %define func(x) x: endbranch %macro FUNC_SAVE 0 push r12 push r13 @@ -163,7 +163,7 @@ %define PS 4 %define LOG_PS 2 - %define func(x) x: + %define func(x) x: endbranch %define arg(x) [ebp + PS*2 + PS*x] %define var(x) [ebp - PS - PS*x] @@ -301,15 +301,9 @@ section .text %define xp3 ymm4 %define xp4 ymm5 %endif - align 16 -global gf_4vect_dot_prod_avx2:ISAL_SYM_TYPE_FUNCTION +global gf_4vect_dot_prod_avx2, function func(gf_4vect_dot_prod_avx2) -%ifidn __OUTPUT_FORMAT__, macho64 -global _gf_4vect_dot_prod_avx2:ISAL_SYM_TYPE_FUNCTION -func(_gf_4vect_dot_prod_avx2) -%endif - FUNC_SAVE SLDR len, len_m sub len, 32 @@ -459,8 +453,3 @@ func(_gf_4vect_dot_prod_avx2) ret endproc_frame - -section .data - -;;; func core, ver, snum -slversion gf_4vect_dot_prod_avx2, 04, 05, 0198 diff --git a/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx2.patch b/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx2.patch deleted file mode 100644 index 39cdd548a7..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx2.patch +++ /dev/null @@ -1,8 +0,0 @@ -304d303 -< -308,312d306 -< %ifidn __OUTPUT_FORMAT__, macho64 -< global _gf_4vect_dot_prod_avx2:ISAL_SYM_TYPE_FUNCTION -< func(_gf_4vect_dot_prod_avx2) -< %endif -< diff --git a/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx512.asm b/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx512.asm index c810008c85..9bdc1a5670 100644 --- a/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx512.asm +++ b/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx512.asm @@ -44,8 +44,6 @@ %define arg5 r9 %define tmp r11 - %define tmp.w r11d - %define tmp.b r11b %define tmp2 r10 %define tmp3 r13 ; must be saved and restored %define tmp4 r12 ; must be saved and restored @@ -54,19 +52,22 @@ %define return rax %define PS 8 %define LOG_PS 3 + %define stack_size 4*8 - %define func(x) x: + %define func(x) x: endbranch %macro FUNC_SAVE 0 - push r12 - push r13 - push r14 - push r15 + sub rsp, stack_size + mov [rsp + 0*8], r12 + mov [rsp + 1*8], r13 + mov [rsp + 2*8], r14 + mov [rsp + 3*8], r15 %endmacro %macro FUNC_RESTORE 0 - pop r15 - pop r14 - pop r13 - pop r12 + mov r12, [rsp + 0*8] + mov r13, [rsp + 1*8] + mov r14, [rsp + 2*8] + mov r15, [rsp + 3*8] + add rsp, stack_size %endmacro %endif @@ -79,8 +80,6 @@ %define arg4 r12 ; must be saved, loaded and restored %define arg5 r15 ; must be saved and restored %define tmp r11 - %define tmp.w r11d - %define tmp.b r11b %define tmp2 r10 %define tmp3 r13 ; must be saved and restored %define tmp4 r14 ; must be saved and restored @@ -156,8 +155,8 @@ %else ;;; Use Non-temporal load/stor %ifdef NO_NT_LDST - %define XLDR vmovdqa - %define XSTR vmovdqa + %define XLDR vmovdqa64 + %define XSTR vmovdqa64 %else %define XLDR vmovntdqa %define XSTR vmovntdq @@ -191,13 +190,8 @@ default rel section .text align 16 -global gf_4vect_dot_prod_avx512:ISAL_SYM_TYPE_FUNCTION +global gf_4vect_dot_prod_avx512, function func(gf_4vect_dot_prod_avx512) -%ifidn __OUTPUT_FORMAT__, macho64 -global _gf_4vect_dot_prod_avx512:ISAL_SYM_TYPE_FUNCTION -func(_gf_4vect_dot_prod_avx512) -%endif - FUNC_SAVE sub len, 64 jl .return_fail diff --git a/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx512.patch b/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx512.patch deleted file mode 100644 index 6ca011bcc3..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx512.patch +++ /dev/null @@ -1,6 +0,0 @@ -196,200d195 -< %ifidn __OUTPUT_FORMAT__, macho64 -< global _gf_4vect_dot_prod_avx512:ISAL_SYM_TYPE_FUNCTION -< func(_gf_4vect_dot_prod_avx512) -< %endif -< diff --git a/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx512_gfni.asm b/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx512_gfni.asm new file mode 100644 index 0000000000..9adb83f196 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx512_gfni.asm @@ -0,0 +1,253 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2023 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; +;;; gf_4vect_dot_prod_avx512_gfni(len, vec, *g_tbls, **buffs, **dests); +;;; + +%include "reg_sizes.asm" +%include "gf_vect_gfni.inc" + +%if AS_FEATURE_LEVEL >= 10 + +%ifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + + %define tmp r11 + %define tmp2 r10 + %define tmp3 r13 ; must be saved and restored + %define tmp4 r12 ; must be saved and restored + %define tmp5 r14 ; must be saved and restored + %define tmp6 r15 ; must be saved and restored + + %define func(x) x: endbranch + %macro FUNC_SAVE 0 + push r12 + push r13 + push r14 + push r15 + %endmacro + %macro FUNC_RESTORE 0 + pop r15 + pop r14 + pop r13 + pop r12 + %endmacro +%endif + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + + %define arg4 r12 ; must be saved, loaded and restored + %define arg5 r15 ; must be saved and restored + %define tmp r11 + %define tmp2 r10 + %define tmp3 r13 ; must be saved and restored + %define tmp4 r14 ; must be saved and restored + %define tmp5 rdi ; must be saved and restored + %define tmp6 rsi ; must be saved and restored + %define stack_size 3*16 + 7*8 ; must be an odd multiple of 8 + %define arg(x) [rsp + stack_size + 8 + 8*x] + + %define func(x) proc_frame x + %macro FUNC_SAVE 0 + alloc_stack stack_size + vmovdqa [rsp + 0*16], xmm6 + vmovdqa [rsp + 1*16], xmm7 + vmovdqa [rsp + 2*16], xmm8 + mov [rsp + 3*16 + 0*8], r12 + mov [rsp + 3*16 + 1*8], r13 + mov [rsp + 3*16 + 2*8], r14 + mov [rsp + 3*16 + 3*8], r15 + mov [rsp + 3*16 + 4*8], rdi + mov [rsp + 3*16 + 5*8], rsi + end_prolog + mov arg4, arg(4) + %endmacro + + %macro FUNC_RESTORE 0 + vmovdqa xmm6, [rsp + 0*16] + vmovdqa xmm7, [rsp + 1*16] + vmovdqa xmm8, [rsp + 2*16] + mov r12, [rsp + 3*16 + 0*8] + mov r13, [rsp + 3*16 + 1*8] + mov r14, [rsp + 3*16 + 2*8] + mov r15, [rsp + 3*16 + 3*8] + mov rdi, [rsp + 3*16 + 4*8] + mov rsi, [rsp + 3*16 + 5*8] + add rsp, stack_size + %endmacro +%endif + + +%define len arg0 +%define vec arg1 +%define mul_array arg2 +%define src arg3 +%define dest1 arg4 +%define ptr arg5 +%define vec_i tmp2 +%define dest2 tmp3 +%define dest3 tmp4 +%define dest4 tmp5 +%define vskip3 tmp6 +%define pos rax + + +%ifndef EC_ALIGNED_ADDR +;;; Use Un-aligned load/store + %define XLDR vmovdqu8 + %define XSTR vmovdqu8 +%else +;;; Use Non-temporal load/stor + %ifdef NO_NT_LDST + %define XLDR vmovdqa64 + %define XSTR vmovdqa64 + %else + %define XLDR vmovntdqa + %define XSTR vmovntdq + %endif +%endif + +%define xgft1 zmm5 +%define xgft2 zmm6 +%define xgft3 zmm7 +%define xgft4 zmm8 + +%define x0 zmm0 +%define xp1 zmm1 +%define xp2 zmm2 +%define xp3 zmm3 +%define xp4 zmm4 + +default rel +[bits 64] + +section .text + +;; +;; Encodes 64 bytes of all "k" sources into 4x 64 bytes (parity disks) +;; +%macro ENCODE_64B_4 0-1 +%define %%KMASK %1 + + vpxorq xp1, xp1, xp1 + vpxorq xp2, xp2, xp2 + vpxorq xp3, xp3, xp3 + vpxorq xp4, xp4, xp4 + mov tmp, mul_array + xor vec_i, vec_i + +%%next_vect: + mov ptr, [src + vec_i] +%if %0 == 1 + vmovdqu8 x0{%%KMASK}, [ptr + pos] ;Get next source vector (less than 64 bytes) +%else + XLDR x0, [ptr + pos] ;Get next source vector (64 bytes) +%endif + add vec_i, 8 + + vbroadcastf32x2 xgft1, [tmp] + vbroadcastf32x2 xgft2, [tmp + vec] + vbroadcastf32x2 xgft3, [tmp + vec*2] + vbroadcastf32x2 xgft4, [tmp + vskip3] + add tmp, 8 + + GF_MUL_XOR EVEX, x0, xgft1, xgft1, xp1, xgft2, xgft2, xp2, xgft3, xgft3, xp3, \ + xgft4, xgft4, xp4 + + cmp vec_i, vec + jl %%next_vect + +%if %0 == 1 + vmovdqu8 [dest1 + pos]{%%KMASK}, xp1 + vmovdqu8 [dest2 + pos]{%%KMASK}, xp2 + vmovdqu8 [dest3 + pos]{%%KMASK}, xp3 + vmovdqu8 [dest4 + pos]{%%KMASK}, xp4 +%else + XSTR [dest1 + pos], xp1 + XSTR [dest2 + pos], xp2 + XSTR [dest3 + pos], xp3 + XSTR [dest4 + pos], xp4 +%endif +%endmacro + +align 16 +global gf_4vect_dot_prod_avx512_gfni, function +func(gf_4vect_dot_prod_avx512_gfni) + FUNC_SAVE + + xor pos, pos + mov vskip3, vec + imul vskip3, 8*3 + shl vec, 3 ;vec *= 8. Make vec_i count by 8 + mov dest2, [dest1 + 8] + mov dest3, [dest1 + 2*8] + mov dest4, [dest1 + 3*8] + mov dest1, [dest1] + + cmp len, 64 + jl .len_lt_64 + +.loop64: + + ENCODE_64B_4 + + add pos, 64 ;Loop on 64 bytes at a time + sub len, 64 + cmp len, 64 + jge .loop64 + +.len_lt_64: + cmp len, 0 + jle .exit + + xor tmp, tmp + bts tmp, len + dec tmp + kmovq k1, tmp + + ENCODE_64B_4 k1 + +.exit: + vzeroupper + + FUNC_RESTORE + ret + +endproc_frame +%endif ; if AS_FEATURE_LEVEL >= 10 diff --git a/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_sse.asm b/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_sse.asm index 8a486bf7b5..aadab4b1e4 100644 --- a/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_sse.asm +++ b/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_sse.asm @@ -54,7 +54,7 @@ %define PS 8 %define LOG_PS 3 - %define func(x) x: + %define func(x) x: endbranch %macro FUNC_SAVE 0 push r12 push r13 @@ -159,7 +159,7 @@ %define PS 4 %define LOG_PS 2 - %define func(x) x: + %define func(x) x: endbranch %define arg(x) [ebp + PS*2 + PS*x] %define var(x) [ebp - PS - PS*x] @@ -294,13 +294,8 @@ section .text %define xp4 xmm5 %endif align 16 -global gf_4vect_dot_prod_sse:ISAL_SYM_TYPE_FUNCTION +global gf_4vect_dot_prod_sse, function func(gf_4vect_dot_prod_sse) -%ifidn __OUTPUT_FORMAT__, macho64 -global _gf_4vect_dot_prod_sse:ISAL_SYM_TYPE_FUNCTION -func(_gf_4vect_dot_prod_sse) -%endif - FUNC_SAVE SLDR len, len_m sub len, 16 @@ -443,6 +438,3 @@ section .data align 16 mask0f: dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f - -;;; func core, ver, snum -slversion gf_4vect_dot_prod_sse, 00, 06, 0064 diff --git a/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_sse.patch b/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_sse.patch deleted file mode 100644 index 78b6abbe4b..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_sse.patch +++ /dev/null @@ -1,6 +0,0 @@ -299,303d298 -< %ifidn __OUTPUT_FORMAT__, macho64 -< global _gf_4vect_dot_prod_sse:ISAL_SYM_TYPE_FUNCTION -< func(_gf_4vect_dot_prod_sse) -< %endif -< diff --git a/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_sse_test.c b/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_sse_test.c deleted file mode 100644 index eb6bc986ab..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_sse_test.c +++ /dev/null @@ -1,695 +0,0 @@ -/********************************************************************** - Copyright(c) 2011-2015 Intel Corporation All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -**********************************************************************/ - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> // for memset, memcmp -#include "erasure_code.h" -// #include "types.h" - -#ifndef FUNCTION_UNDER_TEST -# define FUNCTION_UNDER_TEST gf_4vect_dot_prod_sse -#endif -#ifndef TEST_MIN_SIZE -# define TEST_MIN_SIZE 16 -#endif - -#define str(s) #s -#define xstr(s) str(s) - -#define TEST_LEN 2048 -#define TEST_SIZE (TEST_LEN/2) -#define TEST_MEM TEST_SIZE -#define TEST_LOOPS 1000 -#define TEST_TYPE_STR "" - -#ifndef TEST_SOURCES -# define TEST_SOURCES 16 -#endif -#ifndef RANDOMS -# define RANDOMS 20 -#endif - -#ifdef EC_ALIGNED_ADDR -// Define power of 2 range to check ptr, len alignment -# define PTR_ALIGN_CHK_B 0 -# define LEN_ALIGN_CHK_B 0 // 0 for aligned only -#else -// Define power of 2 range to check ptr, len alignment -# define PTR_ALIGN_CHK_B 32 -# define LEN_ALIGN_CHK_B 32 // 0 for aligned only -#endif - -typedef unsigned char u8; - -extern void FUNCTION_UNDER_TEST(int len, int vlen, unsigned char *gftbls, - unsigned char **src, unsigned char **dest); - -void dump(unsigned char *buf, int len) -{ - int i; - for (i = 0; i < len;) { - printf(" %2x", 0xff & buf[i++]); - if (i % 32 == 0) - printf("\n"); - } - printf("\n"); -} - -void dump_matrix(unsigned char **s, int k, int m) -{ - int i, j; - for (i = 0; i < k; i++) { - for (j = 0; j < m; j++) { - printf(" %2x", s[i][j]); - } - printf("\n"); - } - printf("\n"); -} - -void dump_u8xu8(unsigned char *s, int k, int m) -{ - int i, j; - for (i = 0; i < k; i++) { - for (j = 0; j < m; j++) { - printf(" %2x", 0xff & s[j + (i * m)]); - } - printf("\n"); - } - printf("\n"); -} - -int main(int argc, char *argv[]) -{ - int i, j, rtest, srcs; - void *buf; - u8 g1[TEST_SOURCES], g2[TEST_SOURCES], g3[TEST_SOURCES]; - u8 g4[TEST_SOURCES], g_tbls[4 * TEST_SOURCES * 32], *buffs[TEST_SOURCES]; - u8 *dest1, *dest2, *dest3, *dest4, *dest_ref1, *dest_ref2, *dest_ref3; - u8 *dest_ref4, *dest_ptrs[4]; - - int align, size; - unsigned char *efence_buffs[TEST_SOURCES]; - unsigned int offset; - u8 *ubuffs[TEST_SOURCES]; - u8 *udest_ptrs[4]; - printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d ", TEST_SOURCES, TEST_LEN); - - // Allocate the arrays - for (i = 0; i < TEST_SOURCES; i++) { - if (posix_memalign(&buf, 64, TEST_LEN)) { - printf("alloc error: Fail"); - return -1; - } - buffs[i] = buf; - } - - if (posix_memalign(&buf, 64, TEST_LEN)) { - printf("alloc error: Fail"); - return -1; - } - dest1 = buf; - - if (posix_memalign(&buf, 64, TEST_LEN)) { - printf("alloc error: Fail"); - return -1; - } - dest2 = buf; - - if (posix_memalign(&buf, 64, TEST_LEN)) { - printf("alloc error: Fail"); - return -1; - } - dest3 = buf; - - if (posix_memalign(&buf, 64, TEST_LEN)) { - printf("alloc error: Fail"); - return -1; - } - dest4 = buf; - - if (posix_memalign(&buf, 64, TEST_LEN)) { - printf("alloc error: Fail"); - return -1; - } - dest_ref1 = buf; - - if (posix_memalign(&buf, 64, TEST_LEN)) { - printf("alloc error: Fail"); - return -1; - } - dest_ref2 = buf; - - if (posix_memalign(&buf, 64, TEST_LEN)) { - printf("alloc error: Fail"); - return -1; - } - dest_ref3 = buf; - - if (posix_memalign(&buf, 64, TEST_LEN)) { - printf("alloc error: Fail"); - return -1; - } - dest_ref4 = buf; - - dest_ptrs[0] = dest1; - dest_ptrs[1] = dest2; - dest_ptrs[2] = dest3; - dest_ptrs[3] = dest4; - - // Test of all zeros - for (i = 0; i < TEST_SOURCES; i++) - memset(buffs[i], 0, TEST_LEN); - - memset(dest1, 0, TEST_LEN); - memset(dest2, 0, TEST_LEN); - memset(dest3, 0, TEST_LEN); - memset(dest4, 0, TEST_LEN); - memset(dest_ref1, 0, TEST_LEN); - memset(dest_ref2, 0, TEST_LEN); - memset(dest_ref3, 0, TEST_LEN); - memset(dest_ref4, 0, TEST_LEN); - memset(g1, 2, TEST_SOURCES); - memset(g2, 1, TEST_SOURCES); - memset(g3, 7, TEST_SOURCES); - memset(g4, 3, TEST_SOURCES); - - for (i = 0; i < TEST_SOURCES; i++) { - gf_vect_mul_init(g1[i], &g_tbls[i * 32]); - gf_vect_mul_init(g2[i], &g_tbls[32 * TEST_SOURCES + i * 32]); - gf_vect_mul_init(g3[i], &g_tbls[64 * TEST_SOURCES + i * 32]); - gf_vect_mul_init(g4[i], &g_tbls[96 * TEST_SOURCES + i * 32]); - } - - gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1); - gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES], buffs, - dest_ref2); - gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES], buffs, - dest_ref3); - gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES], buffs, - dest_ref4); - - FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs); - - if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) { - printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test1\n"); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref1, 25); - printf("dprod_dut:"); - dump(dest1, 25); - return -1; - } - if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) { - printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test2\n"); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref2, 25); - printf("dprod_dut:"); - dump(dest2, 25); - return -1; - } - if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) { - printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test3\n"); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref3, 25); - printf("dprod_dut:"); - dump(dest3, 25); - return -1; - } - if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) { - printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test4\n"); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref4, 25); - printf("dprod_dut:"); - dump(dest4, 25); - return -1; - } - - putchar('.'); - - // Rand data test - - for (rtest = 0; rtest < RANDOMS; rtest++) { - for (i = 0; i < TEST_SOURCES; i++) - for (j = 0; j < TEST_LEN; j++) - buffs[i][j] = rand(); - - for (i = 0; i < TEST_SOURCES; i++) { - g1[i] = rand(); - g2[i] = rand(); - g3[i] = rand(); - g4[i] = rand(); - } - - for (i = 0; i < TEST_SOURCES; i++) { - gf_vect_mul_init(g1[i], &g_tbls[i * 32]); - gf_vect_mul_init(g2[i], &g_tbls[(32 * TEST_SOURCES) + (i * 32)]); - gf_vect_mul_init(g3[i], &g_tbls[(64 * TEST_SOURCES) + (i * 32)]); - gf_vect_mul_init(g4[i], &g_tbls[(96 * TEST_SOURCES) + (i * 32)]); - } - - gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1); - gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES], - buffs, dest_ref2); - gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES], - buffs, dest_ref3); - gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES], - buffs, dest_ref4); - - FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs); - - if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test1 %d\n", rtest); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref1, 25); - printf("dprod_dut:"); - dump(dest1, 25); - return -1; - } - if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test2 %d\n", rtest); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref2, 25); - printf("dprod_dut:"); - dump(dest2, 25); - return -1; - } - if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test3 %d\n", rtest); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref3, 25); - printf("dprod_dut:"); - dump(dest3, 25); - return -1; - } - if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test4 %d\n", rtest); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref4, 25); - printf("dprod_dut:"); - dump(dest4, 25); - return -1; - } - - putchar('.'); - } - - // Rand data test with varied parameters - for (rtest = 0; rtest < RANDOMS; rtest++) { - for (srcs = TEST_SOURCES; srcs > 0; srcs--) { - for (i = 0; i < srcs; i++) - for (j = 0; j < TEST_LEN; j++) - buffs[i][j] = rand(); - - for (i = 0; i < srcs; i++) { - g1[i] = rand(); - g2[i] = rand(); - g3[i] = rand(); - g4[i] = rand(); - } - - for (i = 0; i < srcs; i++) { - gf_vect_mul_init(g1[i], &g_tbls[i * 32]); - gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]); - gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]); - gf_vect_mul_init(g4[i], &g_tbls[(96 * srcs) + (i * 32)]); - } - - gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[0], buffs, dest_ref1); - gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[32 * srcs], buffs, - dest_ref2); - gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[64 * srcs], buffs, - dest_ref3); - gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[96 * srcs], buffs, - dest_ref4); - - FUNCTION_UNDER_TEST(TEST_LEN, srcs, g_tbls, buffs, dest_ptrs); - - if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) - " test1 srcs=%d\n", srcs); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref1, 25); - printf("dprod_dut:"); - dump(dest1, 25); - return -1; - } - if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) - " test2 srcs=%d\n", srcs); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref2, 25); - printf("dprod_dut:"); - dump(dest2, 25); - return -1; - } - if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) - " test3 srcs=%d\n", srcs); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref3, 25); - printf("dprod_dut:"); - dump(dest3, 25); - return -1; - } - if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) - " test4 srcs=%d\n", srcs); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref4, 25); - printf("dprod_dut:"); - dump(dest4, 25); - return -1; - } - - putchar('.'); - } - } - - // Run tests at end of buffer for Electric Fence - align = (LEN_ALIGN_CHK_B != 0) ? 1 : 32; - for (size = TEST_MIN_SIZE; size <= TEST_SIZE; size += align) { - for (i = 0; i < TEST_SOURCES; i++) - for (j = 0; j < TEST_LEN; j++) - buffs[i][j] = rand(); - - for (i = 0; i < TEST_SOURCES; i++) // Line up TEST_SIZE from end - efence_buffs[i] = buffs[i] + TEST_LEN - size; - - for (i = 0; i < TEST_SOURCES; i++) { - g1[i] = rand(); - g2[i] = rand(); - g3[i] = rand(); - g4[i] = rand(); - } - - for (i = 0; i < TEST_SOURCES; i++) { - gf_vect_mul_init(g1[i], &g_tbls[i * 32]); - gf_vect_mul_init(g2[i], &g_tbls[(32 * TEST_SOURCES) + (i * 32)]); - gf_vect_mul_init(g3[i], &g_tbls[(64 * TEST_SOURCES) + (i * 32)]); - gf_vect_mul_init(g4[i], &g_tbls[(96 * TEST_SOURCES) + (i * 32)]); - } - - gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[0], efence_buffs, dest_ref1); - gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES], - efence_buffs, dest_ref2); - gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES], - efence_buffs, dest_ref3); - gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES], - efence_buffs, dest_ref4); - - FUNCTION_UNDER_TEST(size, TEST_SOURCES, g_tbls, efence_buffs, dest_ptrs); - - if (0 != memcmp(dest_ref1, dest1, size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test1 %d\n", rtest); - dump_matrix(efence_buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref1, align); - printf("dprod_dut:"); - dump(dest1, align); - return -1; - } - - if (0 != memcmp(dest_ref2, dest2, size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test2 %d\n", rtest); - dump_matrix(efence_buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref2, align); - printf("dprod_dut:"); - dump(dest2, align); - return -1; - } - - if (0 != memcmp(dest_ref3, dest3, size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test3 %d\n", rtest); - dump_matrix(efence_buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref3, align); - printf("dprod_dut:"); - dump(dest3, align); - return -1; - } - - if (0 != memcmp(dest_ref4, dest4, size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test4 %d\n", rtest); - dump_matrix(efence_buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref4, align); - printf("dprod_dut:"); - dump(dest4, align); - return -1; - } - - putchar('.'); - } - - // Test rand ptr alignment if available - - for (rtest = 0; rtest < RANDOMS; rtest++) { - size = (TEST_LEN - PTR_ALIGN_CHK_B) & ~(TEST_MIN_SIZE - 1); - srcs = rand() % TEST_SOURCES; - if (srcs == 0) - continue; - - offset = (PTR_ALIGN_CHK_B != 0) ? 1 : PTR_ALIGN_CHK_B; - // Add random offsets - for (i = 0; i < srcs; i++) - ubuffs[i] = buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset)); - - udest_ptrs[0] = dest1 + (rand() & (PTR_ALIGN_CHK_B - offset)); - udest_ptrs[1] = dest2 + (rand() & (PTR_ALIGN_CHK_B - offset)); - udest_ptrs[2] = dest3 + (rand() & (PTR_ALIGN_CHK_B - offset)); - udest_ptrs[3] = dest4 + (rand() & (PTR_ALIGN_CHK_B - offset)); - - memset(dest1, 0, TEST_LEN); // zero pad to check write-over - memset(dest2, 0, TEST_LEN); - memset(dest3, 0, TEST_LEN); - memset(dest4, 0, TEST_LEN); - - for (i = 0; i < srcs; i++) - for (j = 0; j < size; j++) - ubuffs[i][j] = rand(); - - for (i = 0; i < srcs; i++) { - g1[i] = rand(); - g2[i] = rand(); - g3[i] = rand(); - g4[i] = rand(); - } - - for (i = 0; i < srcs; i++) { - gf_vect_mul_init(g1[i], &g_tbls[i * 32]); - gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]); - gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]); - gf_vect_mul_init(g4[i], &g_tbls[(96 * srcs) + (i * 32)]); - } - - gf_vect_dot_prod_base(size, srcs, &g_tbls[0], ubuffs, dest_ref1); - gf_vect_dot_prod_base(size, srcs, &g_tbls[32 * srcs], ubuffs, dest_ref2); - gf_vect_dot_prod_base(size, srcs, &g_tbls[64 * srcs], ubuffs, dest_ref3); - gf_vect_dot_prod_base(size, srcs, &g_tbls[96 * srcs], ubuffs, dest_ref4); - - FUNCTION_UNDER_TEST(size, srcs, g_tbls, ubuffs, udest_ptrs); - - if (memcmp(dest_ref1, udest_ptrs[0], size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n", - srcs); - dump_matrix(ubuffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref1, 25); - printf("dprod_dut:"); - dump(udest_ptrs[0], 25); - return -1; - } - if (memcmp(dest_ref2, udest_ptrs[1], size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n", - srcs); - dump_matrix(ubuffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref2, 25); - printf("dprod_dut:"); - dump(udest_ptrs[1], 25); - return -1; - } - if (memcmp(dest_ref3, udest_ptrs[2], size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n", - srcs); - dump_matrix(ubuffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref3, 25); - printf("dprod_dut:"); - dump(udest_ptrs[2], 25); - return -1; - } - if (memcmp(dest_ref4, udest_ptrs[3], size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n", - srcs); - dump_matrix(ubuffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref4, 25); - printf("dprod_dut:"); - dump(udest_ptrs[3], 25); - return -1; - } - // Confirm that padding around dests is unchanged - memset(dest_ref1, 0, PTR_ALIGN_CHK_B); // Make reference zero buff - offset = udest_ptrs[0] - dest1; - - if (memcmp(dest1, dest_ref1, offset)) { - printf("Fail rand ualign pad1 start\n"); - return -1; - } - if (memcmp(dest1 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) { - printf("Fail rand ualign pad1 end\n"); - printf("size=%d offset=%d srcs=%d\n", size, offset, srcs); - return -1; - } - - offset = udest_ptrs[1] - dest2; - if (memcmp(dest2, dest_ref1, offset)) { - printf("Fail rand ualign pad2 start\n"); - return -1; - } - if (memcmp(dest2 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) { - printf("Fail rand ualign pad2 end\n"); - return -1; - } - - offset = udest_ptrs[2] - dest3; - if (memcmp(dest3, dest_ref1, offset)) { - printf("Fail rand ualign pad3 start\n"); - return -1; - } - if (memcmp(dest3 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) { - printf("Fail rand ualign pad3 end\n"); - return -1; - } - - offset = udest_ptrs[3] - dest4; - if (memcmp(dest4, dest_ref1, offset)) { - printf("Fail rand ualign pad4 start\n"); - return -1; - } - if (memcmp(dest4 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) { - printf("Fail rand ualign pad4 end\n"); - return -1; - } - - putchar('.'); - } - - // Test all size alignment - align = (LEN_ALIGN_CHK_B != 0) ? 1 : 32; - - for (size = TEST_LEN; size >= TEST_MIN_SIZE; size -= align) { - srcs = TEST_SOURCES; - - for (i = 0; i < srcs; i++) - for (j = 0; j < size; j++) - buffs[i][j] = rand(); - - for (i = 0; i < srcs; i++) { - g1[i] = rand(); - g2[i] = rand(); - g3[i] = rand(); - g4[i] = rand(); - } - - for (i = 0; i < srcs; i++) { - gf_vect_mul_init(g1[i], &g_tbls[i * 32]); - gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]); - gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]); - gf_vect_mul_init(g4[i], &g_tbls[(96 * srcs) + (i * 32)]); - } - - gf_vect_dot_prod_base(size, srcs, &g_tbls[0], buffs, dest_ref1); - gf_vect_dot_prod_base(size, srcs, &g_tbls[32 * srcs], buffs, dest_ref2); - gf_vect_dot_prod_base(size, srcs, &g_tbls[64 * srcs], buffs, dest_ref3); - gf_vect_dot_prod_base(size, srcs, &g_tbls[96 * srcs], buffs, dest_ref4); - - FUNCTION_UNDER_TEST(size, srcs, g_tbls, buffs, dest_ptrs); - - if (memcmp(dest_ref1, dest_ptrs[0], size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n", - size); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref1, 25); - printf("dprod_dut:"); - dump(dest_ptrs[0], 25); - return -1; - } - if (memcmp(dest_ref2, dest_ptrs[1], size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n", - size); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref2, 25); - printf("dprod_dut:"); - dump(dest_ptrs[1], 25); - return -1; - } - if (memcmp(dest_ref3, dest_ptrs[2], size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n", - size); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref3, 25); - printf("dprod_dut:"); - dump(dest_ptrs[2], 25); - return -1; - } - if (memcmp(dest_ref4, dest_ptrs[3], size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n", - size); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref4, 25); - printf("dprod_dut:"); - dump(dest_ptrs[3], 25); - return -1; - } - } - - printf("Pass\n"); - return 0; - -} diff --git a/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_sse_test.patch b/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_sse_test.patch deleted file mode 100644 index 21bbfaa667..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_sse_test.patch +++ /dev/null @@ -1,4 +0,0 @@ -34c34 -< // #include "types.h" ---- -> #include "types.h" diff --git a/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx.asm b/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx.asm index 2d351663c3..870bc1cdaf 100644 --- a/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx.asm +++ b/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx.asm @@ -103,7 +103,7 @@ %define return rax %define return.w eax - %define func(x) x: + %define func(x) x: endbranch %macro FUNC_SAVE 0 push r12 %endmacro @@ -169,13 +169,8 @@ section .text %define xd4 xtmpl1 align 16 -global gf_4vect_mad_avx:ISAL_SYM_TYPE_FUNCTION +global gf_4vect_mad_avx, function func(gf_4vect_mad_avx) -%ifidn __OUTPUT_FORMAT__, macho64 -global _gf_4vect_mad_avx:ISAL_SYM_TYPE_FUNCTION -func(_gf_4vect_mad_avx) -%endif - FUNC_SAVE sub len, 16 jl .return_fail @@ -336,6 +331,3 @@ align 16 mask0f: dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f constip16: dq 0xf8f9fafbfcfdfeff, 0xf0f1f2f3f4f5f6f7 - -;;; func core, ver, snum -slversion gf_4vect_mad_avx, 02, 01, 020a diff --git a/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx.patch b/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx.patch deleted file mode 100644 index 5b3ad1f1a9..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx.patch +++ /dev/null @@ -1,6 +0,0 @@ -174,178d173 -< %ifidn __OUTPUT_FORMAT__, macho64 -< global _gf_4vect_mad_avx:ISAL_SYM_TYPE_FUNCTION -< func(_gf_4vect_mad_avx) -< %endif -< diff --git a/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx2.asm b/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx2.asm index 9ec431ff27..4ec710ddac 100644 --- a/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx2.asm +++ b/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx2.asm @@ -101,7 +101,7 @@ %define return rax %define return.w eax - %define func(x) x: + %define func(x) x: endbranch %define FUNC_SAVE %define FUNC_RESTORE %endif @@ -165,13 +165,8 @@ section .text %define xd4 ymm10 align 16 -global gf_4vect_mad_avx2:ISAL_SYM_TYPE_FUNCTION +global gf_4vect_mad_avx2, function func(gf_4vect_mad_avx2) -%ifidn __OUTPUT_FORMAT__, macho64 -global _gf_4vect_mad_avx2:ISAL_SYM_TYPE_FUNCTION -func(_gf_4vect_mad_avx2) -%endif - FUNC_SAVE sub len, 32 jl .return_fail @@ -342,6 +337,3 @@ align 32 constip32: dq 0xf8f9fafbfcfdfeff, 0xf0f1f2f3f4f5f6f7 dq 0xe8e9eaebecedeeef, 0xe0e1e2e3e4e5e6e7 - -;;; func core, ver, snum -slversion gf_4vect_mad_avx2, 04, 01, 020b diff --git a/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx2.patch b/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx2.patch deleted file mode 100644 index e0518326ce..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx2.patch +++ /dev/null @@ -1,6 +0,0 @@ -170,174d169 -< %ifidn __OUTPUT_FORMAT__, macho64 -< global _gf_4vect_mad_avx2:ISAL_SYM_TYPE_FUNCTION -< func(_gf_4vect_mad_avx2) -< %endif -< diff --git a/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx2_gfni.asm b/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx2_gfni.asm new file mode 100644 index 0000000000..63efd4decc --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx2_gfni.asm @@ -0,0 +1,239 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2023 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; +;;; gf_4vect_mad_avx2_gfni(len, vec, vec_i, mul_array, src, dest); +;;; + +%include "reg_sizes.asm" +%include "gf_vect_gfni.inc" +%include "memcpy.asm" + +%if AS_FEATURE_LEVEL >= 10 + +%ifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + %define tmp r11 + %define tmp2 r10 + %define tmp3 r12 + %define func(x) x: endbranch + %macro FUNC_SAVE 0 + push r12 + %endmacro + %macro FUNC_RESTORE 0 + pop r12 + %endmacro +%endif + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define arg4 r12 ; must be saved, loaded and restored + %define arg5 r13 ; must be saved and restored + %define tmp r11 + %define tmp2 r10 + %define tmp3 r14 + %define stack_size 16*7 + 3*8 + %define arg(x) [rsp + stack_size + 8 + 8*x] + %define func(x) proc_frame x + + %macro FUNC_SAVE 0 + sub rsp, stack_size + vmovdqa [rsp + 0*16], xmm6 + vmovdqa [rsp + 1*16], xmm7 + vmovdqa [rsp + 2*16], xmm8 + vmovdqa [rsp + 3*16], xmm9 + vmovdqa [rsp + 4*16], xmm10 + vmovdqa [rsp + 5*16], xmm11 + vmovdqa [rsp + 6*16], xmm12 + mov [rsp + 7*16 + 0*8], r12 + mov [rsp + 7*16 + 1*8], r13 + mov [rsp + 7*16 + 2*8], r14 + end_prolog + mov arg4, arg(4) + mov arg5, arg(5) + %endmacro + + %macro FUNC_RESTORE 0 + vmovdqa xmm6, [rsp + 0*16] + vmovdqa xmm7, [rsp + 1*16] + vmovdqa xmm8, [rsp + 2*16] + vmovdqa xmm9, [rsp + 3*16] + vmovdqa xmm10, [rsp + 4*16] + vmovdqa xmm11, [rsp + 5*16] + vmovdqa xmm12, [rsp + 6*16] + mov r12, [rsp + 7*16 + 0*8] + mov r13, [rsp + 7*16 + 1*8] + mov r14, [rsp + 7*16 + 2*8] + add rsp, stack_size + %endmacro +%endif + +%define len arg0 +%define vec arg1 +%define vec_i arg2 +%define mul_array arg3 +%define src arg4 +%define dest1 arg5 +%define pos rax +%define dest2 mul_array +%define dest3 vec_i +%define dest4 tmp3 + +%ifndef EC_ALIGNED_ADDR +;;; Use Un-aligned load/store + %define XLDR vmovdqu + %define XSTR vmovdqu +%else +;;; Use Non-temporal load/stor + %ifdef NO_NT_LDST + %define XLDR vmovdqa + %define XSTR vmovdqa + %else + %define XLDR vmovntdqa + %define XSTR vmovntdq + %endif +%endif + +default rel +[bits 64] +section .text + +%define x0 ymm0 +%define xd1 ymm1 +%define xd2 ymm2 +%define xd3 ymm3 +%define xd4 ymm4 +%define xgft1 ymm5 +%define xgft2 ymm6 +%define xgft3 ymm7 +%define xgft4 ymm8 +%define xret1 ymm9 +%define xret2 ymm10 +%define xret3 ymm11 +%define xret4 ymm12 + +;; +;; Encodes 32 bytes of a single source into 4x 32 bytes (parity disks) +;; +%macro ENCODE_32B_4 0 + ;; get next source vector + XLDR x0, [src + pos] + ;; get next dest vectors + XLDR xd1, [dest1 + pos] + XLDR xd2, [dest2 + pos] + XLDR xd3, [dest3 + pos] + XLDR xd4, [dest4 + pos] + + GF_MUL_XOR VEX, x0, xgft1, xret1, xd1, xgft2, xret2, xd2, \ + xgft3, xret3, xd3, xgft4, xret4, xd4 + + XSTR [dest1 + pos], xd1 + XSTR [dest2 + pos], xd2 + XSTR [dest3 + pos], xd3 + XSTR [dest4 + pos], xd4 +%endmacro + +;; +;; Encodes less than 32 bytes of a single source into 4x parity disks +;; +%macro ENCODE_LT_32B_4 1 +%define %%LEN %1 + ;; get next source vector + simd_load_avx2 x0, src + pos, %%LEN, tmp, tmp2 + ;; get next dest vectors + simd_load_avx2 xd1, dest1 + pos, %%LEN, tmp, tmp2 + simd_load_avx2 xd2, dest2 + pos, %%LEN, tmp, tmp2 + simd_load_avx2 xd3, dest3 + pos, %%LEN, tmp, tmp2 + simd_load_avx2 xd4, dest4 + pos, %%LEN, tmp, tmp2 + + GF_MUL_XOR VEX, x0, xgft1, xret1, xd1, xgft2, xret2, xd2, \ + xgft3, xret3, xd3, xgft4, xret4, xd4 + + lea dest1, [dest1 + pos] + simd_store_avx2 dest1, xd1, %%LEN, tmp, tmp2 + lea dest2, [dest2 + pos] + simd_store_avx2 dest2, xd2, %%LEN, tmp, tmp2 + lea dest3, [dest3 + pos] + simd_store_avx2 dest3, xd3, %%LEN, tmp, tmp2 + lea dest4, [dest4 + pos] + simd_store_avx2 dest4, xd4, %%LEN, tmp, tmp2 +%endmacro + +align 16 +global gf_4vect_mad_avx2_gfni, function +func(gf_4vect_mad_avx2_gfni) + FUNC_SAVE + + xor pos, pos + shl vec_i, 3 ;Multiply by 8 + shl vec, 3 ;Multiply by 8 + lea tmp, [mul_array + vec_i] + lea tmp2,[vec*3] + vbroadcastsd xgft1, [tmp] + vbroadcastsd xgft2, [tmp + vec] + vbroadcastsd xgft3, [tmp + vec*2] + vbroadcastsd xgft4, [tmp + tmp2] + mov dest2, [dest1 + 8] ; reuse mul_array + mov dest3, [dest1 + 2*8] ; reuse vec_i + mov dest4, [dest1 + 3*8] + mov dest1, [dest1] + + cmp len, 32 + jl .len_lt_32 + +.loop32: + ENCODE_32B_4 ;; loop on 32 bytes at a time + + add pos, 32 + sub len, 32 + cmp len, 32 + jge .loop32 + +.len_lt_32: + cmp len, 0 + jle .exit + + ENCODE_LT_32B_4 len ;; encode final bytes + +.exit: + vzeroupper + + FUNC_RESTORE + ret + +endproc_frame +%endif ; if AS_FEATURE_LEVEL >= 10 diff --git a/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx512.asm b/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx512.asm index 7a5866fdf0..c09ec387e7 100644 --- a/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx512.asm +++ b/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx512.asm @@ -44,7 +44,7 @@ %define arg5 r9 %define tmp r11 %define return rax - %define func(x) x: + %define func(x) x: endbranch %define FUNC_SAVE %define FUNC_RESTORE %endif @@ -64,16 +64,16 @@ %macro FUNC_SAVE 0 sub rsp, stack_size - movdqa [rsp+16*0],xmm6 - movdqa [rsp+16*1],xmm7 - movdqa [rsp+16*2],xmm8 - movdqa [rsp+16*3],xmm9 - movdqa [rsp+16*4],xmm10 - movdqa [rsp+16*5],xmm11 - movdqa [rsp+16*6],xmm12 - movdqa [rsp+16*7],xmm13 - movdqa [rsp+16*8],xmm14 - movdqa [rsp+16*9],xmm15 + vmovdqa [rsp+16*0],xmm6 + vmovdqa [rsp+16*1],xmm7 + vmovdqa [rsp+16*2],xmm8 + vmovdqa [rsp+16*3],xmm9 + vmovdqa [rsp+16*4],xmm10 + vmovdqa [rsp+16*5],xmm11 + vmovdqa [rsp+16*6],xmm12 + vmovdqa [rsp+16*7],xmm13 + vmovdqa [rsp+16*8],xmm14 + vmovdqa [rsp+16*9],xmm15 save_reg r12, 10*16 + 0*8 save_reg r15, 10*16 + 1*8 end_prolog @@ -82,16 +82,16 @@ %endmacro %macro FUNC_RESTORE 0 - movdqa xmm6, [rsp+16*0] - movdqa xmm7, [rsp+16*1] - movdqa xmm8, [rsp+16*2] - movdqa xmm9, [rsp+16*3] - movdqa xmm10, [rsp+16*4] - movdqa xmm11, [rsp+16*5] - movdqa xmm12, [rsp+16*6] - movdqa xmm13, [rsp+16*7] - movdqa xmm14, [rsp+16*8] - movdqa xmm15, [rsp+16*9] + vmovdqa xmm6, [rsp+16*0] + vmovdqa xmm7, [rsp+16*1] + vmovdqa xmm8, [rsp+16*2] + vmovdqa xmm9, [rsp+16*3] + vmovdqa xmm10, [rsp+16*4] + vmovdqa xmm11, [rsp+16*5] + vmovdqa xmm12, [rsp+16*6] + vmovdqa xmm13, [rsp+16*7] + vmovdqa xmm14, [rsp+16*8] + vmovdqa xmm15, [rsp+16*9] mov r12, [rsp + 10*16 + 0*8] mov r15, [rsp + 10*16 + 1*8] add rsp, stack_size @@ -117,8 +117,8 @@ %else ;;; Use Non-temporal load/stor %ifdef NO_NT_LDST - %define XLDR vmovdqa - %define XSTR vmovdqa + %define XLDR vmovdqa64 + %define XSTR vmovdqa64 %else %define XLDR vmovntdqa %define XSTR vmovntdq @@ -159,13 +159,8 @@ section .text %define xtmpl5 zmm23 align 16 -global gf_4vect_mad_avx512:ISAL_SYM_TYPE_FUNCTION +global gf_4vect_mad_avx512, function func(gf_4vect_mad_avx512) -%ifidn __OUTPUT_FORMAT__, macho64 -global _gf_4vect_mad_avx512:ISAL_SYM_TYPE_FUNCTION -func(_gf_4vect_mad_avx512) -%endif - FUNC_SAVE sub len, 64 jl .return_fail diff --git a/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx512.patch b/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx512.patch deleted file mode 100644 index 4c2a0d08c0..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx512.patch +++ /dev/null @@ -1,6 +0,0 @@ -164,168d163 -< %ifidn __OUTPUT_FORMAT__, macho64 -< global _gf_4vect_mad_avx512:ISAL_SYM_TYPE_FUNCTION -< func(_gf_4vect_mad_avx512) -< %endif -< diff --git a/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx512_gfni.asm b/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx512_gfni.asm new file mode 100644 index 0000000000..1a5c4d9804 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx512_gfni.asm @@ -0,0 +1,223 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2023 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; +;;; gf_4vect_mad_avx512_gfni(len, vec, vec_i, mul_array, src, dest); +;;; + +%include "reg_sizes.asm" +%include "gf_vect_gfni.inc" + +%if AS_FEATURE_LEVEL >= 10 + +%ifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + %define tmp r11 + %define func(x) x: endbranch + %define FUNC_SAVE + %define FUNC_RESTORE +%endif + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define arg4 r12 + %define arg5 r13 + %define tmp r11 + %define stack_size 7*16 + 3*8 + %define arg(x) [rsp + stack_size + 8 + 8*x] + %define func(x) proc_frame x + +%macro FUNC_SAVE 0 + sub rsp, stack_size + vmovdqa [rsp + 16*0], xmm6 + vmovdqa [rsp + 16*1], xmm7 + vmovdqa [rsp + 16*2], xmm8 + vmovdqa [rsp + 16*3], xmm9 + vmovdqa [rsp + 16*4], xmm10 + vmovdqa [rsp + 16*5], xmm11 + vmovdqa [rsp + 16*6], xmm12 + mov [rsp + 7*16 + 0*8], r12 + mov [rsp + 7*16 + 1*8], r13 + end_prolog + mov arg4, arg(4) + mov arg5, arg(5) +%endmacro + +%macro FUNC_RESTORE 0 + vmovdqa xmm6, [rsp + 16*0] + vmovdqa xmm7, [rsp + 16*1] + vmovdqa xmm8, [rsp + 16*2] + vmovdqa xmm9, [rsp + 16*3] + vmovdqa xmm10, [rsp + 16*4] + vmovdqa xmm11, [rsp + 16*5] + vmovdqa xmm12, [rsp + 16*6] + mov r12, [rsp + 7*16 + 0*8] + mov r13, [rsp + 7*16 + 1*8] + add rsp, stack_size +%endmacro +%endif + +%define len arg0 +%define vec arg1 +%define vec_i arg2 +%define mul_array arg3 +%define src arg4 +%define dest1 arg5 +%define pos rax +%define dest2 mul_array +%define dest3 vec +%define dest4 vec_i + +%ifndef EC_ALIGNED_ADDR +;;; Use Un-aligned load/store + %define XLDR vmovdqu8 + %define XSTR vmovdqu8 +%else +;;; Use Non-temporal load/stor + %ifdef NO_NT_LDST + %define XLDR vmovdqa64 + %define XSTR vmovdqa64 + %else + %define XLDR vmovntdqa + %define XSTR vmovntdq + %endif +%endif + +default rel +[bits 64] +section .text + +%define x0 zmm0 +%define xd1 zmm1 +%define xd2 zmm2 +%define xd3 zmm3 +%define xd4 zmm4 + +%define xgft1 zmm5 +%define xgft2 zmm6 +%define xgft3 zmm7 +%define xgft4 zmm8 + +%define xret1 zmm9 +%define xret2 zmm10 +%define xret3 zmm11 +%define xret4 zmm12 + +;; +;; Encodes 64 bytes of a single source into 4x 64 bytes (parity disks) +;; +%macro ENCODE_64B_4 0-1 +%define %%KMASK %1 + +%if %0 == 1 + vmovdqu8 x0{%%KMASK}, [src + pos] ;Get next source vector + vmovdqu8 xd1{%%KMASK}, [dest1 + pos] ;Get next dest vector + vmovdqu8 xd2{%%KMASK}, [dest2 + pos] ;Get next dest vector + vmovdqu8 xd3{%%KMASK}, [dest3 + pos] ;Get next dest vector + vmovdqu8 xd4{%%KMASK}, [dest4 + pos] ;Get next dest vector +%else + XLDR x0, [src + pos] ;Get next source vector + XLDR xd1, [dest1 + pos] ;Get next dest vector + XLDR xd2, [dest2 + pos] ;Get next dest vector + XLDR xd3, [dest3 + pos] ;Get next dest vector + XLDR xd4, [dest4 + pos] ;Get next dest vector +%endif + + GF_MUL_XOR EVEX, x0, xgft1, xret1, xd1, xgft2, xret2, xd2, xgft3, xret3, xd3, \ + xgft4, xret4, xd4 + +%if %0 == 1 + vmovdqu8 [dest1 + pos]{%%KMASK}, xd1 + vmovdqu8 [dest2 + pos]{%%KMASK}, xd2 + vmovdqu8 [dest3 + pos]{%%KMASK}, xd3 + vmovdqu8 [dest4 + pos]{%%KMASK}, xd4 +%else + XSTR [dest1 + pos], xd1 + XSTR [dest2 + pos], xd2 + XSTR [dest3 + pos], xd3 + XSTR [dest4 + pos], xd4 +%endif +%endmacro + +align 16 +global gf_4vect_mad_avx512_gfni, function +func(gf_4vect_mad_avx512_gfni) + FUNC_SAVE + + xor pos, pos + shl vec_i, 3 ;Multiply by 8 + shl vec, 3 ;Multiply by 8 + lea tmp, [mul_array + vec_i] + vbroadcastf32x2 xgft1, [tmp] + vbroadcastf32x2 xgft2, [tmp + vec] + vbroadcastf32x2 xgft3, [tmp + vec*2] + add tmp, vec + vbroadcastf32x2 xgft4, [tmp + vec*2] + mov dest2, [dest1 + 8] ; reuse mul_array + mov dest3, [dest1 + 2*8] ; reuse vec + mov dest4, [dest1 + 3*8] ; reuse vec_i + mov dest1, [dest1] + + cmp len, 64 + jl .len_lt_64 +.loop64: + ENCODE_64B_4 + + add pos, 64 ;Loop on 64 bytes at a time + sub len, 64 + cmp len, 64 + jge .loop64 + +.len_lt_64: + cmp len, 0 + jle .exit + + xor tmp, tmp + bts tmp, len + dec tmp + kmovq k1, tmp + + ENCODE_64B_4 k1 + +.exit: + vzeroupper + + FUNC_RESTORE + ret + +endproc_frame +%endif ; if AS_FEATURE_LEVEL >= 10 diff --git a/contrib/libs/isa-l/erasure_code/gf_4vect_mad_sse.asm b/contrib/libs/isa-l/erasure_code/gf_4vect_mad_sse.asm index 32b6cda183..efbe3836a8 100644 --- a/contrib/libs/isa-l/erasure_code/gf_4vect_mad_sse.asm +++ b/contrib/libs/isa-l/erasure_code/gf_4vect_mad_sse.asm @@ -103,7 +103,7 @@ %define return rax %define return.w eax - %define func(x) x: + %define func(x) x: endbranch %macro FUNC_SAVE 0 push r12 %endmacro @@ -168,13 +168,8 @@ section .text %define xd4 xtmpl1 align 16 -global gf_4vect_mad_sse:ISAL_SYM_TYPE_FUNCTION +global gf_4vect_mad_sse, function func(gf_4vect_mad_sse) -%ifidn __OUTPUT_FORMAT__, macho64 -global _gf_4vect_mad_sse:ISAL_SYM_TYPE_FUNCTION -func(_gf_4vect_mad_sse) -%endif - FUNC_SAVE sub len, 16 jl .return_fail @@ -342,6 +337,3 @@ mask0f: dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f constip16: dq 0xf8f9fafbfcfdfeff, 0xf0f1f2f3f4f5f6f7 - -;;; func core, ver, snum -slversion gf_4vect_mad_sse, 00, 01, 0209 diff --git a/contrib/libs/isa-l/erasure_code/gf_4vect_mad_sse.patch b/contrib/libs/isa-l/erasure_code/gf_4vect_mad_sse.patch deleted file mode 100644 index d8610712d2..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_4vect_mad_sse.patch +++ /dev/null @@ -1,6 +0,0 @@ -173,177d172 -< %ifidn __OUTPUT_FORMAT__, macho64 -< global _gf_4vect_mad_sse:ISAL_SYM_TYPE_FUNCTION -< func(_gf_4vect_mad_sse) -< %endif -< diff --git a/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_avx.asm b/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_avx.asm index 1d8cccf70b..978b4d2720 100644 --- a/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_avx.asm +++ b/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_avx.asm @@ -51,7 +51,7 @@ %define PS 8 %define LOG_PS 3 - %define func(x) x: + %define func(x) x: endbranch %macro FUNC_SAVE 0 push r12 push r13 @@ -89,16 +89,16 @@ %define func(x) proc_frame x %macro FUNC_SAVE 0 alloc_stack stack_size - save_xmm128 xmm6, 0*16 - save_xmm128 xmm7, 1*16 - save_xmm128 xmm8, 2*16 - save_xmm128 xmm9, 3*16 - save_xmm128 xmm10, 4*16 - save_xmm128 xmm11, 5*16 - save_xmm128 xmm12, 6*16 - save_xmm128 xmm13, 7*16 - save_xmm128 xmm14, 8*16 - save_xmm128 xmm15, 9*16 + vmovdqa [rsp + 0*16], xmm6 + vmovdqa [rsp + 1*16], xmm7 + vmovdqa [rsp + 2*16], xmm8 + vmovdqa [rsp + 3*16], xmm9 + vmovdqa [rsp + 4*16], xmm10 + vmovdqa [rsp + 5*16], xmm11 + vmovdqa [rsp + 6*16], xmm12 + vmovdqa [rsp + 7*16], xmm13 + vmovdqa [rsp + 8*16], xmm14 + vmovdqa [rsp + 9*16], xmm15 save_reg r12, 10*16 + 0*8 save_reg r13, 10*16 + 1*8 save_reg r14, 10*16 + 2*8 @@ -184,13 +184,8 @@ section .text %define xp5 xmm6 align 16 -global gf_5vect_dot_prod_avx:ISAL_SYM_TYPE_FUNCTION +global gf_5vect_dot_prod_avx, function func(gf_5vect_dot_prod_avx) -%ifidn __OUTPUT_FORMAT__, macho64 -global _gf_5vect_dot_prod_avx:ISAL_SYM_TYPE_FUNCTION -func(_gf_5vect_dot_prod_avx) -%endif - FUNC_SAVE sub len, 16 jl .return_fail @@ -303,6 +298,3 @@ section .data align 16 mask0f: dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f - -;;; func core, ver, snum -slversion gf_5vect_dot_prod_avx, 02, 04, 0194 diff --git a/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_avx.patch b/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_avx.patch deleted file mode 100644 index e72a2b4857..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_avx.patch +++ /dev/null @@ -1,6 +0,0 @@ -189,193d188 -< %ifidn __OUTPUT_FORMAT__, macho64 -< global _gf_5vect_dot_prod_avx:ISAL_SYM_TYPE_FUNCTION -< func(_gf_5vect_dot_prod_avx) -< %endif -< diff --git a/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_avx2.asm b/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_avx2.asm index 0cdfee906e..11fb36b687 100644 --- a/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_avx2.asm +++ b/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_avx2.asm @@ -53,7 +53,7 @@ %define PS 8 %define LOG_PS 3 - %define func(x) x: + %define func(x) x: endbranch %macro FUNC_SAVE 0 push r12 push r13 @@ -189,13 +189,8 @@ section .text %define xp5 ymm6 align 16 -global gf_5vect_dot_prod_avx2:ISAL_SYM_TYPE_FUNCTION +global gf_5vect_dot_prod_avx2, function func(gf_5vect_dot_prod_avx2) -%ifidn __OUTPUT_FORMAT__, macho64 -global _gf_5vect_dot_prod_avx2:ISAL_SYM_TYPE_FUNCTION -func(_gf_5vect_dot_prod_avx2) -%endif - FUNC_SAVE sub len, 32 jl .return_fail @@ -313,8 +308,3 @@ func(_gf_5vect_dot_prod_avx2) ret endproc_frame - -section .data - -;;; func core, ver, snum -slversion gf_5vect_dot_prod_avx2, 04, 04, 0199 diff --git a/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_avx2.patch b/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_avx2.patch deleted file mode 100644 index a898e05522..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_avx2.patch +++ /dev/null @@ -1,6 +0,0 @@ -194,198d193 -< %ifidn __OUTPUT_FORMAT__, macho64 -< global _gf_5vect_dot_prod_avx2:ISAL_SYM_TYPE_FUNCTION -< func(_gf_5vect_dot_prod_avx2) -< %endif -< diff --git a/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_avx512.asm b/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_avx512.asm new file mode 100644 index 0000000000..e790cb69eb --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_avx512.asm @@ -0,0 +1,334 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2019 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; +;;; gf_5vect_dot_prod_avx512(len, vec, *g_tbls, **buffs, **dests); +;;; + +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_AVX512 + +%ifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + + %define tmp r11 + %define tmp2 r10 + %define tmp3 r13 ; must be saved and restored + %define tmp4 r12 ; must be saved and restored + %define tmp5 r14 ; must be saved and restored + %define tmp6 r15 ; must be saved and restored + %define tmp7 rbp ; must be saved and restored + %define tmp8 rbx ; must be saved and restored + %define return rax + %define PS 8 + %define LOG_PS 3 + %define stack_size 6*8 + + %define func(x) x: endbranch + %macro FUNC_SAVE 0 + sub rsp, stack_size + mov [rsp + 0*8], r12 + mov [rsp + 1*8], r13 + mov [rsp + 2*8], r14 + mov [rsp + 3*8], r15 + mov [rsp + 4*8], rbp + mov [rsp + 5*8], rbx + %endmacro + %macro FUNC_RESTORE 0 + mov r12, [rsp + 0*8] + mov r13, [rsp + 1*8] + mov r14, [rsp + 2*8] + mov r15, [rsp + 3*8] + mov rbp, [rsp + 4*8] + mov rbx, [rsp + 5*8] + add rsp, stack_size + %endmacro +%endif + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + + %define arg4 r12 ; must be saved, loaded and restored + %define arg5 r15 ; must be saved and restored + %define tmp r11 + %define tmp2 r10 + %define tmp3 r13 ; must be saved and restored + %define tmp4 r14 ; must be saved and restored + %define tmp5 rdi ; must be saved and restored + %define tmp6 rsi ; must be saved and restored + %define tmp7 rbp ; must be saved and restored + %define tmp8 rbx ; must be saved and restored + %define return rax + %define PS 8 + %define LOG_PS 3 + %define stack_size 10*16 + 9*8 ; must be an odd multiple of 8 + %define arg(x) [rsp + stack_size + PS + PS*x] + + %define func(x) proc_frame x + %macro FUNC_SAVE 0 + alloc_stack stack_size + vmovdqa [rsp + 0*16], xmm6 + vmovdqa [rsp + 1*16], xmm7 + vmovdqa [rsp + 2*16], xmm8 + vmovdqa [rsp + 3*16], xmm9 + vmovdqa [rsp + 4*16], xmm10 + vmovdqa [rsp + 5*16], xmm11 + vmovdqa [rsp + 6*16], xmm12 + vmovdqa [rsp + 7*16], xmm13 + vmovdqa [rsp + 8*16], xmm14 + vmovdqa [rsp + 9*16], xmm15 + save_reg r12, 10*16 + 0*8 + save_reg r13, 10*16 + 1*8 + save_reg r14, 10*16 + 2*8 + save_reg r15, 10*16 + 3*8 + save_reg rdi, 10*16 + 4*8 + save_reg rsi, 10*16 + 5*8 + save_reg rbp, 10*16 + 6*8 + save_reg rbx, 10*16 + 7*8 + end_prolog + mov arg4, arg(4) + %endmacro + + %macro FUNC_RESTORE 0 + vmovdqa xmm6, [rsp + 0*16] + vmovdqa xmm7, [rsp + 1*16] + vmovdqa xmm8, [rsp + 2*16] + vmovdqa xmm9, [rsp + 3*16] + vmovdqa xmm10, [rsp + 4*16] + vmovdqa xmm11, [rsp + 5*16] + vmovdqa xmm12, [rsp + 6*16] + vmovdqa xmm13, [rsp + 7*16] + vmovdqa xmm14, [rsp + 8*16] + vmovdqa xmm15, [rsp + 9*16] + mov r12, [rsp + 10*16 + 0*8] + mov r13, [rsp + 10*16 + 1*8] + mov r14, [rsp + 10*16 + 2*8] + mov r15, [rsp + 10*16 + 3*8] + mov rdi, [rsp + 10*16 + 4*8] + mov rsi, [rsp + 10*16 + 5*8] + mov rbp, [rsp + 10*16 + 6*8] + mov rbx, [rsp + 10*16 + 7*8] + add rsp, stack_size + %endmacro +%endif + + +%define len arg0 +%define vec arg1 +%define mul_array arg2 +%define src arg3 +%define dest1 arg4 +%define ptr arg5 +%define vec_i tmp2 +%define dest2 tmp3 +%define dest3 tmp4 +%define dest4 tmp5 +%define vskip3 tmp6 +%define dest5 tmp7 +%define vskip1 tmp8 +%define pos return + + +%ifndef EC_ALIGNED_ADDR +;;; Use Un-aligned load/store + %define XLDR vmovdqu8 + %define XSTR vmovdqu8 +%else +;;; Use Non-temporal load/stor + %ifdef NO_NT_LDST + %define XLDR vmovdqa64 + %define XSTR vmovdqa64 + %else + %define XLDR vmovntdqa + %define XSTR vmovntdq + %endif +%endif + +%define xmask0f zmm17 +%define xgft1_lo zmm16 +%define xgft1_loy ymm16 +%define xgft1_hi zmm15 +%define xgft2_lo zmm14 +%define xgft2_loy ymm14 +%define xgft2_hi zmm13 +%define xgft3_lo zmm12 +%define xgft3_loy ymm12 +%define xgft3_hi zmm11 +%define xgft4_lo zmm10 +%define xgft4_loy ymm10 +%define xgft4_hi zmm9 +%define xgft5_lo zmm8 +%define xgft5_loy ymm8 +%define xgft5_hi zmm7 + +%define x0 zmm0 +%define xtmpa zmm1 +%define xp1 zmm2 +%define xp2 zmm3 +%define xp3 zmm4 +%define xp4 zmm5 +%define xp5 zmm6 + +default rel +[bits 64] + +section .text + +align 16 +global gf_5vect_dot_prod_avx512, function +func(gf_5vect_dot_prod_avx512) + FUNC_SAVE + sub len, 64 + jl .return_fail + + xor pos, pos + mov tmp, 0x0f + vpbroadcastb xmask0f, tmp ;Construct mask 0x0f0f0f... + mov vskip1, vec + imul vskip1, 32 + mov vskip3, vec + imul vskip3, 96 + sal vec, LOG_PS ;vec *= PS. Make vec_i count by PS + mov dest2, [dest1+PS] + mov dest3, [dest1+2*PS] + mov dest4, [dest1+3*PS] + mov dest5, [dest1+4*PS] + mov dest1, [dest1] + +.loop64: + vpxorq xp1, xp1, xp1 + vpxorq xp2, xp2, xp2 + vpxorq xp3, xp3, xp3 + vpxorq xp4, xp4, xp4 + vpxorq xp5, xp5, xp5 + mov tmp, mul_array + xor vec_i, vec_i + +.next_vect: + mov ptr, [src+vec_i] + XLDR x0, [ptr+pos] ;Get next source vector + add vec_i, PS + + vpandq xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0 + vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 + vpandq x0, x0, xmask0f ;Mask high src nibble in bits 4-0 + + vmovdqu8 xgft1_loy, [tmp] ;Load array Ax{00}..{0f}, Ax{00}..{f0} + vmovdqu8 xgft2_loy, [tmp+vec*(32/PS)] ;Load array Bx{00}..{0f}, Bx{00}..{f0} + vmovdqu8 xgft3_loy, [tmp+vec*(64/PS)] ;Load array Cx{00}..{0f}, Cx{00}..{f0} + vmovdqu8 xgft4_loy, [tmp+vskip3] ;Load array Dx{00}..{0f}, Dx{00}..{f0} + vmovdqu8 xgft5_loy, [tmp+vskip1*4] ;Load array Ex{00}..{0f}, Ex{00}..{f0} + add tmp, 32 + + vshufi64x2 xgft1_hi, xgft1_lo, xgft1_lo, 0x55 + vshufi64x2 xgft1_lo, xgft1_lo, xgft1_lo, 0x00 + vshufi64x2 xgft2_hi, xgft2_lo, xgft2_lo, 0x55 + vshufi64x2 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 + + vpshufb xgft1_hi, xgft1_hi, x0 ;Lookup mul table of high nibble + vpshufb xgft1_lo, xgft1_lo, xtmpa ;Lookup mul table of low nibble + vpxorq xgft1_hi, xgft1_hi, xgft1_lo ;GF add high and low partials + vpxorq xp1, xp1, xgft1_hi ;xp1 += partial + + vpshufb xgft2_hi, xgft2_hi, x0 ;Lookup mul table of high nibble + vpshufb xgft2_lo, xgft2_lo, xtmpa ;Lookup mul table of low nibble + vpxorq xgft2_hi, xgft2_hi, xgft2_lo ;GF add high and low partials + vpxorq xp2, xp2, xgft2_hi ;xp2 += partial + + vshufi64x2 xgft3_hi, xgft3_lo, xgft3_lo, 0x55 + vshufi64x2 xgft3_lo, xgft3_lo, xgft3_lo, 0x00 + vshufi64x2 xgft4_hi, xgft4_lo, xgft4_lo, 0x55 + vshufi64x2 xgft4_lo, xgft4_lo, xgft4_lo, 0x00 + + vpshufb xgft3_hi, xgft3_hi, x0 ;Lookup mul table of high nibble + vpshufb xgft3_lo, xgft3_lo, xtmpa ;Lookup mul table of low nibble + vpxorq xgft3_hi, xgft3_hi, xgft3_lo ;GF add high and low partials + vpxorq xp3, xp3, xgft3_hi ;xp3 += partial + + vpshufb xgft4_hi, xgft4_hi, x0 ;Lookup mul table of high nibble + vpshufb xgft4_lo, xgft4_lo, xtmpa ;Lookup mul table of low nibble + vpxorq xgft4_hi, xgft4_hi, xgft4_lo ;GF add high and low partials + vpxorq xp4, xp4, xgft4_hi ;xp4 += partial + + vshufi64x2 xgft5_hi, xgft5_lo, xgft5_lo, 0x55 + vshufi64x2 xgft5_lo, xgft5_lo, xgft5_lo, 0x00 + + vpshufb xgft5_hi, xgft5_hi, x0 ;Lookup mul table of high nibble + vpshufb xgft5_lo, xgft5_lo, xtmpa ;Lookup mul table of low nibble + vpxorq xgft5_hi, xgft5_hi, xgft5_lo ;GF add high and low partials + vpxorq xp5, xp5, xgft5_hi ;xp5 += partial + + cmp vec_i, vec + jl .next_vect + + XSTR [dest1+pos], xp1 + XSTR [dest2+pos], xp2 + XSTR [dest3+pos], xp3 + XSTR [dest4+pos], xp4 + XSTR [dest5+pos], xp5 + + add pos, 64 ;Loop on 64 bytes at a time + cmp pos, len + jle .loop64 + + lea tmp, [len + 64] + cmp pos, tmp + je .return_pass + + ;; Tail len + mov pos, len ;Overlapped offset length-64 + jmp .loop64 ;Do one more overlap pass + +.return_pass: + mov return, 0 + FUNC_RESTORE + ret + +.return_fail: + mov return, 1 + FUNC_RESTORE + ret + +endproc_frame + +%else +%ifidn __OUTPUT_FORMAT__, win64 +global no_gf_5vect_dot_prod_avx512 +no_gf_5vect_dot_prod_avx512: +%endif +%endif ; ifdef HAVE_AS_KNOWS_AVX512 diff --git a/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_avx512_gfni.asm b/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_avx512_gfni.asm new file mode 100644 index 0000000000..ebb9052368 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_avx512_gfni.asm @@ -0,0 +1,275 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2023 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; +;;; gf_5vect_dot_prod_avx512_gfni(len, vec, *g_tbls, **buffs, **dests); +;;; + +%include "reg_sizes.asm" +%include "gf_vect_gfni.inc" + +%if AS_FEATURE_LEVEL >= 10 + +%ifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + + %define tmp r11 + %define tmp2 r10 + %define tmp3 r13 ; must be saved and restored + %define tmp4 r12 ; must be saved and restored + %define tmp5 r14 ; must be saved and restored + %define tmp6 r15 ; must be saved and restored + %define tmp7 rbp ; must be saved and restored + + %define func(x) x: endbranch + %macro FUNC_SAVE 0 + push r12 + push r13 + push r14 + push r15 + push rbp + %endmacro + %macro FUNC_RESTORE 0 + pop rbp + pop r15 + pop r14 + pop r13 + pop r12 + %endmacro +%endif + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + + %define arg4 r12 ; must be saved, loaded and restored + %define arg5 r15 ; must be saved and restored + %define tmp r11 + %define tmp2 r10 + %define tmp3 r13 ; must be saved and restored + %define tmp4 r14 ; must be saved and restored + %define tmp5 rdi ; must be saved and restored + %define tmp6 rsi ; must be saved and restored + %define tmp7 rbp ; must be saved and restored + %define stack_size 5*16 + 9*8 ; must be an odd multiple of 8 + %define arg(x) [rsp + stack_size + 8 + 8*x] + + %define func(x) proc_frame x + %macro FUNC_SAVE 0 + alloc_stack stack_size + vmovdqa [rsp + 0*16], xmm6 + vmovdqa [rsp + 1*16], xmm7 + vmovdqa [rsp + 2*16], xmm8 + vmovdqa [rsp + 3*16], xmm9 + vmovdqa [rsp + 4*16], xmm10 + mov [rsp + 5*16 + 0*8], r12 + mov [rsp + 5*16 + 1*8], r13 + mov [rsp + 5*16 + 2*8], r14 + mov [rsp + 5*16 + 3*8], r15 + mov [rsp + 5*16 + 4*8], rdi + mov [rsp + 5*16 + 5*8], rsi + mov [rsp + 5*16 + 6*8], rbp + end_prolog + mov arg4, arg(4) + %endmacro + + %macro FUNC_RESTORE 0 + vmovdqa xmm6, [rsp + 0*16] + vmovdqa xmm7, [rsp + 1*16] + vmovdqa xmm8, [rsp + 2*16] + vmovdqa xmm9, [rsp + 3*16] + vmovdqa xmm10, [rsp + 4*16] + mov r12, [rsp + 5*16 + 0*8] + mov r13, [rsp + 5*16 + 1*8] + mov r14, [rsp + 5*16 + 2*8] + mov r15, [rsp + 5*16 + 3*8] + mov rdi, [rsp + 5*16 + 4*8] + mov rsi, [rsp + 5*16 + 5*8] + mov rbp, [rsp + 5*16 + 6*8] + add rsp, stack_size + %endmacro +%endif + + +%define len arg0 +%define vec arg1 +%define mul_array arg2 +%define src arg3 +%define dest1 arg4 +%define ptr arg5 +%define vec_i tmp2 +%define dest2 tmp3 +%define dest3 tmp4 +%define dest4 tmp5 +%define vskip3 tmp6 +%define dest5 tmp7 +%define pos rax + + +%ifndef EC_ALIGNED_ADDR +;;; Use Un-aligned load/store + %define XLDR vmovdqu8 + %define XSTR vmovdqu8 +%else +;;; Use Non-temporal load/stor + %ifdef NO_NT_LDST + %define XLDR vmovdqa64 + %define XSTR vmovdqa64 + %else + %define XLDR vmovntdqa + %define XSTR vmovntdq + %endif +%endif + +%define xgft1 zmm6 +%define xgft2 zmm7 +%define xgft3 zmm8 +%define xgft4 zmm9 +%define xgft5 zmm10 + +%define x0 zmm0 +%define xp1 zmm1 +%define xp2 zmm2 +%define xp3 zmm3 +%define xp4 zmm4 +%define xp5 zmm5 + +default rel +[bits 64] + +section .text + +;; +;; Encodes 64 bytes of all "k" sources into 5x 64 bytes (parity disks) +;; +%macro ENCODE_64B_5 0-1 +%define %%KMASK %1 + + vpxorq xp1, xp1, xp1 + vpxorq xp2, xp2, xp2 + vpxorq xp3, xp3, xp3 + vpxorq xp4, xp4, xp4 + vpxorq xp5, xp5, xp5 + mov tmp, mul_array + xor vec_i, vec_i + +%%next_vect: + mov ptr, [src + vec_i] +%if %0 == 1 + vmovdqu8 x0{%%KMASK}, [ptr + pos] ;Get next source vector (less than 64 bytes) +%else + XLDR x0, [ptr + pos] ;Get next source vector (64 bytes) +%endif + add vec_i, 8 + + vbroadcastf32x2 xgft1, [tmp] + vbroadcastf32x2 xgft2, [tmp + vec] + vbroadcastf32x2 xgft3, [tmp + vec*2] + vbroadcastf32x2 xgft4, [tmp + vskip3] + vbroadcastf32x2 xgft5, [tmp + vec*4] + add tmp, 8 + + GF_MUL_XOR EVEX, x0, xgft1, xgft1, xp1, xgft2, xgft2, xp2, xgft3, xgft3, xp3, \ + xgft4, xgft4, xp4, xgft5, xgft5, xp5 + + cmp vec_i, vec + jl %%next_vect + + mov ptr, [dest1] ;reuse ptr + mov tmp, [dest1 + 5*8] ;reuse tmp + +%if %0 == 1 + vmovdqu8 [dest1 + pos]{%%KMASK}, xp1 + vmovdqu8 [dest2 + pos]{%%KMASK}, xp2 + vmovdqu8 [dest3 + pos]{%%KMASK}, xp3 + vmovdqu8 [dest4 + pos]{%%KMASK}, xp4 + vmovdqu8 [dest5 + pos]{%%KMASK}, xp5 +%else + XSTR [dest1 + pos], xp1 + XSTR [dest2 + pos], xp2 + XSTR [dest3 + pos], xp3 + XSTR [dest4 + pos], xp4 + XSTR [dest5 + pos], xp5 +%endif +%endmacro + +align 16 +global gf_5vect_dot_prod_avx512_gfni, function +func(gf_5vect_dot_prod_avx512_gfni) + FUNC_SAVE + + xor pos, pos + mov vskip3, vec + imul vskip3, 8*3 + shl vec, 3 ;vec *= 8. Make vec_i count by 8 + mov dest2, [dest1 + 8] + mov dest3, [dest1 + 2*8] + mov dest4, [dest1 + 3*8] + mov dest5, [dest1 + 4*8] + mov dest1, [dest1] + + cmp len, 64 + jl .len_lt_64 + +.loop64: + + ENCODE_64B_5 + + add pos, 64 ;Loop on 64 bytes at a time + sub len, 64 + cmp len, 64 + jge .loop64 + + +.len_lt_64: + cmp len, 0 + jle .exit + + xor tmp, tmp + bts tmp, len + dec tmp + kmovq k1, tmp + + ENCODE_64B_5 k1 + +.exit: + vzeroupper + + FUNC_RESTORE + ret + +endproc_frame +%endif ; if AS_FEATURE_LEVEL >= 10 diff --git a/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_sse.asm b/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_sse.asm index 577875dbb4..b669ac6464 100644 --- a/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_sse.asm +++ b/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_sse.asm @@ -51,7 +51,7 @@ %define PS 8 %define LOG_PS 3 - %define func(x) x: + %define func(x) x: endbranch %macro FUNC_SAVE 0 push r12 push r13 @@ -184,13 +184,8 @@ section .text %define xp5 xmm14 align 16 -global gf_5vect_dot_prod_sse:ISAL_SYM_TYPE_FUNCTION +global gf_5vect_dot_prod_sse, function func(gf_5vect_dot_prod_sse) -%ifidn __OUTPUT_FORMAT__, macho64 -global _gf_5vect_dot_prod_sse:ISAL_SYM_TYPE_FUNCTION -func(_gf_5vect_dot_prod_sse) -%endif - FUNC_SAVE sub len, 16 jl .return_fail @@ -304,6 +299,3 @@ section .data align 16 mask0f: dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f - -;;; func core, ver, snum -slversion gf_5vect_dot_prod_sse, 00, 05, 0065 diff --git a/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_sse.patch b/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_sse.patch deleted file mode 100644 index eaa82dcc5d..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_sse.patch +++ /dev/null @@ -1,6 +0,0 @@ -189,193d188 -< %ifidn __OUTPUT_FORMAT__, macho64 -< global _gf_5vect_dot_prod_sse:ISAL_SYM_TYPE_FUNCTION -< func(_gf_5vect_dot_prod_sse) -< %endif -< diff --git a/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_sse_test.c b/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_sse_test.c deleted file mode 100644 index b1eea664b1..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_sse_test.c +++ /dev/null @@ -1,805 +0,0 @@ -/********************************************************************** - Copyright(c) 2011-2015 Intel Corporation All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -**********************************************************************/ - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> // for memset, memcmp -#include "erasure_code.h" -// #include "types.h" - -#ifndef FUNCTION_UNDER_TEST -# define FUNCTION_UNDER_TEST gf_5vect_dot_prod_sse -#endif -#ifndef TEST_MIN_SIZE -# define TEST_MIN_SIZE 16 -#endif - -#define str(s) #s -#define xstr(s) str(s) - -#define TEST_LEN 2048 -#define TEST_SIZE (TEST_LEN/2) -#define TEST_MEM TEST_SIZE -#define TEST_LOOPS 1000 -#define TEST_TYPE_STR "" - -#ifndef TEST_SOURCES -# define TEST_SOURCES 16 -#endif -#ifndef RANDOMS -# define RANDOMS 20 -#endif - -#ifdef EC_ALIGNED_ADDR -// Define power of 2 range to check ptr, len alignment -# define PTR_ALIGN_CHK_B 0 -# define LEN_ALIGN_CHK_B 0 // 0 for aligned only -#else -// Define power of 2 range to check ptr, len alignment -# define PTR_ALIGN_CHK_B 32 -# define LEN_ALIGN_CHK_B 32 // 0 for aligned only -#endif - -typedef unsigned char u8; - -void dump(unsigned char *buf, int len) -{ - int i; - for (i = 0; i < len;) { - printf(" %2x", 0xff & buf[i++]); - if (i % 32 == 0) - printf("\n"); - } - printf("\n"); -} - -void dump_matrix(unsigned char **s, int k, int m) -{ - int i, j; - for (i = 0; i < k; i++) { - for (j = 0; j < m; j++) { - printf(" %2x", s[i][j]); - } - printf("\n"); - } - printf("\n"); -} - -void dump_u8xu8(unsigned char *s, int k, int m) -{ - int i, j; - for (i = 0; i < k; i++) { - for (j = 0; j < m; j++) { - printf(" %2x", 0xff & s[j + (i * m)]); - } - printf("\n"); - } - printf("\n"); -} - -int main(int argc, char *argv[]) -{ - int i, j, rtest, srcs; - void *buf; - u8 g1[TEST_SOURCES], g2[TEST_SOURCES], g3[TEST_SOURCES]; - u8 g4[TEST_SOURCES], g5[TEST_SOURCES], *g_tbls; - u8 *dest1, *dest2, *dest3, *dest4, *dest5, *buffs[TEST_SOURCES]; - u8 *dest_ref1, *dest_ref2, *dest_ref3, *dest_ref4, *dest_ref5; - u8 *dest_ptrs[5]; - - int align, size; - unsigned char *efence_buffs[TEST_SOURCES]; - unsigned int offset; - u8 *ubuffs[TEST_SOURCES]; - u8 *udest_ptrs[5]; - printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d ", TEST_SOURCES, TEST_LEN); - - // Allocate the arrays - for (i = 0; i < TEST_SOURCES; i++) { - if (posix_memalign(&buf, 64, TEST_LEN)) { - printf("alloc error: Fail"); - return -1; - } - buffs[i] = buf; - } - - if (posix_memalign(&buf, 16, 2 * (6 * TEST_SOURCES * 32))) { - printf("alloc error: Fail"); - return -1; - } - g_tbls = buf; - - if (posix_memalign(&buf, 64, TEST_LEN)) { - printf("alloc error: Fail"); - return -1; - } - dest1 = buf; - - if (posix_memalign(&buf, 64, TEST_LEN)) { - printf("alloc error: Fail"); - return -1; - } - dest2 = buf; - - if (posix_memalign(&buf, 64, TEST_LEN)) { - printf("alloc error: Fail"); - return -1; - } - dest3 = buf; - - if (posix_memalign(&buf, 64, TEST_LEN)) { - printf("alloc error: Fail"); - return -1; - } - dest4 = buf; - - if (posix_memalign(&buf, 64, TEST_LEN)) { - printf("alloc error: Fail"); - return -1; - } - dest5 = buf; - - if (posix_memalign(&buf, 64, TEST_LEN)) { - printf("alloc error: Fail"); - return -1; - } - dest_ref1 = buf; - - if (posix_memalign(&buf, 64, TEST_LEN)) { - printf("alloc error: Fail"); - return -1; - } - dest_ref2 = buf; - - if (posix_memalign(&buf, 64, TEST_LEN)) { - printf("alloc error: Fail"); - return -1; - } - dest_ref3 = buf; - - if (posix_memalign(&buf, 64, TEST_LEN)) { - printf("alloc error: Fail"); - return -1; - } - dest_ref4 = buf; - - if (posix_memalign(&buf, 64, TEST_LEN)) { - printf("alloc error: Fail"); - return -1; - } - dest_ref5 = buf; - - dest_ptrs[0] = dest1; - dest_ptrs[1] = dest2; - dest_ptrs[2] = dest3; - dest_ptrs[3] = dest4; - dest_ptrs[4] = dest5; - - // Test of all zeros - for (i = 0; i < TEST_SOURCES; i++) - memset(buffs[i], 0, TEST_LEN); - - memset(dest1, 0, TEST_LEN); - memset(dest2, 0, TEST_LEN); - memset(dest3, 0, TEST_LEN); - memset(dest4, 0, TEST_LEN); - memset(dest5, 0, TEST_LEN); - memset(dest_ref1, 0, TEST_LEN); - memset(dest_ref2, 0, TEST_LEN); - memset(dest_ref3, 0, TEST_LEN); - memset(dest_ref4, 0, TEST_LEN); - memset(dest_ref5, 0, TEST_LEN); - memset(g1, 2, TEST_SOURCES); - memset(g2, 1, TEST_SOURCES); - memset(g3, 7, TEST_SOURCES); - memset(g4, 9, TEST_SOURCES); - memset(g5, 4, TEST_SOURCES); - - for (i = 0; i < TEST_SOURCES; i++) { - gf_vect_mul_init(g1[i], &g_tbls[i * 32]); - gf_vect_mul_init(g2[i], &g_tbls[32 * TEST_SOURCES + i * 32]); - gf_vect_mul_init(g3[i], &g_tbls[64 * TEST_SOURCES + i * 32]); - gf_vect_mul_init(g4[i], &g_tbls[96 * TEST_SOURCES + i * 32]); - gf_vect_mul_init(g5[i], &g_tbls[128 * TEST_SOURCES + i * 32]); - } - - gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1); - gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES], buffs, - dest_ref2); - gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES], buffs, - dest_ref3); - gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES], buffs, - dest_ref4); - gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[128 * TEST_SOURCES], buffs, - dest_ref5); - - FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs); - - if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) { - printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test1\n"); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref1, 25); - printf("dprod_dut:"); - dump(dest1, 25); - return -1; - } - if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) { - printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test2\n"); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref2, 25); - printf("dprod_dut:"); - dump(dest2, 25); - return -1; - } - if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) { - printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test3\n"); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref3, 25); - printf("dprod_dut:"); - dump(dest3, 25); - return -1; - } - if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) { - printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test4\n"); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref4, 25); - printf("dprod_dut:"); - dump(dest4, 25); - return -1; - } - if (0 != memcmp(dest_ref5, dest5, TEST_LEN)) { - printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test5\n"); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref5, 25); - printf("dprod_dut:"); - dump(dest5, 25); - return -1; - } - putchar('.'); - - // Rand data test - - for (rtest = 0; rtest < RANDOMS; rtest++) { - for (i = 0; i < TEST_SOURCES; i++) - for (j = 0; j < TEST_LEN; j++) - buffs[i][j] = rand(); - - for (i = 0; i < TEST_SOURCES; i++) { - g1[i] = rand(); - g2[i] = rand(); - g3[i] = rand(); - g4[i] = rand(); - g5[i] = rand(); - } - - for (i = 0; i < TEST_SOURCES; i++) { - gf_vect_mul_init(g1[i], &g_tbls[i * 32]); - gf_vect_mul_init(g2[i], &g_tbls[(32 * TEST_SOURCES) + (i * 32)]); - gf_vect_mul_init(g3[i], &g_tbls[(64 * TEST_SOURCES) + (i * 32)]); - gf_vect_mul_init(g4[i], &g_tbls[(96 * TEST_SOURCES) + (i * 32)]); - gf_vect_mul_init(g5[i], &g_tbls[(128 * TEST_SOURCES) + (i * 32)]); - } - - gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1); - gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES], - buffs, dest_ref2); - gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES], - buffs, dest_ref3); - gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES], - buffs, dest_ref4); - gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[128 * TEST_SOURCES], - buffs, dest_ref5); - - FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs); - - if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test1 %d\n", rtest); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref1, 25); - printf("dprod_dut:"); - dump(dest1, 25); - return -1; - } - if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test2 %d\n", rtest); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref2, 25); - printf("dprod_dut:"); - dump(dest2, 25); - return -1; - } - if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test3 %d\n", rtest); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref3, 25); - printf("dprod_dut:"); - dump(dest3, 25); - return -1; - } - if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test4 %d\n", rtest); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref4, 25); - printf("dprod_dut:"); - dump(dest4, 25); - return -1; - } - if (0 != memcmp(dest_ref5, dest5, TEST_LEN)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test5 %d\n", rtest); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref5, 25); - printf("dprod_dut:"); - dump(dest5, 25); - return -1; - } - - putchar('.'); - } - - // Rand data test with varied parameters - for (rtest = 0; rtest < RANDOMS; rtest++) { - for (srcs = TEST_SOURCES; srcs > 0; srcs--) { - for (i = 0; i < srcs; i++) - for (j = 0; j < TEST_LEN; j++) - buffs[i][j] = rand(); - - for (i = 0; i < srcs; i++) { - g1[i] = rand(); - g2[i] = rand(); - g3[i] = rand(); - g4[i] = rand(); - g5[i] = rand(); - } - - for (i = 0; i < srcs; i++) { - gf_vect_mul_init(g1[i], &g_tbls[i * 32]); - gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]); - gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]); - gf_vect_mul_init(g4[i], &g_tbls[(96 * srcs) + (i * 32)]); - gf_vect_mul_init(g5[i], &g_tbls[(128 * srcs) + (i * 32)]); - } - - gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[0], buffs, dest_ref1); - gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[32 * srcs], buffs, - dest_ref2); - gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[64 * srcs], buffs, - dest_ref3); - gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[96 * srcs], buffs, - dest_ref4); - gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[128 * srcs], buffs, - dest_ref5); - - FUNCTION_UNDER_TEST(TEST_LEN, srcs, g_tbls, buffs, dest_ptrs); - - if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) - " test1 srcs=%d\n", srcs); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref1, 25); - printf("dprod_dut:"); - dump(dest1, 25); - return -1; - } - if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) - " test2 srcs=%d\n", srcs); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref2, 25); - printf("dprod_dut:"); - dump(dest2, 25); - return -1; - } - if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) - " test3 srcs=%d\n", srcs); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref3, 25); - printf("dprod_dut:"); - dump(dest3, 25); - return -1; - } - if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) - " test4 srcs=%d\n", srcs); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref4, 25); - printf("dprod_dut:"); - dump(dest4, 25); - return -1; - } - if (0 != memcmp(dest_ref5, dest5, TEST_LEN)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) - " test5 srcs=%d\n", srcs); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref5, 25); - printf("dprod_dut:"); - dump(dest5, 25); - return -1; - } - - putchar('.'); - } - } - - // Run tests at end of buffer for Electric Fence - align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16; - for (size = TEST_MIN_SIZE; size <= TEST_SIZE; size += align) { - for (i = 0; i < TEST_SOURCES; i++) - for (j = 0; j < TEST_LEN; j++) - buffs[i][j] = rand(); - - for (i = 0; i < TEST_SOURCES; i++) // Line up TEST_SIZE from end - efence_buffs[i] = buffs[i] + TEST_LEN - size; - - for (i = 0; i < TEST_SOURCES; i++) { - g1[i] = rand(); - g2[i] = rand(); - g3[i] = rand(); - g4[i] = rand(); - g5[i] = rand(); - } - - for (i = 0; i < TEST_SOURCES; i++) { - gf_vect_mul_init(g1[i], &g_tbls[i * 32]); - gf_vect_mul_init(g2[i], &g_tbls[(32 * TEST_SOURCES) + (i * 32)]); - gf_vect_mul_init(g3[i], &g_tbls[(64 * TEST_SOURCES) + (i * 32)]); - gf_vect_mul_init(g4[i], &g_tbls[(96 * TEST_SOURCES) + (i * 32)]); - gf_vect_mul_init(g5[i], &g_tbls[(128 * TEST_SOURCES) + (i * 32)]); - } - - gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[0], efence_buffs, dest_ref1); - gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES], - efence_buffs, dest_ref2); - gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES], - efence_buffs, dest_ref3); - gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES], - efence_buffs, dest_ref4); - gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[128 * TEST_SOURCES], - efence_buffs, dest_ref5); - - FUNCTION_UNDER_TEST(size, TEST_SOURCES, g_tbls, efence_buffs, dest_ptrs); - - if (0 != memcmp(dest_ref1, dest1, size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test1 %d\n", rtest); - dump_matrix(efence_buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref1, align); - printf("dprod_dut:"); - dump(dest1, align); - return -1; - } - - if (0 != memcmp(dest_ref2, dest2, size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test2 %d\n", rtest); - dump_matrix(efence_buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref2, align); - printf("dprod_dut:"); - dump(dest2, align); - return -1; - } - - if (0 != memcmp(dest_ref3, dest3, size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test3 %d\n", rtest); - dump_matrix(efence_buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref3, align); - printf("dprod_dut:"); - dump(dest3, align); - return -1; - } - - if (0 != memcmp(dest_ref4, dest4, size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test4 %d\n", rtest); - dump_matrix(efence_buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref4, align); - printf("dprod_dut:"); - dump(dest4, align); - return -1; - } - - if (0 != memcmp(dest_ref5, dest5, size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test5 %d\n", rtest); - dump_matrix(efence_buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref5, align); - printf("dprod_dut:"); - dump(dest5, align); - return -1; - } - - putchar('.'); - } - - // Test rand ptr alignment if available - - for (rtest = 0; rtest < RANDOMS; rtest++) { - size = (TEST_LEN - PTR_ALIGN_CHK_B) & ~(TEST_MIN_SIZE - 1); - srcs = rand() % TEST_SOURCES; - if (srcs == 0) - continue; - - offset = (PTR_ALIGN_CHK_B != 0) ? 1 : PTR_ALIGN_CHK_B; - // Add random offsets - for (i = 0; i < srcs; i++) - ubuffs[i] = buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset)); - - udest_ptrs[0] = dest1 + (rand() & (PTR_ALIGN_CHK_B - offset)); - udest_ptrs[1] = dest2 + (rand() & (PTR_ALIGN_CHK_B - offset)); - udest_ptrs[2] = dest3 + (rand() & (PTR_ALIGN_CHK_B - offset)); - udest_ptrs[3] = dest4 + (rand() & (PTR_ALIGN_CHK_B - offset)); - udest_ptrs[4] = dest5 + (rand() & (PTR_ALIGN_CHK_B - offset)); - - memset(dest1, 0, TEST_LEN); // zero pad to check write-over - memset(dest2, 0, TEST_LEN); - memset(dest3, 0, TEST_LEN); - memset(dest4, 0, TEST_LEN); - memset(dest5, 0, TEST_LEN); - - for (i = 0; i < srcs; i++) - for (j = 0; j < size; j++) - ubuffs[i][j] = rand(); - - for (i = 0; i < srcs; i++) { - g1[i] = rand(); - g2[i] = rand(); - g3[i] = rand(); - g4[i] = rand(); - g5[i] = rand(); - } - - for (i = 0; i < srcs; i++) { - gf_vect_mul_init(g1[i], &g_tbls[i * 32]); - gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]); - gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]); - gf_vect_mul_init(g4[i], &g_tbls[(96 * srcs) + (i * 32)]); - gf_vect_mul_init(g5[i], &g_tbls[(128 * srcs) + (i * 32)]); - } - - gf_vect_dot_prod_base(size, srcs, &g_tbls[0], ubuffs, dest_ref1); - gf_vect_dot_prod_base(size, srcs, &g_tbls[32 * srcs], ubuffs, dest_ref2); - gf_vect_dot_prod_base(size, srcs, &g_tbls[64 * srcs], ubuffs, dest_ref3); - gf_vect_dot_prod_base(size, srcs, &g_tbls[96 * srcs], ubuffs, dest_ref4); - gf_vect_dot_prod_base(size, srcs, &g_tbls[128 * srcs], ubuffs, dest_ref5); - - FUNCTION_UNDER_TEST(size, srcs, g_tbls, ubuffs, udest_ptrs); - - if (memcmp(dest_ref1, udest_ptrs[0], size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n", - srcs); - dump_matrix(ubuffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref1, 25); - printf("dprod_dut:"); - dump(udest_ptrs[0], 25); - return -1; - } - if (memcmp(dest_ref2, udest_ptrs[1], size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n", - srcs); - dump_matrix(ubuffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref2, 25); - printf("dprod_dut:"); - dump(udest_ptrs[1], 25); - return -1; - } - if (memcmp(dest_ref3, udest_ptrs[2], size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n", - srcs); - dump_matrix(ubuffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref3, 25); - printf("dprod_dut:"); - dump(udest_ptrs[2], 25); - return -1; - } - if (memcmp(dest_ref4, udest_ptrs[3], size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n", - srcs); - dump_matrix(ubuffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref4, 25); - printf("dprod_dut:"); - dump(udest_ptrs[3], 25); - return -1; - } - if (memcmp(dest_ref5, udest_ptrs[4], size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n", - srcs); - dump_matrix(ubuffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref5, 25); - printf("dprod_dut:"); - dump(udest_ptrs[4], 25); - return -1; - } - // Confirm that padding around dests is unchanged - memset(dest_ref1, 0, PTR_ALIGN_CHK_B); // Make reference zero buff - offset = udest_ptrs[0] - dest1; - - if (memcmp(dest1, dest_ref1, offset)) { - printf("Fail rand ualign pad1 start\n"); - return -1; - } - if (memcmp(dest1 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) { - printf("Fail rand ualign pad1 end\n"); - return -1; - } - - offset = udest_ptrs[1] - dest2; - if (memcmp(dest2, dest_ref1, offset)) { - printf("Fail rand ualign pad2 start\n"); - return -1; - } - if (memcmp(dest2 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) { - printf("Fail rand ualign pad2 end\n"); - return -1; - } - - offset = udest_ptrs[2] - dest3; - if (memcmp(dest3, dest_ref1, offset)) { - printf("Fail rand ualign pad3 start\n"); - return -1; - } - if (memcmp(dest3 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) { - printf("Fail rand ualign pad3 end\n"); - return -1; - } - - offset = udest_ptrs[3] - dest4; - if (memcmp(dest4, dest_ref1, offset)) { - printf("Fail rand ualign pad4 start\n"); - return -1; - } - if (memcmp(dest4 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) { - printf("Fail rand ualign pad4 end\n"); - return -1; - } - - offset = udest_ptrs[4] - dest5; - if (memcmp(dest5, dest_ref1, offset)) { - printf("Fail rand ualign pad5 start\n"); - return -1; - } - if (memcmp(dest5 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) { - printf("Fail rand ualign pad5 end\n"); - return -1; - } - - putchar('.'); - } - - // Test all size alignment - align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16; - - for (size = TEST_LEN; size >= TEST_MIN_SIZE; size -= align) { - srcs = TEST_SOURCES; - - for (i = 0; i < srcs; i++) - for (j = 0; j < size; j++) - buffs[i][j] = rand(); - - for (i = 0; i < srcs; i++) { - g1[i] = rand(); - g2[i] = rand(); - g3[i] = rand(); - g4[i] = rand(); - g5[i] = rand(); - } - - for (i = 0; i < srcs; i++) { - gf_vect_mul_init(g1[i], &g_tbls[i * 32]); - gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]); - gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]); - gf_vect_mul_init(g4[i], &g_tbls[(96 * srcs) + (i * 32)]); - gf_vect_mul_init(g5[i], &g_tbls[(128 * srcs) + (i * 32)]); - } - - gf_vect_dot_prod_base(size, srcs, &g_tbls[0], buffs, dest_ref1); - gf_vect_dot_prod_base(size, srcs, &g_tbls[32 * srcs], buffs, dest_ref2); - gf_vect_dot_prod_base(size, srcs, &g_tbls[64 * srcs], buffs, dest_ref3); - gf_vect_dot_prod_base(size, srcs, &g_tbls[96 * srcs], buffs, dest_ref4); - gf_vect_dot_prod_base(size, srcs, &g_tbls[128 * srcs], buffs, dest_ref5); - - FUNCTION_UNDER_TEST(size, srcs, g_tbls, buffs, dest_ptrs); - - if (memcmp(dest_ref1, dest_ptrs[0], size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n", - size); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref1, 25); - printf("dprod_dut:"); - dump(dest_ptrs[0], 25); - - return -1; - } - if (memcmp(dest_ref2, dest_ptrs[1], size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n", - size); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref2, 25); - printf("dprod_dut:"); - dump(dest_ptrs[1], 25); - return -1; - } - if (memcmp(dest_ref3, dest_ptrs[2], size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n", - size); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref3, 25); - printf("dprod_dut:"); - dump(dest_ptrs[2], 25); - return -1; - } - if (memcmp(dest_ref4, dest_ptrs[3], size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n", - size); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref4, 25); - printf("dprod_dut:"); - dump(dest_ptrs[3], 25); - return -1; - } - if (memcmp(dest_ref5, dest_ptrs[4], size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n", - size); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref5, 25); - printf("dprod_dut:"); - dump(dest_ptrs[4], 25); - return -1; - } - } - - printf("Pass\n"); - return 0; - -} diff --git a/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_sse_test.patch b/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_sse_test.patch deleted file mode 100644 index 21bbfaa667..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_sse_test.patch +++ /dev/null @@ -1,4 +0,0 @@ -34c34 -< // #include "types.h" ---- -> #include "types.h" diff --git a/contrib/libs/isa-l/erasure_code/gf_5vect_mad_avx.asm b/contrib/libs/isa-l/erasure_code/gf_5vect_mad_avx.asm index 8f38a415a1..e72717328a 100644 --- a/contrib/libs/isa-l/erasure_code/gf_5vect_mad_avx.asm +++ b/contrib/libs/isa-l/erasure_code/gf_5vect_mad_avx.asm @@ -107,7 +107,7 @@ %define return rax %define return.w eax - %define func(x) x: + %define func(x) x: endbranch %macro FUNC_SAVE 0 push r12 push r13 @@ -178,13 +178,8 @@ section .text align 16 -global gf_5vect_mad_avx:ISAL_SYM_TYPE_FUNCTION +global gf_5vect_mad_avx, function func(gf_5vect_mad_avx) -%ifidn __OUTPUT_FORMAT__, macho64 -global _gf_5vect_mad_avx:ISAL_SYM_TYPE_FUNCTION -func(_gf_5vect_mad_avx) -%endif - FUNC_SAVE sub len, 16 jl .return_fail @@ -365,6 +360,3 @@ align 16 mask0f: dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f constip16: dq 0xf8f9fafbfcfdfeff, 0xf0f1f2f3f4f5f6f7 - -;;; func core, ver, snum -slversion gf_5vect_mad_avx, 02, 01, 020d diff --git a/contrib/libs/isa-l/erasure_code/gf_5vect_mad_avx.patch b/contrib/libs/isa-l/erasure_code/gf_5vect_mad_avx.patch deleted file mode 100644 index d1a3e09445..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_5vect_mad_avx.patch +++ /dev/null @@ -1,6 +0,0 @@ -183,187d182 -< %ifidn __OUTPUT_FORMAT__, macho64 -< global _gf_5vect_mad_avx:ISAL_SYM_TYPE_FUNCTION -< func(_gf_5vect_mad_avx) -< %endif -< diff --git a/contrib/libs/isa-l/erasure_code/gf_5vect_mad_avx2.asm b/contrib/libs/isa-l/erasure_code/gf_5vect_mad_avx2.asm index 9029f9287e..927cbcdd1a 100644 --- a/contrib/libs/isa-l/erasure_code/gf_5vect_mad_avx2.asm +++ b/contrib/libs/isa-l/erasure_code/gf_5vect_mad_avx2.asm @@ -103,7 +103,7 @@ %define return rax %define return.w eax - %define func(x) x: + %define func(x) x: endbranch %define FUNC_SAVE %define FUNC_RESTORE %endif @@ -166,13 +166,8 @@ section .text %define xd5 ymm9 align 16 -global gf_5vect_mad_avx2:ISAL_SYM_TYPE_FUNCTION +global gf_5vect_mad_avx2, function func(gf_5vect_mad_avx2) -%ifidn __OUTPUT_FORMAT__, macho64 -global _gf_5vect_mad_avx2:ISAL_SYM_TYPE_FUNCTION -func(_gf_5vect_mad_avx2) -%endif - FUNC_SAVE sub len, 32 jl .return_fail @@ -363,6 +358,3 @@ align 32 constip32: dq 0xf8f9fafbfcfdfeff, 0xf0f1f2f3f4f5f6f7 dq 0xe8e9eaebecedeeef, 0xe0e1e2e3e4e5e6e7 - -;;; func core, ver, snum -slversion gf_5vect_mad_avx2, 04, 01, 020e diff --git a/contrib/libs/isa-l/erasure_code/gf_5vect_mad_avx2.patch b/contrib/libs/isa-l/erasure_code/gf_5vect_mad_avx2.patch deleted file mode 100644 index 1960386b1c..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_5vect_mad_avx2.patch +++ /dev/null @@ -1,6 +0,0 @@ -171,175d170 -< %ifidn __OUTPUT_FORMAT__, macho64 -< global _gf_5vect_mad_avx2:ISAL_SYM_TYPE_FUNCTION -< func(_gf_5vect_mad_avx2) -< %endif -< diff --git a/contrib/libs/isa-l/erasure_code/gf_5vect_mad_avx2_gfni.asm b/contrib/libs/isa-l/erasure_code/gf_5vect_mad_avx2_gfni.asm new file mode 100644 index 0000000000..7ff768528e --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/gf_5vect_mad_avx2_gfni.asm @@ -0,0 +1,265 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2023 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; +;;; gf_5vect_mad_avx2_gfni(len, vec, vec_i, mul_array, src, dest); +;;; + +%include "reg_sizes.asm" +%include "gf_vect_gfni.inc" +%include "memcpy.asm" + +%if AS_FEATURE_LEVEL >= 10 + +%ifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + %define tmp r11 + %define tmp2 r10 + %define tmp3 r12 + %define tmp4 r13 + %define func(x) x: endbranch + %define stack_size 2*8 + %macro FUNC_SAVE 0 + sub rsp, stack_size + mov [rsp + 0*8], r12 + mov [rsp + 1*8], r13 + %endmacro + %macro FUNC_RESTORE 0 + mov r12, [rsp + 0*8] + mov r13, [rsp + 1*8] + add rsp, stack_size + %endmacro +%endif + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define arg4 r12 ; must be saved, loaded and restored + %define arg5 r13 ; must be saved and restored + %define tmp r11 + %define tmp2 r10 + %define tmp3 r14 + %define tmp4 r15 + %define stack_size 16*10 + 5*8 + %define arg(x) [rsp + stack_size + 8 + 8*x] + %define func(x) proc_frame x + + %macro FUNC_SAVE 0 + sub rsp, stack_size + vmovdqa [rsp + 0*16], xmm6 + vmovdqa [rsp + 1*16], xmm7 + vmovdqa [rsp + 2*16], xmm8 + vmovdqa [rsp + 3*16], xmm9 + vmovdqa [rsp + 4*16], xmm10 + vmovdqa [rsp + 5*16], xmm11 + vmovdqa [rsp + 6*16], xmm12 + vmovdqa [rsp + 7*16], xmm13 + vmovdqa [rsp + 8*16], xmm14 + vmovdqa [rsp + 9*16], xmm15 + mov [rsp + 10*16 + 0*8], r12 + mov [rsp + 10*16 + 1*8], r13 + mov [rsp + 10*16 + 2*8], r14 + mov [rsp + 10*16 + 3*8], r15 + end_prolog + mov arg4, arg(4) + mov arg5, arg(5) + %endmacro + + %macro FUNC_RESTORE 0 + vmovdqa xmm6, [rsp + 0*16] + vmovdqa xmm7, [rsp + 1*16] + vmovdqa xmm8, [rsp + 2*16] + vmovdqa xmm9, [rsp + 3*16] + vmovdqa xmm10, [rsp + 4*16] + vmovdqa xmm11, [rsp + 5*16] + vmovdqa xmm12, [rsp + 6*16] + vmovdqa xmm13, [rsp + 7*16] + vmovdqa xmm14, [rsp + 8*16] + vmovdqa xmm15, [rsp + 9*16] + mov r12, [rsp + 10*16 + 0*8] + mov r13, [rsp + 10*16 + 1*8] + mov r14, [rsp + 10*16 + 2*8] + mov r15, [rsp + 10*16 + 3*8] + add rsp, stack_size + %endmacro +%endif + +%define len arg0 +%define vec arg1 +%define vec_i arg2 +%define mul_array arg3 +%define src arg4 +%define dest1 arg5 +%define pos rax +%define dest2 mul_array +%define dest3 vec_i +%define dest4 tmp3 +%define dest5 tmp4 + +%ifndef EC_ALIGNED_ADDR +;;; Use Un-aligned load/store + %define XLDR vmovdqu + %define XSTR vmovdqu +%else +;;; Use Non-temporal load/stor + %ifdef NO_NT_LDST + %define XLDR vmovdqa + %define XSTR vmovdqa + %else + %define XLDR vmovntdqa + %define XSTR vmovntdq + %endif +%endif + +default rel +[bits 64] +section .text + +%define x0 ymm0 +%define xd1 ymm1 +%define xd2 ymm2 +%define xd3 ymm3 +%define xd4 ymm4 +%define xd5 ymm5 +%define xgft1 ymm6 +%define xgft2 ymm7 +%define xgft3 ymm8 +%define xgft4 ymm9 +%define xgft5 ymm10 +%define xret1 ymm11 +%define xret2 ymm12 +%define xret3 ymm13 +%define xret4 ymm14 +%define xret5 ymm15 + +;; +;; Encodes 32 bytes of a single source into 5x 32 bytes (parity disks) +;; +%macro ENCODE_32B_5 0 + ;; get next source vector + XLDR x0, [src + pos] + ;; get next dest vectors + XLDR xd1, [dest1 + pos] + XLDR xd2, [dest2 + pos] + XLDR xd3, [dest3 + pos] + XLDR xd4, [dest4 + pos] + XLDR xd5, [dest5 + pos] + + GF_MUL_XOR VEX, x0, xgft1, xret1, xd1, xgft2, xret2, xd2, \ + xgft3, xret3, xd3, xgft4, xret4, xd4, xgft5, xret5, xd5 + + XSTR [dest1 + pos], xd1 + XSTR [dest2 + pos], xd2 + XSTR [dest3 + pos], xd3 + XSTR [dest4 + pos], xd4 + XSTR [dest5 + pos], xd5 +%endmacro + +;; +;; Encodes less than 32 bytes of a single source into 5x parity disks +;; +%macro ENCODE_LT_32B_5 1 +%define %%LEN %1 + ;; get next source vector + simd_load_avx2 x0, src + pos, %%LEN, tmp, tmp2 + ;; get next dest vectors + simd_load_avx2 xd1, dest1 + pos, %%LEN, tmp, tmp2 + simd_load_avx2 xd2, dest2 + pos, %%LEN, tmp, tmp2 + simd_load_avx2 xd3, dest3 + pos, %%LEN, tmp, tmp2 + simd_load_avx2 xd4, dest4 + pos, %%LEN, tmp, tmp2 + simd_load_avx2 xd5, dest5 + pos, %%LEN, tmp, tmp2 + + GF_MUL_XOR VEX, x0, xgft1, xret1, xd1, xgft2, xret2, xd2, \ + xgft3, xret3, xd3, xgft4, xret4, xd4, xgft5, xret5, xd5 + + lea dest1, [dest1 + pos] + simd_store_avx2 dest1, xd1, %%LEN, tmp, tmp2 + lea dest2, [dest2 + pos] + simd_store_avx2 dest2, xd2, %%LEN, tmp, tmp2 + lea dest3, [dest3 + pos] + simd_store_avx2 dest3, xd3, %%LEN, tmp, tmp2 + lea dest4, [dest4 + pos] + simd_store_avx2 dest4, xd4, %%LEN, tmp, tmp2 + lea dest5, [dest5 + pos] + simd_store_avx2 dest5, xd5, %%LEN, tmp, tmp2 +%endmacro + +align 16 +global gf_5vect_mad_avx2_gfni, function +func(gf_5vect_mad_avx2_gfni) + FUNC_SAVE + + xor pos, pos + shl vec_i, 3 ;Multiply by 8 + shl vec, 3 ;Multiply by 8 + lea tmp, [mul_array + vec_i] + lea tmp2, [vec*3] + vbroadcastsd xgft1, [tmp] + vbroadcastsd xgft2, [tmp + vec] + vbroadcastsd xgft3, [tmp + vec*2] + vbroadcastsd xgft4, [tmp + tmp2] + vbroadcastsd xgft5, [tmp + vec*4] + mov dest2, [dest1 + 1*8] ; reuse mul_array + mov dest3, [dest1 + 2*8] ; reuse vec_i + mov dest4, [dest1 + 3*8] + mov dest5, [dest1 + 4*8] + mov dest1, [dest1] + + cmp len, 32 + jl .len_lt_32 + +.loop32: + ENCODE_32B_5 ;; loop on 32 bytes at a time + + add pos, 32 + sub len, 32 + cmp len, 32 + jge .loop32 + +.len_lt_32: + cmp len, 0 + jle .exit + + ENCODE_LT_32B_5 len ;; encode final bytes + +.exit: + vzeroupper + + FUNC_RESTORE + ret + +endproc_frame +%endif ; if AS_FEATURE_LEVEL >= 10 diff --git a/contrib/libs/isa-l/erasure_code/gf_5vect_mad_avx512.asm b/contrib/libs/isa-l/erasure_code/gf_5vect_mad_avx512.asm new file mode 100644 index 0000000000..26f0964b94 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/gf_5vect_mad_avx512.asm @@ -0,0 +1,287 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2019 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; +;;; gf_5vect_mad_avx512(len, vec, vec_i, mul_array, src, dest); +;;; + +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_AVX512 + +%ifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + %define tmp r11 + %define tmp2 r10 + %define return rax + %define func(x) x: endbranch + %define FUNC_SAVE + %define FUNC_RESTORE +%endif + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define arg4 r12 + %define arg5 r15 + %define tmp r11 + %define tmp2 r10 + %define return rax + %define stack_size 16*10 + 3*8 + %define arg(x) [rsp + stack_size + PS + PS*x] + %define func(x) proc_frame x + +%macro FUNC_SAVE 0 + sub rsp, stack_size + vmovdqa [rsp+16*0],xmm6 + vmovdqa [rsp+16*1],xmm7 + vmovdqa [rsp+16*2],xmm8 + vmovdqa [rsp+16*3],xmm9 + vmovdqa [rsp+16*4],xmm10 + vmovdqa [rsp+16*5],xmm11 + vmovdqa [rsp+16*6],xmm12 + vmovdqa [rsp+16*7],xmm13 + vmovdqa [rsp+16*8],xmm14 + vmovdqa [rsp+16*9],xmm15 + save_reg r12, 10*16 + 0*8 + save_reg r15, 10*16 + 1*8 + end_prolog + mov arg4, arg(4) + mov arg5, arg(5) +%endmacro + +%macro FUNC_RESTORE 0 + vmovdqa xmm6, [rsp+16*0] + vmovdqa xmm7, [rsp+16*1] + vmovdqa xmm8, [rsp+16*2] + vmovdqa xmm9, [rsp+16*3] + vmovdqa xmm10, [rsp+16*4] + vmovdqa xmm11, [rsp+16*5] + vmovdqa xmm12, [rsp+16*6] + vmovdqa xmm13, [rsp+16*7] + vmovdqa xmm14, [rsp+16*8] + vmovdqa xmm15, [rsp+16*9] + mov r12, [rsp + 10*16 + 0*8] + mov r15, [rsp + 10*16 + 1*8] + add rsp, stack_size +%endmacro +%endif + +%define PS 8 +%define len arg0 +%define vec arg1 +%define vec_i arg2 +%define mul_array arg3 +%define src arg4 +%define dest1 arg5 +%define pos return +%define dest2 tmp2 +%define dest3 mul_array +%define dest4 vec +%define dest5 vec_i + +%ifndef EC_ALIGNED_ADDR +;;; Use Un-aligned load/store + %define XLDR vmovdqu8 + %define XSTR vmovdqu8 +%else +;;; Use Non-temporal load/stor + %ifdef NO_NT_LDST + %define XLDR vmovdqa64 + %define XSTR vmovdqa64 + %else + %define XLDR vmovntdqa + %define XSTR vmovntdq + %endif +%endif + +default rel +[bits 64] +section .text + +%define x0 zmm0 +%define xtmpa zmm1 +%define xtmpl1 zmm2 +%define xtmph1 zmm3 +%define xtmph2 zmm4 +%define xtmph3 zmm5 +%define xgft1_hi zmm6 +%define xgft1_lo zmm7 +%define xgft1_loy ymm7 +%define xgft2_hi zmm8 +%define xgft2_lo zmm9 +%define xgft2_loy ymm9 +%define xgft3_hi zmm10 +%define xgft3_lo zmm11 +%define xgft3_loy ymm11 +%define xgft4_hi zmm12 +%define xgft4_lo zmm13 +%define xgft4_loy ymm13 +%define xgft5_hi zmm14 +%define xgft5_lo zmm15 +%define xgft5_loy ymm15 +%define xd1 zmm16 +%define xd2 zmm17 +%define xd3 zmm18 +%define xd4 zmm19 +%define xd5 zmm20 +%define xmask0f zmm21 +%define xtmpl2 zmm22 +%define xtmpl3 zmm23 +%define xtmpl4 zmm24 +%define xtmpl5 zmm25 +%define xtmph4 zmm26 +%define xtmph5 zmm27 + +align 16 +global gf_5vect_mad_avx512, function +func(gf_5vect_mad_avx512) + FUNC_SAVE + sub len, 64 + jl .return_fail + xor pos, pos + mov tmp, 0x0f + vpbroadcastb xmask0f, tmp ;Construct mask 0x0f0f0f... + sal vec_i, 5 ;Multiply by 32 + sal vec, 5 ;Multiply by 32 + lea tmp, [mul_array + vec_i] + vmovdqu xgft1_loy, [tmp] ;Load array Ax{00}..{0f}, Ax{00}..{f0} + vmovdqu xgft2_loy, [tmp+vec] ;Load array Bx{00}..{0f}, Bx{00}..{f0} + vmovdqu xgft3_loy, [tmp+2*vec] ;Load array Cx{00}..{0f}, Cx{00}..{f0} + vmovdqu xgft5_loy, [tmp+4*vec] ;Load array Ex{00}..{0f}, Ex{00}..{f0} + add tmp, vec + vmovdqu xgft4_loy, [tmp+2*vec] ;Load array Dx{00}..{0f}, Dx{00}..{f0} + vshufi64x2 xgft1_hi, xgft1_lo, xgft1_lo, 0x55 + vshufi64x2 xgft1_lo, xgft1_lo, xgft1_lo, 0x00 + vshufi64x2 xgft2_hi, xgft2_lo, xgft2_lo, 0x55 + vshufi64x2 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 + vshufi64x2 xgft3_hi, xgft3_lo, xgft3_lo, 0x55 + vshufi64x2 xgft3_lo, xgft3_lo, xgft3_lo, 0x00 + vshufi64x2 xgft4_hi, xgft4_lo, xgft4_lo, 0x55 + vshufi64x2 xgft4_lo, xgft4_lo, xgft4_lo, 0x00 + vshufi64x2 xgft5_hi, xgft5_lo, xgft5_lo, 0x55 + vshufi64x2 xgft5_lo, xgft5_lo, xgft5_lo, 0x00 + mov dest2, [dest1+PS] + mov dest3, [dest1+2*PS] ; reuse mul_array + mov dest4, [dest1+3*PS] ; reuse vec + mov dest5, [dest1+4*PS] ; reuse vec_i + mov dest1, [dest1] + mov tmp, -1 + kmovq k1, tmp + +.loop64: + XLDR x0, [src+pos] ;Get next source vector + XLDR xd1, [dest1+pos] ;Get next dest vector + XLDR xd2, [dest2+pos] ;Get next dest vector + XLDR xd3, [dest3+pos] ;Get next dest vector + XLDR xd4, [dest4+pos] ;reuse xtmpl1. Get next dest vector + XLDR xd5, [dest5+pos] ;Get next dest vector + + vpandq xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0 + vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 + vpandq x0, x0, xmask0f ;Mask high src nibble in bits 4-0 + + ; dest1 + vpshufb xtmph1 {k1}{z}, xgft1_hi, x0 ;Lookup mul table of high nibble + vpshufb xtmpl1 {k1}{z}, xgft1_lo, xtmpa ;Lookup mul table of low nibble + vpxorq xtmph1, xtmph1, xtmpl1 ;GF add high and low partials + vpxorq xd1, xd1, xtmph1 ;xd1 += partial + + ; dest2 + vpshufb xtmph2 {k1}{z}, xgft2_hi, x0 ;Lookup mul table of high nibble + vpshufb xtmpl2 {k1}{z}, xgft2_lo, xtmpa ;Lookup mul table of low nibble + vpxorq xtmph2, xtmph2, xtmpl2 ;GF add high and low partials + vpxorq xd2, xd2, xtmph2 ;xd2 += partial + + ; dest3 + vpshufb xtmph3 {k1}{z}, xgft3_hi, x0 ;Lookup mul table of high nibble + vpshufb xtmpl3 {k1}{z}, xgft3_lo, xtmpa ;Lookup mul table of low nibble + vpxorq xtmph3, xtmph3, xtmpl3 ;GF add high and low partials + vpxorq xd3, xd3, xtmph3 ;xd2 += partial + + ; dest4 + vpshufb xtmph4 {k1}{z}, xgft4_hi, x0 ;Lookup mul table of high nibble + vpshufb xtmpl4 {k1}{z}, xgft4_lo, xtmpa ;Lookup mul table of low nibble + vpxorq xtmph4, xtmph4, xtmpl4 ;GF add high and low partials + vpxorq xd4, xd4, xtmph4 ;xd2 += partial + + ; dest5 + vpshufb xtmph5 {k1}{z}, xgft5_hi, x0 ;Lookup mul table of high nibble + vpshufb xtmpl5 {k1}{z}, xgft5_lo, xtmpa ;Lookup mul table of low nibble + vpxorq xtmph5, xtmph5, xtmpl5 ;GF add high and low partials + vpxorq xd5, xd5, xtmph5 ;xd2 += partial + + XSTR [dest1+pos], xd1 + XSTR [dest2+pos], xd2 + XSTR [dest3+pos], xd3 + XSTR [dest4+pos], xd4 + XSTR [dest5+pos], xd5 + + add pos, 64 ;Loop on 64 bytes at a time + cmp pos, len + jle .loop64 + + lea tmp, [len + 64] + cmp pos, tmp + je .return_pass + + ;; Tail len + mov pos, (1 << 63) + lea tmp, [len + 64 - 1] + and tmp, 63 + sarx pos, pos, tmp + kmovq k1, pos + mov pos, len ;Overlapped offset length-64 + jmp .loop64 ;Do one more overlap pass + +.return_pass: + mov return, 0 + FUNC_RESTORE + ret + +.return_fail: + mov return, 1 + FUNC_RESTORE + ret + +endproc_frame + +%else +%ifidn __OUTPUT_FORMAT__, win64 +global no_gf_5vect_mad_avx512 +no_gf_5vect_mad_avx512: +%endif +%endif ; ifdef HAVE_AS_KNOWS_AVX512 diff --git a/contrib/libs/isa-l/erasure_code/gf_5vect_mad_avx512_gfni.asm b/contrib/libs/isa-l/erasure_code/gf_5vect_mad_avx512_gfni.asm new file mode 100644 index 0000000000..d89ecca970 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/gf_5vect_mad_avx512_gfni.asm @@ -0,0 +1,240 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2023 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; +;;; gf_5vect_mad_avx512_gfni(len, vec, vec_i, mul_array, src, dest); +;;; + +%include "reg_sizes.asm" +%include "gf_vect_gfni.inc" + +%if AS_FEATURE_LEVEL >= 10 + +%ifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + %define tmp r11 + %define tmp2 r10 + %define func(x) x: endbranch + %define FUNC_SAVE + %define FUNC_RESTORE +%endif + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define arg4 r12 + %define arg5 r13 + %define tmp r11 + %define tmp2 r10 + %define stack_size 16*10 + 3*8 + %define arg(x) [rsp + stack_size + 8 + 8*x] + %define func(x) proc_frame x + +%macro FUNC_SAVE 0 + sub rsp, stack_size + vmovdqa [rsp + 16*0], xmm6 + vmovdqa [rsp + 16*1], xmm7 + vmovdqa [rsp + 16*2], xmm8 + vmovdqa [rsp + 16*3], xmm9 + vmovdqa [rsp + 16*4], xmm10 + vmovdqa [rsp + 16*5], xmm11 + vmovdqa [rsp + 16*6], xmm12 + vmovdqa [rsp + 16*7], xmm13 + vmovdqa [rsp + 16*8], xmm14 + vmovdqa [rsp + 16*9], xmm15 + mov [rsp + 10*16 + 0*8], r12 + mov [rsp + 10*16 + 1*8], r13 + end_prolog + mov arg4, arg(4) + mov arg5, arg(5) +%endmacro + +%macro FUNC_RESTORE 0 + vmovdqa xmm6, [rsp + 16*0] + vmovdqa xmm7, [rsp + 16*1] + vmovdqa xmm8, [rsp + 16*2] + vmovdqa xmm9, [rsp + 16*3] + vmovdqa xmm10, [rsp + 16*4] + vmovdqa xmm11, [rsp + 16*5] + vmovdqa xmm12, [rsp + 16*6] + vmovdqa xmm13, [rsp + 16*7] + vmovdqa xmm14, [rsp + 16*8] + vmovdqa xmm15, [rsp + 16*9] + mov r12, [rsp + 10*16 + 0*8] + mov r13, [rsp + 10*16 + 1*8] + add rsp, stack_size +%endmacro +%endif + +%define len arg0 +%define vec arg1 +%define vec_i arg2 +%define mul_array arg3 +%define src arg4 +%define dest1 arg5 +%define pos rax +%define dest2 tmp2 +%define dest3 mul_array +%define dest4 vec +%define dest5 vec_i + +%ifndef EC_ALIGNED_ADDR +;;; Use Un-aligned load/store + %define XLDR vmovdqu8 + %define XSTR vmovdqu8 +%else +;;; Use Non-temporal load/stor + %ifdef NO_NT_LDST + %define XLDR vmovdqa64 + %define XSTR vmovdqa64 + %else + %define XLDR vmovntdqa + %define XSTR vmovntdq + %endif +%endif + +default rel +[bits 64] +section .text + +%define x0 zmm0 +%define xd1 zmm1 +%define xd2 zmm2 +%define xd3 zmm3 +%define xd4 zmm4 +%define xd5 zmm5 + +%define xgft1 zmm6 +%define xgft2 zmm7 +%define xgft3 zmm8 +%define xgft4 zmm9 +%define xgft5 zmm10 + +%define xret1 zmm11 +%define xret2 zmm12 +%define xret3 zmm13 +%define xret4 zmm14 +%define xret5 zmm15 + +;; +;; Encodes 64 bytes of a single source into 5x 64 bytes (parity disks) +;; +%macro ENCODE_64B_5 0-1 +%define %%KMASK %1 + +%if %0 == 1 + vmovdqu8 x0{%%KMASK}, [src + pos] ;Get next source vector + vmovdqu8 xd1{%%KMASK}, [dest1 + pos] ;Get next dest vector + vmovdqu8 xd2{%%KMASK}, [dest2 + pos] ;Get next dest vector + vmovdqu8 xd3{%%KMASK}, [dest3 + pos] ;Get next dest vector + vmovdqu8 xd4{%%KMASK}, [dest4 + pos] ;Get next dest vector + vmovdqu8 xd5{%%KMASK}, [dest5 + pos] ;Get next dest vector +%else + XLDR x0, [src + pos] ;Get next source vector + XLDR xd1, [dest1 + pos] ;Get next dest vector + XLDR xd2, [dest2 + pos] ;Get next dest vector + XLDR xd3, [dest3 + pos] ;Get next dest vector + XLDR xd4, [dest4 + pos] ;Get next dest vector + XLDR xd5, [dest5 + pos] ;Get next dest vector +%endif + + GF_MUL_XOR EVEX, x0, xgft1, xret1, xd1, xgft2, xret2, xd2, xgft3, xret3, xd3, \ + xgft4, xret4, xd4, xgft5, xret5, xd5 + +%if %0 == 1 + vmovdqu8 [dest1 + pos]{%%KMASK}, xd1 + vmovdqu8 [dest2 + pos]{%%KMASK}, xd2 + vmovdqu8 [dest3 + pos]{%%KMASK}, xd3 + vmovdqu8 [dest4 + pos]{%%KMASK}, xd4 + vmovdqu8 [dest5 + pos]{%%KMASK}, xd5 +%else + XSTR [dest1 + pos], xd1 + XSTR [dest2 + pos], xd2 + XSTR [dest3 + pos], xd3 + XSTR [dest4 + pos], xd4 + XSTR [dest5 + pos], xd5 +%endif +%endmacro +align 16 +global gf_5vect_mad_avx512_gfni, function +func(gf_5vect_mad_avx512_gfni) + FUNC_SAVE + + xor pos, pos + shl vec_i, 3 ;Multiply by 8 + shl vec, 3 ;Multiply by 8 + lea tmp, [mul_array + vec_i] + vbroadcastf32x2 xgft1, [tmp] + vbroadcastf32x2 xgft2, [tmp + vec] + vbroadcastf32x2 xgft3, [tmp + vec*2] + vbroadcastf32x2 xgft5, [tmp + vec*4] + add tmp, vec + vbroadcastf32x2 xgft4, [tmp + vec*2] + mov dest2, [dest1 + 8] + mov dest3, [dest1 + 2*8] ; reuse mul_array + mov dest4, [dest1 + 3*8] ; reuse vec + mov dest5, [dest1 + 4*8] ; reuse vec_i + mov dest1, [dest1] + + cmp len, 64 + jl .len_lt_64 +.loop64: + ENCODE_64B_5 + + add pos, 64 ;Loop on 64 bytes at a time + sub len, 64 + cmp len, 64 + jge .loop64 + +.len_lt_64: + cmp len, 0 + jle .exit + + xor tmp, tmp + bts tmp, len + dec tmp + kmovq k1, tmp + + ENCODE_64B_5 k1 + +.exit: + vzeroupper + + FUNC_RESTORE + ret + +endproc_frame +%endif ; if AS_FEATURE_LEVEL >= 10 diff --git a/contrib/libs/isa-l/erasure_code/gf_5vect_mad_sse.asm b/contrib/libs/isa-l/erasure_code/gf_5vect_mad_sse.asm index 15c96bf4dc..072c2b5632 100644 --- a/contrib/libs/isa-l/erasure_code/gf_5vect_mad_sse.asm +++ b/contrib/libs/isa-l/erasure_code/gf_5vect_mad_sse.asm @@ -107,7 +107,7 @@ %define return rax %define return.w eax - %define func(x) x: + %define func(x) x: endbranch %macro FUNC_SAVE 0 push r12 push r13 @@ -177,13 +177,8 @@ section .text align 16 -global gf_5vect_mad_sse:ISAL_SYM_TYPE_FUNCTION +global gf_5vect_mad_sse, function func(gf_5vect_mad_sse) -%ifidn __OUTPUT_FORMAT__, macho64 -global _gf_5vect_mad_sse:ISAL_SYM_TYPE_FUNCTION -func(_gf_5vect_mad_sse) -%endif - FUNC_SAVE sub len, 16 jl .return_fail @@ -373,6 +368,3 @@ mask0f: dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f constip16: dq 0xf8f9fafbfcfdfeff, 0xf0f1f2f3f4f5f6f7 - -;;; func core, ver, snum -slversion gf_5vect_mad_sse, 00, 01, 020c diff --git a/contrib/libs/isa-l/erasure_code/gf_5vect_mad_sse.patch b/contrib/libs/isa-l/erasure_code/gf_5vect_mad_sse.patch deleted file mode 100644 index 35d5094ffe..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_5vect_mad_sse.patch +++ /dev/null @@ -1,6 +0,0 @@ -182,186d181 -< %ifidn __OUTPUT_FORMAT__, macho64 -< global _gf_5vect_mad_sse:ISAL_SYM_TYPE_FUNCTION -< func(_gf_5vect_mad_sse) -< %endif -< diff --git a/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_avx.asm b/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_avx.asm index f12798edec..86082e75af 100644 --- a/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_avx.asm +++ b/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_avx.asm @@ -51,7 +51,7 @@ %define PS 8 %define LOG_PS 3 - %define func(x) x: + %define func(x) x: endbranch %macro FUNC_SAVE 0 push r12 push r13 @@ -89,16 +89,16 @@ %define func(x) proc_frame x %macro FUNC_SAVE 0 alloc_stack stack_size - save_xmm128 xmm6, 0*16 - save_xmm128 xmm7, 1*16 - save_xmm128 xmm8, 2*16 - save_xmm128 xmm9, 3*16 - save_xmm128 xmm10, 4*16 - save_xmm128 xmm11, 5*16 - save_xmm128 xmm12, 6*16 - save_xmm128 xmm13, 7*16 - save_xmm128 xmm14, 8*16 - save_xmm128 xmm15, 9*16 + vmovdqa [rsp + 0*16], xmm6 + vmovdqa [rsp + 1*16], xmm7 + vmovdqa [rsp + 2*16], xmm8 + vmovdqa [rsp + 3*16], xmm9 + vmovdqa [rsp + 4*16], xmm10 + vmovdqa [rsp + 5*16], xmm11 + vmovdqa [rsp + 6*16], xmm12 + vmovdqa [rsp + 7*16], xmm13 + vmovdqa [rsp + 8*16], xmm14 + vmovdqa [rsp + 9*16], xmm15 save_reg r12, 10*16 + 0*8 save_reg r13, 10*16 + 1*8 save_reg r14, 10*16 + 2*8 @@ -182,13 +182,8 @@ section .text %define xp6 xmm7 align 16 -global gf_6vect_dot_prod_avx:ISAL_SYM_TYPE_FUNCTION +global gf_6vect_dot_prod_avx, function func(gf_6vect_dot_prod_avx) -%ifidn __OUTPUT_FORMAT__, macho64 -global _gf_6vect_dot_prod_avx:ISAL_SYM_TYPE_FUNCTION -func(_gf_6vect_dot_prod_avx) -%endif - FUNC_SAVE sub len, 16 jl .return_fail @@ -315,6 +310,3 @@ section .data align 16 mask0f: dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f - -;;; func core, ver, snum -slversion gf_6vect_dot_prod_avx, 02, 04, 0195 diff --git a/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_avx.patch b/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_avx.patch deleted file mode 100644 index 42b790fcab..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_avx.patch +++ /dev/null @@ -1,6 +0,0 @@ -187,191d186 -< %ifidn __OUTPUT_FORMAT__, macho64 -< global _gf_6vect_dot_prod_avx:ISAL_SYM_TYPE_FUNCTION -< func(_gf_6vect_dot_prod_avx) -< %endif -< diff --git a/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_avx2.asm b/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_avx2.asm index d5b2543225..ee2d92665e 100644 --- a/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_avx2.asm +++ b/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_avx2.asm @@ -53,7 +53,7 @@ %define PS 8 %define LOG_PS 3 - %define func(x) x: + %define func(x) x: endbranch %macro FUNC_SAVE 0 push r12 push r13 @@ -187,13 +187,8 @@ section .text %define xp6 ymm7 align 16 -global gf_6vect_dot_prod_avx2:ISAL_SYM_TYPE_FUNCTION +global gf_6vect_dot_prod_avx2, function func(gf_6vect_dot_prod_avx2) -%ifidn __OUTPUT_FORMAT__, macho64 -global _gf_6vect_dot_prod_avx2:ISAL_SYM_TYPE_FUNCTION -func(_gf_6vect_dot_prod_avx2) -%endif - FUNC_SAVE sub len, 32 jl .return_fail @@ -324,8 +319,3 @@ func(_gf_6vect_dot_prod_avx2) ret endproc_frame - -section .data - -;;; func core, ver, snum -slversion gf_6vect_dot_prod_avx2, 04, 04, 019a diff --git a/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_avx2.patch b/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_avx2.patch deleted file mode 100644 index 531cd8cdda..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_avx2.patch +++ /dev/null @@ -1,6 +0,0 @@ -192,196d191 -< %ifidn __OUTPUT_FORMAT__, macho64 -< global _gf_6vect_dot_prod_avx2:ISAL_SYM_TYPE_FUNCTION -< func(_gf_6vect_dot_prod_avx2) -< %endif -< diff --git a/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_avx512.asm b/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_avx512.asm new file mode 100644 index 0000000000..531dce90d1 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_avx512.asm @@ -0,0 +1,353 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2019 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; +;;; gf_6vect_dot_prod_avx512(len, vec, *g_tbls, **buffs, **dests); +;;; + +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_AVX512 + +%ifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + + %define tmp r11 + %define tmp2 r10 + %define tmp3 r13 ; must be saved and restored + %define tmp4 r12 ; must be saved and restored + %define tmp5 r14 ; must be saved and restored + %define tmp6 r15 ; must be saved and restored + %define tmp7 rbp ; must be saved and restored + %define tmp8 rbx ; must be saved and restored + %define return rax + %define PS 8 + %define LOG_PS 3 + %define stack_size 6*8 + + %define func(x) x: endbranch + %macro FUNC_SAVE 0 + sub rsp, stack_size + mov [rsp + 0*8], r12 + mov [rsp + 1*8], r13 + mov [rsp + 2*8], r14 + mov [rsp + 3*8], r15 + mov [rsp + 4*8], rbp + mov [rsp + 5*8], rbx + %endmacro + %macro FUNC_RESTORE 0 + mov r12, [rsp + 0*8] + mov r13, [rsp + 1*8] + mov r14, [rsp + 2*8] + mov r15, [rsp + 3*8] + mov rbp, [rsp + 4*8] + mov rbx, [rsp + 5*8] + add rsp, stack_size + %endmacro +%endif + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + + %define arg4 r12 ; must be saved, loaded and restored + %define arg5 r15 ; must be saved and restored + %define tmp r11 + %define tmp2 r10 + %define tmp3 r13 ; must be saved and restored + %define tmp4 r14 ; must be saved and restored + %define tmp5 rdi ; must be saved and restored + %define tmp6 rsi ; must be saved and restored + %define tmp7 rbp ; must be saved and restored + %define tmp8 rbx ; must be saved and restored + %define return rax + %define PS 8 + %define LOG_PS 3 + %define stack_size 10*16 + 9*8 ; must be an odd multiple of 8 + %define arg(x) [rsp + stack_size + PS + PS*x] + + %define func(x) proc_frame x + %macro FUNC_SAVE 0 + alloc_stack stack_size + vmovdqa [rsp + 0*16], xmm6 + vmovdqa [rsp + 1*16], xmm7 + vmovdqa [rsp + 2*16], xmm8 + vmovdqa [rsp + 3*16], xmm9 + vmovdqa [rsp + 4*16], xmm10 + vmovdqa [rsp + 5*16], xmm11 + vmovdqa [rsp + 6*16], xmm12 + vmovdqa [rsp + 7*16], xmm13 + vmovdqa [rsp + 8*16], xmm14 + vmovdqa [rsp + 9*16], xmm15 + save_reg r12, 10*16 + 0*8 + save_reg r13, 10*16 + 1*8 + save_reg r14, 10*16 + 2*8 + save_reg r15, 10*16 + 3*8 + save_reg rdi, 10*16 + 4*8 + save_reg rsi, 10*16 + 5*8 + save_reg rbp, 10*16 + 6*8 + save_reg rbx, 10*16 + 7*8 + end_prolog + mov arg4, arg(4) + %endmacro + + %macro FUNC_RESTORE 0 + vmovdqa xmm6, [rsp + 0*16] + vmovdqa xmm7, [rsp + 1*16] + vmovdqa xmm8, [rsp + 2*16] + vmovdqa xmm9, [rsp + 3*16] + vmovdqa xmm10, [rsp + 4*16] + vmovdqa xmm11, [rsp + 5*16] + vmovdqa xmm12, [rsp + 6*16] + vmovdqa xmm13, [rsp + 7*16] + vmovdqa xmm14, [rsp + 8*16] + vmovdqa xmm15, [rsp + 9*16] + mov r12, [rsp + 10*16 + 0*8] + mov r13, [rsp + 10*16 + 1*8] + mov r14, [rsp + 10*16 + 2*8] + mov r15, [rsp + 10*16 + 3*8] + mov rdi, [rsp + 10*16 + 4*8] + mov rsi, [rsp + 10*16 + 5*8] + mov rbp, [rsp + 10*16 + 6*8] + mov rbx, [rsp + 10*16 + 7*8] + add rsp, stack_size + %endmacro +%endif + + +%define len arg0 +%define vec arg1 +%define mul_array arg2 +%define src arg3 +%define dest1 arg4 +%define ptr arg5 +%define vec_i tmp2 +%define dest2 tmp3 +%define dest3 tmp4 +%define dest4 tmp5 +%define vskip3 tmp6 +%define dest5 tmp7 +%define vskip1 tmp8 +%define pos return + + +%ifndef EC_ALIGNED_ADDR +;;; Use Un-aligned load/store + %define XLDR vmovdqu8 + %define XSTR vmovdqu8 +%else +;;; Use Non-temporal load/stor + %ifdef NO_NT_LDST + %define XLDR vmovdqa64 + %define XSTR vmovdqa64 + %else + %define XLDR vmovntdqa + %define XSTR vmovntdq + %endif +%endif + +%define xmask0f zmm20 +%define xgft1_lo zmm19 +%define xgft1_loy ymm19 +%define xgft1_hi zmm18 +%define xgft2_lo zmm17 +%define xgft2_loy ymm17 +%define xgft2_hi zmm16 +%define xgft3_lo zmm15 +%define xgft3_loy ymm15 +%define xgft3_hi zmm14 +%define xgft4_lo zmm13 +%define xgft4_loy ymm13 +%define xgft4_hi zmm12 +%define xgft5_lo zmm11 +%define xgft5_loy ymm11 +%define xgft5_hi zmm10 +%define xgft6_lo zmm9 +%define xgft6_loy ymm9 +%define xgft6_hi zmm8 + +%define x0 zmm0 +%define xtmpa zmm1 +%define xp1 zmm2 +%define xp2 zmm3 +%define xp3 zmm4 +%define xp4 zmm5 +%define xp5 zmm6 +%define xp6 zmm7 + +default rel +[bits 64] + +section .text + +align 16 +global gf_6vect_dot_prod_avx512, function +func(gf_6vect_dot_prod_avx512) + FUNC_SAVE + sub len, 64 + jl .return_fail + + xor pos, pos + mov tmp, 0x0f + vpbroadcastb xmask0f, tmp ;Construct mask 0x0f0f0f... + mov vskip1, vec + imul vskip1, 32 + mov vskip3, vec + imul vskip3, 96 + sal vec, LOG_PS ;vec *= PS. Make vec_i count by PS + mov dest2, [dest1+PS] + mov dest3, [dest1+2*PS] + mov dest4, [dest1+3*PS] + mov dest5, [dest1+4*PS] + +.loop64: + vpxorq xp1, xp1, xp1 + vpxorq xp2, xp2, xp2 + vpxorq xp3, xp3, xp3 + vpxorq xp4, xp4, xp4 + vpxorq xp5, xp5, xp5 + vpxorq xp6, xp6, xp6 + mov tmp, mul_array + xor vec_i, vec_i + +.next_vect: + mov ptr, [src+vec_i] + XLDR x0, [ptr+pos] ;Get next source vector + add vec_i, PS + + vpandq xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0 + vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 + vpandq x0, x0, xmask0f ;Mask high src nibble in bits 4-0 + + vmovdqu8 xgft1_loy, [tmp] ;Load array Ax{00}..{0f}, Ax{00}..{f0} + vmovdqu8 xgft2_loy, [tmp+vec*(32/PS)] ;Load array Bx{00}..{0f}, Bx{00}..{f0} + vmovdqu8 xgft3_loy, [tmp+vec*(64/PS)] ;Load array Cx{00}..{0f}, Cx{00}..{f0} + vmovdqu8 xgft4_loy, [tmp+vskip3] ;Load array Dx{00}..{0f}, Dx{00}..{f0} + vmovdqu8 xgft5_loy, [tmp+vskip1*4] ;Load array Ex{00}..{0f}, Ex{00}..{f0} + lea ptr, [vskip1 + vskip1*4] ;ptr = vskip5 + vmovdqu8 xgft6_loy, [tmp+ptr] ;Load array Fx{00}..{0f}, Fx{00}..{f0} + add tmp, 32 + + vshufi64x2 xgft1_hi, xgft1_lo, xgft1_lo, 0x55 + vshufi64x2 xgft1_lo, xgft1_lo, xgft1_lo, 0x00 + vshufi64x2 xgft2_hi, xgft2_lo, xgft2_lo, 0x55 + vshufi64x2 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 + + vpshufb xgft1_hi, xgft1_hi, x0 ;Lookup mul table of high nibble + vpshufb xgft1_lo, xgft1_lo, xtmpa ;Lookup mul table of low nibble + vpxorq xgft1_hi, xgft1_hi, xgft1_lo ;GF add high and low partials + vpxorq xp1, xp1, xgft1_hi ;xp1 += partial + + vpshufb xgft2_hi, xgft2_hi, x0 ;Lookup mul table of high nibble + vpshufb xgft2_lo, xgft2_lo, xtmpa ;Lookup mul table of low nibble + vpxorq xgft2_hi, xgft2_hi, xgft2_lo ;GF add high and low partials + vpxorq xp2, xp2, xgft2_hi ;xp2 += partial + + vshufi64x2 xgft3_hi, xgft3_lo, xgft3_lo, 0x55 + vshufi64x2 xgft3_lo, xgft3_lo, xgft3_lo, 0x00 + vshufi64x2 xgft4_hi, xgft4_lo, xgft4_lo, 0x55 + vshufi64x2 xgft4_lo, xgft4_lo, xgft4_lo, 0x00 + + vpshufb xgft3_hi, xgft3_hi, x0 ;Lookup mul table of high nibble + vpshufb xgft3_lo, xgft3_lo, xtmpa ;Lookup mul table of low nibble + vpxorq xgft3_hi, xgft3_hi, xgft3_lo ;GF add high and low partials + vpxorq xp3, xp3, xgft3_hi ;xp3 += partial + + vpshufb xgft4_hi, xgft4_hi, x0 ;Lookup mul table of high nibble + vpshufb xgft4_lo, xgft4_lo, xtmpa ;Lookup mul table of low nibble + vpxorq xgft4_hi, xgft4_hi, xgft4_lo ;GF add high and low partials + vpxorq xp4, xp4, xgft4_hi ;xp4 += partial + + vshufi64x2 xgft5_hi, xgft5_lo, xgft5_lo, 0x55 + vshufi64x2 xgft5_lo, xgft5_lo, xgft5_lo, 0x00 + + vpshufb xgft5_hi, xgft5_hi, x0 ;Lookup mul table of high nibble + vpshufb xgft5_lo, xgft5_lo, xtmpa ;Lookup mul table of low nibble + vpxorq xgft5_hi, xgft5_hi, xgft5_lo ;GF add high and low partials + vpxorq xp5, xp5, xgft5_hi ;xp5 += partial + + vshufi64x2 xgft6_hi, xgft6_lo, xgft6_lo, 0x55 + vshufi64x2 xgft6_lo, xgft6_lo, xgft6_lo, 0x00 + + vpshufb xgft6_hi, xgft6_hi, x0 ;Lookup mul table of high nibble + vpshufb xgft6_lo, xgft6_lo, xtmpa ;Lookup mul table of low nibble + vpxorq xgft6_hi, xgft6_hi, xgft6_lo ;GF add high and low partials + vpxorq xp6, xp6, xgft6_hi ;x6 += partial + + cmp vec_i, vec + jl .next_vect + + mov ptr, [dest1] ;reuse ptr + mov tmp, [dest1+5*PS] ;reuse tmp + + XSTR [dest2+pos], xp2 + XSTR [dest3+pos], xp3 + XSTR [dest4+pos], xp4 + XSTR [dest5+pos], xp5 + + XSTR [ptr+pos], xp1 + XSTR [tmp+pos], xp6 + + add pos, 64 ;Loop on 64 bytes at a time + cmp pos, len + jle .loop64 + + lea tmp, [len + 64] + cmp pos, tmp + je .return_pass + + ;; Tail len + mov pos, len ;Overlapped offset length-64 + jmp .loop64 ;Do one more overlap pass + +.return_pass: + mov return, 0 + FUNC_RESTORE + ret + +.return_fail: + mov return, 1 + FUNC_RESTORE + ret + +endproc_frame + +%else +%ifidn __OUTPUT_FORMAT__, win64 +global no_gf_6vect_dot_prod_avx512 +no_gf_6vect_dot_prod_avx512: +%endif +%endif ; ifdef HAVE_AS_KNOWS_AVX512 diff --git a/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_avx512_gfni.asm b/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_avx512_gfni.asm new file mode 100644 index 0000000000..b4b8c654bb --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_avx512_gfni.asm @@ -0,0 +1,292 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2023 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; +;;; gf_6vect_dot_prod_avx512_gfni(len, vec, *g_tbls, **buffs, **dests); +;;; + +%include "reg_sizes.asm" +%include "gf_vect_gfni.inc" + +%if AS_FEATURE_LEVEL >= 10 + +%ifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + + %define tmp r11 + %define tmp2 r10 + %define tmp3 r13 ; must be saved and restored + %define tmp4 r12 ; must be saved and restored + %define tmp5 r14 ; must be saved and restored + %define tmp6 r15 ; must be saved and restored + %define tmp7 rbp ; must be saved and restored + %define tmp8 rbx ; must be saved and restored + + %define func(x) x: endbranch + %macro FUNC_SAVE 0 + push r12 + push r13 + push r14 + push r15 + push rbp + push rbx + %endmacro + %macro FUNC_RESTORE 0 + pop rbx + pop rbp + pop r15 + pop r14 + pop r13 + pop r12 + %endmacro +%endif + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + + %define arg4 r12 ; must be saved, loaded and restored + %define arg5 r15 ; must be saved and restored + %define tmp r11 + %define tmp2 r10 + %define tmp3 r13 ; must be saved and restored + %define tmp4 r14 ; must be saved and restored + %define tmp5 rdi ; must be saved and restored + %define tmp6 rsi ; must be saved and restored + %define tmp7 rbp ; must be saved and restored + %define tmp8 rbx ; must be saved and restored + %define stack_size 7*16 + 9*8 ; must be an odd multiple of 8 + %define arg(x) [rsp + stack_size + 8 + 8*x] + + %define func(x) proc_frame x + %macro FUNC_SAVE 0 + alloc_stack stack_size + vmovdqa [rsp + 0*16], xmm6 + vmovdqa [rsp + 1*16], xmm7 + vmovdqa [rsp + 2*16], xmm8 + vmovdqa [rsp + 3*16], xmm9 + vmovdqa [rsp + 4*16], xmm10 + vmovdqa [rsp + 5*16], xmm11 + vmovdqa [rsp + 6*16], xmm12 + mov [rsp + 7*16 + 0*8], r12 + mov [rsp + 7*16 + 1*8], r13 + mov [rsp + 7*16 + 2*8], r14 + mov [rsp + 7*16 + 3*8], r15 + mov [rsp + 7*16 + 4*8], rdi + mov [rsp + 7*16 + 5*8], rsi + mov [rsp + 7*16 + 6*8], rbp + mov [rsp + 7*16 + 7*8], rbx + end_prolog + mov arg4, arg(4) + %endmacro + + %macro FUNC_RESTORE 0 + vmovdqa xmm6, [rsp + 0*16] + vmovdqa xmm7, [rsp + 1*16] + vmovdqa xmm8, [rsp + 2*16] + vmovdqa xmm9, [rsp + 3*16] + vmovdqa xmm10, [rsp + 4*16] + vmovdqa xmm11, [rsp + 5*16] + vmovdqa xmm12, [rsp + 6*16] + mov r12, [rsp + 7*16 + 0*8] + mov r13, [rsp + 7*16 + 1*8] + mov r14, [rsp + 7*16 + 2*8] + mov r15, [rsp + 7*16 + 3*8] + mov rdi, [rsp + 7*16 + 4*8] + mov rsi, [rsp + 7*16 + 5*8] + mov rbp, [rsp + 7*16 + 6*8] + mov rbx, [rsp + 7*16 + 7*8] + add rsp, stack_size + %endmacro +%endif + + +%define len arg0 +%define vec arg1 +%define mul_array arg2 +%define src arg3 +%define dest1 arg4 +%define ptr arg5 +%define vec_i tmp2 +%define dest2 tmp3 +%define dest3 tmp4 +%define dest4 tmp5 +%define vskip3 tmp6 +%define dest5 tmp7 +%define vskip5 tmp8 +%define pos rax + + +%ifndef EC_ALIGNED_ADDR +;;; Use Un-aligned load/store + %define XLDR vmovdqu8 + %define XSTR vmovdqu8 +%else +;;; Use Non-temporal load/stor + %ifdef NO_NT_LDST + %define XLDR vmovdqa64 + %define XSTR vmovdqa64 + %else + %define XLDR vmovntdqa + %define XSTR vmovntdq + %endif +%endif + +%define xgft1 zmm7 +%define xgft2 zmm8 +%define xgft3 zmm9 +%define xgft4 zmm10 +%define xgft5 zmm11 +%define xgft6 zmm12 + +%define x0 zmm0 +%define xp1 zmm1 +%define xp2 zmm2 +%define xp3 zmm3 +%define xp4 zmm4 +%define xp5 zmm5 +%define xp6 zmm6 + +default rel +[bits 64] + +section .text + +;; +;; Encodes 64 bytes of all "k" sources into 6x 64 bytes (parity disks) +;; +%macro ENCODE_64B_6 0-1 +%define %%KMASK %1 + + vpxorq xp1, xp1, xp1 + vpxorq xp2, xp2, xp2 + vpxorq xp3, xp3, xp3 + vpxorq xp4, xp4, xp4 + vpxorq xp5, xp5, xp5 + vpxorq xp6, xp6, xp6 + mov tmp, mul_array + xor vec_i, vec_i + +%%next_vect: + mov ptr, [src + vec_i] +%if %0 == 1 + vmovdqu8 x0{%%KMASK}, [ptr + pos] ;Get next source vector (less than 64 bytes) +%else + XLDR x0, [ptr + pos] ;Get next source vector (64 bytes) +%endif + add vec_i, 8 + + vbroadcastf32x2 xgft1, [tmp] + vbroadcastf32x2 xgft2, [tmp + vec] + vbroadcastf32x2 xgft3, [tmp + vec*2] + vbroadcastf32x2 xgft4, [tmp + vskip3] + vbroadcastf32x2 xgft5, [tmp + vec*4] + vbroadcastf32x2 xgft6, [tmp + vskip5] + add tmp, 8 + + GF_MUL_XOR EVEX, x0, xgft1, xgft1, xp1, xgft2, xgft2, xp2, xgft3, xgft3, xp3, \ + xgft4, xgft4, xp4, xgft5, xgft5, xp5, xgft6, xgft6, xp6 + + cmp vec_i, vec + jl %%next_vect + + mov ptr, [dest1] ;reuse ptr + mov tmp, [dest1 + 5*8] ;reuse tmp + +%if %0 == 1 + vmovdqu8 [dest2 + pos]{%%KMASK}, xp2 + vmovdqu8 [dest3 + pos]{%%KMASK}, xp3 + vmovdqu8 [dest4 + pos]{%%KMASK}, xp4 + vmovdqu8 [dest5 + pos]{%%KMASK}, xp5 + vmovdqu8 [ptr + pos]{%%KMASK}, xp1 ; dest 1 + vmovdqu8 [tmp + pos]{%%KMASK}, xp6 ; dest 6 +%else + XSTR [dest2 + pos], xp2 + XSTR [dest3 + pos], xp3 + XSTR [dest4 + pos], xp4 + XSTR [dest5 + pos], xp5 + XSTR [ptr + pos], xp1 ; dest 1 + XSTR [tmp + pos], xp6 ; dest 6 +%endif +%endmacro + +align 16 +global gf_6vect_dot_prod_avx512_gfni, function +func(gf_6vect_dot_prod_avx512_gfni) + FUNC_SAVE + + xor pos, pos + mov vskip3, vec + imul vskip3, 3*8 + mov vskip5, vec + imul vskip5, 5*8 + shl vec, 3 ;vec *= 8. Make vec_i count by 8 + mov dest2, [dest1 + 8] + mov dest3, [dest1 + 2*8] + mov dest4, [dest1 + 3*8] + mov dest5, [dest1 + 4*8] ;dest1 and dest6 are calculated later + + cmp len, 64 + jl .len_lt_64 + +.loop64: + + ENCODE_64B_6 + + add pos, 64 ;Loop on 64 bytes at a time + sub len, 64 + cmp len, 64 + jge .loop64 + +.len_lt_64: + cmp len, 0 + jle .exit + + xor tmp, tmp + bts tmp, len + dec tmp + kmovq k1, tmp + + ENCODE_64B_6 k1 + +.exit: + vzeroupper + + FUNC_RESTORE + ret + +endproc_frame +%endif ; if AS_FEATURE_LEVEL >= 10 diff --git a/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_sse.asm b/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_sse.asm index 5dea0be18e..b877411da8 100644 --- a/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_sse.asm +++ b/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_sse.asm @@ -51,7 +51,7 @@ %define PS 8 %define LOG_PS 3 - %define func(x) x: + %define func(x) x: endbranch %macro FUNC_SAVE 0 push r12 push r13 @@ -182,13 +182,8 @@ section .text %define xp6 xmm13 align 16 -global gf_6vect_dot_prod_sse:ISAL_SYM_TYPE_FUNCTION +global gf_6vect_dot_prod_sse, function func(gf_6vect_dot_prod_sse) -%ifidn __OUTPUT_FORMAT__, macho64 -global _gf_6vect_dot_prod_sse:ISAL_SYM_TYPE_FUNCTION -func(_gf_6vect_dot_prod_sse) -%endif - FUNC_SAVE sub len, 16 jl .return_fail @@ -315,6 +310,3 @@ section .data align 16 mask0f: dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f - -;;; func core, ver, snum -slversion gf_6vect_dot_prod_sse, 00, 05, 0066 diff --git a/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_sse.patch b/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_sse.patch deleted file mode 100644 index 1255245edf..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_sse.patch +++ /dev/null @@ -1,6 +0,0 @@ -187,191d186 -< %ifidn __OUTPUT_FORMAT__, macho64 -< global _gf_6vect_dot_prod_sse:ISAL_SYM_TYPE_FUNCTION -< func(_gf_6vect_dot_prod_sse) -< %endif -< diff --git a/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_sse_test.c b/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_sse_test.c deleted file mode 100644 index f0075a00e8..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_sse_test.c +++ /dev/null @@ -1,911 +0,0 @@ -/********************************************************************** - Copyright(c) 2011-2015 Intel Corporation All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -**********************************************************************/ - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> // for memset, memcmp -#include "erasure_code.h" -// #include "types.h" - -#ifndef FUNCTION_UNDER_TEST -# define FUNCTION_UNDER_TEST gf_6vect_dot_prod_sse -#endif -#ifndef TEST_MIN_SIZE -# define TEST_MIN_SIZE 16 -#endif - -#define str(s) #s -#define xstr(s) str(s) - -#define TEST_LEN 2048 -#define TEST_SIZE (TEST_LEN/2) -#define TEST_MEM TEST_SIZE -#define TEST_LOOPS 1000 -#define TEST_TYPE_STR "" - -#ifndef TEST_SOURCES -# define TEST_SOURCES 16 -#endif -#ifndef RANDOMS -# define RANDOMS 20 -#endif - -#ifdef EC_ALIGNED_ADDR -// Define power of 2 range to check ptr, len alignment -# define PTR_ALIGN_CHK_B 0 -# define LEN_ALIGN_CHK_B 0 // 0 for aligned only -#else -// Define power of 2 range to check ptr, len alignment -# define PTR_ALIGN_CHK_B 32 -# define LEN_ALIGN_CHK_B 32 // 0 for aligned only -#endif - -typedef unsigned char u8; - -void dump(unsigned char *buf, int len) -{ - int i; - for (i = 0; i < len;) { - printf(" %2x", 0xff & buf[i++]); - if (i % 32 == 0) - printf("\n"); - } - printf("\n"); -} - -void dump_matrix(unsigned char **s, int k, int m) -{ - int i, j; - for (i = 0; i < k; i++) { - for (j = 0; j < m; j++) { - printf(" %2x", s[i][j]); - } - printf("\n"); - } - printf("\n"); -} - -void dump_u8xu8(unsigned char *s, int k, int m) -{ - int i, j; - for (i = 0; i < k; i++) { - for (j = 0; j < m; j++) { - printf(" %2x", 0xff & s[j + (i * m)]); - } - printf("\n"); - } - printf("\n"); -} - -int main(int argc, char *argv[]) -{ - int i, j, rtest, srcs; - void *buf; - u8 g1[TEST_SOURCES], g2[TEST_SOURCES], g3[TEST_SOURCES]; - u8 g4[TEST_SOURCES], g5[TEST_SOURCES], g6[TEST_SOURCES], *g_tbls; - u8 *dest1, *dest2, *dest3, *dest4, *dest5, *dest6, *dest_ref1; - u8 *dest_ref2, *dest_ref3, *dest_ref4, *dest_ref5, *dest_ref6; - u8 *dest_ptrs[6], *buffs[TEST_SOURCES]; - - int align, size; - unsigned char *efence_buffs[TEST_SOURCES]; - unsigned int offset; - u8 *ubuffs[TEST_SOURCES]; - u8 *udest_ptrs[6]; - printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d ", TEST_SOURCES, TEST_LEN); - - // Allocate the arrays - for (i = 0; i < TEST_SOURCES; i++) { - if (posix_memalign(&buf, 64, TEST_LEN)) { - printf("alloc error: Fail"); - return -1; - } - buffs[i] = buf; - } - - if (posix_memalign(&buf, 16, 2 * (6 * TEST_SOURCES * 32))) { - printf("alloc error: Fail"); - return -1; - } - g_tbls = buf; - - if (posix_memalign(&buf, 64, TEST_LEN)) { - printf("alloc error: Fail"); - return -1; - } - dest1 = buf; - - if (posix_memalign(&buf, 64, TEST_LEN)) { - printf("alloc error: Fail"); - return -1; - } - dest2 = buf; - - if (posix_memalign(&buf, 64, TEST_LEN)) { - printf("alloc error: Fail"); - return -1; - } - dest3 = buf; - - if (posix_memalign(&buf, 64, TEST_LEN)) { - printf("alloc error: Fail"); - return -1; - } - dest4 = buf; - - if (posix_memalign(&buf, 64, TEST_LEN)) { - printf("alloc error: Fail"); - return -1; - } - dest5 = buf; - - if (posix_memalign(&buf, 64, TEST_LEN)) { - printf("alloc error: Fail"); - return -1; - } - dest6 = buf; - - if (posix_memalign(&buf, 64, TEST_LEN)) { - printf("alloc error: Fail"); - return -1; - } - dest_ref1 = buf; - - if (posix_memalign(&buf, 64, TEST_LEN)) { - printf("alloc error: Fail"); - return -1; - } - dest_ref2 = buf; - - if (posix_memalign(&buf, 64, TEST_LEN)) { - printf("alloc error: Fail"); - return -1; - } - dest_ref3 = buf; - - if (posix_memalign(&buf, 64, TEST_LEN)) { - printf("alloc error: Fail"); - return -1; - } - dest_ref4 = buf; - - if (posix_memalign(&buf, 64, TEST_LEN)) { - printf("alloc error: Fail"); - return -1; - } - dest_ref5 = buf; - - if (posix_memalign(&buf, 64, TEST_LEN)) { - printf("alloc error: Fail"); - return -1; - } - dest_ref6 = buf; - - dest_ptrs[0] = dest1; - dest_ptrs[1] = dest2; - dest_ptrs[2] = dest3; - dest_ptrs[3] = dest4; - dest_ptrs[4] = dest5; - dest_ptrs[5] = dest6; - - // Test of all zeros - for (i = 0; i < TEST_SOURCES; i++) - memset(buffs[i], 0, TEST_LEN); - - memset(dest1, 0, TEST_LEN); - memset(dest2, 0, TEST_LEN); - memset(dest3, 0, TEST_LEN); - memset(dest4, 0, TEST_LEN); - memset(dest5, 0, TEST_LEN); - memset(dest6, 0, TEST_LEN); - memset(dest_ref1, 0, TEST_LEN); - memset(dest_ref2, 0, TEST_LEN); - memset(dest_ref3, 0, TEST_LEN); - memset(dest_ref4, 0, TEST_LEN); - memset(dest_ref5, 0, TEST_LEN); - memset(dest_ref6, 0, TEST_LEN); - memset(g1, 2, TEST_SOURCES); - memset(g2, 1, TEST_SOURCES); - memset(g3, 7, TEST_SOURCES); - memset(g4, 9, TEST_SOURCES); - memset(g5, 4, TEST_SOURCES); - memset(g6, 0xe6, TEST_SOURCES); - - for (i = 0; i < TEST_SOURCES; i++) { - gf_vect_mul_init(g1[i], &g_tbls[i * 32]); - gf_vect_mul_init(g2[i], &g_tbls[32 * TEST_SOURCES + i * 32]); - gf_vect_mul_init(g3[i], &g_tbls[64 * TEST_SOURCES + i * 32]); - gf_vect_mul_init(g4[i], &g_tbls[96 * TEST_SOURCES + i * 32]); - gf_vect_mul_init(g5[i], &g_tbls[128 * TEST_SOURCES + i * 32]); - gf_vect_mul_init(g6[i], &g_tbls[160 * TEST_SOURCES + i * 32]); - } - - gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1); - gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES], buffs, - dest_ref2); - gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES], buffs, - dest_ref3); - gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES], buffs, - dest_ref4); - gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[128 * TEST_SOURCES], buffs, - dest_ref5); - gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[160 * TEST_SOURCES], buffs, - dest_ref6); - - FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs); - - if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) { - printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test1\n"); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref1, 25); - printf("dprod_dut:"); - dump(dest1, 25); - return -1; - } - if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) { - printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test2\n"); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref2, 25); - printf("dprod_dut:"); - dump(dest2, 25); - return -1; - } - if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) { - printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test3\n"); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref3, 25); - printf("dprod_dut:"); - dump(dest3, 25); - return -1; - } - if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) { - printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test4\n"); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref4, 25); - printf("dprod_dut:"); - dump(dest4, 25); - return -1; - } - if (0 != memcmp(dest_ref5, dest5, TEST_LEN)) { - printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test5\n"); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref5, 25); - printf("dprod_dut:"); - dump(dest5, 25); - return -1; - } - if (0 != memcmp(dest_ref6, dest6, TEST_LEN)) { - printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test6\n"); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref6, 25); - printf("dprod_dut:"); - dump(dest6, 25); - return -1; - } - putchar('.'); - - // Rand data test - - for (rtest = 0; rtest < RANDOMS; rtest++) { - for (i = 0; i < TEST_SOURCES; i++) - for (j = 0; j < TEST_LEN; j++) - buffs[i][j] = rand(); - - for (i = 0; i < TEST_SOURCES; i++) { - g1[i] = rand(); - g2[i] = rand(); - g3[i] = rand(); - g4[i] = rand(); - g5[i] = rand(); - g6[i] = rand(); - } - - for (i = 0; i < TEST_SOURCES; i++) { - gf_vect_mul_init(g1[i], &g_tbls[i * 32]); - gf_vect_mul_init(g2[i], &g_tbls[(32 * TEST_SOURCES) + (i * 32)]); - gf_vect_mul_init(g3[i], &g_tbls[(64 * TEST_SOURCES) + (i * 32)]); - gf_vect_mul_init(g4[i], &g_tbls[(96 * TEST_SOURCES) + (i * 32)]); - gf_vect_mul_init(g5[i], &g_tbls[(128 * TEST_SOURCES) + (i * 32)]); - gf_vect_mul_init(g6[i], &g_tbls[(160 * TEST_SOURCES) + (i * 32)]); - } - - gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1); - gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES], - buffs, dest_ref2); - gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES], - buffs, dest_ref3); - gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES], - buffs, dest_ref4); - gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[128 * TEST_SOURCES], - buffs, dest_ref5); - gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[160 * TEST_SOURCES], - buffs, dest_ref6); - - FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs); - - if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test1 %d\n", rtest); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref1, 25); - printf("dprod_dut:"); - dump(dest1, 25); - return -1; - } - if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test2 %d\n", rtest); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref2, 25); - printf("dprod_dut:"); - dump(dest2, 25); - return -1; - } - if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test3 %d\n", rtest); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref3, 25); - printf("dprod_dut:"); - dump(dest3, 25); - return -1; - } - if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test4 %d\n", rtest); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref4, 25); - printf("dprod_dut:"); - dump(dest4, 25); - return -1; - } - if (0 != memcmp(dest_ref5, dest5, TEST_LEN)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test5 %d\n", rtest); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref5, 25); - printf("dprod_dut:"); - dump(dest5, 25); - return -1; - } - if (0 != memcmp(dest_ref6, dest6, TEST_LEN)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test6 %d\n", rtest); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref6, 25); - printf("dprod_dut:"); - dump(dest6, 25); - return -1; - } - - putchar('.'); - } - - // Rand data test with varied parameters - for (rtest = 0; rtest < RANDOMS; rtest++) { - for (srcs = TEST_SOURCES; srcs > 0; srcs--) { - for (i = 0; i < srcs; i++) - for (j = 0; j < TEST_LEN; j++) - buffs[i][j] = rand(); - - for (i = 0; i < srcs; i++) { - g1[i] = rand(); - g2[i] = rand(); - g3[i] = rand(); - g4[i] = rand(); - g5[i] = rand(); - g6[i] = rand(); - } - - for (i = 0; i < srcs; i++) { - gf_vect_mul_init(g1[i], &g_tbls[i * 32]); - gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]); - gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]); - gf_vect_mul_init(g4[i], &g_tbls[(96 * srcs) + (i * 32)]); - gf_vect_mul_init(g5[i], &g_tbls[(128 * srcs) + (i * 32)]); - gf_vect_mul_init(g6[i], &g_tbls[(160 * srcs) + (i * 32)]); - } - - gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[0], buffs, dest_ref1); - gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[32 * srcs], buffs, - dest_ref2); - gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[64 * srcs], buffs, - dest_ref3); - gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[96 * srcs], buffs, - dest_ref4); - gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[128 * srcs], buffs, - dest_ref5); - gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[160 * srcs], buffs, - dest_ref6); - - FUNCTION_UNDER_TEST(TEST_LEN, srcs, g_tbls, buffs, dest_ptrs); - - if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) - " test1 srcs=%d\n", srcs); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref1, 25); - printf("dprod_dut:"); - dump(dest1, 25); - return -1; - } - if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) - " test2 srcs=%d\n", srcs); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref2, 25); - printf("dprod_dut:"); - dump(dest2, 25); - return -1; - } - if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) - " test3 srcs=%d\n", srcs); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref3, 25); - printf("dprod_dut:"); - dump(dest3, 25); - return -1; - } - if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) - " test4 srcs=%d\n", srcs); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref4, 25); - printf("dprod_dut:"); - dump(dest4, 25); - return -1; - } - if (0 != memcmp(dest_ref5, dest5, TEST_LEN)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) - " test5 srcs=%d\n", srcs); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref5, 25); - printf("dprod_dut:"); - dump(dest5, 25); - return -1; - } - if (0 != memcmp(dest_ref6, dest6, TEST_LEN)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) - " test6 srcs=%d\n", srcs); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref6, 25); - printf("dprod_dut:"); - dump(dest6, 25); - return -1; - } - - putchar('.'); - } - } - - // Run tests at end of buffer for Electric Fence - align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16; - for (size = TEST_MIN_SIZE; size <= TEST_SIZE; size += align) { - for (i = 0; i < TEST_SOURCES; i++) - for (j = 0; j < TEST_LEN; j++) - buffs[i][j] = rand(); - - for (i = 0; i < TEST_SOURCES; i++) // Line up TEST_SIZE from end - efence_buffs[i] = buffs[i] + TEST_LEN - size; - - for (i = 0; i < TEST_SOURCES; i++) { - g1[i] = rand(); - g2[i] = rand(); - g3[i] = rand(); - g4[i] = rand(); - g5[i] = rand(); - g6[i] = rand(); - } - - for (i = 0; i < TEST_SOURCES; i++) { - gf_vect_mul_init(g1[i], &g_tbls[i * 32]); - gf_vect_mul_init(g2[i], &g_tbls[(32 * TEST_SOURCES) + (i * 32)]); - gf_vect_mul_init(g3[i], &g_tbls[(64 * TEST_SOURCES) + (i * 32)]); - gf_vect_mul_init(g4[i], &g_tbls[(96 * TEST_SOURCES) + (i * 32)]); - gf_vect_mul_init(g5[i], &g_tbls[(128 * TEST_SOURCES) + (i * 32)]); - gf_vect_mul_init(g6[i], &g_tbls[(160 * TEST_SOURCES) + (i * 32)]); - } - - gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[0], efence_buffs, dest_ref1); - gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES], - efence_buffs, dest_ref2); - gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES], - efence_buffs, dest_ref3); - gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES], - efence_buffs, dest_ref4); - gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[128 * TEST_SOURCES], - efence_buffs, dest_ref5); - gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[160 * TEST_SOURCES], - efence_buffs, dest_ref6); - - FUNCTION_UNDER_TEST(size, TEST_SOURCES, g_tbls, efence_buffs, dest_ptrs); - - if (0 != memcmp(dest_ref1, dest1, size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test1 %d\n", rtest); - dump_matrix(efence_buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref1, align); - printf("dprod_dut:"); - dump(dest1, align); - return -1; - } - - if (0 != memcmp(dest_ref2, dest2, size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test2 %d\n", rtest); - dump_matrix(efence_buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref2, align); - printf("dprod_dut:"); - dump(dest2, align); - return -1; - } - - if (0 != memcmp(dest_ref3, dest3, size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test3 %d\n", rtest); - dump_matrix(efence_buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref3, align); - printf("dprod_dut:"); - dump(dest3, align); - return -1; - } - - if (0 != memcmp(dest_ref4, dest4, size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test4 %d\n", rtest); - dump_matrix(efence_buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref4, align); - printf("dprod_dut:"); - dump(dest4, align); - return -1; - } - - if (0 != memcmp(dest_ref5, dest5, size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test5 %d\n", rtest); - dump_matrix(efence_buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref5, align); - printf("dprod_dut:"); - dump(dest5, align); - return -1; - } - - if (0 != memcmp(dest_ref6, dest6, size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test6 %d\n", rtest); - dump_matrix(efence_buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref6, align); - printf("dprod_dut:"); - dump(dest6, align); - return -1; - } - - putchar('.'); - } - - // Test rand ptr alignment if available - - for (rtest = 0; rtest < RANDOMS; rtest++) { - size = (TEST_LEN - PTR_ALIGN_CHK_B) & ~(TEST_MIN_SIZE - 1); - srcs = rand() % TEST_SOURCES; - if (srcs == 0) - continue; - - offset = (PTR_ALIGN_CHK_B != 0) ? 1 : PTR_ALIGN_CHK_B; - // Add random offsets - for (i = 0; i < srcs; i++) - ubuffs[i] = buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset)); - - udest_ptrs[0] = dest1 + (rand() & (PTR_ALIGN_CHK_B - offset)); - udest_ptrs[1] = dest2 + (rand() & (PTR_ALIGN_CHK_B - offset)); - udest_ptrs[2] = dest3 + (rand() & (PTR_ALIGN_CHK_B - offset)); - udest_ptrs[3] = dest4 + (rand() & (PTR_ALIGN_CHK_B - offset)); - udest_ptrs[4] = dest5 + (rand() & (PTR_ALIGN_CHK_B - offset)); - udest_ptrs[5] = dest6 + (rand() & (PTR_ALIGN_CHK_B - offset)); - - memset(dest1, 0, TEST_LEN); // zero pad to check write-over - memset(dest2, 0, TEST_LEN); - memset(dest3, 0, TEST_LEN); - memset(dest4, 0, TEST_LEN); - memset(dest5, 0, TEST_LEN); - memset(dest6, 0, TEST_LEN); - - for (i = 0; i < srcs; i++) - for (j = 0; j < size; j++) - ubuffs[i][j] = rand(); - - for (i = 0; i < srcs; i++) { - g1[i] = rand(); - g2[i] = rand(); - g3[i] = rand(); - g4[i] = rand(); - g5[i] = rand(); - g6[i] = rand(); - } - - for (i = 0; i < srcs; i++) { - gf_vect_mul_init(g1[i], &g_tbls[i * 32]); - gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]); - gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]); - gf_vect_mul_init(g4[i], &g_tbls[(96 * srcs) + (i * 32)]); - gf_vect_mul_init(g5[i], &g_tbls[(128 * srcs) + (i * 32)]); - gf_vect_mul_init(g6[i], &g_tbls[(160 * srcs) + (i * 32)]); - } - - gf_vect_dot_prod_base(size, srcs, &g_tbls[0], ubuffs, dest_ref1); - gf_vect_dot_prod_base(size, srcs, &g_tbls[32 * srcs], ubuffs, dest_ref2); - gf_vect_dot_prod_base(size, srcs, &g_tbls[64 * srcs], ubuffs, dest_ref3); - gf_vect_dot_prod_base(size, srcs, &g_tbls[96 * srcs], ubuffs, dest_ref4); - gf_vect_dot_prod_base(size, srcs, &g_tbls[128 * srcs], ubuffs, dest_ref5); - gf_vect_dot_prod_base(size, srcs, &g_tbls[160 * srcs], ubuffs, dest_ref6); - - FUNCTION_UNDER_TEST(size, srcs, g_tbls, ubuffs, udest_ptrs); - - if (memcmp(dest_ref1, udest_ptrs[0], size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n", - srcs); - dump_matrix(ubuffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref1, 25); - printf("dprod_dut:"); - dump(udest_ptrs[0], 25); - return -1; - } - if (memcmp(dest_ref2, udest_ptrs[1], size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n", - srcs); - dump_matrix(ubuffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref2, 25); - printf("dprod_dut:"); - dump(udest_ptrs[1], 25); - return -1; - } - if (memcmp(dest_ref3, udest_ptrs[2], size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n", - srcs); - dump_matrix(ubuffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref3, 25); - printf("dprod_dut:"); - dump(udest_ptrs[2], 25); - return -1; - } - if (memcmp(dest_ref4, udest_ptrs[3], size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n", - srcs); - dump_matrix(ubuffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref4, 25); - printf("dprod_dut:"); - dump(udest_ptrs[3], 25); - return -1; - } - if (memcmp(dest_ref5, udest_ptrs[4], size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n", - srcs); - dump_matrix(ubuffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref5, 25); - printf("dprod_dut:"); - dump(udest_ptrs[4], 25); - return -1; - } - if (memcmp(dest_ref6, udest_ptrs[5], size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n", - srcs); - dump_matrix(ubuffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref6, 25); - printf("dprod_dut:"); - dump(udest_ptrs[5], 25); - return -1; - } - // Confirm that padding around dests is unchanged - memset(dest_ref1, 0, PTR_ALIGN_CHK_B); // Make reference zero buff - offset = udest_ptrs[0] - dest1; - - if (memcmp(dest1, dest_ref1, offset)) { - printf("Fail rand ualign pad1 start\n"); - return -1; - } - if (memcmp(dest1 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) { - printf("Fail rand ualign pad1 end\n"); - return -1; - } - - offset = udest_ptrs[1] - dest2; - if (memcmp(dest2, dest_ref1, offset)) { - printf("Fail rand ualign pad2 start\n"); - return -1; - } - if (memcmp(dest2 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) { - printf("Fail rand ualign pad2 end\n"); - return -1; - } - - offset = udest_ptrs[2] - dest3; - if (memcmp(dest3, dest_ref1, offset)) { - printf("Fail rand ualign pad3 start\n"); - return -1; - } - if (memcmp(dest3 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) { - printf("Fail rand ualign pad3 end\n"); - return -1; - } - - offset = udest_ptrs[3] - dest4; - if (memcmp(dest4, dest_ref1, offset)) { - printf("Fail rand ualign pad4 start\n"); - return -1; - } - if (memcmp(dest4 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) { - printf("Fail rand ualign pad4 end\n"); - return -1; - } - - offset = udest_ptrs[4] - dest5; - if (memcmp(dest5, dest_ref1, offset)) { - printf("Fail rand ualign pad5 start\n"); - return -1; - } - if (memcmp(dest5 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) { - printf("Fail rand ualign pad5 end\n"); - return -1; - } - - offset = udest_ptrs[5] - dest6; - if (memcmp(dest6, dest_ref1, offset)) { - printf("Fail rand ualign pad6 start\n"); - return -1; - } - if (memcmp(dest6 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) { - printf("Fail rand ualign pad6 end\n"); - return -1; - } - - putchar('.'); - } - - // Test all size alignment - align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16; - - for (size = TEST_LEN; size >= TEST_MIN_SIZE; size -= align) { - srcs = TEST_SOURCES; - - for (i = 0; i < srcs; i++) - for (j = 0; j < size; j++) - buffs[i][j] = rand(); - - for (i = 0; i < srcs; i++) { - g1[i] = rand(); - g2[i] = rand(); - g3[i] = rand(); - g4[i] = rand(); - g5[i] = rand(); - g6[i] = rand(); - } - - for (i = 0; i < srcs; i++) { - gf_vect_mul_init(g1[i], &g_tbls[i * 32]); - gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]); - gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]); - gf_vect_mul_init(g4[i], &g_tbls[(96 * srcs) + (i * 32)]); - gf_vect_mul_init(g5[i], &g_tbls[(128 * srcs) + (i * 32)]); - gf_vect_mul_init(g6[i], &g_tbls[(160 * srcs) + (i * 32)]); - } - - gf_vect_dot_prod_base(size, srcs, &g_tbls[0], buffs, dest_ref1); - gf_vect_dot_prod_base(size, srcs, &g_tbls[32 * srcs], buffs, dest_ref2); - gf_vect_dot_prod_base(size, srcs, &g_tbls[64 * srcs], buffs, dest_ref3); - gf_vect_dot_prod_base(size, srcs, &g_tbls[96 * srcs], buffs, dest_ref4); - gf_vect_dot_prod_base(size, srcs, &g_tbls[128 * srcs], buffs, dest_ref5); - gf_vect_dot_prod_base(size, srcs, &g_tbls[160 * srcs], buffs, dest_ref6); - - FUNCTION_UNDER_TEST(size, srcs, g_tbls, buffs, dest_ptrs); - - if (memcmp(dest_ref1, dest_ptrs[0], size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n", - size); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref1, 25); - printf("dprod_dut:"); - dump(dest_ptrs[0], 25); - return -1; - } - if (memcmp(dest_ref2, dest_ptrs[1], size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n", - size); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref2, 25); - printf("dprod_dut:"); - dump(dest_ptrs[1], 25); - return -1; - } - if (memcmp(dest_ref3, dest_ptrs[2], size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n", - size); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref3, 25); - printf("dprod_dut:"); - dump(dest_ptrs[2], 25); - return -1; - } - if (memcmp(dest_ref4, dest_ptrs[3], size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n", - size); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref4, 25); - printf("dprod_dut:"); - dump(dest_ptrs[3], 25); - return -1; - } - if (memcmp(dest_ref5, dest_ptrs[4], size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n", - size); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref5, 25); - printf("dprod_dut:"); - dump(dest_ptrs[4], 25); - return -1; - } - if (memcmp(dest_ref6, dest_ptrs[5], size)) { - printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n", - size); - dump_matrix(buffs, 5, TEST_SOURCES); - printf("dprod_base:"); - dump(dest_ref6, 25); - printf("dprod_dut:"); - dump(dest_ptrs[5], 25); - return -1; - } - } - - printf("Pass\n"); - return 0; - -} diff --git a/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_sse_test.patch b/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_sse_test.patch deleted file mode 100644 index 21bbfaa667..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_sse_test.patch +++ /dev/null @@ -1,4 +0,0 @@ -34c34 -< // #include "types.h" ---- -> #include "types.h" diff --git a/contrib/libs/isa-l/erasure_code/gf_6vect_mad_avx.asm b/contrib/libs/isa-l/erasure_code/gf_6vect_mad_avx.asm index c9d7e57472..13a9af79c2 100644 --- a/contrib/libs/isa-l/erasure_code/gf_6vect_mad_avx.asm +++ b/contrib/libs/isa-l/erasure_code/gf_6vect_mad_avx.asm @@ -111,7 +111,7 @@ %define return rax %define return.w eax - %define func(x) x: + %define func(x) x: endbranch %macro FUNC_SAVE 0 push r12 push r13 @@ -184,13 +184,8 @@ section .text align 16 -global gf_6vect_mad_avx:ISAL_SYM_TYPE_FUNCTION +global gf_6vect_mad_avx, function func(gf_6vect_mad_avx) -%ifidn __OUTPUT_FORMAT__, macho64 -global _gf_6vect_mad_avx:ISAL_SYM_TYPE_FUNCTION -func(_gf_6vect_mad_avx) -%endif - FUNC_SAVE sub len, 16 jl .return_fail @@ -394,6 +389,3 @@ align 16 mask0f: dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f constip16: dq 0xf8f9fafbfcfdfeff, 0xf0f1f2f3f4f5f6f7 - -;;; func core, ver, snum -slversion gf_6vect_mad_avx, 02, 01, 0210 diff --git a/contrib/libs/isa-l/erasure_code/gf_6vect_mad_avx.patch b/contrib/libs/isa-l/erasure_code/gf_6vect_mad_avx.patch deleted file mode 100644 index d5afa0f167..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_6vect_mad_avx.patch +++ /dev/null @@ -1,6 +0,0 @@ -189,193d188 -< %ifidn __OUTPUT_FORMAT__, macho64 -< global _gf_6vect_mad_avx:ISAL_SYM_TYPE_FUNCTION -< func(_gf_6vect_mad_avx) -< %endif -< diff --git a/contrib/libs/isa-l/erasure_code/gf_6vect_mad_avx2.asm b/contrib/libs/isa-l/erasure_code/gf_6vect_mad_avx2.asm index 2b6babcba5..5f0b3477bb 100644 --- a/contrib/libs/isa-l/erasure_code/gf_6vect_mad_avx2.asm +++ b/contrib/libs/isa-l/erasure_code/gf_6vect_mad_avx2.asm @@ -107,7 +107,7 @@ %define return rax %define return.w eax - %define func(x) x: + %define func(x) x: endbranch %macro FUNC_SAVE 0 push r12 %endmacro @@ -177,13 +177,8 @@ section .text %define xd6 xd1 align 16 -global gf_6vect_mad_avx2:ISAL_SYM_TYPE_FUNCTION +global gf_6vect_mad_avx2, function func(gf_6vect_mad_avx2) -%ifidn __OUTPUT_FORMAT__, macho64 -global _gf_6vect_mad_avx2:ISAL_SYM_TYPE_FUNCTION -func(_gf_6vect_mad_avx2) -%endif - FUNC_SAVE sub len, 32 jl .return_fail @@ -400,6 +395,3 @@ align 32 constip32: dq 0xf8f9fafbfcfdfeff, 0xf0f1f2f3f4f5f6f7 dq 0xe8e9eaebecedeeef, 0xe0e1e2e3e4e5e6e7 - -;;; func core, ver, snum -slversion gf_6vect_mad_avx2, 04, 01, 0211 diff --git a/contrib/libs/isa-l/erasure_code/gf_6vect_mad_avx2.patch b/contrib/libs/isa-l/erasure_code/gf_6vect_mad_avx2.patch deleted file mode 100644 index add3a8e3a2..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_6vect_mad_avx2.patch +++ /dev/null @@ -1,6 +0,0 @@ -182,186d181 -< %ifidn __OUTPUT_FORMAT__, macho64 -< global _gf_6vect_mad_avx2:ISAL_SYM_TYPE_FUNCTION -< func(_gf_6vect_mad_avx2) -< %endif -< diff --git a/contrib/libs/isa-l/erasure_code/gf_6vect_mad_avx512.asm b/contrib/libs/isa-l/erasure_code/gf_6vect_mad_avx512.asm new file mode 100644 index 0000000000..2d18c529be --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/gf_6vect_mad_avx512.asm @@ -0,0 +1,321 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2019 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; +;;; gf_6vect_mad_avx512(len, vec, vec_i, mul_array, src, dest); +;;; + +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_AVX512 + +%ifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + %define tmp r11 + %define tmp2 r10 + %define tmp3 r12 ;must be saved and restored + %define return rax + %define func(x) x: endbranch + %macro FUNC_SAVE 0 + push r12 + %endmacro + %macro FUNC_RESTORE 0 + pop r12 + %endmacro +%endif + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define arg4 r12 + %define arg5 r15 + %define tmp r11 + %define tmp2 r10 + %define tmp3 r13 + %define return rax + %define stack_size 16*10 + 3*8 + %define arg(x) [rsp + stack_size + PS + PS*x] + %define func(x) proc_frame x + +%macro FUNC_SAVE 0 + sub rsp, stack_size + vmovdqa [rsp+16*0],xmm6 + vmovdqa [rsp+16*1],xmm7 + vmovdqa [rsp+16*2],xmm8 + vmovdqa [rsp+16*3],xmm9 + vmovdqa [rsp+16*4],xmm10 + vmovdqa [rsp+16*5],xmm11 + vmovdqa [rsp+16*6],xmm12 + vmovdqa [rsp+16*7],xmm13 + vmovdqa [rsp+16*8],xmm14 + vmovdqa [rsp+16*9],xmm15 + save_reg r12, 10*16 + 0*8 + save_reg r15, 10*16 + 1*8 + save_reg r13, 10*16 + 2*8 + end_prolog + mov arg4, arg(4) + mov arg5, arg(5) +%endmacro + +%macro FUNC_RESTORE 0 + vmovdqa xmm6, [rsp+16*0] + vmovdqa xmm7, [rsp+16*1] + vmovdqa xmm8, [rsp+16*2] + vmovdqa xmm9, [rsp+16*3] + vmovdqa xmm10, [rsp+16*4] + vmovdqa xmm11, [rsp+16*5] + vmovdqa xmm12, [rsp+16*6] + vmovdqa xmm13, [rsp+16*7] + vmovdqa xmm14, [rsp+16*8] + vmovdqa xmm15, [rsp+16*9] + mov r12, [rsp + 10*16 + 0*8] + mov r15, [rsp + 10*16 + 1*8] + mov r13, [rsp + 10*16 + 2*8] + add rsp, stack_size +%endmacro +%endif + +%define PS 8 +%define len arg0 +%define vec arg1 +%define vec_i arg2 +%define mul_array arg3 +%define src arg4 +%define dest1 arg5 +%define pos return +%define dest2 tmp3 +%define dest3 tmp2 +%define dest4 mul_array +%define dest5 vec +%define dest6 vec_i + +%ifndef EC_ALIGNED_ADDR +;;; Use Un-aligned load/store + %define XLDR vmovdqu8 + %define XSTR vmovdqu8 +%else +;;; Use Non-temporal load/stor + %ifdef NO_NT_LDST + %define XLDR vmovdqa64 + %define XSTR vmovdqa64 + %else + %define XLDR vmovntdqa + %define XSTR vmovntdq + %endif +%endif + +default rel +[bits 64] +section .text + +%define x0 zmm0 +%define xtmpa zmm1 +%define xtmpl1 zmm2 +%define xtmph1 zmm3 +%define xgft1_hi zmm4 +%define xgft1_lo zmm5 +%define xgft1_loy ymm5 +%define xgft2_hi zmm6 +%define xgft2_lo zmm7 +%define xgft2_loy ymm7 +%define xgft3_hi zmm8 +%define xgft3_lo zmm9 +%define xgft3_loy ymm9 +%define xgft4_hi zmm10 +%define xgft4_lo zmm11 +%define xgft4_loy ymm11 +%define xgft5_hi zmm12 +%define xgft5_lo zmm13 +%define xgft5_loy ymm13 +%define xgft6_hi zmm14 +%define xgft6_lo zmm15 +%define xgft6_loy ymm15 +%define xd1 zmm16 +%define xd2 zmm17 +%define xd3 zmm18 +%define xd4 zmm19 +%define xd5 zmm20 +%define xd6 zmm21 +%define xmask0f zmm22 +%define xtmpl2 zmm23 +%define xtmpl3 zmm24 +%define xtmpl4 zmm25 +%define xtmpl5 zmm26 +%define xtmph2 zmm27 +%define xtmph3 zmm28 +%define xtmph4 zmm29 +%define xtmph5 zmm30 +%define xtmph6 zmm31 + +align 16 +global gf_6vect_mad_avx512, function +func(gf_6vect_mad_avx512) + FUNC_SAVE + sub len, 64 + jl .return_fail + xor pos, pos + mov tmp, 0x0f + vpbroadcastb xmask0f, tmp ;Construct mask 0x0f0f0f... + sal vec_i, 5 ;Multiply by 32 + sal vec, 5 ;Multiply by 32 + lea tmp, [mul_array + vec_i] + mov vec_i, vec + mov mul_array, vec + sal vec_i, 1 ;vec_i=vec*64 + sal mul_array, 1 ;mul_array=vec*64 + add vec_i, vec ;vec_i=vec*96 + add mul_array, vec_i ;vec_i=vec*160 + + vmovdqu xgft1_loy, [tmp] ;Load array Ax{00}..{0f}, Ax{00}..{f0} + vmovdqu xgft2_loy, [tmp+vec] ;Load array Bx{00}..{0f}, Bx{00}..{f0} + vmovdqu xgft3_loy, [tmp+2*vec] ;Load array Cx{00}..{0f}, Cx{00}..{f0} + vmovdqu xgft4_loy, [tmp+vec_i] ;Load array Dx{00}..{0f}, Dx{00}..{f0} + vmovdqu xgft5_loy, [tmp+4*vec] ;Load array Ex{00}..{0f}, Ex{00}..{f0} + vmovdqu xgft6_loy, [tmp+mul_array] ;Load array Fx{00}..{0f}, Fx{00}..{f0} + + vshufi64x2 xgft1_hi, xgft1_lo, xgft1_lo, 0x55 + vshufi64x2 xgft1_lo, xgft1_lo, xgft1_lo, 0x00 + vshufi64x2 xgft2_hi, xgft2_lo, xgft2_lo, 0x55 + vshufi64x2 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 + vshufi64x2 xgft3_hi, xgft3_lo, xgft3_lo, 0x55 + vshufi64x2 xgft3_lo, xgft3_lo, xgft3_lo, 0x00 + vshufi64x2 xgft4_hi, xgft4_lo, xgft4_lo, 0x55 + vshufi64x2 xgft4_lo, xgft4_lo, xgft4_lo, 0x00 + vshufi64x2 xgft5_hi, xgft5_lo, xgft5_lo, 0x55 + vshufi64x2 xgft5_lo, xgft5_lo, xgft5_lo, 0x00 + vshufi64x2 xgft6_hi, xgft6_lo, xgft6_lo, 0x55 + vshufi64x2 xgft6_lo, xgft6_lo, xgft6_lo, 0x00 + + mov dest2, [dest1+PS] + mov dest3, [dest1+2*PS] + mov dest4, [dest1+3*PS] ; reuse mul_array + mov dest5, [dest1+4*PS] ; reuse vec + mov dest6, [dest1+5*PS] ; reuse vec_i + mov dest1, [dest1] + mov tmp, -1 + kmovq k1, tmp + +.loop64: + XLDR x0, [src+pos] ;Get next source vector + XLDR xd1, [dest1+pos] ;Get next dest vector + XLDR xd2, [dest2+pos] ;Get next dest vector + XLDR xd3, [dest3+pos] ;Get next dest vector + XLDR xd4, [dest4+pos] ;Get next dest vector + XLDR xd5, [dest5+pos] ;Get next dest vector + XLDR xd6, [dest6+pos] ;Get next dest vector + + vpandq xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0 + vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 + vpandq x0, x0, xmask0f ;Mask high src nibble in bits 4-0 + + ; dest1 + vpshufb xtmph1 {k1}{z}, xgft1_hi, x0 ;Lookup mul table of high nibble + vpshufb xtmpl1 {k1}{z}, xgft1_lo, xtmpa ;Lookup mul table of low nibble + vpxorq xtmph1, xtmph1, xtmpl1 ;GF add high and low partials + vpxorq xd1, xd1, xtmph1 ;xd1 += partial + + ; dest2 + vpshufb xtmph2 {k1}{z}, xgft2_hi, x0 ;Lookup mul table of high nibble + vpshufb xtmpl2 {k1}{z}, xgft2_lo, xtmpa ;Lookup mul table of low nibble + vpxorq xtmph2, xtmph2, xtmpl2 ;GF add high and low partials + vpxorq xd2, xd2, xtmph2 ;xd2 += partial + + ; dest3 + vpshufb xtmph3 {k1}{z}, xgft3_hi, x0 ;Lookup mul table of high nibble + vpshufb xtmpl3 {k1}{z}, xgft3_lo, xtmpa ;Lookup mul table of low nibble + vpxorq xtmph3, xtmph3, xtmpl3 ;GF add high and low partials + vpxorq xd3, xd3, xtmph3 ;xd3 += partial + + ; dest4 + vpshufb xtmph4 {k1}{z}, xgft4_hi, x0 ;Lookup mul table of high nibble + vpshufb xtmpl4 {k1}{z}, xgft4_lo, xtmpa ;Lookup mul table of low nibble + vpxorq xtmph4, xtmph4, xtmpl4 ;GF add high and low partials + vpxorq xd4, xd4, xtmph4 ;xd4 += partial + + ; dest5 + vpshufb xtmph5 {k1}{z}, xgft5_hi, x0 ;Lookup mul table of high nibble + vpshufb xtmpl5 {k1}{z}, xgft5_lo, xtmpa ;Lookup mul table of low nibble + vpxorq xtmph5, xtmph5, xtmpl5 ;GF add high and low partials + vpxorq xd5, xd5, xtmph5 ;xd5 += partial + + ; dest6 + vpshufb xtmph6 {k1}{z}, xgft6_hi, x0 ;Lookup mul table of high nibble + vpshufb xtmpl5 {k1}{z}, xgft6_lo, xtmpa ;Lookup mul table of low nibble. Reuse xtmpl5 + vpxorq xtmph6, xtmph6, xtmpl5 ;GF add high and low partials. + vpxorq xd6, xd6, xtmph6 ;xd6 += partial + + XSTR [dest1+pos], xd1 + XSTR [dest2+pos], xd2 + XSTR [dest3+pos], xd3 + XSTR [dest4+pos], xd4 + XSTR [dest5+pos], xd5 + XSTR [dest6+pos], xd6 + + add pos, 64 ;Loop on 64 bytes at a time + cmp pos, len + jle .loop64 + + lea tmp, [len + 64] + cmp pos, tmp + je .return_pass + + ;; Tail len + mov pos, (1 << 63) + lea tmp, [len + 64 - 1] + and tmp, 63 + sarx pos, pos, tmp + kmovq k1, pos + mov pos, len ;Overlapped offset length-64 + jmp .loop64 ;Do one more overlap pass + +.return_pass: + mov return, 0 + FUNC_RESTORE + ret + +.return_fail: + mov return, 1 + FUNC_RESTORE + ret + +endproc_frame + +%else +%ifidn __OUTPUT_FORMAT__, win64 +global no_gf_6vect_mad_avx512 +no_gf_6vect_mad_avx512: +%endif +%endif ; ifdef HAVE_AS_KNOWS_AVX512 diff --git a/contrib/libs/isa-l/erasure_code/gf_6vect_mad_avx512_gfni.asm b/contrib/libs/isa-l/erasure_code/gf_6vect_mad_avx512_gfni.asm new file mode 100644 index 0000000000..b1853b65fd --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/gf_6vect_mad_avx512_gfni.asm @@ -0,0 +1,259 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2023 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; +;;; gf_6vect_mad_avx512_gfni(len, vec, vec_i, mul_array, src, dest); +;;; + +%include "reg_sizes.asm" +%include "gf_vect_gfni.inc" + +%if AS_FEATURE_LEVEL >= 10 + +%ifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + %define tmp r11 + %define tmp2 r10 + %define tmp3 r12 ;must be saved and restored + %define func(x) x: endbranch + %macro FUNC_SAVE 0 + push r12 + %endmacro + %macro FUNC_RESTORE 0 + pop r12 + %endmacro +%endif + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define arg4 r12 + %define arg5 r14 + %define tmp r11 + %define tmp2 r10 + %define tmp3 r13 + %define stack_size 16*10 + 3*8 + %define arg(x) [rsp + stack_size + 8 + 8*x] + %define func(x) proc_frame x + +%macro FUNC_SAVE 0 + sub rsp, stack_size + vmovdqa [rsp + 16*0], xmm6 + vmovdqa [rsp + 16*1], xmm7 + vmovdqa [rsp + 16*2], xmm8 + vmovdqa [rsp + 16*3], xmm9 + vmovdqa [rsp + 16*4], xmm10 + vmovdqa [rsp + 16*5], xmm11 + vmovdqa [rsp + 16*6], xmm12 + vmovdqa [rsp + 16*7], xmm13 + vmovdqa [rsp + 16*8], xmm14 + vmovdqa [rsp + 16*9], xmm15 + mov [rsp + 10*16 + 0*8], r12 + mov [rsp + 10*16 + 1*8], r13 + mov [rsp + 10*16 + 2*8], r14 + end_prolog + mov arg4, arg(4) + mov arg5, arg(5) +%endmacro + +%macro FUNC_RESTORE 0 + vmovdqa xmm6, [rsp + 16*0] + vmovdqa xmm7, [rsp + 16*1] + vmovdqa xmm8, [rsp + 16*2] + vmovdqa xmm9, [rsp + 16*3] + vmovdqa xmm10, [rsp + 16*4] + vmovdqa xmm11, [rsp + 16*5] + vmovdqa xmm12, [rsp + 16*6] + vmovdqa xmm13, [rsp + 16*7] + vmovdqa xmm14, [rsp + 16*8] + vmovdqa xmm15, [rsp + 16*9] + mov r12, [rsp + 10*16 + 0*8] + mov r13, [rsp + 10*16 + 1*8] + mov r14, [rsp + 10*16 + 2*8] + add rsp, stack_size +%endmacro +%endif + +%define len arg0 +%define vec arg1 +%define vec_i arg2 +%define mul_array arg3 +%define src arg4 +%define dest1 arg5 +%define pos rax +%define dest2 tmp3 +%define dest3 tmp2 +%define dest4 mul_array +%define dest5 vec +%define dest6 vec_i + +%ifndef EC_ALIGNED_ADDR +;;; Use Un-aligned load/store + %define XLDR vmovdqu8 + %define XSTR vmovdqu8 +%else +;;; Use Non-temporal load/stor + %ifdef NO_NT_LDST + %define XLDR vmovdqa64 + %define XSTR vmovdqa64 + %else + %define XLDR vmovntdqa + %define XSTR vmovntdq + %endif +%endif + +default rel +[bits 64] +section .text + +%define x0 zmm0 +%define xd1 zmm1 +%define xd2 zmm2 +%define xd3 zmm3 +%define xd4 zmm4 +%define xd5 zmm5 +%define xd6 zmm6 + +%define xgft1 zmm7 +%define xgft2 zmm8 +%define xgft3 zmm9 +%define xgft4 zmm10 +%define xgft5 zmm11 +%define xgft6 zmm12 + +%define xret1 zmm13 +%define xret2 zmm14 +%define xret3 zmm15 +%define xret4 zmm16 +%define xret5 zmm17 +%define xret6 zmm18 + +;; +;; Encodes 64 bytes of a single source into 6x 64 bytes (parity disks) +;; +%macro ENCODE_64B_6 0-1 +%define %%KMASK %1 + +%if %0 == 1 + vmovdqu8 x0{%%KMASK}, [src + pos] ;Get next source vector + vmovdqu8 xd1{%%KMASK}, [dest1 + pos] ;Get next dest vector + vmovdqu8 xd2{%%KMASK}, [dest2 + pos] ;Get next dest vector + vmovdqu8 xd3{%%KMASK}, [dest3 + pos] ;Get next dest vector + vmovdqu8 xd4{%%KMASK}, [dest4 + pos] ;Get next dest vector + vmovdqu8 xd5{%%KMASK}, [dest5 + pos] ;Get next dest vector + vmovdqu8 xd6{%%KMASK}, [dest6 + pos] ;Get next dest vector +%else + XLDR x0, [src + pos] ;Get next source vector + XLDR xd1, [dest1 + pos] ;Get next dest vector + XLDR xd2, [dest2 + pos] ;Get next dest vector + XLDR xd3, [dest3 + pos] ;Get next dest vector + XLDR xd4, [dest4 + pos] ;Get next dest vector + XLDR xd5, [dest5 + pos] ;Get next dest vector + XLDR xd6, [dest6 + pos] ;Get next dest vector +%endif + + GF_MUL_XOR EVEX, x0, xgft1, xret1, xd1, xgft2, xret2, xd2, xgft3, xret3, xd3, \ + xgft4, xret4, xd4, xgft5, xret5, xd5, xgft6, xret6, xd6 + +%if %0 == 1 + vmovdqu8 [dest1 + pos]{%%KMASK}, xd1 + vmovdqu8 [dest2 + pos]{%%KMASK}, xd2 + vmovdqu8 [dest3 + pos]{%%KMASK}, xd3 + vmovdqu8 [dest4 + pos]{%%KMASK}, xd4 + vmovdqu8 [dest5 + pos]{%%KMASK}, xd5 + vmovdqu8 [dest6 + pos]{%%KMASK}, xd6 +%else + XSTR [dest1 + pos], xd1 + XSTR [dest2 + pos], xd2 + XSTR [dest3 + pos], xd3 + XSTR [dest4 + pos], xd4 + XSTR [dest5 + pos], xd5 + XSTR [dest6 + pos], xd6 +%endif +%endmacro + +align 16 +global gf_6vect_mad_avx512_gfni, function +func(gf_6vect_mad_avx512_gfni) + FUNC_SAVE + + xor pos, pos + shl vec_i, 3 ;Multiply by 8 + shl vec, 3 ;Multiply by 8 + lea tmp, [mul_array + vec_i] + vbroadcastf32x2 xgft1, [tmp] + vbroadcastf32x2 xgft2, [tmp + vec] + vbroadcastf32x2 xgft3, [tmp + vec*2] + vbroadcastf32x2 xgft5, [tmp + vec*4] + add tmp, vec + vbroadcastf32x2 xgft4, [tmp + vec*2] + vbroadcastf32x2 xgft6, [tmp + vec*4] + mov dest2, [dest1 + 8] + mov dest3, [dest1 + 2*8] + mov dest4, [dest1 + 3*8] ; reuse mul_array + mov dest5, [dest1 + 4*8] ; reuse vec + mov dest6, [dest1 + 5*8] ; reuse vec_i + mov dest1, [dest1] + + cmp len, 64 + jl .len_lt_64 +.loop64: + ENCODE_64B_6 + + add pos, 64 ;Loop on 64 bytes at a time + sub len, 64 + cmp len, 64 + jge .loop64 + +.len_lt_64: + cmp len, 0 + jle .exit + + xor tmp, tmp + bts tmp, len + dec tmp + kmovq k1, tmp + + ENCODE_64B_6 k1 + +.exit: + vzeroupper + + FUNC_RESTORE + ret + +endproc_frame +%endif ; if AS_FEATURE_LEVEL >= 10 diff --git a/contrib/libs/isa-l/erasure_code/gf_6vect_mad_sse.asm b/contrib/libs/isa-l/erasure_code/gf_6vect_mad_sse.asm index 8e0fc0e0a4..a816f8bbf4 100644 --- a/contrib/libs/isa-l/erasure_code/gf_6vect_mad_sse.asm +++ b/contrib/libs/isa-l/erasure_code/gf_6vect_mad_sse.asm @@ -113,7 +113,7 @@ %define return rax %define return.w eax - %define func(x) x: + %define func(x) x: endbranch %macro FUNC_SAVE 0 push r12 push r13 @@ -185,13 +185,8 @@ section .text align 16 -global gf_6vect_mad_sse:ISAL_SYM_TYPE_FUNCTION +global gf_6vect_mad_sse, function func(gf_6vect_mad_sse) -%ifidn __OUTPUT_FORMAT__, macho64 -global _gf_6vect_mad_sse:ISAL_SYM_TYPE_FUNCTION -func(_gf_6vect_mad_sse) -%endif - FUNC_SAVE sub len, 16 jl .return_fail @@ -406,6 +401,3 @@ align 16 mask0f: dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f constip16: dq 0xf8f9fafbfcfdfeff, 0xf0f1f2f3f4f5f6f7 - -;;; func core, ver, snum -slversion gf_6vect_mad_sse, 00, 01, 020f diff --git a/contrib/libs/isa-l/erasure_code/gf_6vect_mad_sse.patch b/contrib/libs/isa-l/erasure_code/gf_6vect_mad_sse.patch deleted file mode 100644 index 619072af53..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_6vect_mad_sse.patch +++ /dev/null @@ -1,6 +0,0 @@ -190,194d189 -< %ifidn __OUTPUT_FORMAT__, macho64 -< global _gf_6vect_mad_sse:ISAL_SYM_TYPE_FUNCTION -< func(_gf_6vect_mad_sse) -< %endif -< diff --git a/contrib/libs/isa-l/erasure_code/gf_inverse_test.c b/contrib/libs/isa-l/erasure_code/gf_inverse_test.c index fe2006eeb7..54e70bb46e 100644 --- a/contrib/libs/isa-l/erasure_code/gf_inverse_test.c +++ b/contrib/libs/isa-l/erasure_code/gf_inverse_test.c @@ -111,7 +111,9 @@ int inv_test(u8 * in, u8 * inv, u8 * sav, int n) print_matrix(in, n); return -1; } +#ifdef TEST_VERBOSE putchar('.'); +#endif return 0; } @@ -119,7 +121,8 @@ int inv_test(u8 * in, u8 * inv, u8 * sav, int n) int main(int argc, char *argv[]) { int i, k, t; - u8 *test_mat, *save_mat, *invr_mat; + u8 *test_mat = NULL, *save_mat = NULL, *invr_mat = NULL; + int ret = -1; u8 test1[] = { 1, 1, 6, 1, 1, 1, @@ -149,25 +152,25 @@ int main(int argc, char *argv[]) invr_mat = malloc(KMAX * KMAX); if (NULL == test_mat || NULL == save_mat || NULL == invr_mat) - return -1; + goto exit; // Test with lots of leading 1's k = 3; memcpy(test_mat, test1, k * k); if (inv_test(test_mat, invr_mat, save_mat, k)) - return -1; + goto exit; // Test with leading zeros k = 3; memcpy(test_mat, test2, k * k); if (inv_test(test_mat, invr_mat, save_mat, k)) - return -1; + goto exit; // Test 3 k = 3; memcpy(test_mat, test3, k * k); if (inv_test(test_mat, invr_mat, save_mat, k)) - return -1; + goto exit; // Test 4 - try a singular matrix k = 4; @@ -175,7 +178,7 @@ int main(int argc, char *argv[]) if (!gf_invert_matrix(test_mat, invr_mat, k)) { printf("Fail: didn't catch singular matrix\n"); print_matrix(test4, 4); - return -1; + goto exit; } // Do random test of size KMAX k = KMAX; @@ -185,7 +188,7 @@ int main(int argc, char *argv[]) if (gf_invert_matrix(test_mat, invr_mat, k)) { printf("rand picked a singular matrix, try again\n"); - return -1; + goto exit; } matrix_mult(invr_mat, save_mat, test_mat, k); @@ -195,7 +198,7 @@ int main(int argc, char *argv[]) print_matrix(save_mat, k); print_matrix(invr_mat, k); print_matrix(test_mat, k); - return -1; + goto exit; } // Do Randoms. Random size and coefficients for (t = 0; t < RANDOMS; t++) { @@ -214,12 +217,22 @@ int main(int argc, char *argv[]) print_matrix(save_mat, k); print_matrix(invr_mat, k); print_matrix(test_mat, k); - return -1; + goto exit; } +#ifdef TEST_VERBOSE if (0 == (t % 8)) putchar('.'); +#endif } printf(" Pass\n"); - return 0; + + ret = 0; + + exit: + free(test_mat); + free(save_mat); + free(invr_mat); + + return ret; } diff --git a/contrib/libs/isa-l/erasure_code/gf_inverse_test.patch b/contrib/libs/isa-l/erasure_code/gf_inverse_test.patch deleted file mode 100644 index b91cf7f348..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_inverse_test.patch +++ /dev/null @@ -1,4 +0,0 @@ -59c59 -< d ^= gf_mul_erasure(a[n * i + k], b[n * k + j]); ---- -> d ^= gf_mul(a[n * i + k], b[n * k + j]); diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_1tbl.c b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_1tbl.c index d2959c3c51..bd202fdcf1 100644 --- a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_1tbl.c +++ b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_1tbl.c @@ -33,22 +33,22 @@ #include "test.h" #include "erasure_code.h" -//#define CACHED_TEST -#ifdef CACHED_TEST +#ifndef GT_L3_CACHE +# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ +#endif + +#if !defined(COLD_TEST) && !defined(TEST_CUSTOM) // Cached test, loop many times over small dataset # define TEST_SOURCES 10 # define TEST_LEN 8*1024 # define TEST_TYPE_STR "_warm" -#else -# ifndef TEST_CUSTOM +#elif defined (COLD_TEST) // Uncached test. Pull from large mem base. -# define TEST_SOURCES 10 -# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ -# define TEST_LEN GT_L3_CACHE / TEST_SOURCES -# define TEST_TYPE_STR "_cold" -# else -# define TEST_TYPE_STR "_cus" -# endif +# define TEST_SOURCES 10 +# define TEST_LEN (GT_L3_CACHE / TEST_SOURCES) +# define TEST_TYPE_STR "_cold" +#elif defined (TEST_CUSTOM) +# define TEST_TYPE_STR "_cus" #endif typedef unsigned char u8; @@ -111,10 +111,20 @@ void gf_vect_dot_prod_mult(int len, int vlen, u8 * v, u8 ** src, u8 * dest) int main(void) { int i, j; - u8 vec[TEST_SOURCES], dest1[TEST_LEN], dest2[TEST_LEN]; + u8 vec[TEST_SOURCES], *dest1, *dest2; u8 *matrix[TEST_SOURCES]; struct perf start; + dest1 = (u8 *) malloc(TEST_LEN); + dest2 = (u8 *) malloc(TEST_LEN); + + if (NULL == dest1 || NULL == dest2) { + printf("buffer alloc error\n"); + return -1; + } + memset(dest1, 0xfe, TEST_LEN); + memset(dest2, 0xfe, TEST_LEN); + mk_gf_field(); mk_gf_mul_table(gf_mul_table); diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_1tbl.patch b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_1tbl.patch deleted file mode 100644 index 5ea8eab7d0..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_1tbl.patch +++ /dev/null @@ -1,8 +0,0 @@ -81c81 -< table[i * 256 + j] = gf_mul_erasure(i, j); ---- -> table[i * 256 + j] = gf_mul(i, j); -91c91 -< s ^= gf_mul_erasure(src[j][i], v[j]); ---- -> s ^= gf_mul(src[j][i], v[j]); diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx.asm b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx.asm index dc1eebb972..37915c873b 100644 --- a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx.asm +++ b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx.asm @@ -48,7 +48,7 @@ %endmacro %define SSTR SLDR %define PS 8 - %define func(x) x: + %define func(x) x: endbranch %define FUNC_SAVE %define FUNC_RESTORE %endif @@ -106,7 +106,7 @@ %define PS 4 %define LOG_PS 2 - %define func(x) x: + %define func(x) x: endbranch %define arg(x) [ebp + PS*2 + PS*x] %define trans ecx ;trans is for the variables in stack @@ -194,13 +194,8 @@ section .text %define xp xmm2 align 16 -global gf_vect_dot_prod_avx:ISAL_SYM_TYPE_FUNCTION +global gf_vect_dot_prod_avx, function func(gf_vect_dot_prod_avx) -%ifidn __OUTPUT_FORMAT__, macho64 -global _gf_vect_dot_prod_avx:ISAL_SYM_TYPE_FUNCTION -func(_gf_vect_dot_prod_avx) -%endif - FUNC_SAVE SLDR len, len_m sub len, 16 @@ -271,6 +266,3 @@ align 16 mask0f: dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f - -;;; func core, ver, snum -slversion gf_vect_dot_prod_avx, 02, 05, 0061 diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx.patch b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx.patch deleted file mode 100644 index 30bdc75785..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx.patch +++ /dev/null @@ -1,6 +0,0 @@ -199,203d198 -< %ifidn __OUTPUT_FORMAT__, macho64 -< global _gf_vect_dot_prod_avx:ISAL_SYM_TYPE_FUNCTION -< func(_gf_vect_dot_prod_avx) -< %endif -< diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx2.asm b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx2.asm index 986fd28a4e..fb9b022975 100644 --- a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx2.asm +++ b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx2.asm @@ -51,7 +51,7 @@ %endmacro %define SSTR SLDR %define PS 8 - %define func(x) x: + %define func(x) x: endbranch %define FUNC_SAVE %define FUNC_RESTORE %endif @@ -111,7 +111,7 @@ %define PS 4 %define LOG_PS 2 - %define func(x) x: + %define func(x) x: endbranch %define arg(x) [ebp + PS*2 + PS*x] %define trans ecx ;trans is for the variables in stack @@ -202,13 +202,8 @@ section .text %define xp ymm2 align 16 -global gf_vect_dot_prod_avx2:ISAL_SYM_TYPE_FUNCTION +global gf_vect_dot_prod_avx2, function func(gf_vect_dot_prod_avx2) -%ifidn __OUTPUT_FORMAT__, macho64 -global _gf_vect_dot_prod_avx2:ISAL_SYM_TYPE_FUNCTION -func(_gf_vect_dot_prod_avx2) -%endif - FUNC_SAVE SLDR len, len_m sub len, 32 @@ -278,8 +273,3 @@ func(_gf_vect_dot_prod_avx2) ret endproc_frame - -section .data - -;;; func core, ver, snum -slversion gf_vect_dot_prod_avx2, 04, 05, 0190 diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx2.patch b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx2.patch deleted file mode 100644 index c2890dbc39..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx2.patch +++ /dev/null @@ -1,6 +0,0 @@ -207,211d206 -< %ifidn __OUTPUT_FORMAT__, macho64 -< global _gf_vect_dot_prod_avx2:ISAL_SYM_TYPE_FUNCTION -< func(_gf_vect_dot_prod_avx2) -< %endif -< diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx2_gfni.asm b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx2_gfni.asm new file mode 100644 index 0000000000..c084894c7b --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx2_gfni.asm @@ -0,0 +1,318 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2023 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; +;;; gf_vect_dot_prod_avx2_gfni(len, vec, *g_tbls, **buffs, *dest); +;;; + +%include "reg_sizes.asm" +%include "gf_vect_gfni.inc" +%include "memcpy.asm" + +%if AS_FEATURE_LEVEL >= 10 + +%ifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + + %define tmp r11 + %define tmp2 r10 + %define tmp3 r12 ; must be saved and restored + + %define stack_size 1*8 + %define func(x) x: endbranch + %macro FUNC_SAVE 0 + sub rsp, stack_size + mov [rsp + 0*8], r12 + %endmacro + %macro FUNC_RESTORE 0 + mov r12, [rsp + 0*8] + add rsp, stack_size + %endmacro +%endif + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + + %define arg4 r12 ; must be saved, loaded and restored + %define arg5 r15 ; must be saved and restored + %define tmp r11 + %define tmp2 r10 + %define tmp3 r13 ; must be saved and restored + %define stack_size 4*16 + 3*8 ; must be an odd multiple of 8 + %define arg(x) [rsp + stack_size + 8 + 8*x] + + %define func(x) proc_frame x + %macro FUNC_SAVE 0 + alloc_stack stack_size + vmovdqa [rsp + 0*16], xmm6 + vmovdqa [rsp + 1*16], xmm7 + vmovdqa [rsp + 2*16], xmm8 + vmovdqa [rsp + 3*16], xmm9 + mov [rsp + 4*16 + 0*8], r12 + mov [rsp + 4*16 + 1*8], r13 + mov [rsp + 4*16 + 2*8], r15 + end_prolog + mov arg4, arg(4) + %endmacro + + %macro FUNC_RESTORE 0 + vmovdqa xmm6, [rsp + 0*16] + vmovdqa xmm7, [rsp + 1*16] + vmovdqa xmm8, [rsp + 2*16] + vmovdqa xmm9, [rsp + 3*16] + mov r12, [rsp + 4*16 + 0*8] + mov r13, [rsp + 4*16 + 1*8] + mov r15, [rsp + 4*16 + 2*8] + add rsp, stack_size + %endmacro +%endif + + +%define len arg0 +%define vec arg1 +%define mul_array arg2 +%define src arg3 +%define dest1 arg4 +%define ptr arg5 +%define vec_i tmp2 +%define pos rax + +%ifndef EC_ALIGNED_ADDR +;;; Use Un-aligned load/store + %define XLDR vmovdqu + %define XSTR vmovdqu +%else +;;; Use Non-temporal load/stor + %ifdef NO_NT_LDST + %define XLDR vmovdqa + %define XSTR vmovdqa + %else + %define XLDR vmovntdqa + %define XSTR vmovntdq + %endif +%endif + +%define x0l ymm0 +%define x0h ymm1 +%define x0x ymm2 + +%define xp1l ymm3 +%define xp1h ymm4 +%define xp1x ymm5 + +%define xgft1 ymm6 +%define xgft2 ymm7 +%define xgft3 ymm8 + +%define xtmp1 ymm9 + +%define x0 x0l +%define xp1 xp1l +%define xp2 xp2l +%define xp3 xp3l + +default rel +[bits 64] + +section .text + +;; +;; Encodes 96 bytes of all "k" sources into 96 bytes (single parity disk) +;; +%macro ENCODE_96B 0 + vpxor xp1l, xp1l, xp1l + vpxor xp1h, xp1h, xp1h + vpxor xp1x, xp1x, xp1x + mov tmp, mul_array + xor vec_i, vec_i + +%%next_vect: + ;; load next source vector + mov ptr, [src + vec_i] + XLDR x0l, [ptr + pos] + XLDR x0h, [ptr + pos + 32] + XLDR x0x, [ptr + pos + 64] + add vec_i, 8 + + vbroadcastsd xgft1, [tmp] + add tmp, 8 + + GF_MUL_XOR VEX, x0l, xgft1, xtmp1, xp1l + GF_MUL_XOR VEX, x0h, xgft1, xtmp1, xp1h + GF_MUL_XOR VEX, x0x, xgft1, xtmp1, xp1x + + cmp vec_i, vec + jl %%next_vect + + XSTR [dest1 + pos], xp1l + XSTR [dest1 + pos + 32], xp1h + XSTR [dest1 + pos + 64], xp1x +%endmacro + +;; +;; Encodes 64 bytes of all "k" sources into 64 bytes (single parity disk) +;; +%macro ENCODE_64B 0 + vpxor xp1l, xp1l, xp1l + vpxor xp1h, xp1h, xp1h + mov tmp, mul_array + xor vec_i, vec_i + +%%next_vect: + ;; load next source vector + mov ptr, [src + vec_i] + XLDR x0l, [ptr + pos] + XLDR x0h, [ptr + pos + 32] + add vec_i, 8 + + vbroadcastsd xgft1, [tmp] + add tmp, 8 + + GF_MUL_XOR VEX, x0l, xgft1, xtmp1, xp1l + GF_MUL_XOR VEX, x0h, xgft1, xgft1, xp1h + + cmp vec_i, vec + jl %%next_vect + + XSTR [dest1 + pos], xp1l + XSTR [dest1 + pos + 32], xp1h +%endmacro + +;; +;; Encodes 32 bytes of all "k" sources into 32 bytes (single parity disks) +;; +%macro ENCODE_32B 0 + vpxor xp1, xp1, xp1 + mov tmp, mul_array + xor vec_i, vec_i + +%%next_vect: + ;; load next source vector + mov ptr, [src + vec_i] + XLDR x0, [ptr + pos] + add vec_i, 8 + + vbroadcastsd xgft1, [tmp] + add tmp, 8 + + GF_MUL_XOR VEX, x0, xgft1, xgft1, xp1 + + cmp vec_i, vec + jl %%next_vect + + XSTR [dest1 + pos], xp1 +%endmacro + +;; +;; Encodes less than 32 bytes of all "k" sources into single parity disks +;; +%macro ENCODE_LT_32B 1 +%define %%LEN %1 + + vpxor xp1, xp1, xp1 + xor vec_i, vec_i + +%%next_vect: + ; get next source vector + mov ptr, [src + vec_i] + simd_load_avx2 x0, ptr + pos, %%LEN, tmp, tmp3 + add vec_i, 8 + + vbroadcastsd xgft1, [mul_array] + add mul_array, 8 + + GF_MUL_XOR VEX, x0, xgft1, xgft1, xp1 + + cmp vec_i, vec + jl %%next_vect + + ;; Store updated encoded data + lea ptr, [dest1 + pos] + simd_store_avx2 ptr, xp1, %%LEN, tmp, vec_i +%endmacro + +align 16 +global gf_vect_dot_prod_avx2_gfni, function +func(gf_vect_dot_prod_avx2_gfni) + FUNC_SAVE + + xor pos, pos + shl vec, 3 ;; vec *= 8. Make vec_i count by 8 + + cmp len, 96 + jl .len_lt_96 + +.loop96: + ENCODE_96B + + add pos, 96 ;; Loop on 96 bytes at a time first + sub len, 96 + cmp len, 96 + jge .loop96 + +.len_lt_96: + cmp len, 64 + jl .len_lt_64 + + ENCODE_64B + + add pos, 64 ;; encode next 64 bytes + sub len, 64 + +.len_lt_64: + cmp len, 32 + jl .len_lt_32 + + ENCODE_32B + + add pos, 32 ;; encode next 32 bytes + sub len, 32 + +.len_lt_32: + cmp len, 0 + jle .exit + + ENCODE_LT_32B len ;; encode final bytes + +.exit: + vzeroupper + + FUNC_RESTORE + ret + +endproc_frame +%endif ; if AS_FEATURE_LEVEL >= 10 diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx512.asm b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx512.asm index 405c1e48e2..b5fbf42498 100644 --- a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx512.asm +++ b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx512.asm @@ -49,7 +49,7 @@ %define PS 8 %define LOG_PS 3 - %define func(x) x: + %define func(x) x: endbranch %define FUNC_SAVE %define FUNC_RESTORE %endif @@ -73,15 +73,15 @@ %define func(x) proc_frame x %macro FUNC_SAVE 0 alloc_stack stack_size - save_reg r12, 9*16 + 0*8 - save_reg r15, 9*16 + 3*8 + save_reg r12, 0*8 + save_reg r15, 1*8 end_prolog mov arg4, arg(4) %endmacro %macro FUNC_RESTORE 0 - mov r12, [rsp + 9*16 + 0*8] - mov r15, [rsp + 9*16 + 3*8] + mov r12, [rsp + 0*8] + mov r15, [rsp + 1*8] add rsp, stack_size %endmacro %endif @@ -104,8 +104,8 @@ %else ;;; Use Non-temporal load/stor %ifdef NO_NT_LDST - %define XLDR vmovdqa - %define XSTR vmovdqa + %define XLDR vmovdqa64 + %define XSTR vmovdqa64 %else %define XLDR vmovntdqa %define XSTR vmovntdq @@ -128,13 +128,8 @@ default rel section .text align 16 -global gf_vect_dot_prod_avx512:ISAL_SYM_TYPE_FUNCTION +global gf_vect_dot_prod_avx512, function func(gf_vect_dot_prod_avx512) -%ifidn __OUTPUT_FORMAT__, macho64 -global _gf_vect_dot_prod_avx512:ISAL_SYM_TYPE_FUNCTION -func(_gf_vect_dot_prod_avx512) -%endif - FUNC_SAVE xor pos, pos mov tmp, 0x0f diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx512.patch b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx512.patch deleted file mode 100644 index 61be77efd3..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx512.patch +++ /dev/null @@ -1,6 +0,0 @@ -133,137d132 -< %ifidn __OUTPUT_FORMAT__, macho64 -< global _gf_vect_dot_prod_avx512:ISAL_SYM_TYPE_FUNCTION -< func(_gf_vect_dot_prod_avx512) -< %endif -< diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx512_gfni.asm b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx512_gfni.asm new file mode 100644 index 0000000000..b8fc778a88 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx512_gfni.asm @@ -0,0 +1,190 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2023 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; +;;; gf_vect_dot_prod_avx512_gfni(len, vec, *g_tbls, **buffs, *dest); +;;; + +%include "reg_sizes.asm" +%include "gf_vect_gfni.inc" + +%if AS_FEATURE_LEVEL >= 10 + +%ifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + + %define tmp r11 + %define tmp2 r10 + + %define func(x) x: endbranch + %define FUNC_SAVE + %define FUNC_RESTORE +%endif + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + + %define arg4 r12 ; must be saved, loaded and restored + %define arg5 r13 ; must be saved and restored + %define tmp r11 + %define tmp2 r10 + %define stack_size 0*16 + 3*8 ; must be an odd multiple of 8 + %define arg(x) [rsp + stack_size + 8 + 8*x] + + %define func(x) proc_frame x + %macro FUNC_SAVE 0 + alloc_stack stack_size + mov [rsp + 0*8], r12 + mov [rsp + 1*8], r13 + end_prolog + mov arg4, arg(4) + %endmacro + + %macro FUNC_RESTORE 0 + mov r12, [rsp + 0*8] + mov r13, [rsp + 1*8] + add rsp, stack_size + %endmacro +%endif + + +%define len arg0 +%define vec arg1 +%define mul_array arg2 +%define src arg3 +%define dest1 arg4 +%define ptr arg5 +%define vec_i tmp2 +%define pos rax + + +%ifndef EC_ALIGNED_ADDR +;;; Use Un-aligned load/store + %define XLDR vmovdqu8 + %define XSTR vmovdqu8 +%else +;;; Use Non-temporal load/stor + %ifdef NO_NT_LDST + %define XLDR vmovdqa64 + %define XSTR vmovdqa64 + %else + %define XLDR vmovntdqa + %define XSTR vmovntdq + %endif +%endif + +%define xgft1 zmm2 + +%define x0 zmm0 +%define xp1 zmm1 + +default rel +[bits 64] +section .text + +;; +;; Encodes 64 bytes of all "k" sources into 64 bytes (single parity disk) +;; +%macro ENCODE_64B 0-1 +%define %%KMASK %1 + + vpxorq xp1, xp1, xp1 + mov tmp, mul_array + xor vec_i, vec_i + +%%next_vect: + mov ptr, [src + vec_i] +%if %0 == 1 + vmovdqu8 x0{%%KMASK}, [ptr + pos] ;Get next source vector (less than 64 bytes) +%else + XLDR x0, [ptr + pos] ;Get next source vector (64 bytes) +%endif + add vec_i, 8 + + vbroadcastf32x2 xgft1, [tmp] + add tmp, 8 + + GF_MUL_XOR EVEX, x0, xgft1, xgft1, xp1 + + cmp vec_i, vec + jl %%next_vect + +%if %0 == 1 + vmovdqu8 [dest1 + pos]{%%KMASK}, xp1 +%else + XSTR [dest1 + pos], xp1 +%endif +%endmacro + +align 16 +global gf_vect_dot_prod_avx512_gfni, function +func(gf_vect_dot_prod_avx512_gfni) + FUNC_SAVE + xor pos, pos + shl vec, 3 ;vec *= 8. Make vec_i count by 8 + + cmp len, 64 + jl .len_lt_64 + +.loop64: + + ENCODE_64B + + add pos, 64 ;Loop on 64 bytes at a time + sub len, 64 + cmp len, 64 + jge .loop64 + +.len_lt_64: + cmp len, 0 + jle .exit + + xor tmp, tmp + bts tmp, len + dec tmp + kmovq k1, tmp + + ENCODE_64B k1 + +.exit: + vzeroupper + + FUNC_RESTORE + ret + +endproc_frame +%endif ; if AS_FEATURE_LEVEL >= 10 diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_base_test.c b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_base_test.c index b2601226e9..0cfd444413 100644 --- a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_base_test.c +++ b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_base_test.c @@ -30,10 +30,11 @@ #include <stdio.h> #include <stdlib.h> #include <string.h> // for memset, memcmp +#include <assert.h> #include "erasure_code.h" -// #include "types.h" +#include "test.h" -#define TEST_LEN 2048 +#define TEST_LEN 8192 #define TEST_SIZE (TEST_LEN/2) #ifndef TEST_SOURCES @@ -134,8 +135,7 @@ int main(int argc, char *argv[]) // Pick a first test m = 9; k = 5; - if (m > MMAX || k > KMAX) - return -1; + assert(!(m > MMAX || k > KMAX)); gf_gen_cauchy1_matrix(a, m, k); @@ -282,7 +282,9 @@ int main(int argc, char *argv[]) return -1; } } +#ifdef TEST_VERBOSE putchar('.'); +#endif } printf("done all: Pass\n"); diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_base_test.patch b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_base_test.patch deleted file mode 100644 index 21bbfaa667..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_base_test.patch +++ /dev/null @@ -1,4 +0,0 @@ -34c34 -< // #include "types.h" ---- -> #include "types.h" diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_perf.c b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_perf.c index bd2b555b0a..3b051c67a4 100644 --- a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_perf.c +++ b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_perf.c @@ -40,22 +40,22 @@ #define str(s) #s #define xstr(s) str(s) -//#define CACHED_TEST -#ifdef CACHED_TEST +#ifndef GT_L3_CACHE +# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ +#endif + +#if !defined(COLD_TEST) && !defined(TEST_CUSTOM) // Cached test, loop many times over small dataset # define TEST_SOURCES 10 # define TEST_LEN 8*1024 # define TEST_TYPE_STR "_warm" -#else -# ifndef TEST_CUSTOM +#elif defined (COLD_TEST) // Uncached test. Pull from large mem base. -# define TEST_SOURCES 10 -# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ -# define TEST_LEN ((GT_L3_CACHE / TEST_SOURCES) & ~(64-1)) -# define TEST_TYPE_STR "_cold" -# else -# define TEST_TYPE_STR "_cus" -# endif +# define TEST_SOURCES 10 +# define TEST_LEN ((GT_L3_CACHE / TEST_SOURCES) & ~(64-1)) +# define TEST_TYPE_STR "_cold" +#elif defined (TEST_CUSTOM) +# define TEST_TYPE_STR "_cus" #endif typedef unsigned char u8; diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_sse.asm b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_sse.asm index 67f4a1a329..ef245b4961 100644 --- a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_sse.asm +++ b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_sse.asm @@ -48,7 +48,7 @@ %endmacro %define SSTR SLDR %define PS 8 - %define func(x) x: + %define func(x) x: endbranch %define FUNC_SAVE %define FUNC_RESTORE %endif @@ -106,7 +106,7 @@ %define PS 4 %define LOG_PS 2 - %define func(x) x: + %define func(x) x: endbranch %define arg(x) [ebp + PS*2 + PS*x] %define trans ecx ;trans is for the variables in stack @@ -194,13 +194,8 @@ section .text %define xp xmm2 align 16 -global gf_vect_dot_prod_sse:ISAL_SYM_TYPE_FUNCTION +global gf_vect_dot_prod_sse, function func(gf_vect_dot_prod_sse) -%ifidn __OUTPUT_FORMAT__, macho64 -global _gf_vect_dot_prod_sse:ISAL_SYM_TYPE_FUNCTION -func(_gf_vect_dot_prod_sse) -%endif - FUNC_SAVE SLDR len, len_m sub len, 16 @@ -271,6 +266,3 @@ section .data align 16 mask0f: dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f - -;;; func core, ver, snum -slversion gf_vect_dot_prod_sse, 00, 05, 0060 diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_sse.patch b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_sse.patch deleted file mode 100644 index ab47fc7a53..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_sse.patch +++ /dev/null @@ -1,6 +0,0 @@ -199,203d198 -< %ifidn __OUTPUT_FORMAT__, macho64 -< global _gf_vect_dot_prod_sse:ISAL_SYM_TYPE_FUNCTION -< func(_gf_vect_dot_prod_sse) -< %endif -< diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_test.c b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_test.c index dbfc2da045..8300fbd70d 100644 --- a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_test.c +++ b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_test.c @@ -31,7 +31,7 @@ #include <stdlib.h> #include <string.h> // for memset, memcmp #include "erasure_code.h" -// #include "types.h" +#include "test.h" #ifndef FUNCTION_UNDER_TEST # define FUNCTION_UNDER_TEST gf_vect_dot_prod @@ -43,7 +43,7 @@ #define str(s) #s #define xstr(s) str(s) -#define TEST_LEN 2048 +#define TEST_LEN 8192 #define TEST_SIZE (TEST_LEN/2) #ifndef TEST_SOURCES @@ -171,8 +171,11 @@ int main(int argc, char *argv[]) printf("dprod:"); dump(dest, 25); return -1; - } else + } +#ifdef TEST_VERBOSE + else putchar('.'); +#endif // Rand data test for (rtest = 0; rtest < RANDOMS; rtest++) { @@ -199,7 +202,9 @@ int main(int argc, char *argv[]) return -1; } +#ifdef TEST_VERBOSE putchar('.'); +#endif } // Rand data test with varied parameters @@ -228,7 +233,9 @@ int main(int argc, char *argv[]) return -1; } +#ifdef TEST_VERBOSE putchar('.'); +#endif } } @@ -396,7 +403,9 @@ int main(int argc, char *argv[]) return -1; } } +#ifdef TEST_VERBOSE putchar('.'); +#endif } // Run tests at end of buffer for Electric Fence @@ -428,7 +437,9 @@ int main(int argc, char *argv[]) return -1; } +#ifdef TEST_VERBOSE putchar('.'); +#endif } // Test rand ptr alignment if available @@ -485,7 +496,9 @@ int main(int argc, char *argv[]) return -1; } +#ifdef TEST_VERBOSE putchar('.'); +#endif } // Test all size alignment diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_test.patch b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_test.patch deleted file mode 100644 index 21bbfaa667..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_test.patch +++ /dev/null @@ -1,4 +0,0 @@ -34c34 -< // #include "types.h" ---- -> #include "types.h" diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_gfni.inc b/contrib/libs/isa-l/erasure_code/gf_vect_gfni.inc new file mode 100644 index 0000000000..83d362bdae --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/gf_vect_gfni.inc @@ -0,0 +1,72 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2023 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; +; Multiply 1 source register to up to 6 different GF table registers +; and XOR the results to partial registers +; +%macro GF_MUL_XOR 5-20 +%define %%ENCODING %1 +%define %%SRC %2 +%define %%GFTABLE1 %3 +%define %%TMP1 %4 +%define %%PARTIAL1 %5 +%define %%GFTABLE2 %6 +%define %%TMP2 %7 +%define %%PARTIAL2 %8 +%define %%GFTABLE3 %9 +%define %%TMP3 %10 +%define %%PARTIAL3 %11 +%define %%GFTABLE4 %12 +%define %%TMP4 %13 +%define %%PARTIAL4 %14 +%define %%GFTABLE5 %15 +%define %%TMP5 %16 +%define %%PARTIAL5 %17 +%define %%GFTABLE6 %18 +%define %%TMP6 %19 +%define %%PARTIAL6 %20 + +%define %%N_BLOCKS ((%0 - 2) / 3) + +%assign %%I 1 +%rep %%N_BLOCKS + vgf2p8affineqb %%TMP %+ %%I, %%SRC, %%GFTABLE %+ %%I, 0x00 +%assign %%I (%%I + 1) +%endrep +%assign %%I 1 +%rep %%N_BLOCKS +%ifidn %%ENCODING, VEX + vpxor %%PARTIAL %+ %%I, %%TMP %+ %%I +%else + vpxorq %%PARTIAL %+ %%I, %%TMP %+ %%I +%endif +%assign %%I (%%I + 1) +%endrep +%endmacro diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx.asm b/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx.asm index 1a252c474f..20a44d7aa3 100644 --- a/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx.asm +++ b/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx.asm @@ -82,7 +82,7 @@ %define return rax %define return.w eax - %define func(x) x: + %define func(x) x: endbranch %define FUNC_SAVE %define FUNC_RESTORE %endif @@ -131,13 +131,8 @@ section .text %define xtmpd xmm5 align 16 -global gf_vect_mad_avx:ISAL_SYM_TYPE_FUNCTION +global gf_vect_mad_avx, function func(gf_vect_mad_avx) -%ifidn __OUTPUT_FORMAT__, macho64 -global _gf_vect_mad_avx:ISAL_SYM_TYPE_FUNCTION -func(_gf_vect_mad_avx) -%endif - FUNC_SAVE sub len, 16 jl .return_fail @@ -196,6 +191,3 @@ section .data align 16 mask0f: dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f - -;;; func core, ver, snum -slversion gf_vect_mad_avx, 02, 01, 0201 diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx.patch b/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx.patch deleted file mode 100644 index e3932a80b5..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx.patch +++ /dev/null @@ -1,6 +0,0 @@ -136,140d135 -< %ifidn __OUTPUT_FORMAT__, macho64 -< global _gf_vect_mad_avx:ISAL_SYM_TYPE_FUNCTION -< func(_gf_vect_mad_avx) -< %endif -< diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx2.asm b/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx2.asm index 9b24c6e62a..c833f5e98c 100644 --- a/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx2.asm +++ b/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx2.asm @@ -88,7 +88,7 @@ %define return rax %define return.w eax - %define func(x) x: + %define func(x) x: endbranch %define FUNC_SAVE %define FUNC_RESTORE %endif @@ -139,13 +139,8 @@ section .text %define xtmpd ymm5 align 16 -global gf_vect_mad_avx2:ISAL_SYM_TYPE_FUNCTION +global gf_vect_mad_avx2, function func(gf_vect_mad_avx2) -%ifidn __OUTPUT_FORMAT__, macho64 -global _gf_vect_mad_avx2:ISAL_SYM_TYPE_FUNCTION -func(_gf_vect_mad_avx2) -%endif - FUNC_SAVE sub len, 32 jl .return_fail @@ -201,8 +196,3 @@ func(_gf_vect_mad_avx2) ret endproc_frame - -section .data - -;;; func core, ver, snum -slversion gf_vect_mad_avx2, 04, 01, 0202 diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx2.patch b/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx2.patch deleted file mode 100644 index 9d37d75b8d..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx2.patch +++ /dev/null @@ -1,6 +0,0 @@ -144,148d143 -< %ifidn __OUTPUT_FORMAT__, macho64 -< global _gf_vect_mad_avx2:ISAL_SYM_TYPE_FUNCTION -< func(_gf_vect_mad_avx2) -< %endif -< diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx2_gfni.asm b/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx2_gfni.asm new file mode 100644 index 0000000000..29af12a0fc --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx2_gfni.asm @@ -0,0 +1,255 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2023 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; +;;; gf_vect_mad_avx2_gfni(len, vec, vec_i, mul_array, src, dest); +;;; + +%include "reg_sizes.asm" +%include "gf_vect_gfni.inc" +%include "memcpy.asm" + +%if AS_FEATURE_LEVEL >= 10 + +%ifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + %define tmp r11 + %define tmp2 r10 + %define func(x) x: endbranch + %define FUNC_SAVE + %define FUNC_RESTORE +%endif + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define arg4 r12 ; must be saved, loaded and restored + %define arg5 r13 ; must be saved and restored + %define tmp r11 + %define tmp2 r10 + %define stack_size 16*3 + 3*8 + %define arg(x) [rsp + stack_size + 8 + 8*x] + %define func(x) proc_frame x + + %macro FUNC_SAVE 0 + sub rsp, stack_size + vmovdqa [rsp + 0*16], xmm6 + vmovdqa [rsp + 1*16], xmm7 + vmovdqa [rsp + 2*16], xmm8 + mov [rsp + 3*16 + 0*8], r12 + mov [rsp + 3*16 + 1*8], r13 + end_prolog + mov arg4, arg(4) + mov arg5, arg(5) + %endmacro + + %macro FUNC_RESTORE 0 + vmovdqa xmm6, [rsp + 0*16] + vmovdqa xmm7, [rsp + 1*16] + vmovdqa xmm8, [rsp + 2*16] + mov r12, [rsp + 3*16 + 0*8] + mov r13, [rsp + 3*16 + 1*8] + add rsp, stack_size + %endmacro +%endif + +%define len arg0 +%define vec arg1 +%define vec_i arg2 +%define mul_array arg3 +%define src arg4 +%define dest1 arg5 +%define pos rax + +%ifndef EC_ALIGNED_ADDR +;;; Use Un-aligned load/store + %define XLDR vmovdqu + %define XSTR vmovdqu +%else +;;; Use Non-temporal load/stor + %ifdef NO_NT_LDST + %define XLDR vmovdqa + %define XSTR vmovdqa + %else + %define XLDR vmovntdqa + %define XSTR vmovntdq + %endif +%endif + +default rel +[bits 64] +section .text + +%define x0l ymm0 +%define x0h ymm1 +%define x0x ymm2 + +%define xgft1 ymm3 + +%define xd1l ymm4 +%define xd1h ymm5 +%define xd1x ymm6 + +%define xret1l ymm7 +%define xret1h ymm8 + +%define x0 x0l +%define xd1 xd1l +%define xret1 xret1l + +;; +;; Encodes 96 bytes of a single source and updates a single parity disk +;; +%macro ENCODE_96B 0 + ;; get next source vector + XLDR x0l, [src + pos] + XLDR x0h, [src + pos + 32] + XLDR x0x, [src + pos + 64] + ;; get next dest vector + XLDR xd1l, [dest1 + pos] + XLDR xd1h, [dest1 + pos + 32] + XLDR xd1x, [dest1 + pos + 64] + + GF_MUL_XOR VEX, x0l, xgft1, xret1l, xd1l + GF_MUL_XOR VEX, x0h, xgft1, xret1h, xd1h + GF_MUL_XOR VEX, x0x, xgft1, xret1l, xd1x + + XSTR [dest1 + pos], xd1l + XSTR [dest1 + pos + 32], xd1h + XSTR [dest1 + pos + 64], xd1x +%endmacro + +;; +;; Encodes 64 bytes of a single source and updates a single parity disk +;; +%macro ENCODE_64B 0 + ;; get next source vector + XLDR x0l, [src + pos] + XLDR x0h, [src + pos + 32] + ;; get next dest vector + XLDR xd1l, [dest1 + pos] + XLDR xd1h, [dest1 + pos + 32] + + GF_MUL_XOR VEX, x0l, xgft1, xret1l, xd1l + GF_MUL_XOR VEX, x0h, xgft1, xret1h, xd1h + + XSTR [dest1 + pos], xd1l + XSTR [dest1 + pos + 32], xd1h +%endmacro + +;; +;; Encodes 32 bytes of a single source and updates single parity disk +;; +%macro ENCODE_32B 0 + ;; get next source vector + XLDR x0, [src + pos] + ;; get next dest vector + XLDR xd1, [dest1 + pos] + + GF_MUL_XOR VEX, x0, xgft1, xret1, xd1 + + XSTR [dest1 + pos], xd1 +%endmacro + +;; +;; Encodes less than 32 bytes of a single source and updates parity disk +;; +%macro ENCODE_LT_32B 1 +%define %%LEN %1 + ;; get next source vector + simd_load_avx2 x0, src + pos, %%LEN, tmp, tmp2 + ;; get next dest vector + simd_load_avx2 xd1, dest1 + pos, %%LEN, tmp, tmp2 + + GF_MUL_XOR VEX, x0, xgft1, xret1, xd1 + + lea dest1, [dest1 + pos] + simd_store_avx2 dest1, xd1, %%LEN, tmp, tmp2 +%endmacro + +align 16 +global gf_vect_mad_avx2_gfni, function +func(gf_vect_mad_avx2_gfni) + FUNC_SAVE + + xor pos, pos + shl vec_i, 3 ;Multiply by 8 + shl vec, 3 ;Multiply by 8 + lea tmp, [mul_array + vec_i] + vbroadcastsd xgft1, [tmp] + + cmp len, 96 + jl .len_lt_96 + +.loop96: + ENCODE_96B ;; loop on 96 bytes at a time + add pos, 96 + sub len, 96 + cmp len, 96 + jge .loop96 + +.len_lt_96: + cmp len, 64 + jl .len_lt_64 + + ENCODE_64B ;; encode next 64 bytes + + add pos, 64 + sub len, 64 + +.len_lt_64: + cmp len, 32 + jl .len_lt_32 + + ENCODE_32B ;; encode next 32 bytes + + add pos, 32 + sub len, 32 + +.len_lt_32: + cmp len, 0 + jle .exit + + ENCODE_LT_32B len ;; encode final bytes + +.exit: + vzeroupper + + FUNC_RESTORE + ret + +endproc_frame +%endif ; if AS_FEATURE_LEVEL >= 10 diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx512.asm b/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx512.asm index adc2acf3e8..6f1bf35197 100644 --- a/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx512.asm +++ b/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx512.asm @@ -44,7 +44,7 @@ %define arg5 r9 %define tmp r11 %define return rax - %define func(x) x: + %define func(x) x: endbranch %define FUNC_SAVE %define FUNC_RESTORE %endif @@ -101,8 +101,8 @@ %else ;;; Use Non-temporal load/stor %ifdef NO_NT_LDST - %define XLDR vmovdqa - %define XSTR vmovdqa + %define XLDR vmovdqa64 + %define XSTR vmovdqa64 %else %define XLDR vmovntdqa %define XSTR vmovntdq @@ -127,13 +127,8 @@ section .text %define xmask0f zmm8 align 16 -global gf_vect_mad_avx512:ISAL_SYM_TYPE_FUNCTION +global gf_vect_mad_avx512, function func(gf_vect_mad_avx512) -%ifidn __OUTPUT_FORMAT__, macho64 -global _gf_vect_mad_avx512:ISAL_SYM_TYPE_FUNCTION -func(_gf_vect_mad_avx512) -%endif - FUNC_SAVE sub len, 64 jl .return_fail diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx512.patch b/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx512.patch deleted file mode 100644 index 9a20fa281a..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx512.patch +++ /dev/null @@ -1,6 +0,0 @@ -132,136d131 -< %ifidn __OUTPUT_FORMAT__, macho64 -< global _gf_vect_mad_avx512:ISAL_SYM_TYPE_FUNCTION -< func(_gf_vect_mad_avx512) -< %endif -< diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx512_gfni.asm b/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx512_gfni.asm new file mode 100644 index 0000000000..09073ce157 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx512_gfni.asm @@ -0,0 +1,175 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2023 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; +;;; gf_vect_mad_avx512_gfni(len, vec, vec_i, mul_array, src, dest); +;;; + +%include "reg_sizes.asm" +%include "gf_vect_gfni.inc" + +%if AS_FEATURE_LEVEL >= 10 + +%ifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + %define tmp r11 + %define func(x) x: endbranch + %define FUNC_SAVE + %define FUNC_RESTORE +%endif + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define arg4 r12 ; must be saved and loaded + %define arg5 r13 + %define tmp r11 + %define stack_size 3*8 + %define arg(x) [rsp + stack_size + 8 + 8*x] + %define func(x) proc_frame x + + %macro FUNC_SAVE 0 + sub rsp, stack_size + mov [rsp + 0*8], r12 + mov [rsp + 1*8], r13 + end_prolog + mov arg4, arg(4) + mov arg5, arg(5) + %endmacro + + %macro FUNC_RESTORE 0 + mov r12, [rsp + 0*8] + mov r13, [rsp + 1*8] + add rsp, stack_size + %endmacro +%endif + +;;; gf_vect_mad_avx512_gfni(len, vec, vec_i, mul_array, src, dest) +%define len arg0 +%define vec arg1 +%define vec_i arg2 +%define mul_array arg3 +%define src arg4 +%define dest arg5 +%define pos rax + +%ifndef EC_ALIGNED_ADDR +;;; Use Un-aligned load/store + %define XLDR vmovdqu8 + %define XSTR vmovdqu8 +%else +;;; Use Non-temporal load/stor + %ifdef NO_NT_LDST + %define XLDR vmovdqa64 + %define XSTR vmovdqa64 + %else + %define XLDR vmovntdqa + %define XSTR vmovntdq + %endif +%endif + + +default rel + +[bits 64] +section .text + +%define x0 zmm0 +%define xd zmm1 +%define xgft1 zmm2 +%define xret1 zmm3 + +;; +;; Encodes 64 bytes of a single source into 64 bytes (single parity disk) +;; +%macro ENCODE_64B 0-1 +%define %%KMASK %1 + +%if %0 == 1 + vmovdqu8 x0{%%KMASK}, [src + pos] ;Get next source vector + vmovdqu8 xd{%%KMASK}, [dest + pos] ;Get next dest vector +%else + XLDR x0, [src + pos] ;Get next source vector + XLDR xd, [dest + pos] ;Get next dest vector +%endif + + GF_MUL_XOR EVEX, x0, xgft1, xret1, xd + +%if %0 == 1 + vmovdqu8 [dest + pos]{%%KMASK}, xd +%else + XSTR [dest + pos], xd +%endif +%endmacro + +align 16 +global gf_vect_mad_avx512_gfni, function +func(gf_vect_mad_avx512_gfni) + FUNC_SAVE + xor pos, pos + shl vec_i, 3 ;Multiply by 8 + + vbroadcastf32x2 xgft1, [vec_i + mul_array] + + cmp len, 64 + jl .len_lt_64 +.loop64: + ENCODE_64B + + add pos, 64 ;Loop on 64 bytes at a time + sub len, 64 + cmp len, 64 + jge .loop64 + +.len_lt_64: + cmp len, 0 + jle .exit + + xor tmp, tmp + bts tmp, len + dec tmp + kmovq k1, tmp + + ENCODE_64B k1 + +.exit: + vzeroupper + + FUNC_RESTORE + ret + +endproc_frame +%endif ; if AS_FEATURE_LEVEL >= 10 diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_mad_sse.asm b/contrib/libs/isa-l/erasure_code/gf_vect_mad_sse.asm index ea48612324..c3afe72041 100644 --- a/contrib/libs/isa-l/erasure_code/gf_vect_mad_sse.asm +++ b/contrib/libs/isa-l/erasure_code/gf_vect_mad_sse.asm @@ -82,7 +82,7 @@ %define return rax %define return.w eax - %define func(x) x: + %define func(x) x: endbranch %define FUNC_SAVE %define FUNC_RESTORE %endif @@ -131,13 +131,8 @@ section .text align 16 -global gf_vect_mad_sse:ISAL_SYM_TYPE_FUNCTION +global gf_vect_mad_sse, function func(gf_vect_mad_sse) -%ifidn __OUTPUT_FORMAT__, macho64 -global _gf_vect_mad_sse:ISAL_SYM_TYPE_FUNCTION -func(_gf_vect_mad_sse) -%endif - FUNC_SAVE sub len, 16 jl .return_fail @@ -197,6 +192,3 @@ section .data align 16 mask0f: dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f - -;;; func core, ver, snum -slversion gf_vect_mad_sse, 00, 01, 0200 diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_mad_sse.patch b/contrib/libs/isa-l/erasure_code/gf_vect_mad_sse.patch deleted file mode 100644 index 9af95a1e02..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_vect_mad_sse.patch +++ /dev/null @@ -1,6 +0,0 @@ -136,140d135 -< %ifidn __OUTPUT_FORMAT__, macho64 -< global _gf_vect_mad_sse:ISAL_SYM_TYPE_FUNCTION -< func(_gf_vect_mad_sse) -< %endif -< diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_mad_test.c b/contrib/libs/isa-l/erasure_code/gf_vect_mad_test.c index e2cbc1063d..3a552b2c08 100644 --- a/contrib/libs/isa-l/erasure_code/gf_vect_mad_test.c +++ b/contrib/libs/isa-l/erasure_code/gf_vect_mad_test.c @@ -31,7 +31,7 @@ #include <stdlib.h> #include <string.h> // for memset, memcmp #include "erasure_code.h" -// #include "types.h" +#include "test.h" #ifndef ALIGN_SIZE # define ALIGN_SIZE 32 @@ -51,7 +51,7 @@ #define str(s) #s #define xstr(s) str(s) -#define TEST_LEN 2048 +#define TEST_LEN 8192 #define TEST_SIZE (TEST_LEN/2) #define TEST_MEM TEST_SIZE #define TEST_LOOPS 20000 @@ -249,7 +249,9 @@ int main(int argc, char *argv[]) } } +#ifdef TEST_VERBOSE putchar('.'); +#endif // Rand data test @@ -294,7 +296,9 @@ int main(int argc, char *argv[]) } } +#ifdef TEST_VERBOSE putchar('.'); +#endif } // Rand data test with varied parameters @@ -340,7 +344,9 @@ int main(int argc, char *argv[]) } } +#ifdef TEST_VERBOSE putchar('.'); +#endif } } @@ -390,7 +396,9 @@ int main(int argc, char *argv[]) } } +#ifdef TEST_VERBOSE putchar('.'); +#endif } // Test rand ptr alignment if available @@ -462,7 +470,9 @@ int main(int argc, char *argv[]) } } +#ifdef TEST_VERBOSE putchar('.'); +#endif } // Test all size alignment @@ -509,7 +519,9 @@ int main(int argc, char *argv[]) } } +#ifdef TEST_VERBOSE putchar('.'); +#endif } diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_mad_test.patch b/contrib/libs/isa-l/erasure_code/gf_vect_mad_test.patch deleted file mode 100644 index 21bbfaa667..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_vect_mad_test.patch +++ /dev/null @@ -1,4 +0,0 @@ -34c34 -< // #include "types.h" ---- -> #include "types.h" diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_mul_avx.asm b/contrib/libs/isa-l/erasure_code/gf_vect_mul_avx.asm index 86121b298a..d8d8c4c050 100644 --- a/contrib/libs/isa-l/erasure_code/gf_vect_mul_avx.asm +++ b/contrib/libs/isa-l/erasure_code/gf_vect_mul_avx.asm @@ -38,11 +38,8 @@ %define arg1 rsi %define arg2 rdx %define arg3 rcx - %define arg4 r8 - %define arg5 r9 - %define tmp r11 %define return rax - %define func(x) x: + %define func(x) x: endbranch %define FUNC_SAVE %define FUNC_RESTORE @@ -56,11 +53,11 @@ %define func(x) proc_frame x %macro FUNC_SAVE 0 alloc_stack stack_size - save_xmm128 xmm6, 0*16 - save_xmm128 xmm7, 1*16 - save_xmm128 xmm13, 2*16 - save_xmm128 xmm14, 3*16 - save_xmm128 xmm15, 4*16 + vmovdqa [rsp + 0*16], xmm6 + vmovdqa [rsp + 1*16], xmm7 + vmovdqa [rsp + 2*16], xmm13 + vmovdqa [rsp + 3*16], xmm14 + vmovdqa [rsp + 4*16], xmm15 end_prolog %endmacro @@ -81,6 +78,7 @@ %define src arg2 %define dest arg3 %define pos return +%define tmp r11 ;;; Use Non-temporal load/stor @@ -111,13 +109,16 @@ section .text %define xtmp2c xmm7 align 16 -global gf_vect_mul_avx:ISAL_SYM_TYPE_FUNCTION +global gf_vect_mul_avx, function func(gf_vect_mul_avx) -%ifidn __OUTPUT_FORMAT__, macho64 -global _gf_vect_mul_avx:ISAL_SYM_TYPE_FUNCTION -func(_gf_vect_mul_avx) -%endif + + ; Check if length is multiple of 32 bytes + mov tmp, len + and tmp, 0x1f + jnz return_fail + FUNC_SAVE + mov pos, 0 vmovdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte vmovdqu xgft_lo, [mul_array] ;Load array Cx{00}, Cx{01}, Cx{02}, ... @@ -144,14 +145,13 @@ loop32: XSTR [dest+pos-16], xtmp2b ;Store +16B result jl loop32 + FUNC_RESTORE return_pass: - FUNC_RESTORE - sub pos, len + xor return, return ret return_fail: - FUNC_RESTORE mov return, 1 ret @@ -163,6 +163,3 @@ align 16 mask0f: dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f - -;;; func core, ver, snum -slversion gf_vect_mul_avx, 01, 03, 0036 diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_mul_avx.patch b/contrib/libs/isa-l/erasure_code/gf_vect_mul_avx.patch deleted file mode 100644 index 99d4bd2d35..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_vect_mul_avx.patch +++ /dev/null @@ -1,5 +0,0 @@ -116,119d115 -< %ifidn __OUTPUT_FORMAT__, macho64 -< global _gf_vect_mul_avx:ISAL_SYM_TYPE_FUNCTION -< func(_gf_vect_mul_avx) -< %endif diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_mul_base_test.c b/contrib/libs/isa-l/erasure_code/gf_vect_mul_base_test.c index 81f04c4443..5ac40cd079 100644 --- a/contrib/libs/isa-l/erasure_code/gf_vect_mul_base_test.c +++ b/contrib/libs/isa-l/erasure_code/gf_vect_mul_base_test.c @@ -63,7 +63,10 @@ int main(int argc, char *argv[]) for (i = 0; i < TEST_SIZE; i++) buff1[i] = rand(); - gf_vect_mul_base(TEST_SIZE, gf_const_tbl, buff1, buff2); + if (gf_vect_mul_base(TEST_SIZE, gf_const_tbl, buff1, buff2) != 0) { + printf("fail fill with rand data\n"); + return 1; + } for (i = 0; i < TEST_SIZE; i++) if (gf_mul_erasure(a, buff1[i]) != buff2[i]) { @@ -72,8 +75,10 @@ int main(int argc, char *argv[]) return 1; } - gf_vect_mul_base(TEST_SIZE, gf_const_tbl, buff1, buff3); - + if (gf_vect_mul_base(TEST_SIZE, gf_const_tbl, buff1, buff3) != 0) { + printf("fail fill with rand data for buff1\n"); + return -1; + } // Check reference function for (i = 0; i < TEST_SIZE; i++) if (buff2[i] != buff3[i]) { @@ -89,7 +94,10 @@ int main(int argc, char *argv[]) printf("Random tests "); for (a = 0; a != 255; a++) { gf_vect_mul_init(a, gf_const_tbl); - gf_vect_mul_base(TEST_SIZE, gf_const_tbl, buff1, buff2); + if (gf_vect_mul_base(TEST_SIZE, gf_const_tbl, buff1, buff2) != 0) { + printf("fail random tests\n"); + return 1; + } for (i = 0; i < TEST_SIZE; i++) if (gf_mul_erasure(a, buff1[i]) != buff2[i]) { @@ -97,7 +105,9 @@ int main(int argc, char *argv[]) i, a, buff1[i], buff2[i], gf_mul_erasure(2, buff1[i])); return 1; } +#ifdef TEST_VERBOSE putchar('.'); +#endif } // Run tests at end of buffer for Electric Fence @@ -110,7 +120,11 @@ int main(int argc, char *argv[]) efence_buff1 = buff1 + size; efence_buff2 = buff2 + size; - gf_vect_mul_base(TEST_SIZE - size, gf_const_tbl, efence_buff1, efence_buff2); + if (gf_vect_mul_base + (TEST_SIZE - size, gf_const_tbl, efence_buff1, efence_buff2) != 0) { + printf("fail tests at end of buffer\n"); + return -1; + } for (i = 0; i < TEST_SIZE - size; i++) if (gf_mul_erasure(a, efence_buff1[i]) != efence_buff2[i]) { @@ -121,7 +135,9 @@ int main(int argc, char *argv[]) return 1; } +#ifdef TEST_VERBOSE putchar('.'); +#endif } printf(" done: Pass\n"); diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_mul_base_test.patch b/contrib/libs/isa-l/erasure_code/gf_vect_mul_base_test.patch deleted file mode 100644 index 84c965985f..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_vect_mul_base_test.patch +++ /dev/null @@ -1,28 +0,0 @@ -69c69 -< if (gf_mul_erasure(a, buff1[i]) != buff2[i]) { ---- -> if (gf_mul(a, buff1[i]) != buff2[i]) { -71c71 -< gf_mul_erasure(2, buff1[i])); ---- -> gf_mul(2, buff1[i])); -81c81 -< i, a, buff1[i], buff2[i], gf_mul_erasure(a, buff1[i])); ---- -> i, a, buff1[i], buff2[i], gf_mul(a, buff1[i])); -95c95 -< if (gf_mul_erasure(a, buff1[i]) != buff2[i]) { ---- -> if (gf_mul(a, buff1[i]) != buff2[i]) { -97c97 -< i, a, buff1[i], buff2[i], gf_mul_erasure(2, buff1[i])); ---- -> i, a, buff1[i], buff2[i], gf_mul(2, buff1[i])); -116c116 -< if (gf_mul_erasure(a, efence_buff1[i]) != efence_buff2[i]) { ---- -> if (gf_mul(a, efence_buff1[i]) != efence_buff2[i]) { -118c118 -< i, efence_buff1[i], efence_buff2[i], gf_mul_erasure(2, ---- -> i, efence_buff1[i], efence_buff2[i], gf_mul(2, diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_mul_perf.c b/contrib/libs/isa-l/erasure_code/gf_vect_mul_perf.c index 58194ccebc..ae41cee43e 100644 --- a/contrib/libs/isa-l/erasure_code/gf_vect_mul_perf.c +++ b/contrib/libs/isa-l/erasure_code/gf_vect_mul_perf.c @@ -33,21 +33,22 @@ #include "erasure_code.h" #include "test.h" -//#define CACHED_TEST -#ifdef CACHED_TEST +#ifndef GT_L3_CACHE +# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ +#endif + +#if !defined(COLD_TEST) && !defined(TEST_CUSTOM) // Cached test, loop many times over small dataset +# define TEST_SOURCES 10 # define TEST_LEN 8*1024 # define TEST_TYPE_STR "_warm" -#else -# ifndef TEST_CUSTOM +#elif defined (COLD_TEST) // Uncached test. Pull from large mem base. -# define TEST_SOURCES 10 -# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ -# define TEST_LEN GT_L3_CACHE / 2 -# define TEST_TYPE_STR "_cold" -# else -# define TEST_TYPE_STR "_cus" -# endif +# define TEST_SOURCES 10 +# define TEST_LEN (GT_L3_CACHE / 2) +# define TEST_TYPE_STR "_cold" +#elif defined (TEST_CUSTOM) +# define TEST_TYPE_STR "_cus" #endif #define TEST_MEM (2 * TEST_LEN) diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_mul_sse.asm b/contrib/libs/isa-l/erasure_code/gf_vect_mul_sse.asm index 01a3269d65..ddccb129e1 100644 --- a/contrib/libs/isa-l/erasure_code/gf_vect_mul_sse.asm +++ b/contrib/libs/isa-l/erasure_code/gf_vect_mul_sse.asm @@ -38,11 +38,8 @@ %define arg1 rsi %define arg2 rdx %define arg3 rcx - %define arg4 r8 - %define arg5 r9 - %define tmp r11 %define return rax - %define func(x) x: + %define func(x) x: endbranch %define FUNC_SAVE %define FUNC_RESTORE @@ -81,6 +78,7 @@ %define src arg2 %define dest arg3 %define pos return +%define tmp r11 ;;; Use Non-temporal load/stor @@ -112,14 +110,15 @@ section .text align 16 -global gf_vect_mul_sse:ISAL_SYM_TYPE_FUNCTION +global gf_vect_mul_sse, function func(gf_vect_mul_sse) -%ifidn __OUTPUT_FORMAT__, macho64 -global _gf_vect_mul_sse:ISAL_SYM_TYPE_FUNCTION -func(_gf_vect_mul_sse) -%endif + ; Check if length is multiple of 32 bytes + mov tmp, len + and tmp, 0x1f + jnz return_fail FUNC_SAVE + mov pos, 0 movdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte movdqu xgft_lo, [mul_array] ;Load array Cx{00}, Cx{01}, Cx{02}, ... @@ -152,15 +151,14 @@ loop32: cmp pos, len jl loop32 + FUNC_RESTORE return_pass: - sub pos, len - FUNC_RESTORE + xor return, return ret return_fail: mov return, 1 - FUNC_RESTORE ret endproc_frame @@ -170,6 +168,3 @@ section .data align 16 mask0f: dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f - -;;; func core, ver, snum -slversion gf_vect_mul_sse, 00, 03, 0034 diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_mul_sse.patch b/contrib/libs/isa-l/erasure_code/gf_vect_mul_sse.patch deleted file mode 100644 index 93027e3cf7..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_vect_mul_sse.patch +++ /dev/null @@ -1,6 +0,0 @@ -117,121d116 -< %ifidn __OUTPUT_FORMAT__, macho64 -< global _gf_vect_mul_sse:ISAL_SYM_TYPE_FUNCTION -< func(_gf_vect_mul_sse) -< %endif -< diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_mul_test.c b/contrib/libs/isa-l/erasure_code/gf_vect_mul_test.c index c34e88b889..7e6457879c 100644 --- a/contrib/libs/isa-l/erasure_code/gf_vect_mul_test.c +++ b/contrib/libs/isa-l/erasure_code/gf_vect_mul_test.c @@ -31,14 +31,14 @@ #include <stdlib.h> #include "erasure_code.h" -#define TEST_SIZE (64*1024) +#define TEST_SIZE (128*1024) typedef unsigned char u8; int main(int argc, char *argv[]) { - int i; - u8 *buff1, *buff2, *buff3, gf_const_tbl[64], a = 2; + int i, ret = -1; + u8 *buff1 = NULL, *buff2 = NULL, *buff3 = NULL, gf_const_tbl[64], a = 2; int tsize; int align, size; unsigned char *efence_buff1; @@ -55,30 +55,35 @@ int main(int argc, char *argv[]) if (NULL == buff1 || NULL == buff2 || NULL == buff3) { printf("buffer alloc error\n"); - return -1; + goto exit; } // Fill with rand data for (i = 0; i < TEST_SIZE; i++) buff1[i] = rand(); - gf_vect_mul(TEST_SIZE, gf_const_tbl, buff1, buff2); + if (gf_vect_mul(TEST_SIZE, gf_const_tbl, buff1, buff2) != 0) { + printf("fail creating buff2\n"); + goto exit; + } for (i = 0; i < TEST_SIZE; i++) { if (gf_mul_erasure(a, buff1[i]) != buff2[i]) { printf("fail at %d, 0x%x x 2 = 0x%x (0x%x)\n", i, buff1[i], buff2[i], gf_mul_erasure(2, buff1[i])); - return -1; + goto exit; } } - gf_vect_mul_base(TEST_SIZE, gf_const_tbl, buff1, buff3); - + if (gf_vect_mul_base(TEST_SIZE, gf_const_tbl, buff1, buff3) != 0) { + printf("fail fill with rand data\n"); + goto exit; + } // Check reference function for (i = 0; i < TEST_SIZE; i++) { if (buff2[i] != buff3[i]) { printf("fail at %d, 0x%x x 0x%d = 0x%x (0x%x)\n", i, a, buff1[i], buff2[i], gf_mul_erasure(a, buff1[i])); - return -1; + goto exit; } } @@ -88,33 +93,43 @@ int main(int argc, char *argv[]) // Check each possible constant for (a = 0; a != 255; a++) { gf_vect_mul_init(a, gf_const_tbl); - gf_vect_mul(TEST_SIZE, gf_const_tbl, buff1, buff2); + if (gf_vect_mul(TEST_SIZE, gf_const_tbl, buff1, buff2) != 0) { + printf("fail creating buff2\n"); + goto exit; + } for (i = 0; i < TEST_SIZE; i++) if (gf_mul_erasure(a, buff1[i]) != buff2[i]) { printf("fail at %d, 0x%x x %d = 0x%x (0x%x)\n", i, a, buff1[i], buff2[i], gf_mul_erasure(2, buff1[i])); - return -1; + goto exit; } +#ifdef TEST_VERBOSE putchar('.'); +#endif } // Check buffer len for (tsize = TEST_SIZE; tsize > 0; tsize -= 32) { a = rand(); gf_vect_mul_init(a, gf_const_tbl); - gf_vect_mul(tsize, gf_const_tbl, buff1, buff2); + if (gf_vect_mul(tsize, gf_const_tbl, buff1, buff2) != 0) { + printf("fail creating buff2 (len %d)\n", tsize); + goto exit; + } for (i = 0; i < tsize; i++) if (gf_mul_erasure(a, buff1[i]) != buff2[i]) { printf("fail at %d, 0x%x x %d = 0x%x (0x%x)\n", i, a, buff1[i], buff2[i], gf_mul_erasure(2, buff1[i])); - return -1; + goto exit; } +#ifdef TEST_VERBOSE if (0 == tsize % (32 * 8)) { putchar('.'); fflush(0); } +#endif } // Run tests at end of buffer for Electric Fence @@ -135,24 +150,46 @@ int main(int argc, char *argv[]) printf("fail at %d, 0x%x x 2 = 0x%x (0x%x)\n", i, efence_buff1[i], efence_buff2[i], gf_mul_erasure(2, efence_buff1[i])); - return 1; + goto exit; } - gf_vect_mul_base(TEST_SIZE - size, gf_const_tbl, efence_buff1, efence_buff3); - + if (gf_vect_mul_base + (TEST_SIZE - size, gf_const_tbl, efence_buff1, efence_buff3) != 0) { + printf("fail line up TEST_SIZE from end\n"); + goto exit; + } // Check reference function for (i = 0; i < TEST_SIZE - size; i++) if (efence_buff2[i] != efence_buff3[i]) { printf("fail at %d, 0x%x x 0x%d = 0x%x (0x%x)\n", i, a, efence_buff2[i], efence_buff3[i], gf_mul_erasure(2, efence_buff1[i])); - return 1; + goto exit; } - +#ifdef TEST_VERBOSE putchar('.'); +#endif + } + + // Test all unsupported sizes up to TEST_SIZE + for (size = 0; size < TEST_SIZE; size++) { + if (size % align != 0 && gf_vect_mul(size, gf_const_tbl, buff1, buff2) == 0) { + printf + ("fail expecting nonzero return code for unaligned size param (%d)\n", + size); + goto exit; + } } printf(" done: Pass\n"); fflush(0); - return 0; + + ret = 0; + exit: + + free(buff1); + free(buff2); + free(buff3); + + return ret; } diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_mul_test.patch b/contrib/libs/isa-l/erasure_code/gf_vect_mul_test.patch deleted file mode 100644 index 4f5b5b575d..0000000000 --- a/contrib/libs/isa-l/erasure_code/gf_vect_mul_test.patch +++ /dev/null @@ -1,40 +0,0 @@ -67c67 -< if (gf_mul_erasure(a, buff1[i]) != buff2[i]) { ---- -> if (gf_mul(a, buff1[i]) != buff2[i]) { -69c69 -< buff1[i], buff2[i], gf_mul_erasure(2, buff1[i])); ---- -> buff1[i], buff2[i], gf_mul(2, buff1[i])); -80c80 -< i, a, buff1[i], buff2[i], gf_mul_erasure(a, buff1[i])); ---- -> i, a, buff1[i], buff2[i], gf_mul(a, buff1[i])); -94c94 -< if (gf_mul_erasure(a, buff1[i]) != buff2[i]) { ---- -> if (gf_mul(a, buff1[i]) != buff2[i]) { -96c96 -< i, a, buff1[i], buff2[i], gf_mul_erasure(2, buff1[i])); ---- -> i, a, buff1[i], buff2[i], gf_mul(2, buff1[i])); -109c109 -< if (gf_mul_erasure(a, buff1[i]) != buff2[i]) { ---- -> if (gf_mul(a, buff1[i]) != buff2[i]) { -111c111 -< i, a, buff1[i], buff2[i], gf_mul_erasure(2, buff1[i])); ---- -> i, a, buff1[i], buff2[i], gf_mul(2, buff1[i])); -134c134 -< if (gf_mul_erasure(a, efence_buff1[i]) != efence_buff2[i]) { ---- -> if (gf_mul(a, efence_buff1[i]) != efence_buff2[i]) { -137c137 -< gf_mul_erasure(2, efence_buff1[i])); ---- -> gf_mul(2, efence_buff1[i])); -148c148 -< gf_mul_erasure(2, efence_buff1[i])); ---- -> gf_mul(2, efence_buff1[i])); diff --git a/contrib/libs/isa-l/erasure_code/ppc64le/Makefile.am b/contrib/libs/isa-l/erasure_code/ppc64le/Makefile.am new file mode 100644 index 0000000000..9d263ac22f --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/ppc64le/Makefile.am @@ -0,0 +1,15 @@ +lsrc_ppc64le += erasure_code/ppc64le/ec_base_vsx.c \ + erasure_code/ppc64le/gf_vect_mul_vsx.c \ + erasure_code/ppc64le/gf_vect_dot_prod_vsx.c \ + erasure_code/ppc64le/gf_vect_mad_vsx.c \ + erasure_code/ppc64le/gf_2vect_dot_prod_vsx.c \ + erasure_code/ppc64le/gf_2vect_mad_vsx.c \ + erasure_code/ppc64le/gf_3vect_dot_prod_vsx.c \ + erasure_code/ppc64le/gf_3vect_mad_vsx.c \ + erasure_code/ppc64le/gf_4vect_dot_prod_vsx.c \ + erasure_code/ppc64le/gf_4vect_mad_vsx.c \ + erasure_code/ppc64le/gf_5vect_dot_prod_vsx.c \ + erasure_code/ppc64le/gf_5vect_mad_vsx.c \ + erasure_code/ppc64le/gf_6vect_dot_prod_vsx.c \ + erasure_code/ppc64le/gf_6vect_mad_vsx.c + diff --git a/contrib/libs/isa-l/erasure_code/ppc64le/ec_base_vsx.c b/contrib/libs/isa-l/erasure_code/ppc64le/ec_base_vsx.c new file mode 100644 index 0000000000..c3163a58ff --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/ppc64le/ec_base_vsx.c @@ -0,0 +1,106 @@ +#include "erasure_code.h" +#include "ec_base_vsx.h" + +void gf_vect_dot_prod(int len, int vlen, unsigned char *v, + unsigned char **src, unsigned char *dest) +{ + gf_vect_dot_prod_vsx(len, vlen, v, src, dest); +} + +void gf_vect_mad(int len, int vec, int vec_i, unsigned char *v, + unsigned char *src, unsigned char *dest) +{ + gf_vect_mad_vsx(len, vec, vec_i, v, src, dest); + +} + +void ec_encode_data(int len, int srcs, int dests, unsigned char *v, + unsigned char **src, unsigned char **dest) +{ + if (len < 64) { + ec_encode_data_base(len, srcs, dests, v, src, dest); + return; + } + + while (dests >= 6) { + gf_6vect_dot_prod_vsx(len, srcs, v, src, dest); + v += 6 * srcs * 32; + dest += 6; + dests -= 6; + } + switch (dests) { + case 6: + gf_6vect_dot_prod_vsx(len, srcs, v, src, dest); + break; + case 5: + gf_5vect_dot_prod_vsx(len, srcs, v, src, dest); + break; + case 4: + gf_4vect_dot_prod_vsx(len, srcs, v, src, dest); + break; + case 3: + gf_3vect_dot_prod_vsx(len, srcs, v, src, dest); + break; + case 2: + gf_2vect_dot_prod_vsx(len, srcs, v, src, dest); + break; + case 1: + gf_vect_dot_prod_vsx(len, srcs, v, src, *dest); + break; + case 0: + break; + } +} + +void ec_encode_data_update(int len, int k, int rows, int vec_i, unsigned char *v, + unsigned char *data, unsigned char **dest) +{ + if (len < 64) { + ec_encode_data_update_base(len, k, rows, vec_i, v, data, dest); + return; + } + + while (rows >= 6) { + gf_6vect_mad_vsx(len, k, vec_i, v, data, dest); + v += 6 * k * 32; + dest += 6; + rows -= 6; + } + switch (rows) { + case 6: + gf_6vect_mad_vsx(len, k, vec_i, v, data, dest); + break; + case 5: + gf_5vect_mad_vsx(len, k, vec_i, v, data, dest); + break; + case 4: + gf_4vect_mad_vsx(len, k, vec_i, v, data, dest); + break; + case 3: + gf_3vect_mad_vsx(len, k, vec_i, v, data, dest); + break; + case 2: + gf_2vect_mad_vsx(len, k, vec_i, v, data, dest); + break; + case 1: + gf_vect_mad_vsx(len, k, vec_i, v, data, *dest); + break; + case 0: + break; + } +} + +int gf_vect_mul(int len, unsigned char *a, void *src, void *dest) +{ + /* Size must be aligned to 32 bytes */ + if ((len % 32) != 0) + return -1; + + gf_vect_mul_vsx(len, a, (unsigned char *)src, (unsigned char *)dest); + return 0; +} + +void ec_init_tables(int k, int rows, unsigned char *a, unsigned char *g_tbls) +{ + return ec_init_tables_base(k, rows, a, g_tbls); +} diff --git a/contrib/libs/isa-l/erasure_code/ppc64le/ec_base_vsx.h b/contrib/libs/isa-l/erasure_code/ppc64le/ec_base_vsx.h new file mode 100644 index 0000000000..c808629a95 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/ppc64le/ec_base_vsx.h @@ -0,0 +1,338 @@ +#ifndef _ERASURE_CODE_PPC64LE_H_ +#define _ERASURE_CODE_PPC64LE_H_ + +#include "erasure_code.h" +#include <altivec.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(__ibmxl__) +#define EC_vec_xl(a, b) vec_xl_be(a, b) +#define EC_vec_permxor(va, vb, vc) __vpermxor(va, vb, vc) +#elif defined __GNUC__ && __GNUC__ >= 8 +#define EC_vec_xl(a, b) vec_xl_be(a, b) +#define EC_vec_permxor(va, vb, vc) __builtin_crypto_vpermxor(va, vb, vc) +#elif defined __GNUC__ && __GNUC__ >= 7 +#if defined _ARCH_PWR9 +#define EC_vec_xl(a, b) vec_vsx_ld(a, b) +#define EC_vec_permxor(va, vb, vc) __builtin_crypto_vpermxor(va, vb, vec_nor(vc, vc)) +#else +inline vector unsigned char EC_vec_xl(int off, unsigned char *ptr) { + vector unsigned char vc; + __asm__ __volatile__("lxvd2x %x0, %1, %2; xxswapd %x0, %x0" : "=wa" (vc) : "r" (off), "r" (ptr)); + return vc; +} +#define EC_vec_permxor(va, vb, vc) __builtin_crypto_vpermxor(va, vb, vec_nor(vc, vc)) +#endif +#else +#if defined _ARCH_PWR8 +inline vector unsigned char EC_vec_xl(int off, unsigned char *ptr) { + vector unsigned char vc; + __asm__ __volatile__("lxvd2x %x0, %1, %2; xxswapd %x0, %x0" : "=wa" (vc) : "r" (off), "r" (ptr)); + return vc; +} +#define EC_vec_permxor(va, vb, vc) __builtin_crypto_vpermxor(va, vb, vec_nor(vc, vc)) +#else +#error "This code is only supported on ppc64le." +#endif +#endif + +/** + * @brief GF(2^8) vector multiply. VSX version. + * + * Does a GF(2^8) multiply across each byte of input source with expanded + * constant and save to destination array. Can be used for erasure coding encode + * and decode update when only one source is available at a time. Function + * requires pre-calculation of a 32 byte constant array based on the input + * coefficients. + * @requires VSX + * + * @param len Length of each vector in bytes. + * @param gftbls Pointer to array of input tables generated from coding + * coefficients in ec_init_tables(). Must be of size 32. + * @param src Array of pointers to source inputs. + * @param dest Pointer to destination data array. + * @returns none + */ + +void gf_vect_mul_vsx(int len, unsigned char *gftbls, unsigned char *src, unsigned char *dest); + +/** + * @brief GF(2^8) vector dot product. VSX version. + * + * Does a GF(2^8) dot product across each byte of the input array and a constant + * set of coefficients to produce each byte of the output. Can be used for + * erasure coding encode and decode. Function requires pre-calculation of a + * 32*vlen byte constant array based on the input coefficients. + * @requires VSX + * + * @param len Length of each vector in bytes. + * @param vlen Number of vector sources. + * @param gftbls Pointer to 32*vlen byte array of pre-calculated constants based + * on the array of input coefficients. + * @param src Array of pointers to source inputs. + * @param dest Pointer to destination data array. + * @returns none + */ + +void gf_vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char *dest); + +/** + * @brief GF(2^8) vector dot product with two outputs. VSX version. + * + * Vector dot product optimized to calculate two outputs at a time. Does two + * GF(2^8) dot products across each byte of the input array and two constant + * sets of coefficients to produce each byte of the outputs. Can be used for + * erasure coding encode and decode. Function requires pre-calculation of a + * 2*32*vlen byte constant array based on the two sets of input coefficients. + * @requires VSX + * + * @param len Length of each vector in bytes. + * @param vlen Number of vector sources. + * @param gftbls Pointer to 2*32*vlen byte array of pre-calculated constants + * based on the array of input coefficients. + * @param src Array of pointers to source inputs. + * @param dest Array of pointers to destination data buffers. + * @returns none + */ + +void gf_2vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + +/** + * @brief GF(2^8) vector dot product with three outputs. VSX version. + * + * Vector dot product optimized to calculate three outputs at a time. Does three + * GF(2^8) dot products across each byte of the input array and three constant + * sets of coefficients to produce each byte of the outputs. Can be used for + * erasure coding encode and decode. Function requires pre-calculation of a + * 3*32*vlen byte constant array based on the three sets of input coefficients. + * @requires VSX + * + * @param len Length of each vector in bytes. + * @param vlen Number of vector sources. + * @param gftbls Pointer to 3*32*vlen byte array of pre-calculated constants + * based on the array of input coefficients. + * @param src Array of pointers to source inputs. + * @param dest Array of pointers to destination data buffers. + * @returns none + */ + +void gf_3vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + +/** + * @brief GF(2^8) vector dot product with four outputs. VSX version. + * + * Vector dot product optimized to calculate four outputs at a time. Does four + * GF(2^8) dot products across each byte of the input array and four constant + * sets of coefficients to produce each byte of the outputs. Can be used for + * erasure coding encode and decode. Function requires pre-calculation of a + * 4*32*vlen byte constant array based on the four sets of input coefficients. + * @requires VSX + * + * @param len Length of each vector in bytes. + * @param vlen Number of vector sources. + * @param gftbls Pointer to 4*32*vlen byte array of pre-calculated constants + * based on the array of input coefficients. + * @param src Array of pointers to source inputs. + * @param dest Array of pointers to destination data buffers. + * @returns none + */ + +void gf_4vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + +/** + * @brief GF(2^8) vector dot product with five outputs. VSX version. + * + * Vector dot product optimized to calculate five outputs at a time. Does five + * GF(2^8) dot products across each byte of the input array and five constant + * sets of coefficients to produce each byte of the outputs. Can be used for + * erasure coding encode and decode. Function requires pre-calculation of a + * 5*32*vlen byte constant array based on the five sets of input coefficients. + * @requires VSX + * + * @param len Length of each vector in bytes. Must >= 16. + * @param vlen Number of vector sources. + * @param gftbls Pointer to 5*32*vlen byte array of pre-calculated constants + * based on the array of input coefficients. + * @param src Array of pointers to source inputs. + * @param dest Array of pointers to destination data buffers. + * @returns none + */ + +void gf_5vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + +/** + * @brief GF(2^8) vector dot product with six outputs. VSX version. + * + * Vector dot product optimized to calculate six outputs at a time. Does six + * GF(2^8) dot products across each byte of the input array and six constant + * sets of coefficients to produce each byte of the outputs. Can be used for + * erasure coding encode and decode. Function requires pre-calculation of a + * 6*32*vlen byte constant array based on the six sets of input coefficients. + * @requires VSX + * + * @param len Length of each vector in bytes. + * @param vlen Number of vector sources. + * @param gftbls Pointer to 6*32*vlen byte array of pre-calculated constants + * based on the array of input coefficients. + * @param src Array of pointers to source inputs. + * @param dest Array of pointers to destination data buffers. + * @returns none + */ + +void gf_6vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + +/** + * @brief GF(2^8) vector multiply accumulate. VSX version. + * + * Does a GF(2^8) multiply across each byte of input source with expanded + * constant and add to destination array. Can be used for erasure coding encode + * and decode update when only one source is available at a time. Function + * requires pre-calculation of a 32*vec byte constant array based on the input + * coefficients. + * @requires VSX + * + * @param len Length of each vector in bytes. + * @param vec The number of vector sources or rows in the generator matrix + * for coding. + * @param vec_i The vector index corresponding to the single input source. + * @param gftbls Pointer to array of input tables generated from coding + * coefficients in ec_init_tables(). Must be of size 32*vec. + * @param src Array of pointers to source inputs. + * @param dest Pointer to destination data array. + * @returns none + */ + +void gf_vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char *dest); +/** + * @brief GF(2^8) vector multiply with 2 accumulate. VSX version. + * + * Does a GF(2^8) multiply across each byte of input source with expanded + * constants and add to destination arrays. Can be used for erasure coding + * encode and decode update when only one source is available at a + * time. Function requires pre-calculation of a 32*vec byte constant array based + * on the input coefficients. + * @requires VSX + * + * @param len Length of each vector in bytes. + * @param vec The number of vector sources or rows in the generator matrix + * for coding. + * @param vec_i The vector index corresponding to the single input source. + * @param gftbls Pointer to array of input tables generated from coding + * coefficients in ec_init_tables(). Must be of size 32*vec. + * @param src Pointer to source input array. + * @param dest Array of pointers to destination input/outputs. + * @returns none + */ + +void gf_2vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char **dest); + +/** + * @brief GF(2^8) vector multiply with 3 accumulate. VSX version. + * + * Does a GF(2^8) multiply across each byte of input source with expanded + * constants and add to destination arrays. Can be used for erasure coding + * encode and decode update when only one source is available at a + * time. Function requires pre-calculation of a 32*vec byte constant array based + * on the input coefficients. + * @requires VSX + * + * @param len Length of each vector in bytes. + * @param vec The number of vector sources or rows in the generator matrix + * for coding. + * @param vec_i The vector index corresponding to the single input source. + * @param gftbls Pointer to array of input tables generated from coding + * coefficients in ec_init_tables(). Must be of size 32*vec. + * @param src Pointer to source input array. + * @param dest Array of pointers to destination input/outputs. + * @returns none + */ + +void gf_3vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char **dest); + +/** + * @brief GF(2^8) vector multiply with 4 accumulate. VSX version. + * + * Does a GF(2^8) multiply across each byte of input source with expanded + * constants and add to destination arrays. Can be used for erasure coding + * encode and decode update when only one source is available at a + * time. Function requires pre-calculation of a 32*vec byte constant array based + * on the input coefficients. + * @requires VSX + * + * @param len Length of each vector in bytes. + * @param vec The number of vector sources or rows in the generator matrix + * for coding. + * @param vec_i The vector index corresponding to the single input source. + * @param gftbls Pointer to array of input tables generated from coding + * coefficients in ec_init_tables(). Must be of size 32*vec. + * @param src Pointer to source input array. + * @param dest Array of pointers to destination input/outputs. + * @returns none + */ + +void gf_4vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char **dest); + +/** + * @brief GF(2^8) vector multiply with 5 accumulate. VSX version. + * + * Does a GF(2^8) multiply across each byte of input source with expanded + * constants and add to destination arrays. Can be used for erasure coding + * encode and decode update when only one source is available at a + * time. Function requires pre-calculation of a 32*vec byte constant array based + * on the input coefficients. + * @requires VSX + * + * @param len Length of each vector in bytes. + * @param vec The number of vector sources or rows in the generator matrix + * for coding. + * @param vec_i The vector index corresponding to the single input source. + * @param gftbls Pointer to array of input tables generated from coding + * coefficients in ec_init_tables(). Must be of size 32*vec. + * @param src Pointer to source input array. + * @param dest Array of pointers to destination input/outputs. + * @returns none + */ +void gf_5vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char **dest); + +/** + * @brief GF(2^8) vector multiply with 6 accumulate. VSX version. + * + * Does a GF(2^8) multiply across each byte of input source with expanded + * constants and add to destination arrays. Can be used for erasure coding + * encode and decode update when only one source is available at a + * time. Function requires pre-calculation of a 32*vec byte constant array based + * on the input coefficients. + * @requires VSX + * + * @param len Length of each vector in bytes. + * @param vec The number of vector sources or rows in the generator matrix + * for coding. + * @param vec_i The vector index corresponding to the single input source. + * @param gftbls Pointer to array of input tables generated from coding + * coefficients in ec_init_tables(). Must be of size 32*vec. + * @param src Pointer to source input array. + * @param dest Array of pointers to destination input/outputs. + * @returns none + */ +void gf_6vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char **dest); + +#ifdef __cplusplus +} +#endif + +#endif //_ERASURE_CODE_PPC64LE_H_ diff --git a/contrib/libs/isa-l/erasure_code/ppc64le/gf_2vect_dot_prod_vsx.c b/contrib/libs/isa-l/erasure_code/ppc64le/gf_2vect_dot_prod_vsx.c new file mode 100644 index 0000000000..3cb269ccef --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/ppc64le/gf_2vect_dot_prod_vsx.c @@ -0,0 +1,83 @@ +#include "ec_base_vsx.h" + +void gf_2vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest) +{ + unsigned char *s, *t0, *t1; + vector unsigned char vX1, vX2, vX3, vX4; + vector unsigned char vY1, vY2, vY3, vY4; + vector unsigned char vYD, vYE, vYF, vYG; + vector unsigned char vhi0, vlo0, vhi1, vlo1; + int i, j, head; + + if (vlen < 128) { + gf_vect_mul_vsx(len, &gftbls[0 * 32 * vlen], src[0], (unsigned char *)dest[0]); + gf_vect_mul_vsx(len, &gftbls[1 * 32 * vlen], src[0], (unsigned char *)dest[1]); + + for (j = 1; j < vlen; j++) { + gf_2vect_mad_vsx(len, vlen, j, gftbls, src[j], dest); + } + return; + } + + t0 = (unsigned char *)dest[0]; + t1 = (unsigned char *)dest[1]; + + head = len % 64; + if (head != 0) { + gf_vect_dot_prod_base(head, vlen, &gftbls[0 * 32 * vlen], src, t0); + gf_vect_dot_prod_base(head, vlen, &gftbls[1 * 32 * vlen], src, t1); + } + + for (i = head; i < len - 63; i += 64) { + vY1 = vY1 ^ vY1; + vY2 = vY2 ^ vY2; + vY3 = vY3 ^ vY3; + vY4 = vY4 ^ vY4; + + vYD = vYD ^ vYD; + vYE = vYE ^ vYE; + vYF = vYF ^ vYF; + vYG = vYG ^ vYG; + + unsigned char *g0 = &gftbls[0 * 32 * vlen]; + unsigned char *g1 = &gftbls[1 * 32 * vlen]; + + for (j = 0; j < vlen; j++) { + s = (unsigned char *)src[j]; + vX1 = vec_xl(0, s + i); + vX2 = vec_xl(16, s + i); + vX3 = vec_xl(32, s + i); + vX4 = vec_xl(48, s + i); + + vlo0 = EC_vec_xl(0, g0); + vhi0 = EC_vec_xl(16, g0); + vlo1 = EC_vec_xl(0, g1); + vhi1 = EC_vec_xl(16, g1); + + vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1); + vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2); + vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3); + vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4); + + vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1); + vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2); + vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3); + vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4); + + g0 += 32; + g1 += 32; + } + + vec_xst(vY1, 0, t0 + i); + vec_xst(vY2, 16, t0 + i); + vec_xst(vY3, 0, t1 + i); + vec_xst(vY4, 16, t1 + i); + + vec_xst(vYD, 32, t0 + i); + vec_xst(vYE, 48, t0 + i); + vec_xst(vYF, 32, t1 + i); + vec_xst(vYG, 48, t1 + i); + } + return; +} diff --git a/contrib/libs/isa-l/erasure_code/ppc64le/gf_2vect_mad_vsx.c b/contrib/libs/isa-l/erasure_code/ppc64le/gf_2vect_mad_vsx.c new file mode 100644 index 0000000000..621684a5fb --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/ppc64le/gf_2vect_mad_vsx.c @@ -0,0 +1,65 @@ +#include "ec_base_vsx.h" + +void gf_2vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest) +{ + unsigned char *s, *t0, *t1; + vector unsigned char vX1, vX2, vX3, vX4; + vector unsigned char vY1, vY2, vY3, vY4; + vector unsigned char vYD, vYE, vYF, vYG; + vector unsigned char vhi0, vlo0, vhi1, vlo1; + int i, head; + + s = (unsigned char *)src; + t0 = (unsigned char *)dest[0]; + t1 = (unsigned char *)dest[1]; + + head = len % 64; + if (head != 0) { + gf_vect_mad_base(head, vec, vec_i, &gftbls[0 * 32 * vec], src, t0); + gf_vect_mad_base(head, vec, vec_i, &gftbls[1 * 32 * vec], src, t1); + } + + vlo0 = EC_vec_xl(0, gftbls + (((0 * vec) << 5) + (vec_i << 5))); + vhi0 = EC_vec_xl(16, gftbls + (((0 * vec) << 5) + (vec_i << 5))); + vlo1 = EC_vec_xl(0, gftbls + (((1 * vec) << 5) + (vec_i << 5))); + vhi1 = EC_vec_xl(16, gftbls + (((1 * vec) << 5) + (vec_i << 5))); + + for (i = head; i < len - 63; i += 64) { + vX1 = vec_xl(0, s + i); + vX2 = vec_xl(16, s + i); + vX3 = vec_xl(32, s + i); + vX4 = vec_xl(48, s + i); + + vY1 = vec_xl(0, t0 + i); + vY2 = vec_xl(16, t0 + i); + vYD = vec_xl(32, t0 + i); + vYE = vec_xl(48, t0 + i); + + vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1); + vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2); + vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3); + vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4); + + vY3 = vec_xl(0, t1 + i); + vY4 = vec_xl(16, t1 + i); + vYF = vec_xl(32, t1 + i); + vYG = vec_xl(48, t1 + i); + + vec_xst(vY1, 0, t0 + i); + vec_xst(vY2, 16, t0 + i); + vec_xst(vYD, 32, t0 + i); + vec_xst(vYE, 48, t0 + i); + + vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1); + vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2); + vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3); + vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4); + + vec_xst(vY3, 0, t1 + i); + vec_xst(vY4, 16, t1 + i); + vec_xst(vYF, 32, t1 + i); + vec_xst(vYG, 48, t1 + i); + } + return; +} diff --git a/contrib/libs/isa-l/erasure_code/ppc64le/gf_3vect_dot_prod_vsx.c b/contrib/libs/isa-l/erasure_code/ppc64le/gf_3vect_dot_prod_vsx.c new file mode 100644 index 0000000000..23b72dc4ba --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/ppc64le/gf_3vect_dot_prod_vsx.c @@ -0,0 +1,104 @@ +#include "ec_base_vsx.h" + +void gf_3vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest) +{ + unsigned char *s, *t0, *t1, *t2; + vector unsigned char vX1, vX2, vX3, vX4; + vector unsigned char vY1, vY2, vY3, vY4, vY5, vY6; + vector unsigned char vYD, vYE, vYF, vYG, vYH, vYI; + vector unsigned char vhi0, vlo0, vhi1, vlo1, vhi2, vlo2; + int i, j, head; + + if (vlen < 128) { + gf_vect_mul_vsx(len, &gftbls[0 * 32 * vlen], src[0], (unsigned char *)dest[0]); + gf_vect_mul_vsx(len, &gftbls[1 * 32 * vlen], src[0], (unsigned char *)dest[1]); + gf_vect_mul_vsx(len, &gftbls[2 * 32 * vlen], src[0], (unsigned char *)dest[2]); + + for (j = 1; j < vlen; j++) { + gf_3vect_mad_vsx(len, vlen, j, gftbls, src[j], dest); + } + return; + } + + t0 = (unsigned char *)dest[0]; + t1 = (unsigned char *)dest[1]; + t2 = (unsigned char *)dest[2]; + + head = len % 64; + if (head != 0) { + gf_vect_dot_prod_base(head, vlen, &gftbls[0 * 32 * vlen], src, t0); + gf_vect_dot_prod_base(head, vlen, &gftbls[1 * 32 * vlen], src, t1); + gf_vect_dot_prod_base(head, vlen, &gftbls[2 * 32 * vlen], src, t2); + } + + for (i = head; i < len - 63; i += 64) { + vY1 = vY1 ^ vY1; + vY2 = vY2 ^ vY2; + vY3 = vY3 ^ vY3; + vY4 = vY4 ^ vY4; + vY5 = vY5 ^ vY5; + vY6 = vY6 ^ vY6; + + vYD = vYD ^ vYD; + vYE = vYE ^ vYE; + vYF = vYF ^ vYF; + vYG = vYG ^ vYG; + vYH = vYH ^ vYH; + vYI = vYI ^ vYI; + + unsigned char *g0 = &gftbls[0 * 32 * vlen]; + unsigned char *g1 = &gftbls[1 * 32 * vlen]; + unsigned char *g2 = &gftbls[2 * 32 * vlen]; + + for (j = 0; j < vlen; j++) { + s = (unsigned char *)src[j]; + vX1 = vec_xl(0, s + i); + vX2 = vec_xl(16, s + i); + vX3 = vec_xl(32, s + i); + vX4 = vec_xl(48, s + i); + + vlo0 = EC_vec_xl(0, g0); + vhi0 = EC_vec_xl(16, g0); + vlo1 = EC_vec_xl(0, g1); + vhi1 = EC_vec_xl(16, g1); + + vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1); + vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2); + vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3); + vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4); + + vlo2 = vec_xl(0, g2); + vhi2 = vec_xl(16, g2); + + vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1); + vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2); + vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3); + vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4); + + vY5 = vY5 ^ EC_vec_permxor(vhi2, vlo2, vX1); + vY6 = vY6 ^ EC_vec_permxor(vhi2, vlo2, vX2); + vYH = vYH ^ EC_vec_permxor(vhi2, vlo2, vX3); + vYI = vYI ^ EC_vec_permxor(vhi2, vlo2, vX4); + + g0 += 32; + g1 += 32; + g2 += 32; + } + + vec_xst(vY1, 0, t0 + i); + vec_xst(vY2, 16, t0 + i); + vec_xst(vY3, 0, t1 + i); + vec_xst(vY4, 16, t1 + i); + vec_xst(vY5, 0, t2 + i); + vec_xst(vY6, 16, t2 + i); + + vec_xst(vYD, 32, t0 + i); + vec_xst(vYE, 48, t0 + i); + vec_xst(vYF, 32, t1 + i); + vec_xst(vYG, 48, t1 + i); + vec_xst(vYH, 32, t2 + i); + vec_xst(vYI, 48, t2 + i); + } + return; +} diff --git a/contrib/libs/isa-l/erasure_code/ppc64le/gf_3vect_mad_vsx.c b/contrib/libs/isa-l/erasure_code/ppc64le/gf_3vect_mad_vsx.c new file mode 100644 index 0000000000..ba90c1fdbf --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/ppc64le/gf_3vect_mad_vsx.c @@ -0,0 +1,84 @@ +#include "ec_base_vsx.h" + +void gf_3vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest) +{ + unsigned char *s, *t0, *t1, *t2; + vector unsigned char vX1, vX2, vX3, vX4; + vector unsigned char vY1, vY2, vY3, vY4, vY5, vY6; + vector unsigned char vYD, vYE, vYF, vYG, vYH, vYI; + vector unsigned char vhi0, vlo0, vhi1, vlo1, vhi2, vlo2; + int i, head; + + s = (unsigned char *)src; + t0 = (unsigned char *)dest[0]; + t1 = (unsigned char *)dest[1]; + t2 = (unsigned char *)dest[2]; + + head = len % 64; + if (head != 0) { + gf_vect_mad_base(head, vec, vec_i, &gftbls[0 * 32 * vec], src, t0); + gf_vect_mad_base(head, vec, vec_i, &gftbls[1 * 32 * vec], src, t1); + gf_vect_mad_base(head, vec, vec_i, &gftbls[2 * 32 * vec], src, t2); + } + + vlo0 = EC_vec_xl(0, gftbls + (((0 * vec) << 5) + (vec_i << 5))); + vhi0 = EC_vec_xl(16, gftbls + (((0 * vec) << 5) + (vec_i << 5))); + vlo1 = EC_vec_xl(0, gftbls + (((1 * vec) << 5) + (vec_i << 5))); + vhi1 = EC_vec_xl(16, gftbls + (((1 * vec) << 5) + (vec_i << 5))); + vlo2 = EC_vec_xl(0, gftbls + (((2 * vec) << 5) + (vec_i << 5))); + vhi2 = EC_vec_xl(16, gftbls + (((2 * vec) << 5) + (vec_i << 5))); + + for (i = head; i < len - 63; i += 64) { + vX1 = vec_xl(0, s + i); + vX2 = vec_xl(16, s + i); + vX3 = vec_xl(32, s + i); + vX4 = vec_xl(48, s + i); + + vY1 = vec_xl(0, t0 + i); + vY2 = vec_xl(16, t0 + i); + vYD = vec_xl(32, t0 + i); + vYE = vec_xl(48, t0 + i); + + vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1); + vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2); + vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3); + vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4); + + vY3 = vec_xl(0, t1 + i); + vY4 = vec_xl(16, t1 + i); + vYF = vec_xl(32, t1 + i); + vYG = vec_xl(48, t1 + i); + + vec_xst(vY1, 0, t0 + i); + vec_xst(vY2, 16, t0 + i); + vec_xst(vYD, 32, t0 + i); + vec_xst(vYE, 48, t0 + i); + + vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1); + vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2); + vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3); + vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4); + + vY5 = vec_xl(0, t2 + i); + vY6 = vec_xl(16, t2 + i); + vYH = vec_xl(32, t2 + i); + vYI = vec_xl(48, t2 + i); + + vec_xst(vY3, 0, t1 + i); + vec_xst(vY4, 16, t1 + i); + vec_xst(vYF, 32, t1 + i); + vec_xst(vYG, 48, t1 + i); + + vY5 = vY5 ^ EC_vec_permxor(vhi2, vlo2, vX1); + vY6 = vY6 ^ EC_vec_permxor(vhi2, vlo2, vX2); + vYH = vYH ^ EC_vec_permxor(vhi2, vlo2, vX3); + vYI = vYI ^ EC_vec_permxor(vhi2, vlo2, vX4); + + vec_xst(vY5, 0, t2 + i); + vec_xst(vY6, 16, t2 + i); + vec_xst(vYH, 32, t2 + i); + vec_xst(vYI, 48, t2 + i); + } + return; +} diff --git a/contrib/libs/isa-l/erasure_code/ppc64le/gf_4vect_dot_prod_vsx.c b/contrib/libs/isa-l/erasure_code/ppc64le/gf_4vect_dot_prod_vsx.c new file mode 100644 index 0000000000..e656544530 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/ppc64le/gf_4vect_dot_prod_vsx.c @@ -0,0 +1,124 @@ +#include "ec_base_vsx.h" + +void gf_4vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest) +{ + unsigned char *s, *t0, *t1, *t2, *t3; + vector unsigned char vX1, vX2, vX3, vX4; + vector unsigned char vY1, vY2, vY3, vY4, vY5, vY6, vY7, vY8; + vector unsigned char vYD, vYE, vYF, vYG, vYH, vYI, vYJ, vYK; + vector unsigned char vhi0, vlo0, vhi1, vlo1, vhi2, vlo2, vhi3, vlo3; + int i, j, head; + + if (vlen < 128) { + gf_vect_mul_vsx(len, &gftbls[0 * 32 * vlen], src[0], (unsigned char *)dest[0]); + gf_vect_mul_vsx(len, &gftbls[1 * 32 * vlen], src[0], (unsigned char *)dest[1]); + gf_vect_mul_vsx(len, &gftbls[2 * 32 * vlen], src[0], (unsigned char *)dest[2]); + gf_vect_mul_vsx(len, &gftbls[3 * 32 * vlen], src[0], (unsigned char *)dest[3]); + + for (j = 1; j < vlen; j++) { + gf_4vect_mad_vsx(len, vlen, j, gftbls, src[j], dest); + } + return; + } + + t0 = (unsigned char *)dest[0]; + t1 = (unsigned char *)dest[1]; + t2 = (unsigned char *)dest[2]; + t3 = (unsigned char *)dest[3]; + + head = len % 64; + if (head != 0) { + gf_vect_dot_prod_base(head, vlen, &gftbls[0 * 32 * vlen], src, t0); + gf_vect_dot_prod_base(head, vlen, &gftbls[1 * 32 * vlen], src, t1); + gf_vect_dot_prod_base(head, vlen, &gftbls[2 * 32 * vlen], src, t2); + gf_vect_dot_prod_base(head, vlen, &gftbls[3 * 32 * vlen], src, t3); + } + + for (i = head; i < len - 63; i += 64) { + vY1 = vY1 ^ vY1; + vY2 = vY2 ^ vY2; + vY3 = vY3 ^ vY3; + vY4 = vY4 ^ vY4; + vY5 = vY5 ^ vY5; + vY6 = vY6 ^ vY6; + vY7 = vY7 ^ vY7; + vY8 = vY8 ^ vY8; + + vYD = vYD ^ vYD; + vYE = vYE ^ vYE; + vYF = vYF ^ vYF; + vYG = vYG ^ vYG; + vYH = vYH ^ vYH; + vYI = vYI ^ vYI; + vYJ = vYJ ^ vYJ; + vYK = vYK ^ vYK; + + unsigned char *g0 = &gftbls[0 * 32 * vlen]; + unsigned char *g1 = &gftbls[1 * 32 * vlen]; + unsigned char *g2 = &gftbls[2 * 32 * vlen]; + unsigned char *g3 = &gftbls[3 * 32 * vlen]; + + for (j = 0; j < vlen; j++) { + s = (unsigned char *)src[j]; + vX1 = vec_xl(0, s + i); + vX2 = vec_xl(16, s + i); + vX3 = vec_xl(32, s + i); + vX4 = vec_xl(48, s + i); + + vlo0 = EC_vec_xl(0, g0); + vhi0 = EC_vec_xl(16, g0); + vlo1 = EC_vec_xl(0, g1); + vhi1 = EC_vec_xl(16, g1); + + vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1); + vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2); + vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3); + vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4); + + vlo2 = vec_xl(0, g2); + vhi2 = vec_xl(16, g2); + vlo3 = vec_xl(0, g3); + vhi3 = vec_xl(16, g3); + + vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1); + vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2); + vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3); + vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4); + + vY5 = vY5 ^ EC_vec_permxor(vhi2, vlo2, vX1); + vY6 = vY6 ^ EC_vec_permxor(vhi2, vlo2, vX2); + vYH = vYH ^ EC_vec_permxor(vhi2, vlo2, vX3); + vYI = vYI ^ EC_vec_permxor(vhi2, vlo2, vX4); + + vY7 = vY7 ^ EC_vec_permxor(vhi3, vlo3, vX1); + vY8 = vY8 ^ EC_vec_permxor(vhi3, vlo3, vX2); + vYJ = vYJ ^ EC_vec_permxor(vhi3, vlo3, vX3); + vYK = vYK ^ EC_vec_permxor(vhi3, vlo3, vX4); + + g0 += 32; + g1 += 32; + g2 += 32; + g3 += 32; + } + + vec_xst(vY1, 0, t0 + i); + vec_xst(vY2, 16, t0 + i); + vec_xst(vY3, 0, t1 + i); + vec_xst(vY4, 16, t1 + i); + vec_xst(vY5, 0, t2 + i); + vec_xst(vY6, 16, t2 + i); + vec_xst(vY7, 0, t3 + i); + vec_xst(vY8, 16, t3 + i); + + vec_xst(vYD, 32, t0 + i); + vec_xst(vYE, 48, t0 + i); + vec_xst(vYF, 32, t1 + i); + vec_xst(vYG, 48, t1 + i); + vec_xst(vYH, 32, t2 + i); + vec_xst(vYI, 48, t2 + i); + vec_xst(vYJ, 32, t3 + i); + vec_xst(vYK, 48, t3 + i); + } + return; +} diff --git a/contrib/libs/isa-l/erasure_code/ppc64le/gf_4vect_mad_vsx.c b/contrib/libs/isa-l/erasure_code/ppc64le/gf_4vect_mad_vsx.c new file mode 100644 index 0000000000..7b236b6f81 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/ppc64le/gf_4vect_mad_vsx.c @@ -0,0 +1,103 @@ +#include "ec_base_vsx.h" + +void gf_4vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest) +{ + unsigned char *s, *t0, *t1, *t2, *t3; + vector unsigned char vX1, vX2, vX3, vX4; + vector unsigned char vY1, vY2, vY3, vY4, vY5, vY6, vY7, vY8; + vector unsigned char vYD, vYE, vYF, vYG, vYH, vYI, vYJ, vYK; + vector unsigned char vhi0, vlo0, vhi1, vlo1, vhi2, vlo2, vhi3, vlo3; + int i, head; + + s = (unsigned char *)src; + t0 = (unsigned char *)dest[0]; + t1 = (unsigned char *)dest[1]; + t2 = (unsigned char *)dest[2]; + t3 = (unsigned char *)dest[3]; + + head = len % 64; + if (head != 0) { + gf_vect_mad_base(head, vec, vec_i, &gftbls[0 * 32 * vec], src, t0); + gf_vect_mad_base(head, vec, vec_i, &gftbls[1 * 32 * vec], src, t1); + gf_vect_mad_base(head, vec, vec_i, &gftbls[2 * 32 * vec], src, t2); + gf_vect_mad_base(head, vec, vec_i, &gftbls[3 * 32 * vec], src, t3); + } + + vlo0 = EC_vec_xl(0, gftbls + (((0 * vec) << 5) + (vec_i << 5))); + vhi0 = EC_vec_xl(16, gftbls + (((0 * vec) << 5) + (vec_i << 5))); + vlo1 = EC_vec_xl(0, gftbls + (((1 * vec) << 5) + (vec_i << 5))); + vhi1 = EC_vec_xl(16, gftbls + (((1 * vec) << 5) + (vec_i << 5))); + vlo2 = EC_vec_xl(0, gftbls + (((2 * vec) << 5) + (vec_i << 5))); + vhi2 = EC_vec_xl(16, gftbls + (((2 * vec) << 5) + (vec_i << 5))); + vlo3 = EC_vec_xl(0, gftbls + (((3 * vec) << 5) + (vec_i << 5))); + vhi3 = EC_vec_xl(16, gftbls + (((3 * vec) << 5) + (vec_i << 5))); + + for (i = head; i < len - 63; i += 64) { + vX1 = vec_xl(0, s + i); + vX2 = vec_xl(16, s + i); + vX3 = vec_xl(32, s + i); + vX4 = vec_xl(48, s + i); + + vY1 = vec_xl(0, t0 + i); + vY2 = vec_xl(16, t0 + i); + vYD = vec_xl(32, t0 + i); + vYE = vec_xl(48, t0 + i); + + vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1); + vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2); + vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3); + vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4); + + vY3 = vec_xl(0, t1 + i); + vY4 = vec_xl(16, t1 + i); + vYF = vec_xl(32, t1 + i); + vYG = vec_xl(48, t1 + i); + + vec_xst(vY1, 0, t0 + i); + vec_xst(vY2, 16, t0 + i); + vec_xst(vYD, 32, t0 + i); + vec_xst(vYE, 48, t0 + i); + + vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1); + vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2); + vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3); + vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4); + + vY5 = vec_xl(0, t2 + i); + vY6 = vec_xl(16, t2 + i); + vYH = vec_xl(32, t2 + i); + vYI = vec_xl(48, t2 + i); + + vec_xst(vY3, 0, t1 + i); + vec_xst(vY4, 16, t1 + i); + vec_xst(vYF, 32, t1 + i); + vec_xst(vYG, 48, t1 + i); + + vY5 = vY5 ^ EC_vec_permxor(vhi2, vlo2, vX1); + vY6 = vY6 ^ EC_vec_permxor(vhi2, vlo2, vX2); + vYH = vYH ^ EC_vec_permxor(vhi2, vlo2, vX3); + vYI = vYI ^ EC_vec_permxor(vhi2, vlo2, vX4); + + vY7 = vec_xl(0, t3 + i); + vY8 = vec_xl(16, t3 + i); + vYJ = vec_xl(32, t3 + i); + vYK = vec_xl(48, t3 + i); + + vec_xst(vY5, 0, t2 + i); + vec_xst(vY6, 16, t2 + i); + vec_xst(vYH, 32, t2 + i); + vec_xst(vYI, 48, t2 + i); + + vY7 = vY7 ^ EC_vec_permxor(vhi3, vlo3, vX1); + vY8 = vY8 ^ EC_vec_permxor(vhi3, vlo3, vX2); + vYJ = vYJ ^ EC_vec_permxor(vhi3, vlo3, vX3); + vYK = vYK ^ EC_vec_permxor(vhi3, vlo3, vX4); + + vec_xst(vY7, 0, t3 + i); + vec_xst(vY8, 16, t3 + i); + vec_xst(vYJ, 32, t3 + i); + vec_xst(vYK, 48, t3 + i); + } + return; +} diff --git a/contrib/libs/isa-l/erasure_code/ppc64le/gf_5vect_dot_prod_vsx.c b/contrib/libs/isa-l/erasure_code/ppc64le/gf_5vect_dot_prod_vsx.c new file mode 100644 index 0000000000..e9eef0e638 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/ppc64le/gf_5vect_dot_prod_vsx.c @@ -0,0 +1,145 @@ +#include "ec_base_vsx.h" + +void gf_5vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest) +{ + unsigned char *s, *t0, *t1, *t2, *t3, *t4; + vector unsigned char vX1, vX2, vX3, vX4; + vector unsigned char vY1, vY2, vY3, vY4, vY5, vY6, vY7, vY8, vY9, vYA; + vector unsigned char vYD, vYE, vYF, vYG, vYH, vYI, vYJ, vYK, vYL, vYM; + vector unsigned char vhi0, vlo0, vhi1, vlo1, vhi2, vlo2, vhi3, vlo3, vhi4, vlo4; + int i, j, head; + + if (vlen < 128) { + gf_vect_mul_vsx(len, &gftbls[0 * 32 * vlen], src[0], (unsigned char *)dest[0]); + gf_vect_mul_vsx(len, &gftbls[1 * 32 * vlen], src[0], (unsigned char *)dest[1]); + gf_vect_mul_vsx(len, &gftbls[2 * 32 * vlen], src[0], (unsigned char *)dest[2]); + gf_vect_mul_vsx(len, &gftbls[3 * 32 * vlen], src[0], (unsigned char *)dest[3]); + gf_vect_mul_vsx(len, &gftbls[4 * 32 * vlen], src[0], (unsigned char *)dest[4]); + + for (j = 1; j < vlen; j++) { + gf_5vect_mad_vsx(len, vlen, j, gftbls, src[j], dest); + } + return; + } + + t0 = (unsigned char *)dest[0]; + t1 = (unsigned char *)dest[1]; + t2 = (unsigned char *)dest[2]; + t3 = (unsigned char *)dest[3]; + t4 = (unsigned char *)dest[4]; + + head = len % 64; + if (head != 0) { + gf_vect_dot_prod_base(head, vlen, &gftbls[0 * 32 * vlen], src, t0); + gf_vect_dot_prod_base(head, vlen, &gftbls[1 * 32 * vlen], src, t1); + gf_vect_dot_prod_base(head, vlen, &gftbls[2 * 32 * vlen], src, t2); + gf_vect_dot_prod_base(head, vlen, &gftbls[3 * 32 * vlen], src, t3); + gf_vect_dot_prod_base(head, vlen, &gftbls[4 * 32 * vlen], src, t4); + } + + for (i = head; i < len - 63; i += 64) { + vY1 = vY1 ^ vY1; + vY2 = vY2 ^ vY2; + vY3 = vY3 ^ vY3; + vY4 = vY4 ^ vY4; + vY5 = vY5 ^ vY5; + vY6 = vY6 ^ vY6; + vY7 = vY7 ^ vY7; + vY8 = vY8 ^ vY8; + vY9 = vY9 ^ vY9; + vYA = vYA ^ vYA; + + vYD = vYD ^ vYD; + vYE = vYE ^ vYE; + vYF = vYF ^ vYF; + vYG = vYG ^ vYG; + vYH = vYH ^ vYH; + vYI = vYI ^ vYI; + vYJ = vYJ ^ vYJ; + vYK = vYK ^ vYK; + vYL = vYL ^ vYL; + vYM = vYM ^ vYM; + + unsigned char *g0 = &gftbls[0 * 32 * vlen]; + unsigned char *g1 = &gftbls[1 * 32 * vlen]; + unsigned char *g2 = &gftbls[2 * 32 * vlen]; + unsigned char *g3 = &gftbls[3 * 32 * vlen]; + unsigned char *g4 = &gftbls[4 * 32 * vlen]; + + for (j = 0; j < vlen; j++) { + s = (unsigned char *)src[j]; + vX1 = vec_xl(0, s + i); + vX2 = vec_xl(16, s + i); + vX3 = vec_xl(32, s + i); + vX4 = vec_xl(48, s + i); + + vlo0 = EC_vec_xl(0, g0); + vhi0 = EC_vec_xl(16, g0); + vlo1 = EC_vec_xl(0, g1); + vhi1 = EC_vec_xl(16, g1); + + vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1); + vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2); + vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3); + vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4); + + vlo2 = vec_xl(0, g2); + vhi2 = vec_xl(16, g2); + vlo3 = vec_xl(0, g3); + vhi3 = vec_xl(16, g3); + + vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1); + vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2); + vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3); + vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4); + + vlo4 = vec_xl(0, g4); + vhi4 = vec_xl(16, g4); + + vY5 = vY5 ^ EC_vec_permxor(vhi2, vlo2, vX1); + vY6 = vY6 ^ EC_vec_permxor(vhi2, vlo2, vX2); + vYH = vYH ^ EC_vec_permxor(vhi2, vlo2, vX3); + vYI = vYI ^ EC_vec_permxor(vhi2, vlo2, vX4); + + vY7 = vY7 ^ EC_vec_permxor(vhi3, vlo3, vX1); + vY8 = vY8 ^ EC_vec_permxor(vhi3, vlo3, vX2); + vYJ = vYJ ^ EC_vec_permxor(vhi3, vlo3, vX3); + vYK = vYK ^ EC_vec_permxor(vhi3, vlo3, vX4); + + vY9 = vY9 ^ EC_vec_permxor(vhi4, vlo4, vX1); + vYA = vYA ^ EC_vec_permxor(vhi4, vlo4, vX2); + vYL = vYL ^ EC_vec_permxor(vhi4, vlo4, vX3); + vYM = vYM ^ EC_vec_permxor(vhi4, vlo4, vX4); + + g0 += 32; + g1 += 32; + g2 += 32; + g3 += 32; + g4 += 32; + } + + vec_xst(vY1, 0, t0 + i); + vec_xst(vY2, 16, t0 + i); + vec_xst(vY3, 0, t1 + i); + vec_xst(vY4, 16, t1 + i); + vec_xst(vY5, 0, t2 + i); + vec_xst(vY6, 16, t2 + i); + vec_xst(vY7, 0, t3 + i); + vec_xst(vY8, 16, t3 + i); + vec_xst(vY9, 0, t4 + i); + vec_xst(vYA, 16, t4 + i); + + vec_xst(vYD, 32, t0 + i); + vec_xst(vYE, 48, t0 + i); + vec_xst(vYF, 32, t1 + i); + vec_xst(vYG, 48, t1 + i); + vec_xst(vYH, 32, t2 + i); + vec_xst(vYI, 48, t2 + i); + vec_xst(vYJ, 32, t3 + i); + vec_xst(vYK, 48, t3 + i); + vec_xst(vYL, 32, t4 + i); + vec_xst(vYM, 48, t4 + i); + } + return; +} diff --git a/contrib/libs/isa-l/erasure_code/ppc64le/gf_5vect_mad_vsx.c b/contrib/libs/isa-l/erasure_code/ppc64le/gf_5vect_mad_vsx.c new file mode 100644 index 0000000000..7bb7bb2115 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/ppc64le/gf_5vect_mad_vsx.c @@ -0,0 +1,122 @@ +#include "ec_base_vsx.h" + +void gf_5vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest) +{ + unsigned char *s, *t0, *t1, *t2, *t3, *t4; + vector unsigned char vX1, vX2, vX3, vX4; + vector unsigned char vY1, vY2, vY3, vY4, vY5, vY6, vY7, vY8, vY9, vYA; + vector unsigned char vYD, vYE, vYF, vYG, vYH, vYI, vYJ, vYK, vYL, vYM; + vector unsigned char vhi0, vlo0, vhi1, vlo1, vhi2, vlo2, vhi3, vlo3, vhi4, vlo4; + int i, head; + + s = (unsigned char *)src; + t0 = (unsigned char *)dest[0]; + t1 = (unsigned char *)dest[1]; + t2 = (unsigned char *)dest[2]; + t3 = (unsigned char *)dest[3]; + t4 = (unsigned char *)dest[4]; + + head = len % 64; + if (head != 0) { + gf_vect_mad_base(head, vec, vec_i, &gftbls[0 * 32 * vec], src, t0); + gf_vect_mad_base(head, vec, vec_i, &gftbls[1 * 32 * vec], src, t1); + gf_vect_mad_base(head, vec, vec_i, &gftbls[2 * 32 * vec], src, t2); + gf_vect_mad_base(head, vec, vec_i, &gftbls[3 * 32 * vec], src, t3); + gf_vect_mad_base(head, vec, vec_i, &gftbls[4 * 32 * vec], src, t4); + } + + vlo0 = EC_vec_xl(0, gftbls + (((0 * vec) << 5) + (vec_i << 5))); + vhi0 = EC_vec_xl(16, gftbls + (((0 * vec) << 5) + (vec_i << 5))); + vlo1 = EC_vec_xl(0, gftbls + (((1 * vec) << 5) + (vec_i << 5))); + vhi1 = EC_vec_xl(16, gftbls + (((1 * vec) << 5) + (vec_i << 5))); + vlo2 = EC_vec_xl(0, gftbls + (((2 * vec) << 5) + (vec_i << 5))); + vhi2 = EC_vec_xl(16, gftbls + (((2 * vec) << 5) + (vec_i << 5))); + vlo3 = EC_vec_xl(0, gftbls + (((3 * vec) << 5) + (vec_i << 5))); + vhi3 = EC_vec_xl(16, gftbls + (((3 * vec) << 5) + (vec_i << 5))); + vlo4 = EC_vec_xl(0, gftbls + (((4 * vec) << 5) + (vec_i << 5))); + vhi4 = EC_vec_xl(16, gftbls + (((4 * vec) << 5) + (vec_i << 5))); + + for (i = head; i < len - 63; i += 64) { + vX1 = vec_xl(0, s + i); + vX2 = vec_xl(16, s + i); + vX3 = vec_xl(32, s + i); + vX4 = vec_xl(48, s + i); + + vY1 = vec_xl(0, t0 + i); + vY2 = vec_xl(16, t0 + i); + vYD = vec_xl(32, t0 + i); + vYE = vec_xl(48, t0 + i); + + vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1); + vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2); + vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3); + vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4); + + vY3 = vec_xl(0, t1 + i); + vY4 = vec_xl(16, t1 + i); + vYF = vec_xl(32, t1 + i); + vYG = vec_xl(48, t1 + i); + + vec_xst(vY1, 0, t0 + i); + vec_xst(vY2, 16, t0 + i); + vec_xst(vYD, 32, t0 + i); + vec_xst(vYE, 48, t0 + i); + + vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1); + vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2); + vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3); + vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4); + + vY5 = vec_xl(0, t2 + i); + vY6 = vec_xl(16, t2 + i); + vYH = vec_xl(32, t2 + i); + vYI = vec_xl(48, t2 + i); + + vec_xst(vY3, 0, t1 + i); + vec_xst(vY4, 16, t1 + i); + vec_xst(vYF, 32, t1 + i); + vec_xst(vYG, 48, t1 + i); + + vY5 = vY5 ^ EC_vec_permxor(vhi2, vlo2, vX1); + vY6 = vY6 ^ EC_vec_permxor(vhi2, vlo2, vX2); + vYH = vYH ^ EC_vec_permxor(vhi2, vlo2, vX3); + vYI = vYI ^ EC_vec_permxor(vhi2, vlo2, vX4); + + vY7 = vec_xl(0, t3 + i); + vY8 = vec_xl(16, t3 + i); + vYJ = vec_xl(32, t3 + i); + vYK = vec_xl(48, t3 + i); + + vec_xst(vY5, 0, t2 + i); + vec_xst(vY6, 16, t2 + i); + vec_xst(vYH, 32, t2 + i); + vec_xst(vYI, 48, t2 + i); + + vY7 = vY7 ^ EC_vec_permxor(vhi3, vlo3, vX1); + vY8 = vY8 ^ EC_vec_permxor(vhi3, vlo3, vX2); + vYJ = vYJ ^ EC_vec_permxor(vhi3, vlo3, vX3); + vYK = vYK ^ EC_vec_permxor(vhi3, vlo3, vX4); + + vY9 = vec_xl(0, t4 + i); + vYA = vec_xl(16, t4 + i); + vYL = vec_xl(32, t4 + i); + vYM = vec_xl(48, t4 + i); + + vec_xst(vY7, 0, t3 + i); + vec_xst(vY8, 16, t3 + i); + vec_xst(vYJ, 32, t3 + i); + vec_xst(vYK, 48, t3 + i); + + vY9 = vY9 ^ EC_vec_permxor(vhi4, vlo4, vX1); + vYA = vYA ^ EC_vec_permxor(vhi4, vlo4, vX2); + vYL = vYL ^ EC_vec_permxor(vhi4, vlo4, vX3); + vYM = vYM ^ EC_vec_permxor(vhi4, vlo4, vX4); + + vec_xst(vY9, 0, t4 + i); + vec_xst(vYA, 16, t4 + i); + vec_xst(vYL, 32, t4 + i); + vec_xst(vYM, 48, t4 + i); + } + return; +} diff --git a/contrib/libs/isa-l/erasure_code/ppc64le/gf_6vect_dot_prod_vsx.c b/contrib/libs/isa-l/erasure_code/ppc64le/gf_6vect_dot_prod_vsx.c new file mode 100644 index 0000000000..ac918bd493 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/ppc64le/gf_6vect_dot_prod_vsx.c @@ -0,0 +1,166 @@ +#include "ec_base_vsx.h" + +void gf_6vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest) +{ + unsigned char *s, *t0, *t1, *t2, *t3, *t4, *t5; + vector unsigned char vX1, vX2, vX3, vX4; + vector unsigned char vY1, vY2, vY3, vY4, vY5, vY6, vY7, vY8, vY9, vYA, vYB, vYC; + vector unsigned char vYD, vYE, vYF, vYG, vYH, vYI, vYJ, vYK, vYL, vYM, vYN, vYO; + vector unsigned char vhi0, vlo0, vhi1, vlo1, vhi2, vlo2; + vector unsigned char vhi3, vlo3, vhi4, vlo4, vhi5, vlo5; + int i, j, head; + + if (vlen < 128) { + gf_vect_mul_vsx(len, &gftbls[0 * 32 * vlen], src[0], (unsigned char *)dest[0]); + gf_vect_mul_vsx(len, &gftbls[1 * 32 * vlen], src[0], (unsigned char *)dest[1]); + gf_vect_mul_vsx(len, &gftbls[2 * 32 * vlen], src[0], (unsigned char *)dest[2]); + gf_vect_mul_vsx(len, &gftbls[3 * 32 * vlen], src[0], (unsigned char *)dest[3]); + gf_vect_mul_vsx(len, &gftbls[4 * 32 * vlen], src[0], (unsigned char *)dest[4]); + gf_vect_mul_vsx(len, &gftbls[5 * 32 * vlen], src[0], (unsigned char *)dest[5]); + + for (j = 1; j < vlen; j++) { + gf_6vect_mad_vsx(len, vlen, j, gftbls, src[j], dest); + } + return; + } + + t0 = (unsigned char *)dest[0]; + t1 = (unsigned char *)dest[1]; + t2 = (unsigned char *)dest[2]; + t3 = (unsigned char *)dest[3]; + t4 = (unsigned char *)dest[4]; + t5 = (unsigned char *)dest[5]; + + head = len % 64; + if (head != 0) { + gf_vect_dot_prod_base(head, vlen, &gftbls[0 * 32 * vlen], src, t0); + gf_vect_dot_prod_base(head, vlen, &gftbls[1 * 32 * vlen], src, t1); + gf_vect_dot_prod_base(head, vlen, &gftbls[2 * 32 * vlen], src, t2); + gf_vect_dot_prod_base(head, vlen, &gftbls[3 * 32 * vlen], src, t3); + gf_vect_dot_prod_base(head, vlen, &gftbls[4 * 32 * vlen], src, t4); + gf_vect_dot_prod_base(head, vlen, &gftbls[5 * 32 * vlen], src, t5); + } + + for (i = head; i < len - 63; i += 64) { + vY1 = vY1 ^ vY1; + vY2 = vY2 ^ vY2; + vY3 = vY3 ^ vY3; + vY4 = vY4 ^ vY4; + vY5 = vY5 ^ vY5; + vY6 = vY6 ^ vY6; + vY7 = vY7 ^ vY7; + vY8 = vY8 ^ vY8; + vY9 = vY9 ^ vY9; + vYA = vYA ^ vYA; + vYB = vYB ^ vYB; + vYC = vYC ^ vYC; + + vYD = vYD ^ vYD; + vYE = vYE ^ vYE; + vYF = vYF ^ vYF; + vYG = vYG ^ vYG; + vYH = vYH ^ vYH; + vYI = vYI ^ vYI; + vYJ = vYJ ^ vYJ; + vYK = vYK ^ vYK; + vYL = vYL ^ vYL; + vYM = vYM ^ vYM; + vYN = vYN ^ vYN; + vYO = vYO ^ vYO; + + unsigned char *g0 = &gftbls[0 * 32 * vlen]; + unsigned char *g1 = &gftbls[1 * 32 * vlen]; + unsigned char *g2 = &gftbls[2 * 32 * vlen]; + unsigned char *g3 = &gftbls[3 * 32 * vlen]; + unsigned char *g4 = &gftbls[4 * 32 * vlen]; + unsigned char *g5 = &gftbls[5 * 32 * vlen]; + + for (j = 0; j < vlen; j++) { + s = (unsigned char *)src[j]; + vX1 = vec_xl(0, s + i); + vX2 = vec_xl(16, s + i); + vX3 = vec_xl(32, s + i); + vX4 = vec_xl(48, s + i); + + vlo0 = EC_vec_xl(0, g0); + vhi0 = EC_vec_xl(16, g0); + vlo1 = EC_vec_xl(0, g1); + vhi1 = EC_vec_xl(16, g1); + + vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1); + vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2); + vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3); + vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4); + + vlo2 = EC_vec_xl(0, g2); + vhi2 = EC_vec_xl(16, g2); + vlo3 = EC_vec_xl(0, g3); + vhi3 = EC_vec_xl(16, g3); + + vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1); + vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2); + vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3); + vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4); + + vlo4 = EC_vec_xl(0, g4); + vhi4 = EC_vec_xl(16, g4); + vlo5 = EC_vec_xl(0, g5); + vhi5 = EC_vec_xl(16, g5); + + vY5 = vY5 ^ EC_vec_permxor(vhi2, vlo2, vX1); + vY6 = vY6 ^ EC_vec_permxor(vhi2, vlo2, vX2); + vYH = vYH ^ EC_vec_permxor(vhi2, vlo2, vX3); + vYI = vYI ^ EC_vec_permxor(vhi2, vlo2, vX4); + + vY7 = vY7 ^ EC_vec_permxor(vhi3, vlo3, vX1); + vY8 = vY8 ^ EC_vec_permxor(vhi3, vlo3, vX2); + vYJ = vYJ ^ EC_vec_permxor(vhi3, vlo3, vX3); + vYK = vYK ^ EC_vec_permxor(vhi3, vlo3, vX4); + + vY9 = vY9 ^ EC_vec_permxor(vhi4, vlo4, vX1); + vYA = vYA ^ EC_vec_permxor(vhi4, vlo4, vX2); + vYL = vYL ^ EC_vec_permxor(vhi4, vlo4, vX3); + vYM = vYM ^ EC_vec_permxor(vhi4, vlo4, vX4); + + vYB = vYB ^ EC_vec_permxor(vhi5, vlo5, vX1); + vYC = vYC ^ EC_vec_permxor(vhi5, vlo5, vX2); + vYN = vYN ^ EC_vec_permxor(vhi5, vlo5, vX3); + vYO = vYO ^ EC_vec_permxor(vhi5, vlo5, vX4); + + g0 += 32; + g1 += 32; + g2 += 32; + g3 += 32; + g4 += 32; + g5 += 32; + } + + vec_xst(vY1, 0, t0 + i); + vec_xst(vY2, 16, t0 + i); + vec_xst(vY3, 0, t1 + i); + vec_xst(vY4, 16, t1 + i); + vec_xst(vY5, 0, t2 + i); + vec_xst(vY6, 16, t2 + i); + vec_xst(vY7, 0, t3 + i); + vec_xst(vY8, 16, t3 + i); + vec_xst(vY9, 0, t4 + i); + vec_xst(vYA, 16, t4 + i); + vec_xst(vYB, 0, t5 + i); + vec_xst(vYC, 16, t5 + i); + + vec_xst(vYD, 32, t0 + i); + vec_xst(vYE, 48, t0 + i); + vec_xst(vYF, 32, t1 + i); + vec_xst(vYG, 48, t1 + i); + vec_xst(vYH, 32, t2 + i); + vec_xst(vYI, 48, t2 + i); + vec_xst(vYJ, 32, t3 + i); + vec_xst(vYK, 48, t3 + i); + vec_xst(vYL, 32, t4 + i); + vec_xst(vYM, 48, t4 + i); + vec_xst(vYN, 32, t5 + i); + vec_xst(vYO, 48, t5 + i); + } + return; +} diff --git a/contrib/libs/isa-l/erasure_code/ppc64le/gf_6vect_mad_vsx.c b/contrib/libs/isa-l/erasure_code/ppc64le/gf_6vect_mad_vsx.c new file mode 100644 index 0000000000..43ea6c6966 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/ppc64le/gf_6vect_mad_vsx.c @@ -0,0 +1,142 @@ +#include "ec_base_vsx.h" + +void gf_6vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest) +{ + unsigned char *s, *t0, *t1, *t2, *t3, *t4, *t5; + vector unsigned char vX1, vX2, vX3, vX4; + vector unsigned char vY1, vY2, vY3, vY4, vY5, vY6, vY7, vY8, vY9, vYA, vYB, vYC; + vector unsigned char vYD, vYE, vYF, vYG, vYH, vYI, vYJ, vYK, vYL, vYM, vYN, vYO; + vector unsigned char vhi0, vlo0, vhi1, vlo1, vhi2, vlo2; + vector unsigned char vhi3, vlo3, vhi4, vlo4, vhi5, vlo5; + int i, head; + + s = (unsigned char *)src; + t0 = (unsigned char *)dest[0]; + t1 = (unsigned char *)dest[1]; + t2 = (unsigned char *)dest[2]; + t3 = (unsigned char *)dest[3]; + t4 = (unsigned char *)dest[4]; + t5 = (unsigned char *)dest[5]; + + head = len % 64; + if (head != 0) { + gf_vect_mad_base(head, vec, vec_i, &gftbls[0 * 32 * vec], src, t0); + gf_vect_mad_base(head, vec, vec_i, &gftbls[1 * 32 * vec], src, t1); + gf_vect_mad_base(head, vec, vec_i, &gftbls[2 * 32 * vec], src, t2); + gf_vect_mad_base(head, vec, vec_i, &gftbls[3 * 32 * vec], src, t3); + gf_vect_mad_base(head, vec, vec_i, &gftbls[4 * 32 * vec], src, t4); + gf_vect_mad_base(head, vec, vec_i, &gftbls[5 * 32 * vec], src, t5); + } + + vlo0 = EC_vec_xl(0, gftbls + (((0 * vec) << 5) + (vec_i << 5))); + vhi0 = EC_vec_xl(16, gftbls + (((0 * vec) << 5) + (vec_i << 5))); + vlo1 = EC_vec_xl(0, gftbls + (((1 * vec) << 5) + (vec_i << 5))); + vhi1 = EC_vec_xl(16, gftbls + (((1 * vec) << 5) + (vec_i << 5))); + vlo2 = EC_vec_xl(0, gftbls + (((2 * vec) << 5) + (vec_i << 5))); + vhi2 = EC_vec_xl(16, gftbls + (((2 * vec) << 5) + (vec_i << 5))); + vlo3 = EC_vec_xl(0, gftbls + (((3 * vec) << 5) + (vec_i << 5))); + vhi3 = EC_vec_xl(16, gftbls + (((3 * vec) << 5) + (vec_i << 5))); + vlo4 = EC_vec_xl(0, gftbls + (((4 * vec) << 5) + (vec_i << 5))); + vhi4 = EC_vec_xl(16, gftbls + (((4 * vec) << 5) + (vec_i << 5))); + vlo5 = EC_vec_xl(0, gftbls + (((5 * vec) << 5) + (vec_i << 5))); + vhi5 = EC_vec_xl(16, gftbls + (((5 * vec) << 5) + (vec_i << 5))); + + for (i = head; i < len - 63; i += 64) { + vX1 = vec_xl(0, s + i); + vX2 = vec_xl(16, s + i); + vX3 = vec_xl(32, s + i); + vX4 = vec_xl(48, s + i); + + vY1 = vec_xl(0, t0 + i); + vY2 = vec_xl(16, t0 + i); + vYD = vec_xl(32, t0 + i); + vYE = vec_xl(48, t0 + i); + + vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1); + vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2); + vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3); + vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4); + + vec_xst(vY1, 0, t0 + i); + vec_xst(vY2, 16, t0 + i); + vec_xst(vYD, 32, t0 + i); + vec_xst(vYE, 48, t0 + i); + + vY3 = vec_xl(0, t1 + i); + vY4 = vec_xl(16, t1 + i); + vYF = vec_xl(32, t1 + i); + vYG = vec_xl(48, t1 + i); + + vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1); + vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2); + vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3); + vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4); + + vec_xst(vY3, 0, t1 + i); + vec_xst(vY4, 16, t1 + i); + vec_xst(vYF, 32, t1 + i); + vec_xst(vYG, 48, t1 + i); + + vY5 = vec_xl(0, t2 + i); + vY6 = vec_xl(16, t2 + i); + vYH = vec_xl(32, t2 + i); + vYI = vec_xl(48, t2 + i); + + vY5 = vY5 ^ EC_vec_permxor(vhi2, vlo2, vX1); + vY6 = vY6 ^ EC_vec_permxor(vhi2, vlo2, vX2); + vYH = vYH ^ EC_vec_permxor(vhi2, vlo2, vX3); + vYI = vYI ^ EC_vec_permxor(vhi2, vlo2, vX4); + + vY7 = vec_xl(0, t3 + i); + vY8 = vec_xl(16, t3 + i); + vYJ = vec_xl(32, t3 + i); + vYK = vec_xl(48, t3 + i); + + vec_xst(vY5, 0, t2 + i); + vec_xst(vY6, 16, t2 + i); + vec_xst(vYH, 32, t2 + i); + vec_xst(vYI, 48, t2 + i); + + vY7 = vY7 ^ EC_vec_permxor(vhi3, vlo3, vX1); + vY8 = vY8 ^ EC_vec_permxor(vhi3, vlo3, vX2); + vYJ = vYJ ^ EC_vec_permxor(vhi3, vlo3, vX3); + vYK = vYK ^ EC_vec_permxor(vhi3, vlo3, vX4); + + vY9 = vec_xl(0, t4 + i); + vYA = vec_xl(16, t4 + i); + vYL = vec_xl(32, t4 + i); + vYM = vec_xl(48, t4 + i); + + vec_xst(vY7, 0, t3 + i); + vec_xst(vY8, 16, t3 + i); + vec_xst(vYJ, 32, t3 + i); + vec_xst(vYK, 48, t3 + i); + + vY9 = vY9 ^ EC_vec_permxor(vhi4, vlo4, vX1); + vYA = vYA ^ EC_vec_permxor(vhi4, vlo4, vX2); + vYL = vYL ^ EC_vec_permxor(vhi4, vlo4, vX3); + vYM = vYM ^ EC_vec_permxor(vhi4, vlo4, vX4); + + vYB = vec_xl(0, t5 + i); + vYC = vec_xl(16, t5 + i); + vYN = vec_xl(32, t5 + i); + vYO = vec_xl(48, t5 + i); + + vec_xst(vY9, 0, t4 + i); + vec_xst(vYA, 16, t4 + i); + vec_xst(vYL, 32, t4 + i); + vec_xst(vYM, 48, t4 + i); + + vYB = vYB ^ EC_vec_permxor(vhi5, vlo5, vX1); + vYC = vYC ^ EC_vec_permxor(vhi5, vlo5, vX2); + vYN = vYN ^ EC_vec_permxor(vhi5, vlo5, vX3); + vYO = vYO ^ EC_vec_permxor(vhi5, vlo5, vX4); + + vec_xst(vYB, 0, t5 + i); + vec_xst(vYC, 16, t5 + i); + vec_xst(vYN, 32, t5 + i); + vec_xst(vYO, 48, t5 + i); + } + return; +} diff --git a/contrib/libs/isa-l/erasure_code/ppc64le/gf_vect_dot_prod_vsx.c b/contrib/libs/isa-l/erasure_code/ppc64le/gf_vect_dot_prod_vsx.c new file mode 100644 index 0000000000..2f97e3421f --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/ppc64le/gf_vect_dot_prod_vsx.c @@ -0,0 +1,85 @@ +#include "ec_base_vsx.h" + +void gf_vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char *dest) +{ + unsigned char *s, *t0; + vector unsigned char vX1, vY1; + vector unsigned char vX2, vY2; + vector unsigned char vX3, vY3; + vector unsigned char vX4, vY4; + vector unsigned char vX5, vY5; + vector unsigned char vX6, vY6; + vector unsigned char vX7, vY7; + vector unsigned char vX8, vY8; + vector unsigned char vhi0, vlo0; + int i, j, head; + + if (vlen < 128) { + gf_vect_mul_vsx(len, &gftbls[0 * 32 * vlen], src[0], (unsigned char *)dest); + + for (j = 1; j < vlen; j++) { + gf_vect_mad_vsx(len, vlen, j, gftbls, src[j], dest); + } + return; + } + + t0 = (unsigned char *)dest; + + head = len % 128; + if (head != 0) { + gf_vect_dot_prod_base(head, vlen, &gftbls[0 * 32 * vlen], src, t0); + } + + for (i = head; i < len - 127; i += 128) { + vY1 = vY1 ^ vY1; + vY2 = vY2 ^ vY2; + vY3 = vY3 ^ vY3; + vY4 = vY4 ^ vY4; + + vY5 = vY5 ^ vY5; + vY6 = vY6 ^ vY6; + vY7 = vY7 ^ vY7; + vY8 = vY8 ^ vY8; + + unsigned char *g0 = &gftbls[0 * 32 * vlen]; + + for (j = 0; j < vlen; j++) { + s = (unsigned char *)src[j]; + vX1 = vec_xl(0, s + i); + vX2 = vec_xl(16, s + i); + vX3 = vec_xl(32, s + i); + vX4 = vec_xl(48, s + i); + + vlo0 = EC_vec_xl(0, g0); + vhi0 = EC_vec_xl(16, g0); + + vX5 = vec_xl(64, s + i); + vX6 = vec_xl(80, s + i); + vX7 = vec_xl(96, s + i); + vX8 = vec_xl(112, s + i); + + vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1); + vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2); + vY3 = vY3 ^ EC_vec_permxor(vhi0, vlo0, vX3); + vY4 = vY4 ^ EC_vec_permxor(vhi0, vlo0, vX4); + + vY5 = vY5 ^ EC_vec_permxor(vhi0, vlo0, vX5); + vY6 = vY6 ^ EC_vec_permxor(vhi0, vlo0, vX6); + vY7 = vY7 ^ EC_vec_permxor(vhi0, vlo0, vX7); + vY8 = vY8 ^ EC_vec_permxor(vhi0, vlo0, vX8); + + g0 += 32; + } + vec_xst(vY1, 0, t0 + i); + vec_xst(vY2, 16, t0 + i); + vec_xst(vY3, 32, t0 + i); + vec_xst(vY4, 48, t0 + i); + + vec_xst(vY5, 64, t0 + i); + vec_xst(vY6, 80, t0 + i); + vec_xst(vY7, 96, t0 + i); + vec_xst(vY8, 112, t0 + i); + } + return; +} diff --git a/contrib/libs/isa-l/erasure_code/ppc64le/gf_vect_mad_vsx.c b/contrib/libs/isa-l/erasure_code/ppc64le/gf_vect_mad_vsx.c new file mode 100644 index 0000000000..a4810b96db --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/ppc64le/gf_vect_mad_vsx.c @@ -0,0 +1,48 @@ +#include "ec_base_vsx.h" + +void gf_vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char *dest) +{ + unsigned char *s, *t0; + vector unsigned char vX1, vY1; + vector unsigned char vX2, vY2; + vector unsigned char vX3, vY3; + vector unsigned char vX4, vY4; + vector unsigned char vhi0, vlo0; + int i, head; + + s = (unsigned char *)src; + t0 = (unsigned char *)dest; + + head = len % 64; + if (head != 0) { + gf_vect_mad_base(head, vec, vec_i, &gftbls[0 * 32 * vec], src, dest); + } + + vlo0 = EC_vec_xl(0, gftbls + (((0 * vec) << 5) + (vec_i << 5))); + vhi0 = EC_vec_xl(16, gftbls + (((0 * vec) << 5) + (vec_i << 5))); + + for (i = head; i < len - 63; i += 64) { + vX1 = vec_xl(0, s + i); + vX2 = vec_xl(16, s + i); + vX3 = vec_xl(32, s + i); + vX4 = vec_xl(48, s + i); + + vY1 = vec_xl(0, t0 + i); + vY2 = vec_xl(16, t0 + i); + vY3 = vec_xl(32, t0 + i); + vY4 = vec_xl(48, t0 + i); + + vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1); + vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2); + vY3 = vY3 ^ EC_vec_permxor(vhi0, vlo0, vX3); + vY4 = vY4 ^ EC_vec_permxor(vhi0, vlo0, vX4); + + vec_xst(vY1, 0, t0 + i); + vec_xst(vY2, 16, t0 + i); + vec_xst(vY3, 32, t0 + i); + vec_xst(vY4, 48, t0 + i); + } + + return; +} diff --git a/contrib/libs/isa-l/erasure_code/ppc64le/gf_vect_mul_vsx.c b/contrib/libs/isa-l/erasure_code/ppc64le/gf_vect_mul_vsx.c new file mode 100644 index 0000000000..812eb83d82 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/ppc64le/gf_vect_mul_vsx.c @@ -0,0 +1,75 @@ +#include "ec_base_vsx.h" + +/* + * Same as gf_vect_mul_base in "ec_base.h" but without the size restriction. + */ +static void _gf_vect_mul_base(int len, unsigned char *a, unsigned char *src, + unsigned char *dest) +{ + //2nd element of table array is ref value used to fill it in + unsigned char c = a[1]; + + while (len-- > 0) + *dest++ = gf_mul_erasure(c, *src++); + return 0; +} + +void gf_vect_mul_vsx(int len, unsigned char *gftbl, unsigned char *src, unsigned char *dest) +{ + unsigned char *s, *t0; + vector unsigned char vX1, vY1; + vector unsigned char vX2, vY2; + vector unsigned char vX3, vY3; + vector unsigned char vX4, vY4; + vector unsigned char vX5, vY5; + vector unsigned char vX6, vY6; + vector unsigned char vX7, vY7; + vector unsigned char vX8, vY8; + vector unsigned char vhi0, vlo0; + int i, head; + + s = (unsigned char *)src; + t0 = (unsigned char *)dest; + + head = len % 128; + if (head != 0) { + _gf_vect_mul_base(head, gftbl, src, dest); + } + + vlo0 = EC_vec_xl(0, gftbl); + vhi0 = EC_vec_xl(16, gftbl); + + for (i = head; i < len - 127; i += 128) { + vX1 = vec_xl(0, s + i); + vX2 = vec_xl(16, s + i); + vX3 = vec_xl(32, s + i); + vX4 = vec_xl(48, s + i); + + vX5 = vec_xl(64, s + i); + vX6 = vec_xl(80, s + i); + vX7 = vec_xl(96, s + i); + vX8 = vec_xl(112, s + i); + + vY1 = EC_vec_permxor(vhi0, vlo0, vX1); + vY2 = EC_vec_permxor(vhi0, vlo0, vX2); + vY3 = EC_vec_permxor(vhi0, vlo0, vX3); + vY4 = EC_vec_permxor(vhi0, vlo0, vX4); + + vY5 = EC_vec_permxor(vhi0, vlo0, vX5); + vY6 = EC_vec_permxor(vhi0, vlo0, vX6); + vY7 = EC_vec_permxor(vhi0, vlo0, vX7); + vY8 = EC_vec_permxor(vhi0, vlo0, vX8); + + vec_xst(vY1, 0, t0 + i); + vec_xst(vY2, 16, t0 + i); + vec_xst(vY3, 32, t0 + i); + vec_xst(vY4, 48, t0 + i); + + vec_xst(vY5, 64, t0 + i); + vec_xst(vY6, 80, t0 + i); + vec_xst(vY7, 96, t0 + i); + vec_xst(vY8, 112, t0 + i); + } + + return; +} diff --git a/contrib/libs/isa-l/erasure_code/ut/erasure_code_base_test/ya.make b/contrib/libs/isa-l/erasure_code/ut/erasure_code_base_test/ya.make index 28acf0f7da..ad59a89cd1 100644 --- a/contrib/libs/isa-l/erasure_code/ut/erasure_code_base_test/ya.make +++ b/contrib/libs/isa-l/erasure_code/ut/erasure_code_base_test/ya.make @@ -1,6 +1,6 @@ PROGRAM() -VERSION(2.28) +VERSION(2.31) LICENSE(BSD-3-Clause) diff --git a/contrib/libs/isa-l/erasure_code/ut/erasure_code_test/ya.make b/contrib/libs/isa-l/erasure_code/ut/erasure_code_test/ya.make index 1efbc5231f..4daab8f3ef 100644 --- a/contrib/libs/isa-l/erasure_code/ut/erasure_code_test/ya.make +++ b/contrib/libs/isa-l/erasure_code/ut/erasure_code_test/ya.make @@ -1,6 +1,6 @@ PROGRAM() -VERSION(2.28) +VERSION(2.31) LICENSE(BSD-3-Clause) diff --git a/contrib/libs/isa-l/erasure_code/ut/erasure_code_update_test/ya.make b/contrib/libs/isa-l/erasure_code/ut/erasure_code_update_test/ya.make index 2b9e11fcaa..1fa89c9034 100644 --- a/contrib/libs/isa-l/erasure_code/ut/erasure_code_update_test/ya.make +++ b/contrib/libs/isa-l/erasure_code/ut/erasure_code_update_test/ya.make @@ -1,6 +1,6 @@ PROGRAM() -VERSION(2.28) +VERSION(2.31) LICENSE(BSD-3-Clause) diff --git a/contrib/libs/isa-l/erasure_code/ut/gf_2vect_dot_prod_sse_test/ya.make b/contrib/libs/isa-l/erasure_code/ut/gf_2vect_dot_prod_sse_test/ya.make deleted file mode 100644 index 2a0c6baf2c..0000000000 --- a/contrib/libs/isa-l/erasure_code/ut/gf_2vect_dot_prod_sse_test/ya.make +++ /dev/null @@ -1,29 +0,0 @@ -PROGRAM() - -VERSION(2.28) - -LICENSE(BSD-3-Clause) - -LICENSE_TEXTS(.yandex_meta/licenses.list.txt) - -NO_UTIL() - -SUBSCRIBER( - akozhikhov - g:base - g:yt -) - -ADDINCL(contrib/libs/isa-l/include) - -NO_COMPILER_WARNINGS() - -SRCS( - ../../gf_2vect_dot_prod_sse_test.c -) - -PEERDIR( - contrib/libs/isa-l/erasure_code -) - -END() diff --git a/contrib/libs/isa-l/erasure_code/ut/gf_3vect_dot_prod_sse_test/.yandex_meta/licenses.list.txt b/contrib/libs/isa-l/erasure_code/ut/gf_3vect_dot_prod_sse_test/.yandex_meta/licenses.list.txt deleted file mode 100644 index 8f218b47cb..0000000000 --- a/contrib/libs/isa-l/erasure_code/ut/gf_3vect_dot_prod_sse_test/.yandex_meta/licenses.list.txt +++ /dev/null @@ -1,164 +0,0 @@ -====================BSD-3-Clause==================== - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -====================BSD-3-Clause==================== -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in -# the documentation and/or other materials provided with the -# distribution. -# * Neither the name of Arm Corporation nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -====================BSD-3-Clause==================== -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in -# the documentation and/or other materials provided with the -# distribution. -# * Neither the name of Intel Corporation nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -====================BSD-3-Clause==================== -; Redistribution and use in source and binary forms, with or without -; modification, are permitted provided that the following conditions -; are met: -; * Redistributions of source code must retain the above copyright -; notice, this list of conditions and the following disclaimer. -; * Redistributions in binary form must reproduce the above copyright -; notice, this list of conditions and the following disclaimer in -; the documentation and/or other materials provided with the -; distribution. -; * Neither the name of Intel Corporation nor the names of its -; contributors may be used to endorse or promote products derived -; from this software without specific prior written permission. -; -; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -====================BSD-3-Clause==================== -ISA-L is licensed using a BSD 3-clause [license]. All code submitted to - - -====================BSD-3-Clause AND BSD-3-Clause AND BSD-3-Clause==================== - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Arm Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -====================COPYRIGHT==================== - Copyright(c) 2011-2013 Intel Corporation All rights reserved. - - -====================COPYRIGHT==================== - Copyright(c) 2011-2017 Intel Corporation All rights reserved. - - -====================COPYRIGHT==================== -# Copyright(c) 2011-2015 Intel Corporation All rights reserved. - - -====================COPYRIGHT==================== -# Copyright(c) 2011-2016 Intel Corporation All rights reserved. - - -====================COPYRIGHT==================== -# Copyright(c) 2011-2018 Intel Corporation All rights reserved. - - -====================COPYRIGHT==================== -# Copyright(c) 2019 Arm Corporation All rights reserved. - - -====================COPYRIGHT==================== -; Copyright(c) 2011-2019 Intel Corporation All rights reserved. diff --git a/contrib/libs/isa-l/erasure_code/ut/gf_3vect_dot_prod_sse_test/ya.make b/contrib/libs/isa-l/erasure_code/ut/gf_3vect_dot_prod_sse_test/ya.make deleted file mode 100644 index c4b11b139e..0000000000 --- a/contrib/libs/isa-l/erasure_code/ut/gf_3vect_dot_prod_sse_test/ya.make +++ /dev/null @@ -1,29 +0,0 @@ -PROGRAM() - -VERSION(2.28) - -LICENSE(BSD-3-Clause) - -LICENSE_TEXTS(.yandex_meta/licenses.list.txt) - -NO_UTIL() - -SUBSCRIBER( - akozhikhov - g:base - g:yt -) - -ADDINCL(contrib/libs/isa-l/include) - -NO_COMPILER_WARNINGS() - -SRCS( - ../../gf_3vect_dot_prod_sse_test.c -) - -PEERDIR( - contrib/libs/isa-l/erasure_code -) - -END() diff --git a/contrib/libs/isa-l/erasure_code/ut/gf_4vect_dot_prod_sse_test/.yandex_meta/licenses.list.txt b/contrib/libs/isa-l/erasure_code/ut/gf_4vect_dot_prod_sse_test/.yandex_meta/licenses.list.txt deleted file mode 100644 index 8f218b47cb..0000000000 --- a/contrib/libs/isa-l/erasure_code/ut/gf_4vect_dot_prod_sse_test/.yandex_meta/licenses.list.txt +++ /dev/null @@ -1,164 +0,0 @@ -====================BSD-3-Clause==================== - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -====================BSD-3-Clause==================== -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in -# the documentation and/or other materials provided with the -# distribution. -# * Neither the name of Arm Corporation nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -====================BSD-3-Clause==================== -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in -# the documentation and/or other materials provided with the -# distribution. -# * Neither the name of Intel Corporation nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -====================BSD-3-Clause==================== -; Redistribution and use in source and binary forms, with or without -; modification, are permitted provided that the following conditions -; are met: -; * Redistributions of source code must retain the above copyright -; notice, this list of conditions and the following disclaimer. -; * Redistributions in binary form must reproduce the above copyright -; notice, this list of conditions and the following disclaimer in -; the documentation and/or other materials provided with the -; distribution. -; * Neither the name of Intel Corporation nor the names of its -; contributors may be used to endorse or promote products derived -; from this software without specific prior written permission. -; -; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -====================BSD-3-Clause==================== -ISA-L is licensed using a BSD 3-clause [license]. All code submitted to - - -====================BSD-3-Clause AND BSD-3-Clause AND BSD-3-Clause==================== - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Arm Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -====================COPYRIGHT==================== - Copyright(c) 2011-2013 Intel Corporation All rights reserved. - - -====================COPYRIGHT==================== - Copyright(c) 2011-2017 Intel Corporation All rights reserved. - - -====================COPYRIGHT==================== -# Copyright(c) 2011-2015 Intel Corporation All rights reserved. - - -====================COPYRIGHT==================== -# Copyright(c) 2011-2016 Intel Corporation All rights reserved. - - -====================COPYRIGHT==================== -# Copyright(c) 2011-2018 Intel Corporation All rights reserved. - - -====================COPYRIGHT==================== -# Copyright(c) 2019 Arm Corporation All rights reserved. - - -====================COPYRIGHT==================== -; Copyright(c) 2011-2019 Intel Corporation All rights reserved. diff --git a/contrib/libs/isa-l/erasure_code/ut/gf_4vect_dot_prod_sse_test/ya.make b/contrib/libs/isa-l/erasure_code/ut/gf_4vect_dot_prod_sse_test/ya.make deleted file mode 100644 index 758e7463a1..0000000000 --- a/contrib/libs/isa-l/erasure_code/ut/gf_4vect_dot_prod_sse_test/ya.make +++ /dev/null @@ -1,29 +0,0 @@ -PROGRAM() - -VERSION(2.28) - -LICENSE(BSD-3-Clause) - -LICENSE_TEXTS(.yandex_meta/licenses.list.txt) - -NO_UTIL() - -SUBSCRIBER( - akozhikhov - g:base - g:yt -) - -ADDINCL(contrib/libs/isa-l/include) - -NO_COMPILER_WARNINGS() - -SRCS( - ../../gf_4vect_dot_prod_sse_test.c -) - -PEERDIR( - contrib/libs/isa-l/erasure_code -) - -END() diff --git a/contrib/libs/isa-l/erasure_code/ut/gf_5vect_dot_prod_sse_test/.yandex_meta/licenses.list.txt b/contrib/libs/isa-l/erasure_code/ut/gf_5vect_dot_prod_sse_test/.yandex_meta/licenses.list.txt deleted file mode 100644 index 8f218b47cb..0000000000 --- a/contrib/libs/isa-l/erasure_code/ut/gf_5vect_dot_prod_sse_test/.yandex_meta/licenses.list.txt +++ /dev/null @@ -1,164 +0,0 @@ -====================BSD-3-Clause==================== - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -====================BSD-3-Clause==================== -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in -# the documentation and/or other materials provided with the -# distribution. -# * Neither the name of Arm Corporation nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -====================BSD-3-Clause==================== -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in -# the documentation and/or other materials provided with the -# distribution. -# * Neither the name of Intel Corporation nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -====================BSD-3-Clause==================== -; Redistribution and use in source and binary forms, with or without -; modification, are permitted provided that the following conditions -; are met: -; * Redistributions of source code must retain the above copyright -; notice, this list of conditions and the following disclaimer. -; * Redistributions in binary form must reproduce the above copyright -; notice, this list of conditions and the following disclaimer in -; the documentation and/or other materials provided with the -; distribution. -; * Neither the name of Intel Corporation nor the names of its -; contributors may be used to endorse or promote products derived -; from this software without specific prior written permission. -; -; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -====================BSD-3-Clause==================== -ISA-L is licensed using a BSD 3-clause [license]. All code submitted to - - -====================BSD-3-Clause AND BSD-3-Clause AND BSD-3-Clause==================== - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Arm Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -====================COPYRIGHT==================== - Copyright(c) 2011-2013 Intel Corporation All rights reserved. - - -====================COPYRIGHT==================== - Copyright(c) 2011-2017 Intel Corporation All rights reserved. - - -====================COPYRIGHT==================== -# Copyright(c) 2011-2015 Intel Corporation All rights reserved. - - -====================COPYRIGHT==================== -# Copyright(c) 2011-2016 Intel Corporation All rights reserved. - - -====================COPYRIGHT==================== -# Copyright(c) 2011-2018 Intel Corporation All rights reserved. - - -====================COPYRIGHT==================== -# Copyright(c) 2019 Arm Corporation All rights reserved. - - -====================COPYRIGHT==================== -; Copyright(c) 2011-2019 Intel Corporation All rights reserved. diff --git a/contrib/libs/isa-l/erasure_code/ut/gf_5vect_dot_prod_sse_test/ya.make b/contrib/libs/isa-l/erasure_code/ut/gf_5vect_dot_prod_sse_test/ya.make deleted file mode 100644 index 4c389b8188..0000000000 --- a/contrib/libs/isa-l/erasure_code/ut/gf_5vect_dot_prod_sse_test/ya.make +++ /dev/null @@ -1,29 +0,0 @@ -PROGRAM() - -VERSION(2.28) - -LICENSE(BSD-3-Clause) - -LICENSE_TEXTS(.yandex_meta/licenses.list.txt) - -NO_UTIL() - -SUBSCRIBER( - akozhikhov - g:base - g:yt -) - -ADDINCL(contrib/libs/isa-l/include) - -NO_COMPILER_WARNINGS() - -SRCS( - ../../gf_5vect_dot_prod_sse_test.c -) - -PEERDIR( - contrib/libs/isa-l/erasure_code -) - -END() diff --git a/contrib/libs/isa-l/erasure_code/ut/gf_6vect_dot_prod_sse_test/.yandex_meta/licenses.list.txt b/contrib/libs/isa-l/erasure_code/ut/gf_6vect_dot_prod_sse_test/.yandex_meta/licenses.list.txt deleted file mode 100644 index 8f218b47cb..0000000000 --- a/contrib/libs/isa-l/erasure_code/ut/gf_6vect_dot_prod_sse_test/.yandex_meta/licenses.list.txt +++ /dev/null @@ -1,164 +0,0 @@ -====================BSD-3-Clause==================== - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -====================BSD-3-Clause==================== -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in -# the documentation and/or other materials provided with the -# distribution. -# * Neither the name of Arm Corporation nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -====================BSD-3-Clause==================== -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in -# the documentation and/or other materials provided with the -# distribution. -# * Neither the name of Intel Corporation nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -====================BSD-3-Clause==================== -; Redistribution and use in source and binary forms, with or without -; modification, are permitted provided that the following conditions -; are met: -; * Redistributions of source code must retain the above copyright -; notice, this list of conditions and the following disclaimer. -; * Redistributions in binary form must reproduce the above copyright -; notice, this list of conditions and the following disclaimer in -; the documentation and/or other materials provided with the -; distribution. -; * Neither the name of Intel Corporation nor the names of its -; contributors may be used to endorse or promote products derived -; from this software without specific prior written permission. -; -; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -====================BSD-3-Clause==================== -ISA-L is licensed using a BSD 3-clause [license]. All code submitted to - - -====================BSD-3-Clause AND BSD-3-Clause AND BSD-3-Clause==================== - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Arm Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -====================COPYRIGHT==================== - Copyright(c) 2011-2013 Intel Corporation All rights reserved. - - -====================COPYRIGHT==================== - Copyright(c) 2011-2017 Intel Corporation All rights reserved. - - -====================COPYRIGHT==================== -# Copyright(c) 2011-2015 Intel Corporation All rights reserved. - - -====================COPYRIGHT==================== -# Copyright(c) 2011-2016 Intel Corporation All rights reserved. - - -====================COPYRIGHT==================== -# Copyright(c) 2011-2018 Intel Corporation All rights reserved. - - -====================COPYRIGHT==================== -# Copyright(c) 2019 Arm Corporation All rights reserved. - - -====================COPYRIGHT==================== -; Copyright(c) 2011-2019 Intel Corporation All rights reserved. diff --git a/contrib/libs/isa-l/erasure_code/ut/gf_6vect_dot_prod_sse_test/ya.make b/contrib/libs/isa-l/erasure_code/ut/gf_6vect_dot_prod_sse_test/ya.make deleted file mode 100644 index 09f782f76a..0000000000 --- a/contrib/libs/isa-l/erasure_code/ut/gf_6vect_dot_prod_sse_test/ya.make +++ /dev/null @@ -1,29 +0,0 @@ -PROGRAM() - -VERSION(2.28) - -LICENSE(BSD-3-Clause) - -LICENSE_TEXTS(.yandex_meta/licenses.list.txt) - -NO_UTIL() - -SUBSCRIBER( - akozhikhov - g:base - g:yt -) - -ADDINCL(contrib/libs/isa-l/include) - -NO_COMPILER_WARNINGS() - -SRCS( - ../../gf_6vect_dot_prod_sse_test.c -) - -PEERDIR( - contrib/libs/isa-l/erasure_code -) - -END() diff --git a/contrib/libs/isa-l/erasure_code/ut/gf_inverse_test/ya.make b/contrib/libs/isa-l/erasure_code/ut/gf_inverse_test/ya.make index dc59e7e3d3..fc897ca8a2 100644 --- a/contrib/libs/isa-l/erasure_code/ut/gf_inverse_test/ya.make +++ b/contrib/libs/isa-l/erasure_code/ut/gf_inverse_test/ya.make @@ -1,6 +1,6 @@ PROGRAM() -VERSION(2.28) +VERSION(2.31) LICENSE(BSD-3-Clause) diff --git a/contrib/libs/isa-l/erasure_code/ut/gf_vect_dot_prod_base_test/ya.make b/contrib/libs/isa-l/erasure_code/ut/gf_vect_dot_prod_base_test/ya.make index 89d263fa19..e396b42fdf 100644 --- a/contrib/libs/isa-l/erasure_code/ut/gf_vect_dot_prod_base_test/ya.make +++ b/contrib/libs/isa-l/erasure_code/ut/gf_vect_dot_prod_base_test/ya.make @@ -1,6 +1,6 @@ PROGRAM() -VERSION(2.28) +VERSION(2.31) LICENSE(BSD-3-Clause) diff --git a/contrib/libs/isa-l/erasure_code/ut/gf_vect_dot_prod_test/ya.make b/contrib/libs/isa-l/erasure_code/ut/gf_vect_dot_prod_test/ya.make index 4767c00d3a..20897781ea 100644 --- a/contrib/libs/isa-l/erasure_code/ut/gf_vect_dot_prod_test/ya.make +++ b/contrib/libs/isa-l/erasure_code/ut/gf_vect_dot_prod_test/ya.make @@ -1,6 +1,6 @@ PROGRAM() -VERSION(2.28) +VERSION(2.31) LICENSE(BSD-3-Clause) diff --git a/contrib/libs/isa-l/erasure_code/ut/gf_vect_mad_test/ya.make b/contrib/libs/isa-l/erasure_code/ut/gf_vect_mad_test/ya.make index 2d776e53f4..0bd839954d 100644 --- a/contrib/libs/isa-l/erasure_code/ut/gf_vect_mad_test/ya.make +++ b/contrib/libs/isa-l/erasure_code/ut/gf_vect_mad_test/ya.make @@ -1,6 +1,6 @@ PROGRAM() -VERSION(2.28) +VERSION(2.31) LICENSE(BSD-3-Clause) diff --git a/contrib/libs/isa-l/erasure_code/ut/gf_vect_mul_base_test/ya.make b/contrib/libs/isa-l/erasure_code/ut/gf_vect_mul_base_test/ya.make index 2725cbf67e..37f0606713 100644 --- a/contrib/libs/isa-l/erasure_code/ut/gf_vect_mul_base_test/ya.make +++ b/contrib/libs/isa-l/erasure_code/ut/gf_vect_mul_base_test/ya.make @@ -1,6 +1,6 @@ PROGRAM() -VERSION(2.28) +VERSION(2.31) LICENSE(BSD-3-Clause) diff --git a/contrib/libs/isa-l/erasure_code/ut/gf_vect_mul_test/ya.make b/contrib/libs/isa-l/erasure_code/ut/gf_vect_mul_test/ya.make index 614ceddd89..c22dd39c6d 100644 --- a/contrib/libs/isa-l/erasure_code/ut/gf_vect_mul_test/ya.make +++ b/contrib/libs/isa-l/erasure_code/ut/gf_vect_mul_test/ya.make @@ -1,6 +1,6 @@ PROGRAM() -VERSION(2.28) +VERSION(2.31) LICENSE(BSD-3-Clause) diff --git a/contrib/libs/isa-l/erasure_code/ut/ya.make b/contrib/libs/isa-l/erasure_code/ut/ya.make index 637eac1966..7bc4eff15e 100644 --- a/contrib/libs/isa-l/erasure_code/ut/ya.make +++ b/contrib/libs/isa-l/erasure_code/ut/ya.make @@ -5,7 +5,7 @@ SUBSCRIBER( EXECTEST() -VERSION(2.28) +VERSION(2.31) LICENSE(BSD-3-Clause) @@ -19,16 +19,6 @@ RUN(erasure_code_base_test) RUN(erasure_code_update_test) -RUN(gf_2vect_dot_prod_sse_test) - -RUN(gf_3vect_dot_prod_sse_test) - -RUN(gf_4vect_dot_prod_sse_test) - -RUN(gf_5vect_dot_prod_sse_test) - -RUN(gf_6vect_dot_prod_sse_test) - RUN(gf_inverse_test) RUN(gf_vect_dot_prod_base_test) @@ -45,11 +35,6 @@ DEPENDS( contrib/libs/isa-l/erasure_code/ut/erasure_code_test contrib/libs/isa-l/erasure_code/ut/erasure_code_base_test contrib/libs/isa-l/erasure_code/ut/erasure_code_update_test - contrib/libs/isa-l/erasure_code/ut/gf_2vect_dot_prod_sse_test - contrib/libs/isa-l/erasure_code/ut/gf_3vect_dot_prod_sse_test - contrib/libs/isa-l/erasure_code/ut/gf_4vect_dot_prod_sse_test - contrib/libs/isa-l/erasure_code/ut/gf_5vect_dot_prod_sse_test - contrib/libs/isa-l/erasure_code/ut/gf_6vect_dot_prod_sse_test contrib/libs/isa-l/erasure_code/ut/gf_inverse_test contrib/libs/isa-l/erasure_code/ut/gf_vect_dot_prod_base_test contrib/libs/isa-l/erasure_code/ut/gf_vect_dot_prod_test @@ -64,11 +49,6 @@ RECURSE_FOR_TESTS( erasure_code_test erasure_code_base_test erasure_code_update_test - gf_2vect_dot_prod_sse_test - gf_3vect_dot_prod_sse_test - gf_4vect_dot_prod_sse_test - gf_5vect_dot_prod_sse_test - gf_6vect_dot_prod_sse_test gf_inverse_test gf_vect_dot_prod_base_test gf_vect_dot_prod_test diff --git a/contrib/libs/isa-l/erasure_code/ya.make b/contrib/libs/isa-l/erasure_code/ya.make index a1c30ae5be..0f2c15a27f 100644 --- a/contrib/libs/isa-l/erasure_code/ya.make +++ b/contrib/libs/isa-l/erasure_code/ya.make @@ -4,7 +4,7 @@ LICENSE(BSD-3-Clause) LICENSE_TEXTS(.yandex_meta/licenses.list.txt) -VERSION(2.28) +VERSION(2.31) NO_UTIL() @@ -17,11 +17,6 @@ ADDINCL( contrib/libs/isa-l/include ) -SRCS( - ec_base.c - ec_highlevel_func.c -) - IF (ARCH_X86_64) IF (OS_DARWIN) SRCS( @@ -34,52 +29,88 @@ ELSE() ENDIF() SRCS( - gf_vect_mul_sse.asm - gf_vect_mul_avx.asm - gf_vect_dot_prod_sse.asm - gf_vect_dot_prod_avx.asm - gf_vect_dot_prod_avx2.asm - gf_vect_dot_prod_avx512.asm - gf_2vect_dot_prod_sse.asm - gf_2vect_dot_prod_avx.asm + ec_base.c + ec_highlevel_func.c gf_2vect_dot_prod_avx2.asm + gf_2vect_dot_prod_avx2_gfni.asm gf_2vect_dot_prod_avx512.asm - gf_3vect_dot_prod_sse.asm - gf_3vect_dot_prod_avx.asm - gf_3vect_dot_prod_avx2.asm - gf_3vect_dot_prod_avx512.asm - gf_4vect_dot_prod_sse.asm - gf_4vect_dot_prod_avx.asm - gf_4vect_dot_prod_avx2.asm - gf_4vect_dot_prod_avx512.asm - gf_5vect_dot_prod_sse.asm - gf_5vect_dot_prod_avx.asm - gf_5vect_dot_prod_avx2.asm - gf_6vect_dot_prod_sse.asm - gf_6vect_dot_prod_avx.asm - gf_6vect_dot_prod_avx2.asm - gf_vect_mad_sse.asm - gf_vect_mad_avx.asm - gf_vect_mad_avx2.asm - gf_vect_mad_avx512.asm - gf_2vect_mad_sse.asm - gf_2vect_mad_avx.asm + gf_2vect_dot_prod_avx512_gfni.asm + gf_2vect_dot_prod_avx.asm + gf_2vect_dot_prod_sse.asm gf_2vect_mad_avx2.asm + gf_2vect_mad_avx2_gfni.asm gf_2vect_mad_avx512.asm - gf_3vect_mad_sse.asm - gf_3vect_mad_avx.asm + gf_2vect_mad_avx512_gfni.asm + gf_2vect_mad_avx.asm + gf_2vect_mad_sse.asm + gf_3vect_dot_prod_avx2.asm + gf_3vect_dot_prod_avx2_gfni.asm + gf_3vect_dot_prod_avx512.asm + gf_3vect_dot_prod_avx512_gfni.asm + gf_3vect_dot_prod_avx.asm + gf_3vect_dot_prod_sse.asm gf_3vect_mad_avx2.asm + gf_3vect_mad_avx2_gfni.asm gf_3vect_mad_avx512.asm - gf_4vect_mad_sse.asm - gf_4vect_mad_avx.asm + gf_3vect_mad_avx512_gfni.asm + gf_3vect_mad_avx.asm + gf_3vect_mad_sse.asm + gf_4vect_dot_prod_avx2.asm + gf_4vect_dot_prod_avx512.asm + gf_4vect_dot_prod_avx512_gfni.asm + gf_4vect_dot_prod_avx.asm + gf_4vect_dot_prod_sse.asm gf_4vect_mad_avx2.asm + gf_4vect_mad_avx2_gfni.asm gf_4vect_mad_avx512.asm - gf_5vect_mad_sse.asm - gf_5vect_mad_avx.asm + gf_4vect_mad_avx512_gfni.asm + gf_4vect_mad_avx.asm + gf_4vect_mad_sse.asm + gf_5vect_dot_prod_avx2.asm + gf_5vect_dot_prod_avx512.asm + gf_5vect_dot_prod_avx512_gfni.asm + gf_5vect_dot_prod_avx.asm + gf_5vect_dot_prod_sse.asm gf_5vect_mad_avx2.asm - gf_6vect_mad_sse.asm - gf_6vect_mad_avx.asm + gf_5vect_mad_avx2_gfni.asm + gf_5vect_mad_avx512.asm + gf_5vect_mad_avx512_gfni.asm + gf_5vect_mad_avx.asm + gf_5vect_mad_sse.asm + gf_6vect_dot_prod_avx2.asm + gf_6vect_dot_prod_avx512.asm + gf_6vect_dot_prod_avx512_gfni.asm + gf_6vect_dot_prod_avx.asm + gf_6vect_dot_prod_sse.asm gf_6vect_mad_avx2.asm + gf_6vect_mad_avx512.asm + gf_6vect_mad_avx512_gfni.asm + gf_6vect_mad_avx.asm + gf_6vect_mad_sse.asm + gf_vect_dot_prod_avx2.asm + gf_vect_dot_prod_avx2_gfni.asm + gf_vect_dot_prod_avx512.asm + gf_vect_dot_prod_avx512_gfni.asm + gf_vect_dot_prod_avx.asm + gf_vect_dot_prod_sse.asm + gf_vect_mad_avx2.asm + gf_vect_mad_avx2_gfni.asm + gf_vect_mad_avx512.asm + gf_vect_mad_avx512_gfni.asm + gf_vect_mad_avx.asm + gf_vect_mad_sse.asm + gf_vect_mul_avx.asm + gf_vect_mul_sse.asm +) +ELSEIF(ARCH_AARCH64) +SRCS( + ec_base.c + aarch64/ec_aarch64_dispatcher.c + aarch64/ec_aarch64_highlevel_func.c +) + +PEERDIR( + contrib/libs/isa-l/erasure_code/aarch64 ) ENDIF() diff --git a/contrib/libs/isa-l/include/aarch64_label.h b/contrib/libs/isa-l/include/aarch64_label.h new file mode 100644 index 0000000000..a4e6d0609c --- /dev/null +++ b/contrib/libs/isa-l/include/aarch64_label.h @@ -0,0 +1,18 @@ +#ifndef __AARCH64_LABEL_H__ +#define __AARCH64_LABEL_H__ + +#ifdef __USER_LABEL_PREFIX__ +#define CONCAT1(a, b) CONCAT2(a, b) +#define CONCAT2(a, b) a ## b +#define cdecl(x) CONCAT1 (__USER_LABEL_PREFIX__, x) +#else +#define cdecl(x) x +#endif + +#ifdef __APPLE__ +#define ASM_DEF_RODATA .section __TEXT,__const +#else +#define ASM_DEF_RODATA .section .rodata +#endif + +#endif diff --git a/contrib/libs/isa-l/include/aarch64_multibinary.h b/contrib/libs/isa-l/include/aarch64_multibinary.h new file mode 100644 index 0000000000..6c77665fd6 --- /dev/null +++ b/contrib/libs/isa-l/include/aarch64_multibinary.h @@ -0,0 +1,347 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#ifndef __AARCH64_MULTIBINARY_H__ +#define __AARCH64_MULTIBINARY_H__ +#ifndef __aarch64__ +#error "This file is for aarch64 only" +#endif +#include "aarch64_label.h" +#ifdef __ASSEMBLY__ +/** + * # mbin_interface : the wrapper layer for isal-l api + * + * ## references: + * * https://sourceware.org/git/gitweb.cgi?p=glibc.git;a=blob;f=sysdeps/aarch64/dl-trampoline.S + * * http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055b/IHI0055B_aapcs64.pdf + * * https://static.docs.arm.com/ihi0057/b/IHI0057B_aadwarf64.pdf?_ga=2.80574487.1870739014.1564969896-1634778941.1548729310 + * + * ## Usage: + * 1. Define dispather function + * 2. name must be \name\()_dispatcher + * 3. Prototype should be *"void * \name\()_dispatcher"* + * 4. The dispather should return the right function pointer , revision and a string information . + **/ +.macro mbin_interface name:req + .extern cdecl(\name\()_dispatcher) + .data + .balign 8 + .global cdecl(\name\()_dispatcher_info) +#ifndef __APPLE__ + .type \name\()_dispatcher_info,%object +#endif + cdecl(\name\()_dispatcher_info): + .quad \name\()_mbinit //func_entry +#ifndef __APPLE__ + .size \name\()_dispatcher_info,. - \name\()_dispatcher_info +#endif + .balign 8 + .text + \name\()_mbinit: + //save lp fp, sub sp + .cfi_startproc + stp x29, x30, [sp, -224]! + + //add cfi directive to avoid GDB bt cmds error + //set cfi(Call Frame Information) + .cfi_def_cfa_offset 224 + .cfi_offset 29, -224 + .cfi_offset 30, -216 + + //save parameter/result/indirect result registers + stp x8, x9, [sp, 16] + .cfi_offset 8, -208 + .cfi_offset 9, -200 + stp x0, x1, [sp, 32] + .cfi_offset 0, -192 + .cfi_offset 1, -184 + stp x2, x3, [sp, 48] + .cfi_offset 2, -176 + .cfi_offset 3, -168 + stp x4, x5, [sp, 64] + .cfi_offset 4, -160 + .cfi_offset 5, -152 + stp x6, x7, [sp, 80] + .cfi_offset 6, -144 + .cfi_offset 7, -136 + stp q0, q1, [sp, 96] + .cfi_offset 64, -128 + .cfi_offset 65, -112 + stp q2, q3, [sp, 128] + .cfi_offset 66, -96 + .cfi_offset 67, -80 + stp q4, q5, [sp, 160] + .cfi_offset 68, -64 + .cfi_offset 69, -48 + stp q6, q7, [sp, 192] + .cfi_offset 70, -32 + .cfi_offset 71, -16 + + /** + * The dispatcher functions have the following prototype: + * void * function_dispatcher(void) + * As the dispatcher is returning a struct, by the AAPCS, + */ + + + bl cdecl(\name\()_dispatcher) + //restore temp/indirect result registers + ldp x8, x9, [sp, 16] + .cfi_restore 8 + .cfi_restore 9 + + // save function entry + str x0, [x9] + + //restore parameter/result registers + ldp x0, x1, [sp, 32] + .cfi_restore 0 + .cfi_restore 1 + ldp x2, x3, [sp, 48] + .cfi_restore 2 + .cfi_restore 3 + ldp x4, x5, [sp, 64] + .cfi_restore 4 + .cfi_restore 5 + ldp x6, x7, [sp, 80] + .cfi_restore 6 + .cfi_restore 7 + ldp q0, q1, [sp, 96] + .cfi_restore 64 + .cfi_restore 65 + ldp q2, q3, [sp, 128] + .cfi_restore 66 + .cfi_restore 67 + ldp q4, q5, [sp, 160] + .cfi_restore 68 + .cfi_restore 69 + ldp q6, q7, [sp, 192] + .cfi_restore 70 + .cfi_restore 71 + //save lp fp and sp + ldp x29, x30, [sp], 224 + //restore cfi setting + .cfi_restore 30 + .cfi_restore 29 + .cfi_def_cfa_offset 0 + .cfi_endproc + + .global cdecl(\name) +#ifndef __APPLE__ + .type \name,%function +#endif + .align 2 + cdecl(\name\()): +#ifndef __APPLE__ + adrp x9, :got:\name\()_dispatcher_info + ldr x9, [x9, #:got_lo12:\name\()_dispatcher_info] +#else + adrp x9, cdecl(\name\()_dispatcher_info)@GOTPAGE + ldr x9, [x9, #cdecl(\name\()_dispatcher_info)@GOTPAGEOFF] +#endif + ldr x10,[x9] + br x10 +#ifndef __APPLE__ + .size \name,. - \name +#endif +.endm + +/** + * mbin_interface_base is used for the interfaces which have only + * noarch implementation + */ +.macro mbin_interface_base name:req, base:req + .extern \base + .data + .balign 8 + .global cdecl(\name\()_dispatcher_info) +#ifndef __APPLE__ + .type \name\()_dispatcher_info,%object +#endif + cdecl(\name\()_dispatcher_info): + .quad \base //func_entry +#ifndef __APPLE__ + .size \name\()_dispatcher_info,. - \name\()_dispatcher_info +#endif + .balign 8 + .text + .global cdecl(\name) +#ifndef __APPLE__ + .type \name,%function +#endif + .align 2 + cdecl(\name\()): +#ifndef __APPLE__ + adrp x9, :got:cdecl(_\name\()_dispatcher_info) + ldr x9, [x9, #:got_lo12:cdecl(_\name\()_dispatcher_info)] +#else + adrp x9, cdecl(_\name\()_dispatcher_info)@GOTPAGE + ldr x9, [x9, #cdecl(_\name\()_dispatcher_info)@GOTPAGEOFF] +#endif + ldr x10,[x9] + br x10 +#ifndef __APPLE__ + .size \name,. - \name +#endif +.endm + +#else /* __ASSEMBLY__ */ +#include <stdint.h> +#if defined(__linux__) +#include <sys/auxv.h> +#include <asm/hwcap.h> +#elif defined(__APPLE__) +#define SYSCTL_PMULL_KEY "hw.optional.arm.FEAT_PMULL" // from macOS 12 FEAT_* sysctl infos are available +#define SYSCTL_CRC32_KEY "hw.optional.armv8_crc32" +#define SYSCTL_SVE_KEY "hw.optional.arm.FEAT_SVE" // this one is just a guess and need to check macOS update +#include <sys/sysctl.h> +#include <stddef.h> +static inline int sysctlEnabled(const char* name){ + int enabled; + size_t size = sizeof(enabled); + int status = sysctlbyname(name, &enabled, &size, NULL, 0); + return status ? 0 : enabled; +} +#endif + + +#define DEFINE_INTERFACE_DISPATCHER(name) \ + void * name##_dispatcher(void) + +#define PROVIDER_BASIC(name) \ + PROVIDER_INFO(name##_base) + +#define DO_DIGNOSTIC(x) _Pragma GCC diagnostic ignored "-W"#x +#define DO_PRAGMA(x) _Pragma (#x) +#define DIGNOSTIC_IGNORE(x) DO_PRAGMA(GCC diagnostic ignored #x) +#define DIGNOSTIC_PUSH() DO_PRAGMA(GCC diagnostic push) +#define DIGNOSTIC_POP() DO_PRAGMA(GCC diagnostic pop) + + +#define PROVIDER_INFO(_func_entry) \ + ({ DIGNOSTIC_PUSH() \ + DIGNOSTIC_IGNORE(-Wnested-externs) \ + extern void _func_entry(void); \ + DIGNOSTIC_POP() \ + _func_entry; \ + }) + +/** + * Micro-Architector definitions + * Reference: https://developer.arm.com/docs/ddi0595/f/aarch64-system-registers/midr_el1 + */ + +#define CPU_IMPLEMENTER_RESERVE 0x00 +#define CPU_IMPLEMENTER_ARM 0x41 + + +#define CPU_PART_CORTEX_A57 0xD07 +#define CPU_PART_CORTEX_A72 0xD08 +#define CPU_PART_NEOVERSE_N1 0xD0C + +#define MICRO_ARCH_ID(imp,part) \ + (((CPU_IMPLEMENTER_##imp&0xff)<<24)|((CPU_PART_##part&0xfff)<<4)) + +#ifndef HWCAP_CPUID +#define HWCAP_CPUID (1<<11) +#endif + +/** + * @brief get_micro_arch_id + * + * read micro-architector register instruction if possible.This function + * provides microarchitecture information and make microarchitecture optimization + * possible. + * + * Read system registers(MRS) is forbidden in userspace. If executed, it + * will raise illegal instruction error. Kernel provides a solution for + * this issue. The solution depends on HWCAP_CPUID flags. Reference(1) + * describes how to use it. It provides a "illegal insstruction" handler + * in kernel space, the handler will execute MRS and return the correct + * value to userspace. + * + * To avoid too many kernel trap, this function MUST be only called in + * dispatcher. And HWCAP must be match,That will make sure there are no + * illegal instruction errors. HWCAP_CPUID should be available to get the + * best performance. + * + * NOTICE: + * - HWCAP_CPUID should be available. Otherwise it returns reserve value + * - It MUST be called inside dispather. + * - It MUST meet the HWCAP requirements + * + * Example: + * DEFINE_INTERFACE_DISPATCHER(crc32_iscsi) + * { + * unsigned long auxval = getauxval(AT_HWCAP); + * // MUST do the judgement is MUST. + * if ((HWCAP_CRC32 | HWCAP_PMULL) == (auxval & (HWCAP_CRC32 | HWCAP_PMULL))) { + * switch (get_micro_arch_id()) { + * case MICRO_ARCH_ID(ARM, CORTEX_A57): + * return PROVIDER_INFO(crc32_pmull_crc_for_a57); + * case MICRO_ARCH_ID(ARM, CORTEX_A72): + * return PROVIDER_INFO(crc32_pmull_crc_for_a72); + * case MICRO_ARCH_ID(ARM, NEOVERSE_N1): + * return PROVIDER_INFO(crc32_pmull_crc_for_n1); + * case default: + * return PROVIDER_INFO(crc32_pmull_crc_for_others); + * } + * } + * return PROVIDER_BASIC(crc32_iscsi); + * } + * KNOWN ISSUE: + * On a heterogeneous system (big.LITTLE), it will work but the performance + * might not be the best one as expected. + * + * If this function is called on the big core, it will return the function + * optimized for the big core. + * + * If execution is then scheduled to the little core. It will still work (1), + * but the function won't be optimized for the little core, thus the performance + * won't be as expected. + * + * References: + * - [CPU Feature detection](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/arm64/cpu-feature-registers.rst?h=v5.5) + * + */ +static inline uint32_t get_micro_arch_id(void) +{ + uint32_t id=CPU_IMPLEMENTER_RESERVE; +#ifndef __APPLE__ + if ((getauxval(AT_HWCAP) & HWCAP_CPUID)) { + /** Here will trap into kernel space */ + asm("mrs %0, MIDR_EL1 " : "=r" (id)); + } +#endif + return id&0xff00fff0; +} + + + +#endif /* __ASSEMBLY__ */ +#endif diff --git a/contrib/libs/isa-l/include/erasure_code.h b/contrib/libs/isa-l/include/erasure_code.h index 04fdfb1bc2..e361d7f4bb 100644 --- a/contrib/libs/isa-l/include/erasure_code.h +++ b/contrib/libs/isa-l/include/erasure_code.h @@ -74,6 +74,14 @@ extern "C" { void ec_init_tables(int k, int rows, unsigned char* a, unsigned char* gftbls); /** + * @brief Initialize tables for fast Erasure Code encode and decode, runs baseline version. + * + * Baseline version of ec_encode_data() with same parameters. + */ + +void ec_init_tables_base(int k, int rows, unsigned char* a, unsigned char* gftbls); + +/** * @brief Generate or decode erasure codes on blocks of data, runs appropriate version. * * Given a list of source data blocks, generate one or multiple blocks of @@ -926,7 +934,10 @@ void gf_gen_cauchy1_matrix(unsigned char *a, int m, int k); /** * @brief Invert a matrix in GF(2^8) * - * @param in input matrix + * Attempts to construct an n x n inverse of the input matrix. Returns non-zero + * if singular. Will always destroy input matrix in process. + * + * @param in input matrix, destroyed by invert process * @param out output matrix such that [in] x [out] = [I] - identity matrix * @param n size of matrix [nxn] * @returns 0 successful, other fail on singular input matrix diff --git a/contrib/libs/isa-l/include/gf_vect_mul.h b/contrib/libs/isa-l/include/gf_vect_mul.h index 70a0ab2ed3..7cd954452e 100644 --- a/contrib/libs/isa-l/include/gf_vect_mul.h +++ b/contrib/libs/isa-l/include/gf_vect_mul.h @@ -140,10 +140,11 @@ void gf_vect_mul_init(unsigned char c, unsigned char* gftbl); * only use 2nd element is used. * @param src Pointer to src data array. Must be aligned to 32B. * @param dest Pointer to destination data array. Must be aligned to 32B. + * @returns 0 pass, other fail */ -void gf_vect_mul_base(int len, unsigned char *a, unsigned char *src, - unsigned char *dest); +int gf_vect_mul_base(int len, unsigned char *a, unsigned char *src, + unsigned char *dest); #ifdef __cplusplus } diff --git a/contrib/libs/isa-l/include/memcpy.asm b/contrib/libs/isa-l/include/memcpy.asm new file mode 100644 index 0000000000..8ce39cc28b --- /dev/null +++ b/contrib/libs/isa-l/include/memcpy.asm @@ -0,0 +1,769 @@ +;; +;; Copyright (c) 2023, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%ifndef __MEMCPY_INC__ +%define __MEMCPY_INC__ + +%include "reg_sizes.asm" + +; This section defines a series of macros to copy small to medium amounts +; of data from memory to memory, where the size is variable but limited. +; +; The macros are all called as: +; memcpy DST, SRC, SIZE, TMP0, TMP1, XTMP0, XTMP1, XTMP2, XTMP3 +; with the parameters defined as: +; DST : register: pointer to dst (not modified) +; SRC : register: pointer to src (not modified) +; SIZE : register: length in bytes (not modified) +; TMP0 : 64-bit temp GPR (clobbered) +; TMP1 : 64-bit temp GPR (clobbered) +; XTMP0 : temp XMM (clobbered) +; XTMP1 : temp XMM (clobbered) +; XTMP2 : temp XMM (clobbered) +; XTMP3 : temp XMM (clobbered) +; +; The name indicates the options. The name is of the form: +; memcpy_<VEC>_<SZ><ZERO><RET> +; where: +; <VEC> is either "sse" or "avx" or "avx2" +; <SZ> is either "64" or "128" and defines largest value of SIZE +; <ZERO> is blank or "_1". If "_1" then the min SIZE is 1 (otherwise 0) +; <RET> is blank or "_ret". If blank, the code falls through. If "ret" +; it does a "ret" at the end +; +; For the avx2 versions, the temp XMM registers need to be YMM registers +; If the SZ is 64, then only two YMM temps are needed, i.e. it is called as: +; memcpy_avx2_64 DST, SRC, SIZE, TMP0, TMP1, YTMP0, YTMP1 +; memcpy_avx2_128 DST, SRC, SIZE, TMP0, TMP1, YTMP0, YTMP1, YTMP2, YTMP3 +; +; For example: +; memcpy_sse_64 : SSE, 0 <= size < 64, falls through +; memcpy_avx_64_1 : AVX1, 1 <= size < 64, falls through +; memcpy_sse_128_ret : SSE, 0 <= size < 128, ends with ret +; mempcy_avx_128_1_ret : AVX1, 1 <= size < 128, ends with ret +; + +%macro memcpy_sse_64 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 0, 0 +%endm + +%macro memcpy_sse_64_1 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 0, 0 +%endm + +%macro memcpy_sse_128 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 0, 0 +%endm + +%macro memcpy_sse_128_1 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 0, 0 +%endm + +%macro memcpy_sse_64_ret 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 1, 0 +%endm + +%macro memcpy_sse_64_1_ret 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 1, 0 +%endm + +%macro memcpy_sse_128_ret 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 1, 0 +%endm + +%macro memcpy_sse_128_1_ret 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 1, 0 +%endm + +%macro memcpy_sse_16 5 + __memcpy_int %1,%2,%3,%4,%5,,,,, 0, 16, 0, 0 +%endm + +%macro memcpy_sse_16_1 5 + __memcpy_int %1,%2,%3,%4,%5,,,,, 1, 16, 0, 0 +%endm + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%macro memcpy_avx_64 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 0, 1 +%endm + +%macro memcpy_avx_64_1 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 0, 1 +%endm + +%macro memcpy_avx_128 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 0, 1 +%endm + +%macro memcpy_avx_128_1 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 0, 1 +%endm + +%macro memcpy_avx_64_ret 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 1, 1 +%endm + +%macro memcpy_avx_64_1_ret 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 1, 1 +%endm + +%macro memcpy_avx_128_ret 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 1, 1 +%endm + +%macro memcpy_avx_128_1_ret 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 1, 1 +%endm + +%macro memcpy_avx_16 5 + __memcpy_int %1,%2,%3,%4,%5,,,,, 0, 16, 0, 1 +%endm + +%macro memcpy_avx_16_1 5 + __memcpy_int %1,%2,%3,%4,%5,,,,, 1, 16, 0, 1 +%endm + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%macro memcpy_avx2_64 7 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 0, 64, 0, 2 +%endm + +%macro memcpy_avx2_64_1 7 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 1, 64, 0, 2 +%endm + +%macro memcpy_avx2_128 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7, %8, %9, 0, 128, 0, 2 +%endm + +%macro memcpy_avx2_128_1 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7, %8, %9, 1, 128, 0, 2 +%endm + +%macro memcpy_avx2_64_ret 7 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 0, 64, 1, 2 +%endm + +%macro memcpy_avx2_64_1_ret 7 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 1, 64, 1, 2 +%endm + +%macro memcpy_avx2_128_ret 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 1, 2 +%endm + +%macro memcpy_avx2_128_1_ret 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 1, 2 +%endm + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%macro __memcpy_int 13 +%define %%DST %1 ; register: pointer to dst (not modified) +%define %%SRC %2 ; register: pointer to src (not modified) +%define %%SIZE %3 ; register: length in bytes (not modified) +%define %%TMP0 %4 ; 64-bit temp GPR (clobbered) +%define %%TMP1 %5 ; 64-bit temp GPR (clobbered) +%define %%XTMP0 %6 ; temp XMM (clobbered) +%define %%XTMP1 %7 ; temp XMM (clobbered) +%define %%XTMP2 %8 ; temp XMM (clobbered) +%define %%XTMP3 %9 ; temp XMM (clobbered) +%define %%NOT0 %10 ; if not 0, then assume size cannot be zero +%define %%MAXSIZE %11 ; 128, 64, etc +%define %%USERET %12 ; if not 0, use "ret" at end +%define %%USEAVX %13 ; 0 = SSE, 1 = AVX1, 2 = AVX2 + +%if (%%USERET != 0) + %define %%DONE ret +%else + %define %%DONE jmp %%end +%endif + +%if (%%USEAVX != 0) + %define %%MOVDQU vmovdqu +%else + %define %%MOVDQU movdqu +%endif + +%if (%%MAXSIZE >= 128) + test %%SIZE, 64 + jz %%lt64 + %if (%%USEAVX >= 2) + %%MOVDQU %%XTMP0, [%%SRC + 0*32] + %%MOVDQU %%XTMP1, [%%SRC + 1*32] + %%MOVDQU %%XTMP2, [%%SRC + %%SIZE - 2*32] + %%MOVDQU %%XTMP3, [%%SRC + %%SIZE - 1*32] + + %%MOVDQU [%%DST + 0*32], %%XTMP0 + %%MOVDQU [%%DST + 1*32], %%XTMP1 + %%MOVDQU [%%DST + %%SIZE - 2*32], %%XTMP2 + %%MOVDQU [%%DST + %%SIZE - 1*32], %%XTMP3 + %else + %%MOVDQU %%XTMP0, [%%SRC + 0*16] + %%MOVDQU %%XTMP1, [%%SRC + 1*16] + %%MOVDQU %%XTMP2, [%%SRC + 2*16] + %%MOVDQU %%XTMP3, [%%SRC + 3*16] + %%MOVDQU [%%DST + 0*16], %%XTMP0 + %%MOVDQU [%%DST + 1*16], %%XTMP1 + %%MOVDQU [%%DST + 2*16], %%XTMP2 + %%MOVDQU [%%DST + 3*16], %%XTMP3 + + %%MOVDQU %%XTMP0, [%%SRC + %%SIZE - 4*16] + %%MOVDQU %%XTMP1, [%%SRC + %%SIZE - 3*16] + %%MOVDQU %%XTMP2, [%%SRC + %%SIZE - 2*16] + %%MOVDQU %%XTMP3, [%%SRC + %%SIZE - 1*16] + %%MOVDQU [%%DST + %%SIZE - 4*16], %%XTMP0 + %%MOVDQU [%%DST + %%SIZE - 3*16], %%XTMP1 + %%MOVDQU [%%DST + %%SIZE - 2*16], %%XTMP2 + %%MOVDQU [%%DST + %%SIZE - 1*16], %%XTMP3 + %endif + %%DONE +%endif + +%if (%%MAXSIZE >= 64) +%%lt64: + test %%SIZE, 32 + jz %%lt32 + %if (%%USEAVX >= 2) + %%MOVDQU %%XTMP0, [%%SRC + 0*32] + %%MOVDQU %%XTMP1, [%%SRC + %%SIZE - 1*32] + %%MOVDQU [%%DST + 0*32], %%XTMP0 + %%MOVDQU [%%DST + %%SIZE - 1*32], %%XTMP1 + %else + %%MOVDQU %%XTMP0, [%%SRC + 0*16] + %%MOVDQU %%XTMP1, [%%SRC + 1*16] + %%MOVDQU %%XTMP2, [%%SRC + %%SIZE - 2*16] + %%MOVDQU %%XTMP3, [%%SRC + %%SIZE - 1*16] + %%MOVDQU [%%DST + 0*16], %%XTMP0 + %%MOVDQU [%%DST + 1*16], %%XTMP1 + %%MOVDQU [%%DST + %%SIZE - 2*16], %%XTMP2 + %%MOVDQU [%%DST + %%SIZE - 1*16], %%XTMP3 + %endif + %%DONE +%endif + +%if (%%MAXSIZE >= 32) +%%lt32: + test %%SIZE, 16 + jz %%lt16 + %if (%%USEAVX >= 2) + %%MOVDQU XWORD(%%XTMP0), [%%SRC + 0*16] + %%MOVDQU XWORD(%%XTMP1), [%%SRC + %%SIZE - 1*16] + %%MOVDQU [%%DST + 0*16], XWORD(%%XTMP0) + %%MOVDQU [%%DST + %%SIZE - 1*16], XWORD(%%XTMP1) + %else + %%MOVDQU %%XTMP0, [%%SRC + 0*16] + %%MOVDQU %%XTMP1, [%%SRC + %%SIZE - 1*16] + %%MOVDQU [%%DST + 0*16], %%XTMP0 + %%MOVDQU [%%DST + %%SIZE - 1*16], %%XTMP1 + %endif + %%DONE +%endif + +%if (%%MAXSIZE >= 16) + test %%SIZE, 16 + jz %%lt16 + mov %%TMP0, [%%SRC] + mov %%TMP1, [%%SRC + 8] + mov [%%DST], %%TMP0 + mov [%%DST + 8], %%TMP1 +%%lt16: + test %%SIZE, 8 + jz %%lt8 + mov %%TMP0, [%%SRC] + mov %%TMP1, [%%SRC + %%SIZE - 8] + mov [%%DST], %%TMP0 + mov [%%DST + %%SIZE - 8], %%TMP1 + %%DONE +%endif + +%if (%%MAXSIZE >= 8) +%%lt8: + test %%SIZE, 4 + jz %%lt4 + mov DWORD(%%TMP0), [%%SRC] + mov DWORD(%%TMP1), [%%SRC + %%SIZE - 4] + mov [%%DST], DWORD(%%TMP0) + mov [%%DST + %%SIZE - 4], DWORD(%%TMP1) + %%DONE +%endif + +%if (%%MAXSIZE >= 4) +%%lt4: + test %%SIZE, 2 + jz %%lt2 + movzx DWORD(%%TMP0), word [%%SRC] + movzx DWORD(%%TMP1), byte [%%SRC + %%SIZE - 1] + mov [%%DST], WORD(%%TMP0) + mov [%%DST + %%SIZE - 1], BYTE(%%TMP1) + %%DONE +%endif + +%%lt2: +%if (%%NOT0 == 0) + test %%SIZE, 1 + jz %%end +%endif + movzx DWORD(%%TMP0), byte [%%SRC] + mov [%%DST], BYTE(%%TMP0) +%%end: +%if (%%USERET != 0) + ret +%endif +%endm + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Utility macro to assist with SIMD shifting +%macro _PSRLDQ 3 +%define %%VEC %1 +%define %%REG %2 +%define %%IMM %3 + +%ifidn %%VEC, SSE + psrldq %%REG, %%IMM +%else + vpsrldq %%REG, %%REG, %%IMM +%endif +%endm + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; This section defines a series of macros to store small to medium amounts +; of data from SIMD registers to memory, where the size is variable but limited. +; +; The macros are all called as: +; memcpy DST, SRC, SIZE, TMP, IDX +; with the parameters defined as: +; DST : register: pointer to dst (not modified) +; SRC : register: src data (clobbered) +; SIZE : register: length in bytes (not modified) +; TMP : 64-bit temp GPR (clobbered) +; IDX : 64-bit GPR to store dst index/offset (clobbered) +; OFFSET ; Offset to be applied to destination pointer (optional) +; +; The name indicates the options. The name is of the form: +; simd_store_<VEC> +; where <VEC> is the SIMD instruction type e.g. "sse" or "avx" + +%macro simd_store_sse 5-6 +%if %0 == 6 + __simd_store %1,%2,%3,%4,%5,SSE,16,%6 +%else + __simd_store %1,%2,%3,%4,%5,SSE,16 +%endif +%endm + +%macro simd_store_avx 5-6 +%if %0 == 6 + __simd_store %1,%2,%3,%4,%5,AVX,16,%6 +%else + __simd_store %1,%2,%3,%4,%5,AVX,16 +%endif +%endm + +%macro simd_store_sse_15 5-6 +%if %0 == 6 + __simd_store %1,%2,%3,%4,%5,SSE,15,%6 +%else + __simd_store %1,%2,%3,%4,%5,SSE,15 +%endif +%endm + +%macro simd_store_avx_15 5-6 +%if %0 == 6 + __simd_store %1,%2,%3,%4,%5,AVX,15,%6 +%else + __simd_store %1,%2,%3,%4,%5,AVX,15 +%endif +%endm + +%macro __simd_store 7-8 +%define %%DST %1 ; register: pointer to dst (not modified) +%define %%SRC %2 ; register: src data (clobbered) +%define %%SIZE %3 ; register: length in bytes (not modified) +%define %%TMP %4 ; 64-bit temp GPR (clobbered) +%define %%IDX %5 ; 64-bit temp GPR to store dst idx (clobbered) +%define %%SIMDTYPE %6 ; "SSE" or "AVX" +%define %%MAX_LEN %7 ; maximum length to be stored +%define %%OFFSET %8 ; offset to be applied to destination pointer + +%define %%PSRLDQ _PSRLDQ %%SIMDTYPE, + +%ifidn %%SIMDTYPE, SSE + %define %%MOVDQU movdqu + %define %%MOVQ movq +%else + %define %%MOVDQU vmovdqu + %define %%MOVQ vmovq +%endif + +;; determine max byte size for store operation +%assign max_length_to_store %%MAX_LEN + +%if max_length_to_store > 16 +%error "__simd_store macro invoked with MAX_LEN bigger than 16!" +%endif + +%if %0 == 8 + mov %%IDX, %%OFFSET +%else + xor %%IDX, %%IDX ; zero idx +%endif + +%if max_length_to_store == 16 + test %%SIZE, 16 + jz %%lt16 + %%MOVDQU [%%DST + %%IDX], %%SRC + jmp %%end +%%lt16: +%endif + +%if max_length_to_store >= 8 + test %%SIZE, 8 + jz %%lt8 + %%MOVQ [%%DST + %%IDX], %%SRC + %%PSRLDQ %%SRC, 8 + add %%IDX, 8 +%%lt8: +%endif + + %%MOVQ %%TMP, %%SRC ; use GPR from now on + +%if max_length_to_store >= 4 + test %%SIZE, 4 + jz %%lt4 + mov [%%DST + %%IDX], DWORD(%%TMP) + shr %%TMP, 32 + add %%IDX, 4 +%%lt4: +%endif + + test %%SIZE, 2 + jz %%lt2 + mov [%%DST + %%IDX], WORD(%%TMP) + shr %%TMP, 16 + add %%IDX, 2 +%%lt2: + test %%SIZE, 1 + jz %%end + mov [%%DST + %%IDX], BYTE(%%TMP) +%%end: +%endm + +; This section defines a series of macros to load small to medium amounts +; (from 0 to 16 bytes) of data from memory to SIMD registers, +; where the size is variable but limited. +; +; The macros are all called as: +; simd_load DST, SRC, SIZE +; with the parameters defined as: +; DST : register: destination XMM register +; SRC : register: pointer to src data (not modified) +; SIZE : register: length in bytes (not modified) +; +; The name indicates the options. The name is of the form: +; simd_load_<VEC>_<SZ><ZERO> +; where: +; <VEC> is either "sse" or "avx" +; <SZ> is either "15" or "16" and defines largest value of SIZE +; <ZERO> is blank or "_1". If "_1" then the min SIZE is 1 (otherwise 0) +; +; For example: +; simd_load_sse_16 : SSE, 0 <= size <= 16 +; simd_load_avx_15_1 : AVX, 1 <= size <= 15 + +%macro simd_load_sse_15_1 3 + __simd_load %1,%2,%3,0,0,SSE +%endm +%macro simd_load_sse_15 3 + __simd_load %1,%2,%3,1,0,SSE +%endm +%macro simd_load_sse_16_1 3 + __simd_load %1,%2,%3,0,1,SSE +%endm +%macro simd_load_sse_16 3 + __simd_load %1,%2,%3,1,1,SSE +%endm + +%macro simd_load_avx_15_1 3 + __simd_load %1,%2,%3,0,0,AVX +%endm +%macro simd_load_avx_15 3 + __simd_load %1,%2,%3,1,0,AVX +%endm +%macro simd_load_avx_16_1 3 + __simd_load %1,%2,%3,0,1,AVX +%endm +%macro simd_load_avx_16 3 + __simd_load %1,%2,%3,1,1,AVX +%endm + +%macro __simd_load 6 +%define %%DST %1 ; [out] destination XMM register +%define %%SRC %2 ; [in] pointer to src data +%define %%SIZE %3 ; [in] length in bytes (0-16 bytes) +%define %%ACCEPT_0 %4 ; 0 = min length = 1, 1 = min length = 0 +%define %%ACCEPT_16 %5 ; 0 = max length = 15 , 1 = max length = 16 +%define %%SIMDTYPE %6 ; "SSE" or "AVX" + +%ifidn %%SIMDTYPE, SSE + %define %%MOVDQU movdqu + %define %%PINSRB pinsrb + %define %%PINSRQ pinsrq + %define %%PXOR pxor +%else + %define %%MOVDQU vmovdqu + %define %%PINSRB vpinsrb + %define %%PINSRQ vpinsrq + %define %%PXOR vpxor +%endif + +%if (%%ACCEPT_16 != 0) + test %%SIZE, 16 + jz %%_skip_16 + %%MOVDQU %%DST, [%%SRC] + jmp %%end_load + +%%_skip_16: +%endif + %%PXOR %%DST, %%DST ; clear XMM register +%if (%%ACCEPT_0 != 0) + or %%SIZE, %%SIZE + je %%end_load +%endif + cmp %%SIZE, 2 + jb %%_size_1 + je %%_size_2 + cmp %%SIZE, 4 + jb %%_size_3 + je %%_size_4 + cmp %%SIZE, 6 + jb %%_size_5 + je %%_size_6 + cmp %%SIZE, 8 + jb %%_size_7 + je %%_size_8 + cmp %%SIZE, 10 + jb %%_size_9 + je %%_size_10 + cmp %%SIZE, 12 + jb %%_size_11 + je %%_size_12 + cmp %%SIZE, 14 + jb %%_size_13 + je %%_size_14 + +%%_size_15: + %%PINSRB %%DST, [%%SRC + 14], 14 +%%_size_14: + %%PINSRB %%DST, [%%SRC + 13], 13 +%%_size_13: + %%PINSRB %%DST, [%%SRC + 12], 12 +%%_size_12: + %%PINSRB %%DST, [%%SRC + 11], 11 +%%_size_11: + %%PINSRB %%DST, [%%SRC + 10], 10 +%%_size_10: + %%PINSRB %%DST, [%%SRC + 9], 9 +%%_size_9: + %%PINSRB %%DST, [%%SRC + 8], 8 +%%_size_8: + %%PINSRQ %%DST, [%%SRC], 0 + jmp %%end_load +%%_size_7: + %%PINSRB %%DST, [%%SRC + 6], 6 +%%_size_6: + %%PINSRB %%DST, [%%SRC + 5], 5 +%%_size_5: + %%PINSRB %%DST, [%%SRC + 4], 4 +%%_size_4: + %%PINSRB %%DST, [%%SRC + 3], 3 +%%_size_3: + %%PINSRB %%DST, [%%SRC + 2], 2 +%%_size_2: + %%PINSRB %%DST, [%%SRC + 1], 1 +%%_size_1: + %%PINSRB %%DST, [%%SRC + 0], 0 +%%end_load: +%endm + +%macro simd_load_avx2 5 +%define %%DST %1 ; [out] destination YMM register +%define %%SRC %2 ; [in] pointer to src data +%define %%SIZE %3 ; [in] length in bytes (0-32 bytes) +%define %%IDX %4 ; [clobbered] Temp GP register to store src idx +%define %%TMP %5 ; [clobbered] Temp GP register + + test %%SIZE, 32 + jz %%_skip_32 + vmovdqu %%DST, [%%SRC] + jmp %%end_load + +%%_skip_32: + vpxor %%DST, %%DST ; clear YMM register + or %%SIZE, %%SIZE + je %%end_load + + lea %%IDX, [%%SRC] + mov %%TMP, %%SIZE + cmp %%SIZE, 16 + jle %%_check_size + + add %%IDX, 16 + sub %%TMP, 16 + +%%_check_size: + cmp %%TMP, 2 + jb %%_size_1 + je %%_size_2 + cmp %%TMP, 4 + jb %%_size_3 + je %%_size_4 + cmp %%TMP, 6 + jb %%_size_5 + je %%_size_6 + cmp %%TMP, 8 + jb %%_size_7 + je %%_size_8 + cmp %%TMP, 10 + jb %%_size_9 + je %%_size_10 + cmp %%TMP, 12 + jb %%_size_11 + je %%_size_12 + cmp %%TMP, 14 + jb %%_size_13 + je %%_size_14 + cmp %%TMP, 15 + je %%_size_15 + +%%_size_16: + vmovdqu XWORD(%%DST), [%%IDX] + jmp %%end_load +%%_size_15: + vpinsrb XWORD(%%DST), [%%IDX + 14], 14 +%%_size_14: + vpinsrb XWORD(%%DST), [%%IDX + 13], 13 +%%_size_13: + vpinsrb XWORD(%%DST), [%%IDX + 12], 12 +%%_size_12: + vpinsrb XWORD(%%DST), [%%IDX + 11], 11 +%%_size_11: + vpinsrb XWORD(%%DST), [%%IDX + 10], 10 +%%_size_10: + vpinsrb XWORD(%%DST), [%%IDX + 9], 9 +%%_size_9: + vpinsrb XWORD(%%DST), [%%IDX + 8], 8 +%%_size_8: + vpinsrq XWORD(%%DST), [%%IDX], 0 + jmp %%_check_higher_16 +%%_size_7: + vpinsrb XWORD(%%DST), [%%IDX + 6], 6 +%%_size_6: + vpinsrb XWORD(%%DST), [%%IDX + 5], 5 +%%_size_5: + vpinsrb XWORD(%%DST), [%%IDX + 4], 4 +%%_size_4: + vpinsrb XWORD(%%DST), [%%IDX + 3], 3 +%%_size_3: + vpinsrb XWORD(%%DST), [%%IDX + 2], 2 +%%_size_2: + vpinsrb XWORD(%%DST), [%%IDX + 1], 1 +%%_size_1: + vpinsrb XWORD(%%DST), [%%IDX + 0], 0 +%%_check_higher_16: + test %%SIZE, 16 + jz %%end_load + + ; Move last bytes loaded to upper half and load 16 bytes in lower half + vinserti128 %%DST, XWORD(%%DST), 1 + vinserti128 %%DST, [%%SRC], 0 +%%end_load: +%endm + +%macro simd_store_avx2 5 +%define %%DST %1 ; register: pointer to dst (not modified) +%define %%SRC %2 ; register: src data (clobbered) +%define %%SIZE %3 ; register: length in bytes (not modified) +%define %%TMP %4 ; 64-bit temp GPR (clobbered) +%define %%IDX %5 ; 64-bit temp GPR to store dst idx (clobbered) + + xor %%IDX, %%IDX ; zero idx + + test %%SIZE, 32 + jz %%lt32 + vmovdqu [%%DST], %%SRC + jmp %%end +%%lt32: + + test %%SIZE, 16 + jz %%lt16 + vmovdqu [%%DST], XWORD(%%SRC) + ; Move upper half to lower half for further stores + vperm2i128 %%SRC, %%SRC, %%SRC, 0x81 + add %%IDX, 16 +%%lt16: + + test %%SIZE, 8 + jz %%lt8 + vmovq [%%DST + %%IDX], XWORD(%%SRC) + vpsrldq XWORD(%%SRC), 8 + add %%IDX, 8 +%%lt8: + + vmovq %%TMP, XWORD(%%SRC) ; use GPR from now on + + test %%SIZE, 4 + jz %%lt4 + mov [%%DST + %%IDX], DWORD(%%TMP) + shr %%TMP, 32 + add %%IDX, 4 +%%lt4: + + test %%SIZE, 2 + jz %%lt2 + mov [%%DST + %%IDX], WORD(%%TMP) + shr %%TMP, 16 + add %%IDX, 2 +%%lt2: + test %%SIZE, 1 + jz %%end + mov [%%DST + %%IDX], BYTE(%%TMP) +%%end: +%endm + +%endif ; ifndef __MEMCPY_INC__ diff --git a/contrib/libs/isa-l/include/multibinary.asm b/contrib/libs/isa-l/include/multibinary.asm index 2cad1c51be..1a861a0376 100644 --- a/contrib/libs/isa-l/include/multibinary.asm +++ b/contrib/libs/isa-l/include/multibinary.asm @@ -69,12 +69,14 @@ mbin_def_ptr %1_mbinit section .text - global %1:ISAL_SYM_TYPE_FUNCTION + global %1, function %1_mbinit: + endbranch ;;; only called the first time to setup hardware match call %1_dispatch_init ;;; falls thru to execute the hw optimized code %1: + endbranch jmp mbin_ptr_sz [%1_dispatched] %endmacro @@ -152,8 +154,10 @@ ; 1-> function name ; 2-> base function ; 3-> SSE4_1 and CLMUL optimized function +; 4-> AVX/02 opt func +; 5-> AVX512/10 opt func ;;;;; -%macro mbin_dispatch_init_clmul 3 +%macro mbin_dispatch_init_clmul 5 section .text %1_dispatch_init: push mbin_rsi @@ -161,18 +165,55 @@ push mbin_rbx push mbin_rcx push mbin_rdx + push mbin_rdi lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function mov eax, 1 cpuid - lea mbin_rbx, [%3 WRT_OPT] ; SSE opt func - - ; Test for SSE4.2 + mov ebx, ecx ; save cpuid1.ecx test ecx, FLAG_CPUID1_ECX_SSE4_1 jz _%1_init_done test ecx, FLAG_CPUID1_ECX_CLMUL - cmovne mbin_rsi, mbin_rbx + jz _%1_init_done + lea mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt + + ;; Test for XMM_YMM support/AVX + test ecx, FLAG_CPUID1_ECX_OSXSAVE + je _%1_init_done + xor ecx, ecx + xgetbv ; xcr -> edx:eax + mov edi, eax ; save xgetvb.eax + + and eax, FLAG_XGETBV_EAX_XMM_YMM + cmp eax, FLAG_XGETBV_EAX_XMM_YMM + jne _%1_init_done + test ebx, FLAG_CPUID1_ECX_AVX + je _%1_init_done + lea mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt + +%if AS_FEATURE_LEVEL >= 10 + ;; Test for AVX2 + xor ecx, ecx + mov eax, 7 + cpuid + test ebx, FLAG_CPUID7_EBX_AVX2 + je _%1_init_done ; No AVX2 possible + + ;; Test for AVX512 + and edi, FLAG_XGETBV_EAX_ZMM_OPM + cmp edi, FLAG_XGETBV_EAX_ZMM_OPM + jne _%1_init_done ; No AVX512 possible + and ebx, FLAGS_CPUID7_EBX_AVX512_G1 + cmp ebx, FLAGS_CPUID7_EBX_AVX512_G1 + jne _%1_init_done + + and ecx, FLAGS_CPUID7_ECX_AVX512_G2 + cmp ecx, FLAGS_CPUID7_ECX_AVX512_G2 + lea mbin_rbx, [%5 WRT_OPT] ; AVX512/10 opt + cmove mbin_rsi, mbin_rbx +%endif _%1_init_done: + pop mbin_rdi pop mbin_rdx pop mbin_rcx pop mbin_rbx @@ -390,10 +431,97 @@ pop mbin_rsi ret %endmacro + +;;;;; +; mbin_dispatch_init8 parameters +; 1-> function name +; 2-> base function +; 3-> SSE4_2 or 00/01 optimized function +; 4-> AVX/02 opt func +; 5-> AVX2/04 opt func +; 6-> AVX512/06 opt func +; 7-> AVX2 Update/07 opt func +; 8-> AVX512 Update/10 opt func +;;;;; +%macro mbin_dispatch_init8 8 + section .text + %1_dispatch_init: + push mbin_rsi + push mbin_rax + push mbin_rbx + push mbin_rcx + push mbin_rdx + push mbin_rdi + lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function + + mov eax, 1 + cpuid + mov ebx, ecx ; save cpuid1.ecx + test ecx, FLAG_CPUID1_ECX_SSE4_2 + je _%1_init_done ; Use base function if no SSE4_2 + lea mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt + + ;; Test for XMM_YMM support/AVX + test ecx, FLAG_CPUID1_ECX_OSXSAVE + je _%1_init_done + xor ecx, ecx + xgetbv ; xcr -> edx:eax + mov edi, eax ; save xgetvb.eax + + and eax, FLAG_XGETBV_EAX_XMM_YMM + cmp eax, FLAG_XGETBV_EAX_XMM_YMM + jne _%1_init_done + test ebx, FLAG_CPUID1_ECX_AVX + je _%1_init_done + lea mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt + + ;; Test for AVX2 + xor ecx, ecx + mov eax, 7 + cpuid + test ebx, FLAG_CPUID7_EBX_AVX2 + je _%1_init_done ; No AVX2 possible + lea mbin_rsi, [%5 WRT_OPT] ; AVX2/04 opt func + + ;; Test for AVX512 + and edi, FLAG_XGETBV_EAX_ZMM_OPM + cmp edi, FLAG_XGETBV_EAX_ZMM_OPM + jne _%1_check_avx2_g2 ; No AVX512 possible + and ebx, FLAGS_CPUID7_EBX_AVX512_G1 + cmp ebx, FLAGS_CPUID7_EBX_AVX512_G1 + lea mbin_rbx, [%6 WRT_OPT] ; AVX512/06 opt + cmove mbin_rsi, mbin_rbx + + and ecx, FLAGS_CPUID7_ECX_AVX512_G2 + cmp ecx, FLAGS_CPUID7_ECX_AVX512_G2 + lea mbin_rbx, [%8 WRT_OPT] ; AVX512/10 opt + cmove mbin_rsi, mbin_rbx + jmp _%1_init_done + + _%1_check_avx2_g2: + ;; Test for AVX2 Gen 2 + and ecx, FLAGS_CPUID7_ECX_AVX2_G2 + cmp ecx, FLAGS_CPUID7_ECX_AVX2_G2 + lea mbin_rbx, [%7 WRT_OPT] ; AVX2/7 opt + cmove mbin_rsi, mbin_rbx + + _%1_init_done: + pop mbin_rdi + pop mbin_rdx + pop mbin_rcx + pop mbin_rbx + pop mbin_rax + mov [%1_dispatched], mbin_rsi + pop mbin_rsi + ret +%endmacro %else %macro mbin_dispatch_init7 7 mbin_dispatch_init6 %1, %2, %3, %4, %5, %6 %endmacro +%macro mbin_dispatch_init8 8 + mbin_dispatch_init6 %1, %2, %3, %4, %5, %6 +%endmacro %endif %endif ; ifndef _MULTIBINARY_ASM_ diff --git a/contrib/libs/isa-l/include/reg_sizes.asm b/contrib/libs/isa-l/include/reg_sizes.asm index fec6a8aafb..983f8b421d 100644 --- a/contrib/libs/isa-l/include/reg_sizes.asm +++ b/contrib/libs/isa-l/include/reg_sizes.asm @@ -30,14 +30,6 @@ %ifndef _REG_SIZES_ASM_ %define _REG_SIZES_ASM_ -%ifdef __NASM_VER__ -%ifidn __OUTPUT_FORMAT__, win64 -%error nasm not supported in windows -%else -%define endproc_frame -%endif -%endif - %ifndef AS_FEATURE_LEVEL %define AS_FEATURE_LEVEL 4 %endif @@ -75,6 +67,7 @@ %define FLAGS_CPUID7_EBX_AVX512_G1 (FLAG_CPUID7_EBX_AVX512F | FLAG_CPUID7_EBX_AVX512VL | FLAG_CPUID7_EBX_AVX512BW | FLAG_CPUID7_EBX_AVX512CD | FLAG_CPUID7_EBX_AVX512DQ) %define FLAGS_CPUID7_ECX_AVX512_G2 (FLAG_CPUID7_ECX_AVX512VBMI2 | FLAG_CPUID7_ECX_GFNI | FLAG_CPUID7_ECX_VAES | FLAG_CPUID7_ECX_VPCLMULQDQ | FLAG_CPUID7_ECX_VNNI | FLAG_CPUID7_ECX_BITALG | FLAG_CPUID7_ECX_VPOPCNTDQ) +%define FLAGS_CPUID7_ECX_AVX2_G2 (FLAG_CPUID7_ECX_GFNI | FLAG_CPUID7_ECX_VAES | FLAG_CPUID7_ECX_VPCLMULQDQ) %define FLAG_XGETBV_EAX_XMM (1<<1) %define FLAG_XGETBV_EAX_YMM (1<<2) @@ -203,14 +196,44 @@ %define XWORD(reg) reg %+ x +%ifdef INTEL_CET_ENABLED + %ifdef __NASM_VER__ + %if AS_FEATURE_LEVEL >= 10 + %ifidn __OUTPUT_FORMAT__,elf32 +section .note.gnu.property note alloc noexec align=4 +DD 0x00000004,0x0000000c,0x00000005,0x00554e47 +DD 0xc0000002,0x00000004,0x00000003 + %endif + %ifidn __OUTPUT_FORMAT__,elf64 +section .note.gnu.property note alloc noexec align=8 +DD 0x00000004,0x00000010,0x00000005,0x00554e47 +DD 0xc0000002,0x00000004,0x00000003,0x00000000 + %endif + %endif + %endif +%endif + %ifidn __OUTPUT_FORMAT__,elf32 section .note.GNU-stack noalloc noexec nowrite progbits section .text %endif %ifidn __OUTPUT_FORMAT__,elf64 + %define __x86_64__ section .note.GNU-stack noalloc noexec nowrite progbits section .text %endif +%ifidn __OUTPUT_FORMAT__,win64 + %define __x86_64__ +%endif +%ifidn __OUTPUT_FORMAT__,macho64 + %define __x86_64__ +%endif + +%ifdef __x86_64__ + %define endbranch db 0xf3, 0x0f, 0x1e, 0xfa +%else + %define endbranch db 0xf3, 0x0f, 0x1e, 0xfb +%endif %ifdef REL_TEXT %define WRT_OPT @@ -220,29 +243,56 @@ section .text %define WRT_OPT %endif +%macro mk_global 1-3 + %ifdef __NASM_VER__ + %ifidn __OUTPUT_FORMAT__, macho64 + global %1 + %elifidn __OUTPUT_FORMAT__, win64 + global %1 + %else + global %1:%2 %3 + %endif + %else + global %1:%2 %3 + %endif +%endmacro + + +; Fixes for nasm lack of MS proc helpers +%ifdef __NASM_VER__ + %ifidn __OUTPUT_FORMAT__, win64 + %macro alloc_stack 1 + sub rsp, %1 + %endmacro + + %macro proc_frame 1 + %1: + %endmacro + + %macro save_xmm128 2 + movdqa [rsp + %2], %1 + %endmacro + + %macro save_reg 2 + mov [rsp + %2], %1 + %endmacro + + %macro rex_push_reg 1 + push %1 + %endmacro + + %macro push_reg 1 + push %1 + %endmacro + + %define end_prolog + %endif + + %define endproc_frame +%endif + %ifidn __OUTPUT_FORMAT__, macho64 %define elf64 macho64 mac_equ equ 1 - %ifdef __NASM_VER__ - %define ISAL_SYM_TYPE_FUNCTION - %define ISAL_SYM_TYPE_DATA_INTERNAL - %else - %define ISAL_SYM_TYPE_FUNCTION function - %define ISAL_SYM_TYPE_DATA_INTERNAL data internal - %endif -%else - %define ISAL_SYM_TYPE_FUNCTION function - %define ISAL_SYM_TYPE_DATA_INTERNAL data internal %endif - -%macro slversion 4 - section .text - global %1_slver_%2%3%4 - global %1_slver - %1_slver: - %1_slver_%2%3%4: - dw 0x%4 - db 0x%3, 0x%2 -%endmacro - %endif ; ifndef _REG_SIZES_ASM_ |