aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/libs/isa-l
diff options
context:
space:
mode:
authorMaxim Yurchuk <maxim-yurchuk@ydb.tech>2024-10-18 20:31:38 +0300
committerGitHub <noreply@github.com>2024-10-18 20:31:38 +0300
commit2a74bac2d2d3bccb4e10120f1ead805640ec9dd0 (patch)
tree047e4818ced5aaf73f58517629e5260b5291f9f0 /contrib/libs/isa-l
parent2d9656823e9521d8c29ea4c9a1d0eab78391abfc (diff)
parent3d834a1923bbf9403cd4a448e7f32b670aa4124f (diff)
downloadydb-2a74bac2d2d3bccb4e10120f1ead805640ec9dd0.tar.gz
Merge pull request #10502 from ydb-platform/mergelibs-241016-1210
Library import 241016-1210
Diffstat (limited to 'contrib/libs/isa-l')
-rw-r--r--contrib/libs/isa-l/erasure_code/Makefile.am51
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/.yandex_meta/licenses.list.txt (renamed from contrib/libs/isa-l/erasure_code/ut/gf_2vect_dot_prod_sse_test/.yandex_meta/licenses.list.txt)0
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/Makefile.am60
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/ec_aarch64_dispatcher.c124
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/ec_aarch64_highlevel_func.c264
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/ec_multibinary_arm.S37
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_dot_prod_neon.S402
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_dot_prod_sve.S168
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_mad_neon.S411
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_mad_sve.S152
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_dot_prod_neon.S361
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_dot_prod_sve.S189
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_mad_neon.S391
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_mad_sve.S175
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_dot_prod_neon.S425
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_dot_prod_sve.S208
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_mad_neon.S464
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_mad_sve.S194
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_dot_prod_neon.S484
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_dot_prod_sve.S237
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_mad_neon.S544
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_mad_sve.S218
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_6vect_dot_prod_sve.S258
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_6vect_mad_neon.S618
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_6vect_mad_sve.S237
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_7vect_dot_prod_sve.S281
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_8vect_dot_prod_sve.S307
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_vect_dot_prod_neon.S303
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_vect_dot_prod_sve.S132
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mad_neon.S324
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mad_sve.S126
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mul_neon.S240
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mul_sve.S123
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/ya.make51
-rw-r--r--contrib/libs/isa-l/erasure_code/ec_base.c46
-rw-r--r--contrib/libs/isa-l/erasure_code/ec_base.h71
-rw-r--r--contrib/libs/isa-l/erasure_code/ec_base.patch44
-rw-r--r--contrib/libs/isa-l/erasure_code/ec_base_aliases.c8
-rw-r--r--contrib/libs/isa-l/erasure_code/ec_highlevel_func.c277
-rw-r--r--contrib/libs/isa-l/erasure_code/ec_multibinary.asm24
-rw-r--r--contrib/libs/isa-l/erasure_code/ec_multibinary_darwin.asm25
-rw-r--r--contrib/libs/isa-l/erasure_code/erasure_code_base_perf.c33
-rw-r--r--contrib/libs/isa-l/erasure_code/erasure_code_base_test.c36
-rw-r--r--contrib/libs/isa-l/erasure_code/erasure_code_base_test.patch12
-rw-r--r--contrib/libs/isa-l/erasure_code/erasure_code_perf.c171
-rw-r--r--contrib/libs/isa-l/erasure_code/erasure_code_test.c99
-rw-r--r--contrib/libs/isa-l/erasure_code/erasure_code_test.patch8
-rw-r--r--contrib/libs/isa-l/erasure_code/erasure_code_update_perf.c167
-rw-r--r--contrib/libs/isa-l/erasure_code/erasure_code_update_test.c127
-rw-r--r--contrib/libs/isa-l/erasure_code/erasure_code_update_test.patch8
-rw-r--r--contrib/libs/isa-l/erasure_code/gen_rs_matrix_limits.c25
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx.asm21
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx.patch8
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx2.asm17
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx2.patch8
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx2_gfni.asm362
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx512.asm41
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx512.patch6
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx512_gfni.asm209
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_sse.asm15
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_sse.patch8
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_sse_test.c480
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_sse_test.patch4
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx.asm13
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx.patch8
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx2.asm15
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx2.patch8
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx2_gfni.asm298
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx512.asm13
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx512.patch6
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx512_gfni.asm189
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_2vect_mad_sse.asm12
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_2vect_mad_sse.patch6
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx.asm26
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx.patch6
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx2.asm16
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx2.patch6
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx2_gfni.asm335
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx512.asm41
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx512.patch6
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx512_gfni.asm225
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_sse.asm14
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_sse.patch6
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_sse_test.c586
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_sse_test.patch4
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx.asm11
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx.patch5
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx2.asm12
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx2.patch6
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx2_gfni.asm276
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx512.asm15
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx512.patch6
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx512_gfni.asm204
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_3vect_mad_sse.asm12
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_3vect_mad_sse.patch6
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx.asm32
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx.patch6
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx2.asm17
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx2.patch8
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx512.asm36
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx512.patch6
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx512_gfni.asm253
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_sse.asm14
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_sse.patch6
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_sse_test.c695
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_sse_test.patch4
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx.asm12
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx.patch6
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx2.asm12
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx2.patch6
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx2_gfni.asm239
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx512.asm53
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx512.patch6
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx512_gfni.asm223
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_4vect_mad_sse.asm12
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_4vect_mad_sse.patch6
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_avx.asm32
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_avx.patch6
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_avx2.asm14
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_avx2.patch6
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_avx512.asm334
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_avx512_gfni.asm275
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_sse.asm12
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_sse.patch6
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_sse_test.c805
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_sse_test.patch4
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_5vect_mad_avx.asm12
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_5vect_mad_avx.patch6
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_5vect_mad_avx2.asm12
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_5vect_mad_avx2.patch6
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_5vect_mad_avx2_gfni.asm265
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_5vect_mad_avx512.asm287
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_5vect_mad_avx512_gfni.asm240
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_5vect_mad_sse.asm12
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_5vect_mad_sse.patch6
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_avx.asm32
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_avx.patch6
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_avx2.asm14
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_avx2.patch6
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_avx512.asm353
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_avx512_gfni.asm292
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_sse.asm12
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_sse.patch6
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_sse_test.c911
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_sse_test.patch4
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_6vect_mad_avx.asm12
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_6vect_mad_avx.patch6
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_6vect_mad_avx2.asm12
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_6vect_mad_avx2.patch6
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_6vect_mad_avx512.asm321
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_6vect_mad_avx512_gfni.asm259
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_6vect_mad_sse.asm12
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_6vect_mad_sse.patch6
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_inverse_test.c33
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_inverse_test.patch4
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_1tbl.c34
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_1tbl.patch8
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx.asm14
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx.patch6
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx2.asm16
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx2.patch6
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx2_gfni.asm318
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx512.asm21
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx512.patch6
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx512_gfni.asm190
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_base_test.c10
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_base_test.patch4
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_perf.c22
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_sse.asm14
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_sse.patch6
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_test.c19
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_test.patch4
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_vect_gfni.inc72
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_vect_mad_avx.asm12
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_vect_mad_avx.patch6
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_vect_mad_avx2.asm14
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_vect_mad_avx2.patch6
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_vect_mad_avx2_gfni.asm255
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_vect_mad_avx512.asm13
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_vect_mad_avx512.patch6
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_vect_mad_avx512_gfni.asm175
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_vect_mad_sse.asm12
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_vect_mad_sse.patch6
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_vect_mad_test.c16
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_vect_mad_test.patch4
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_vect_mul_avx.asm37
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_vect_mul_avx.patch5
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_vect_mul_base_test.c26
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_vect_mul_base_test.patch28
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_vect_mul_perf.c23
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_vect_mul_sse.asm25
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_vect_mul_sse.patch6
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_vect_mul_test.c75
-rw-r--r--contrib/libs/isa-l/erasure_code/gf_vect_mul_test.patch40
-rw-r--r--contrib/libs/isa-l/erasure_code/ppc64le/Makefile.am15
-rw-r--r--contrib/libs/isa-l/erasure_code/ppc64le/ec_base_vsx.c106
-rw-r--r--contrib/libs/isa-l/erasure_code/ppc64le/ec_base_vsx.h338
-rw-r--r--contrib/libs/isa-l/erasure_code/ppc64le/gf_2vect_dot_prod_vsx.c83
-rw-r--r--contrib/libs/isa-l/erasure_code/ppc64le/gf_2vect_mad_vsx.c65
-rw-r--r--contrib/libs/isa-l/erasure_code/ppc64le/gf_3vect_dot_prod_vsx.c104
-rw-r--r--contrib/libs/isa-l/erasure_code/ppc64le/gf_3vect_mad_vsx.c84
-rw-r--r--contrib/libs/isa-l/erasure_code/ppc64le/gf_4vect_dot_prod_vsx.c124
-rw-r--r--contrib/libs/isa-l/erasure_code/ppc64le/gf_4vect_mad_vsx.c103
-rw-r--r--contrib/libs/isa-l/erasure_code/ppc64le/gf_5vect_dot_prod_vsx.c145
-rw-r--r--contrib/libs/isa-l/erasure_code/ppc64le/gf_5vect_mad_vsx.c122
-rw-r--r--contrib/libs/isa-l/erasure_code/ppc64le/gf_6vect_dot_prod_vsx.c166
-rw-r--r--contrib/libs/isa-l/erasure_code/ppc64le/gf_6vect_mad_vsx.c142
-rw-r--r--contrib/libs/isa-l/erasure_code/ppc64le/gf_vect_dot_prod_vsx.c85
-rw-r--r--contrib/libs/isa-l/erasure_code/ppc64le/gf_vect_mad_vsx.c48
-rw-r--r--contrib/libs/isa-l/erasure_code/ppc64le/gf_vect_mul_vsx.c75
-rw-r--r--contrib/libs/isa-l/erasure_code/ut/erasure_code_base_test/ya.make2
-rw-r--r--contrib/libs/isa-l/erasure_code/ut/erasure_code_test/ya.make2
-rw-r--r--contrib/libs/isa-l/erasure_code/ut/erasure_code_update_test/ya.make2
-rw-r--r--contrib/libs/isa-l/erasure_code/ut/gf_2vect_dot_prod_sse_test/ya.make29
-rw-r--r--contrib/libs/isa-l/erasure_code/ut/gf_3vect_dot_prod_sse_test/.yandex_meta/licenses.list.txt164
-rw-r--r--contrib/libs/isa-l/erasure_code/ut/gf_3vect_dot_prod_sse_test/ya.make29
-rw-r--r--contrib/libs/isa-l/erasure_code/ut/gf_4vect_dot_prod_sse_test/.yandex_meta/licenses.list.txt164
-rw-r--r--contrib/libs/isa-l/erasure_code/ut/gf_4vect_dot_prod_sse_test/ya.make29
-rw-r--r--contrib/libs/isa-l/erasure_code/ut/gf_5vect_dot_prod_sse_test/.yandex_meta/licenses.list.txt164
-rw-r--r--contrib/libs/isa-l/erasure_code/ut/gf_5vect_dot_prod_sse_test/ya.make29
-rw-r--r--contrib/libs/isa-l/erasure_code/ut/gf_6vect_dot_prod_sse_test/.yandex_meta/licenses.list.txt164
-rw-r--r--contrib/libs/isa-l/erasure_code/ut/gf_6vect_dot_prod_sse_test/ya.make29
-rw-r--r--contrib/libs/isa-l/erasure_code/ut/gf_inverse_test/ya.make2
-rw-r--r--contrib/libs/isa-l/erasure_code/ut/gf_vect_dot_prod_base_test/ya.make2
-rw-r--r--contrib/libs/isa-l/erasure_code/ut/gf_vect_dot_prod_test/ya.make2
-rw-r--r--contrib/libs/isa-l/erasure_code/ut/gf_vect_mad_test/ya.make2
-rw-r--r--contrib/libs/isa-l/erasure_code/ut/gf_vect_mul_base_test/ya.make2
-rw-r--r--contrib/libs/isa-l/erasure_code/ut/gf_vect_mul_test/ya.make2
-rw-r--r--contrib/libs/isa-l/erasure_code/ut/ya.make22
-rw-r--r--contrib/libs/isa-l/erasure_code/ya.make115
-rw-r--r--contrib/libs/isa-l/include/aarch64_label.h18
-rw-r--r--contrib/libs/isa-l/include/aarch64_multibinary.h347
-rw-r--r--contrib/libs/isa-l/include/erasure_code.h13
-rw-r--r--contrib/libs/isa-l/include/gf_vect_mul.h5
-rw-r--r--contrib/libs/isa-l/include/memcpy.asm769
-rw-r--r--contrib/libs/isa-l/include/multibinary.asm140
-rw-r--r--contrib/libs/isa-l/include/reg_sizes.asm108
237 files changed, 19486 insertions, 5845 deletions
diff --git a/contrib/libs/isa-l/erasure_code/Makefile.am b/contrib/libs/isa-l/erasure_code/Makefile.am
index bad2aae2f3..8f334462ac 100644
--- a/contrib/libs/isa-l/erasure_code/Makefile.am
+++ b/contrib/libs/isa-l/erasure_code/Makefile.am
@@ -1,5 +1,5 @@
########################################################################
-# Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+# Copyright(c) 2011-2019 Intel Corporation All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
@@ -27,11 +27,13 @@
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
########################################################################
+include erasure_code/aarch64/Makefile.am
+
+include erasure_code/ppc64le/Makefile.am
+
lsrc += erasure_code/ec_base.c
lsrc_base_aliases += erasure_code/ec_base_aliases.c
-lsrc_aarch64 += erasure_code/ec_base_aliases.c
-
lsrc_x86_64 += \
erasure_code/ec_highlevel_func.c \
erasure_code/gf_vect_mul_sse.asm \
@@ -76,14 +78,38 @@ lsrc_x86_64 += \
#if HAVE_AVX512
lsrc_x86_64 += \
+ erasure_code/gf_vect_mad_avx2_gfni.asm \
+ erasure_code/gf_2vect_mad_avx2_gfni.asm \
+ erasure_code/gf_3vect_mad_avx2_gfni.asm \
+ erasure_code/gf_4vect_mad_avx2_gfni.asm \
+ erasure_code/gf_5vect_mad_avx2_gfni.asm \
erasure_code/gf_vect_dot_prod_avx512.asm \
erasure_code/gf_2vect_dot_prod_avx512.asm \
erasure_code/gf_3vect_dot_prod_avx512.asm \
erasure_code/gf_4vect_dot_prod_avx512.asm \
+ erasure_code/gf_5vect_dot_prod_avx512.asm \
+ erasure_code/gf_6vect_dot_prod_avx512.asm \
+ erasure_code/gf_vect_dot_prod_avx512_gfni.asm \
+ erasure_code/gf_vect_dot_prod_avx2_gfni.asm \
+ erasure_code/gf_2vect_dot_prod_avx2_gfni.asm \
+ erasure_code/gf_3vect_dot_prod_avx2_gfni.asm \
+ erasure_code/gf_2vect_dot_prod_avx512_gfni.asm \
+ erasure_code/gf_3vect_dot_prod_avx512_gfni.asm \
+ erasure_code/gf_4vect_dot_prod_avx512_gfni.asm \
+ erasure_code/gf_5vect_dot_prod_avx512_gfni.asm \
+ erasure_code/gf_6vect_dot_prod_avx512_gfni.asm \
erasure_code/gf_vect_mad_avx512.asm \
erasure_code/gf_2vect_mad_avx512.asm \
erasure_code/gf_3vect_mad_avx512.asm \
- erasure_code/gf_4vect_mad_avx512.asm
+ erasure_code/gf_4vect_mad_avx512.asm \
+ erasure_code/gf_5vect_mad_avx512.asm \
+ erasure_code/gf_6vect_mad_avx512.asm \
+ erasure_code/gf_vect_mad_avx512_gfni.asm \
+ erasure_code/gf_2vect_mad_avx512_gfni.asm \
+ erasure_code/gf_3vect_mad_avx512_gfni.asm \
+ erasure_code/gf_4vect_mad_avx512_gfni.asm \
+ erasure_code/gf_5vect_mad_avx512_gfni.asm \
+ erasure_code/gf_6vect_mad_avx512_gfni.asm
lsrc_x86_32 += \
erasure_code/ec_highlevel_func.c \
@@ -143,19 +169,4 @@ perf_tests += erasure_code/gf_vect_mul_perf \
other_tests += erasure_code/gen_rs_matrix_limits
-other_tests_x86_64 += \
- erasure_code/gf_2vect_dot_prod_sse_test \
- erasure_code/gf_3vect_dot_prod_sse_test \
- erasure_code/gf_4vect_dot_prod_sse_test \
- erasure_code/gf_5vect_dot_prod_sse_test \
- erasure_code/gf_6vect_dot_prod_sse_test
-
-other_tests_x86_32 += \
- erasure_code/gf_2vect_dot_prod_sse_test \
- erasure_code/gf_3vect_dot_prod_sse_test \
- erasure_code/gf_4vect_dot_prod_sse_test \
- erasure_code/gf_5vect_dot_prod_sse_test \
- erasure_code/gf_6vect_dot_prod_sse_test
-
-other_src += include/test.h \
- include/types.h
+other_src += include/test.h
diff --git a/contrib/libs/isa-l/erasure_code/ut/gf_2vect_dot_prod_sse_test/.yandex_meta/licenses.list.txt b/contrib/libs/isa-l/erasure_code/aarch64/.yandex_meta/licenses.list.txt
index 8f218b47cb..8f218b47cb 100644
--- a/contrib/libs/isa-l/erasure_code/ut/gf_2vect_dot_prod_sse_test/.yandex_meta/licenses.list.txt
+++ b/contrib/libs/isa-l/erasure_code/aarch64/.yandex_meta/licenses.list.txt
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/Makefile.am b/contrib/libs/isa-l/erasure_code/aarch64/Makefile.am
new file mode 100644
index 0000000000..47bbf12d2b
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/Makefile.am
@@ -0,0 +1,60 @@
+##################################################################
+# Copyright (c) 2019 Huawei Technologies Co., Ltd.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Huawei Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+lsrc_aarch64 += \
+ erasure_code/aarch64/ec_aarch64_highlevel_func.c \
+ erasure_code/aarch64/ec_aarch64_dispatcher.c \
+ erasure_code/aarch64/gf_vect_dot_prod_neon.S \
+ erasure_code/aarch64/gf_2vect_dot_prod_neon.S \
+ erasure_code/aarch64/gf_3vect_dot_prod_neon.S \
+ erasure_code/aarch64/gf_4vect_dot_prod_neon.S \
+ erasure_code/aarch64/gf_5vect_dot_prod_neon.S \
+ erasure_code/aarch64/gf_vect_mad_neon.S \
+ erasure_code/aarch64/gf_2vect_mad_neon.S \
+ erasure_code/aarch64/gf_3vect_mad_neon.S \
+ erasure_code/aarch64/gf_4vect_mad_neon.S \
+ erasure_code/aarch64/gf_5vect_mad_neon.S \
+ erasure_code/aarch64/gf_6vect_mad_neon.S \
+ erasure_code/aarch64/gf_vect_mul_neon.S \
+ erasure_code/aarch64/gf_vect_mad_sve.S \
+ erasure_code/aarch64/gf_2vect_mad_sve.S \
+ erasure_code/aarch64/gf_3vect_mad_sve.S \
+ erasure_code/aarch64/gf_4vect_mad_sve.S \
+ erasure_code/aarch64/gf_5vect_mad_sve.S \
+ erasure_code/aarch64/gf_6vect_mad_sve.S \
+ erasure_code/aarch64/gf_vect_dot_prod_sve.S \
+ erasure_code/aarch64/gf_2vect_dot_prod_sve.S \
+ erasure_code/aarch64/gf_3vect_dot_prod_sve.S \
+ erasure_code/aarch64/gf_4vect_dot_prod_sve.S \
+ erasure_code/aarch64/gf_5vect_dot_prod_sve.S \
+ erasure_code/aarch64/gf_6vect_dot_prod_sve.S \
+ erasure_code/aarch64/gf_7vect_dot_prod_sve.S \
+ erasure_code/aarch64/gf_8vect_dot_prod_sve.S \
+ erasure_code/aarch64/gf_vect_mul_sve.S \
+ erasure_code/aarch64/ec_multibinary_arm.S
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/ec_aarch64_dispatcher.c b/contrib/libs/isa-l/erasure_code/aarch64/ec_aarch64_dispatcher.c
new file mode 100644
index 0000000000..0a11604076
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/ec_aarch64_dispatcher.c
@@ -0,0 +1,124 @@
+/**************************************************************
+ Copyright (c) 2019 Huawei Technologies Co., Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <aarch64_multibinary.h>
+
+DEFINE_INTERFACE_DISPATCHER(gf_vect_dot_prod)
+{
+#if defined(__linux__)
+ unsigned long auxval = getauxval(AT_HWCAP);
+
+ if (auxval & HWCAP_SVE)
+ return PROVIDER_INFO(gf_vect_dot_prod_sve);
+ if (auxval & HWCAP_ASIMD)
+ return PROVIDER_INFO(gf_vect_dot_prod_neon);
+#elif defined(__APPLE__)
+ if (sysctlEnabled(SYSCTL_SVE_KEY))
+ return PROVIDER_INFO(gf_vect_dot_prod_sve);
+ return PROVIDER_INFO(gf_vect_dot_prod_neon);
+#endif
+ return PROVIDER_BASIC(gf_vect_dot_prod);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(gf_vect_mad)
+{
+#if defined(__linux__)
+ unsigned long auxval = getauxval(AT_HWCAP);
+
+ if (auxval & HWCAP_SVE)
+ return PROVIDER_INFO(gf_vect_mad_sve);
+ if (auxval & HWCAP_ASIMD)
+ return PROVIDER_INFO(gf_vect_mad_neon);
+#elif defined(__APPLE__)
+ if (sysctlEnabled(SYSCTL_SVE_KEY))
+ return PROVIDER_INFO(gf_vect_mad_sve);
+ return PROVIDER_INFO(gf_vect_mad_neon);
+#endif
+ return PROVIDER_BASIC(gf_vect_mad);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(ec_encode_data)
+{
+#if defined(__linux__)
+ unsigned long auxval = getauxval(AT_HWCAP);
+
+ if (auxval & HWCAP_SVE)
+ return PROVIDER_INFO(ec_encode_data_sve);
+ if (auxval & HWCAP_ASIMD)
+ return PROVIDER_INFO(ec_encode_data_neon);
+#elif defined(__APPLE__)
+ if (sysctlEnabled(SYSCTL_SVE_KEY))
+ return PROVIDER_INFO(ec_encode_data_sve);
+ return PROVIDER_INFO(ec_encode_data_neon);
+#endif
+ return PROVIDER_BASIC(ec_encode_data);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(ec_encode_data_update)
+{
+#if defined(__linux__)
+ unsigned long auxval = getauxval(AT_HWCAP);
+
+ if (auxval & HWCAP_SVE)
+ return PROVIDER_INFO(ec_encode_data_update_sve);
+ if (auxval & HWCAP_ASIMD)
+ return PROVIDER_INFO(ec_encode_data_update_neon);
+#elif defined(__APPLE__)
+ if (sysctlEnabled(SYSCTL_SVE_KEY))
+ return PROVIDER_INFO(ec_encode_data_update_sve);
+ return PROVIDER_INFO(ec_encode_data_update_neon);
+#endif
+ return PROVIDER_BASIC(ec_encode_data_update);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(gf_vect_mul)
+{
+#if defined(__linux__)
+ unsigned long auxval = getauxval(AT_HWCAP);
+
+ if (auxval & HWCAP_SVE)
+ return PROVIDER_INFO(gf_vect_mul_sve);
+ if (auxval & HWCAP_ASIMD)
+ return PROVIDER_INFO(gf_vect_mul_neon);
+#elif defined(__APPLE__)
+ if (sysctlEnabled(SYSCTL_SVE_KEY))
+ return PROVIDER_INFO(gf_vect_mul_sve);
+ return PROVIDER_INFO(gf_vect_mul_neon);
+#endif
+ return PROVIDER_BASIC(gf_vect_mul);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(ec_init_tables)
+{
+ return PROVIDER_BASIC(ec_init_tables);
+}
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/ec_aarch64_highlevel_func.c b/contrib/libs/isa-l/erasure_code/aarch64/ec_aarch64_highlevel_func.c
new file mode 100644
index 0000000000..e001fd72a0
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/ec_aarch64_highlevel_func.c
@@ -0,0 +1,264 @@
+/**************************************************************
+ Copyright (c) 2019 Huawei Technologies Co., Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include "erasure_code.h"
+
+/*external function*/
+extern void gf_vect_dot_prod_neon(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char *dest);
+extern void gf_2vect_dot_prod_neon(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+extern void gf_3vect_dot_prod_neon(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+extern void gf_4vect_dot_prod_neon(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+extern void gf_5vect_dot_prod_neon(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+extern void gf_vect_mad_neon(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char *dest);
+extern void gf_2vect_mad_neon(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char **dest);
+extern void gf_3vect_mad_neon(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char **dest);
+extern void gf_4vect_mad_neon(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char **dest);
+extern void gf_5vect_mad_neon(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char **dest);
+extern void gf_6vect_mad_neon(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char **dest);
+
+void ec_encode_data_neon(int len, int k, int rows, unsigned char *g_tbls, unsigned char **data,
+ unsigned char **coding)
+{
+ if (len < 16) {
+ ec_encode_data_base(len, k, rows, g_tbls, data, coding);
+ return;
+ }
+
+ while (rows > 5) {
+ gf_5vect_dot_prod_neon(len, k, g_tbls, data, coding);
+ g_tbls += 5 * k * 32;
+ coding += 5;
+ rows -= 5;
+ }
+ switch (rows) {
+ case 5:
+ gf_5vect_dot_prod_neon(len, k, g_tbls, data, coding);
+ break;
+ case 4:
+ gf_4vect_dot_prod_neon(len, k, g_tbls, data, coding);
+ break;
+ case 3:
+ gf_3vect_dot_prod_neon(len, k, g_tbls, data, coding);
+ break;
+ case 2:
+ gf_2vect_dot_prod_neon(len, k, g_tbls, data, coding);
+ break;
+ case 1:
+ gf_vect_dot_prod_neon(len, k, g_tbls, data, *coding);
+ break;
+ case 0:
+ break;
+ default:
+ break;
+ }
+}
+
+void ec_encode_data_update_neon(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
+ unsigned char *data, unsigned char **coding)
+{
+ if (len < 16) {
+ ec_encode_data_update_base(len, k, rows, vec_i, g_tbls, data, coding);
+ return;
+ }
+ while (rows > 6) {
+ gf_6vect_mad_neon(len, k, vec_i, g_tbls, data, coding);
+ g_tbls += 6 * k * 32;
+ coding += 6;
+ rows -= 6;
+ }
+ switch (rows) {
+ case 6:
+ gf_6vect_mad_neon(len, k, vec_i, g_tbls, data, coding);
+ break;
+ case 5:
+ gf_5vect_mad_neon(len, k, vec_i, g_tbls, data, coding);
+ break;
+ case 4:
+ gf_4vect_mad_neon(len, k, vec_i, g_tbls, data, coding);
+ break;
+ case 3:
+ gf_3vect_mad_neon(len, k, vec_i, g_tbls, data, coding);
+ break;
+ case 2:
+ gf_2vect_mad_neon(len, k, vec_i, g_tbls, data, coding);
+ break;
+ case 1:
+ gf_vect_mad_neon(len, k, vec_i, g_tbls, data, *coding);
+ break;
+ case 0:
+ break;
+ }
+}
+
+/* SVE */
+extern void gf_vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char *dest);
+extern void gf_2vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+extern void gf_3vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+extern void gf_4vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+extern void gf_5vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+extern void gf_6vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+extern void gf_7vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+extern void gf_8vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+extern void gf_vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char *dest);
+extern void gf_2vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char **dest);
+extern void gf_3vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char **dest);
+extern void gf_4vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char **dest);
+extern void gf_5vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char **dest);
+extern void gf_6vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char **dest);
+
+void ec_encode_data_sve(int len, int k, int rows, unsigned char *g_tbls, unsigned char **data,
+ unsigned char **coding)
+{
+ if (len < 16) {
+ ec_encode_data_base(len, k, rows, g_tbls, data, coding);
+ return;
+ }
+
+ while (rows > 11) {
+ gf_6vect_dot_prod_sve(len, k, g_tbls, data, coding);
+ g_tbls += 6 * k * 32;
+ coding += 6;
+ rows -= 6;
+ }
+
+ switch (rows) {
+ case 11:
+ /* 7 + 4 */
+ gf_7vect_dot_prod_sve(len, k, g_tbls, data, coding);
+ g_tbls += 7 * k * 32;
+ coding += 7;
+ gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding);
+ break;
+ case 10:
+ /* 6 + 4 */
+ gf_6vect_dot_prod_sve(len, k, g_tbls, data, coding);
+ g_tbls += 6 * k * 32;
+ coding += 6;
+ gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding);
+ break;
+ case 9:
+ /* 5 + 4 */
+ gf_5vect_dot_prod_sve(len, k, g_tbls, data, coding);
+ g_tbls += 5 * k * 32;
+ coding += 5;
+ gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding);
+ break;
+ case 8:
+ /* 4 + 4 */
+ gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding);
+ g_tbls += 4 * k * 32;
+ coding += 4;
+ gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding);
+ break;
+ case 7:
+ gf_7vect_dot_prod_sve(len, k, g_tbls, data, coding);
+ break;
+ case 6:
+ gf_6vect_dot_prod_sve(len, k, g_tbls, data, coding);
+ break;
+ case 5:
+ gf_5vect_dot_prod_sve(len, k, g_tbls, data, coding);
+ break;
+ case 4:
+ gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding);
+ break;
+ case 3:
+ gf_3vect_dot_prod_sve(len, k, g_tbls, data, coding);
+ break;
+ case 2:
+ gf_2vect_dot_prod_sve(len, k, g_tbls, data, coding);
+ break;
+ case 1:
+ gf_vect_dot_prod_sve(len, k, g_tbls, data, *coding);
+ break;
+ default:
+ break;
+ }
+}
+
+void ec_encode_data_update_sve(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
+ unsigned char *data, unsigned char **coding)
+{
+ if (len < 16) {
+ ec_encode_data_update_base(len, k, rows, vec_i, g_tbls, data, coding);
+ return;
+ }
+ while (rows > 6) {
+ gf_6vect_mad_sve(len, k, vec_i, g_tbls, data, coding);
+ g_tbls += 6 * k * 32;
+ coding += 6;
+ rows -= 6;
+ }
+ switch (rows) {
+ case 6:
+ gf_6vect_mad_sve(len, k, vec_i, g_tbls, data, coding);
+ break;
+ case 5:
+ gf_5vect_mad_sve(len, k, vec_i, g_tbls, data, coding);
+ break;
+ case 4:
+ gf_4vect_mad_sve(len, k, vec_i, g_tbls, data, coding);
+ break;
+ case 3:
+ gf_3vect_mad_sve(len, k, vec_i, g_tbls, data, coding);
+ break;
+ case 2:
+ gf_2vect_mad_sve(len, k, vec_i, g_tbls, data, coding);
+ break;
+ case 1:
+ gf_vect_mad_sve(len, k, vec_i, g_tbls, data, *coding);
+ break;
+ default:
+ break;
+ }
+}
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/ec_multibinary_arm.S b/contrib/libs/isa-l/erasure_code/aarch64/ec_multibinary_arm.S
new file mode 100644
index 0000000000..c276e63780
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/ec_multibinary_arm.S
@@ -0,0 +1,37 @@
+/**************************************************************
+ Copyright (c) 2019 Huawei Technologies Co., Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "aarch64_multibinary.h"
+
+mbin_interface ec_encode_data
+mbin_interface gf_vect_mul
+mbin_interface gf_vect_dot_prod
+mbin_interface gf_vect_mad
+mbin_interface ec_encode_data_update
+mbin_interface ec_init_tables
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_dot_prod_neon.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_dot_prod_neon.S
new file mode 100644
index 0000000000..4ff7e7ce16
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_dot_prod_neon.S
@@ -0,0 +1,402 @@
+/**************************************************************
+ Copyright (c) 2019 Huawei Technologies Co., Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "../include/aarch64_label.h"
+
+.text
+
+.global cdecl(gf_2vect_dot_prod_neon)
+#ifndef __APPLE__
+.type gf_2vect_dot_prod_neon, %function
+#endif
+
+/* arguments */
+x_len .req x0
+x_vec .req x1
+x_tbl .req x2
+x_src .req x3
+x_dest .req x4
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_vec_i .req x5
+x_ptr .req x6
+x_pos .req x7
+x_tmp .req x8
+x_tbl1 .req x9
+x_tbl2 .req x10
+x_dest1 .req x11
+x_dest2 .req x12
+
+/* vectors */
+v_gft1_lo .req v0
+v_gft1_hi .req v1
+v_gft2_lo .req v2
+v_gft2_hi .req v3
+q_gft1_lo .req q0
+q_gft1_hi .req q1
+q_gft2_lo .req q2
+q_gft2_hi .req q3
+
+v_mask0f .req v4
+q_mask0f .req q4
+
+v_tmp1_lo .req v5
+v_tmp1_hi .req v6
+v_tmp1 .req v7
+
+v_data_0 .req v8
+v_data_1 .req v9
+v_data_2 .req v10
+v_data_3 .req v11
+v_data_4 .req v12
+v_data_5 .req v13
+v_data_6 .req v14
+v_data_7 .req v15
+q_data_0 .req q8
+q_data_1 .req q9
+q_data_2 .req q10
+q_data_3 .req q11
+q_data_4 .req q12
+q_data_5 .req q13
+q_data_6 .req q14
+q_data_7 .req q15
+
+v_p1_0 .req v16
+v_p1_1 .req v17
+v_p1_2 .req v18
+v_p1_3 .req v19
+v_p1_4 .req v20
+v_p1_5 .req v21
+v_p1_6 .req v22
+v_p1_7 .req v23
+v_p2_0 .req v24
+v_p2_1 .req v25
+v_p2_2 .req v26
+v_p2_3 .req v27
+v_p2_4 .req v28
+v_p2_5 .req v29
+v_p2_6 .req v30
+v_p2_7 .req v31
+
+q_p1_0 .req q16
+q_p1_1 .req q17
+q_p1_2 .req q18
+q_p1_3 .req q19
+q_p1_4 .req q20
+q_p1_5 .req q21
+q_p1_6 .req q22
+q_p1_7 .req q23
+q_p2_0 .req q24
+q_p2_1 .req q25
+q_p2_2 .req q26
+q_p2_3 .req q27
+q_p2_4 .req q28
+q_p2_5 .req q29
+q_p2_6 .req q30
+q_p2_7 .req q31
+
+v_p1 .req v_p1_0
+q_p1 .req q_p1_0
+v_p2 .req v_p2_0
+q_p2 .req q_p2_0
+v_data .req v_p1_1
+q_data .req q_p1_1
+v_data_lo .req v_p1_2
+v_data_hi .req v_p1_3
+
+cdecl(gf_2vect_dot_prod_neon):
+ /* less than 16 bytes, return_fail */
+ cmp x_len, #16
+ blt .return_fail
+
+ movi v_mask0f.16b, #0x0f
+ mov x_pos, #0
+ lsl x_vec, x_vec, #3
+ ldr x_dest1, [x_dest, #8*0]
+ ldr x_dest2, [x_dest, #8*1]
+
+.Lloop128_init:
+ /* less than 128 bytes, goto Lloop16_init */
+ cmp x_len, #128
+ blt .Lloop16_init
+
+ /* save d8 ~ d15 to stack */
+ sub sp, sp, #64
+ stp d8, d9, [sp]
+ stp d10, d11, [sp, #16]
+ stp d12, d13, [sp, #32]
+ stp d14, d15, [sp, #48]
+
+ sub x_len, x_len, #128
+
+.Lloop128:
+ movi v_p1_0.16b, #0
+ movi v_p1_1.16b, #0
+ movi v_p1_2.16b, #0
+ movi v_p1_3.16b, #0
+ movi v_p1_4.16b, #0
+ movi v_p1_5.16b, #0
+ movi v_p1_6.16b, #0
+ movi v_p1_7.16b, #0
+
+ movi v_p2_0.16b, #0
+ movi v_p2_1.16b, #0
+ movi v_p2_2.16b, #0
+ movi v_p2_3.16b, #0
+ movi v_p2_4.16b, #0
+ movi v_p2_5.16b, #0
+ movi v_p2_6.16b, #0
+ movi v_p2_7.16b, #0
+
+ mov x_tbl1, x_tbl
+ add x_tbl2, x_tbl, x_vec, lsl #2
+ mov x_vec_i, #0
+
+.Lloop128_vects:
+ ldr x_ptr, [x_src, x_vec_i]
+ add x_vec_i, x_vec_i, #8
+ add x_ptr, x_ptr, x_pos
+
+ ldp q_data_0, q_data_1, [x_ptr], #32
+ ldp q_data_2, q_data_3, [x_ptr], #32
+
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32
+ ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32
+ ldp q_data_4, q_data_5, [x_ptr], #32
+ ldp q_data_6, q_data_7, [x_ptr], #32
+ prfm pldl1strm, [x_ptr]
+ prfm pldl1keep, [x_tbl1]
+ prfm pldl1keep, [x_tbl2]
+
+ /* data_0 */
+ and v_tmp1.16b, v_data_0.16b, v_mask0f.16b
+ ushr v_data_0.16b, v_data_0.16b, #4
+
+ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_0.16b
+ eor v_p1_0.16b, v_tmp1_lo.16b, v_p1_0.16b
+ eor v_p1_0.16b, v_p1_0.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_0.16b
+ eor v_p2_0.16b, v_tmp1_lo.16b, v_p2_0.16b
+ eor v_p2_0.16b, v_p2_0.16b, v_tmp1_hi.16b
+
+ /* data_1 */
+ and v_tmp1.16b, v_data_1.16b, v_mask0f.16b
+ ushr v_data_1.16b, v_data_1.16b, #4
+
+ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_1.16b
+ eor v_p1_1.16b, v_tmp1_lo.16b, v_p1_1.16b
+ eor v_p1_1.16b, v_p1_1.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_1.16b
+ eor v_p2_1.16b, v_tmp1_lo.16b, v_p2_1.16b
+ eor v_p2_1.16b, v_p2_1.16b, v_tmp1_hi.16b
+
+ /* data_2 */
+ and v_tmp1.16b, v_data_2.16b, v_mask0f.16b
+ ushr v_data_2.16b, v_data_2.16b, #4
+
+ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_2.16b
+ eor v_p1_2.16b, v_tmp1_lo.16b, v_p1_2.16b
+ eor v_p1_2.16b, v_p1_2.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_2.16b
+ eor v_p2_2.16b, v_tmp1_lo.16b, v_p2_2.16b
+ eor v_p2_2.16b, v_p2_2.16b, v_tmp1_hi.16b
+
+ /* data_3 */
+ and v_tmp1.16b, v_data_3.16b, v_mask0f.16b
+ ushr v_data_3.16b, v_data_3.16b, #4
+
+ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_3.16b
+ eor v_p1_3.16b, v_tmp1_lo.16b, v_p1_3.16b
+ eor v_p1_3.16b, v_p1_3.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_3.16b
+ eor v_p2_3.16b, v_tmp1_lo.16b, v_p2_3.16b
+ eor v_p2_3.16b, v_p2_3.16b, v_tmp1_hi.16b
+
+ /* data_4 */
+ and v_tmp1.16b, v_data_4.16b, v_mask0f.16b
+ ushr v_data_4.16b, v_data_4.16b, #4
+
+ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_4.16b
+ eor v_p1_4.16b, v_tmp1_lo.16b, v_p1_4.16b
+ eor v_p1_4.16b, v_p1_4.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_4.16b
+ eor v_p2_4.16b, v_tmp1_lo.16b, v_p2_4.16b
+ eor v_p2_4.16b, v_p2_4.16b, v_tmp1_hi.16b
+
+ /* data_5 */
+ and v_tmp1.16b, v_data_5.16b, v_mask0f.16b
+ ushr v_data_5.16b, v_data_5.16b, #4
+
+ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_5.16b
+ eor v_p1_5.16b, v_tmp1_lo.16b, v_p1_5.16b
+ eor v_p1_5.16b, v_p1_5.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_5.16b
+ eor v_p2_5.16b, v_tmp1_lo.16b, v_p2_5.16b
+ eor v_p2_5.16b, v_p2_5.16b, v_tmp1_hi.16b
+
+ /* data_6 */
+ and v_tmp1.16b, v_data_6.16b, v_mask0f.16b
+ ushr v_data_6.16b, v_data_6.16b, #4
+
+ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_6.16b
+ eor v_p1_6.16b, v_tmp1_lo.16b, v_p1_6.16b
+ eor v_p1_6.16b, v_p1_6.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_6.16b
+ eor v_p2_6.16b, v_tmp1_lo.16b, v_p2_6.16b
+ eor v_p2_6.16b, v_p2_6.16b, v_tmp1_hi.16b
+
+ /* data_7 */
+ and v_tmp1.16b, v_data_7.16b, v_mask0f.16b
+ ushr v_data_7.16b, v_data_7.16b, #4
+
+ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_7.16b
+ eor v_p1_7.16b, v_tmp1_lo.16b, v_p1_7.16b
+ eor v_p1_7.16b, v_p1_7.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_7.16b
+ eor v_p2_7.16b, v_tmp1_lo.16b, v_p2_7.16b
+ eor v_p2_7.16b, v_p2_7.16b, v_tmp1_hi.16b
+
+ cmp x_vec_i, x_vec
+ blt .Lloop128_vects
+
+.Lloop128_vects_end:
+ add x_ptr, x_dest1, x_pos
+ stp q_p1_0, q_p1_1, [x_ptr], #32
+ stp q_p1_2, q_p1_3, [x_ptr], #32
+ stp q_p1_4, q_p1_5, [x_ptr], #32
+ stp q_p1_6, q_p1_7, [x_ptr]
+
+ add x_ptr, x_dest2, x_pos
+ stp q_p2_0, q_p2_1, [x_ptr], #32
+ stp q_p2_2, q_p2_3, [x_ptr], #32
+ stp q_p2_4, q_p2_5, [x_ptr], #32
+ stp q_p2_6, q_p2_7, [x_ptr]
+
+ add x_pos, x_pos, #128
+ cmp x_pos, x_len
+ ble .Lloop128
+
+.Lloop128_end:
+ /* restore d8 ~ d15 */
+ ldp d8, d9, [sp]
+ ldp d10, d11, [sp, #16]
+ ldp d12, d13, [sp, #32]
+ ldp d14, d15, [sp, #48]
+ add sp, sp, #64
+
+ add x_len, x_len, #128
+ cmp x_pos, x_len
+ beq .return_pass
+
+.Lloop16_init:
+ sub x_len, x_len, #16
+ cmp x_pos, x_len
+ bgt .lessthan16_init
+
+.Lloop16:
+ movi v_p1.16b, #0
+ movi v_p2.16b, #0
+ mov x_tbl1, x_tbl
+ add x_tbl2, x_tbl, x_vec, lsl #2
+ mov x_vec_i, #0
+
+.Lloop16_vects:
+ ldr x_ptr, [x_src, x_vec_i]
+ ldr q_data, [x_ptr, x_pos]
+ add x_vec_i, x_vec_i, #8
+
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32
+ ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32
+
+ and v_data_lo.16b, v_data.16b, v_mask0f.16b
+ ushr v_data_hi.16b, v_data.16b, #4
+
+ tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
+ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
+ eor v_p1.16b, v_tmp1_lo.16b, v_p1.16b
+ eor v_p1.16b, v_p1.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
+ tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
+ eor v_p2.16b, v_tmp1_lo.16b, v_p2.16b
+ eor v_p2.16b, v_p2.16b, v_tmp1_hi.16b
+
+ cmp x_vec_i, x_vec
+ bne .Lloop16_vects
+
+.Lloop16_vects_end:
+ str q_p1, [x_dest1, x_pos]
+ str q_p2, [x_dest2, x_pos]
+ add x_pos, x_pos, #16
+ cmp x_pos, x_len
+ ble .Lloop16
+
+.Lloop16_end:
+ sub x_tmp, x_pos, x_len
+ cmp x_tmp, #16
+ beq .return_pass
+
+.lessthan16_init:
+ mov x_pos, x_len
+ b .Lloop16
+
+.return_pass:
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_dot_prod_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_dot_prod_sve.S
new file mode 100644
index 0000000000..99b5f15cfb
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_dot_prod_sve.S
@@ -0,0 +1,168 @@
+/*************************************************************
+ Copyright (c) 2021 Linaro Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+.text
+.align 6
+.arch armv8-a+sve
+
+#include "../include/aarch64_label.h"
+
+.global cdecl(gf_2vect_dot_prod_sve)
+#ifndef __APPLE__
+.type gf_2vect_dot_prod_sve, %function
+#endif
+/* void gf_2vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+ */
+
+/* arguments */
+x_len .req x0 /* vector length */
+x_vec .req x1 /* number of source vectors (ie. data blocks) */
+x_tbl .req x2
+x_src .req x3
+x_dest .req x4
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_vec_i .req x5
+x_ptr .req x6
+x_pos .req x7
+
+x_tbl1 .req x8
+x_tbl2 .req x9
+x_dest1 .req x10
+x_dest2 .req x_dest /* reused */
+
+/* r16,r17,r18,r29,r30: special role registers, avoided */
+/* r19..r29 and SP must be preserved */
+
+/* vectors */
+z_mask0f .req z0
+
+z_src .req z1
+z_src_lo .req z2
+z_src_hi .req z_src
+
+z_dest1 .req z3
+
+z_gft1_lo .req z4
+z_gft1_hi .req z5
+q_gft1_lo .req q4
+q_gft1_hi .req q5
+
+/* bottom 64-bit of v8..v15 must be preserved if used */
+z_gft2_lo .req z17
+z_gft2_hi .req z18
+q_gft2_lo .req q17
+q_gft2_hi .req q18
+
+z_dest2 .req z27
+
+cdecl(gf_2vect_dot_prod_sve):
+ /* less than 16 bytes, return_fail */
+ cmp x_len, #16
+ blt .return_fail
+
+ mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */
+ mov x_pos, #0
+ lsl x_vec, x_vec, #3
+ ldp x_dest1, x_dest2, [x_dest, #8*0]
+
+/* Loop 1: x_len, vector length */
+.Lloopsve_vl:
+ whilelo p0.b, x_pos, x_len
+ b.none .return_pass
+
+ mov x_vec_i, #0 /* clear x_vec_i */
+ ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */
+
+ mov z_dest1.b, #0 /* clear z_dest1 */
+ mov z_dest2.b, #0 /* clear z_dest2 */
+
+ /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */
+ mov x_tbl1, x_tbl /* reset x_tbl1 */
+ add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */
+
+/* Loop 2: x_vec, number of source vectors (ie. data blocks) */
+.Lloopsve_vl_vects:
+ /* load src data, governed by p0 */
+ ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */
+ /* split 4-bit lo; 4-bit hi */
+ and z_src_lo.d, z_src.d, z_mask0f.d
+ lsr z_src_hi.b, z_src.b, #4
+
+
+ /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */
+ /* load gf_table's */
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */
+ ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32
+
+ /* prefetch */
+ prfb pldl2keep, p0, [x_tbl1]
+ prfb pldl2keep, p0, [x_tbl2]
+
+ /* calc for next */
+ add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */
+ ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */
+
+ /* dest 1 */
+ /* table indexing, ie. gf(2^8) multiplication */
+ tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b
+ tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b
+ /* exclusive or, ie. gf(2^8) add */
+ eor z_dest1.d, z_gft1_lo.d, z_dest1.d
+ eor z_dest1.d, z_dest1.d, z_gft1_hi.d
+
+ /* dest 2 */
+ tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b
+ tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b
+ eor z_dest2.d, z_gft2_lo.d, z_dest2.d
+ eor z_dest2.d, z_dest2.d, z_gft2_hi.d
+
+ cmp x_vec_i, x_vec
+ blt .Lloopsve_vl_vects
+/* end of Loop 2 */
+
+ /* store dest data, governed by p0 */
+ st1b z_dest1.b, p0, [x_dest1, x_pos]
+ st1b z_dest2.b, p0, [x_dest2, x_pos]
+
+ /* increment one vector length */
+ incb x_pos
+ b .Lloopsve_vl
+/* end of Loop 1 */
+
+.return_pass:
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_mad_neon.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_mad_neon.S
new file mode 100644
index 0000000000..453524a221
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_mad_neon.S
@@ -0,0 +1,411 @@
+/**************************************************************
+ Copyright (c) 2019 Huawei Technologies Co., Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "../include/aarch64_label.h"
+
+.text
+
+.global cdecl(gf_2vect_mad_neon)
+#ifndef __APPLE__
+.type gf_2vect_mad_neon, %function
+#endif
+
+/* arguments */
+x_len .req x0
+x_vec .req x1
+x_vec_i .req x2
+x_tbl .req x3
+x_src .req x4
+x_dest .req x5
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_src_end .req x6
+x_dest1 .req x7
+x_dest2 .req x8
+x_tmp .req x9
+x_tbl1 .req x10
+x_tbl2 .req x11
+x_const .req x12
+
+/* vectors */
+v_mask0f .req v0
+v_tmp_lo .req v1
+v_tmp_hi .req v2
+v_tmp .req v3
+q_tmp .req q3
+
+v_gft1_lo .req v4
+v_gft1_hi .req v5
+v_gft2_lo .req v6
+v_gft2_hi .req v7
+q_gft1_lo .req q4
+q_gft1_hi .req q5
+q_gft2_lo .req q6
+q_gft2_hi .req q7
+
+v_data_0 .req v8
+v_data_1 .req v9
+v_data_2 .req v10
+v_data_3 .req v11
+v_data_4 .req v12
+v_data_5 .req v13
+v_data_6 .req v14
+v_data_7 .req v15
+q_data_0 .req q8
+q_data_1 .req q9
+q_data_2 .req q10
+q_data_3 .req q11
+q_data_4 .req q12
+q_data_5 .req q13
+q_data_6 .req q14
+q_data_7 .req q15
+
+v_data_0_lo .req v16
+v_data_1_lo .req v17
+v_data_2_lo .req v18
+v_data_3_lo .req v19
+v_data_4_lo .req v20
+v_data_5_lo .req v21
+v_data_6_lo .req v22
+v_data_7_lo .req v23
+v_data_0_hi .req v_data_0
+v_data_1_hi .req v_data_1
+v_data_2_hi .req v_data_2
+v_data_3_hi .req v_data_3
+v_data_4_hi .req v_data_4
+v_data_5_hi .req v_data_5
+v_data_6_hi .req v_data_6
+v_data_7_hi .req v_data_7
+
+v_d0 .req v24
+v_d1 .req v25
+v_d2 .req v26
+v_d3 .req v27
+v_d4 .req v28
+v_d5 .req v29
+v_d6 .req v30
+v_d7 .req v31
+q_d0 .req q24
+q_d1 .req q25
+q_d2 .req q26
+q_d3 .req q27
+q_d4 .req q28
+q_d5 .req q29
+q_d6 .req q30
+q_d7 .req q31
+
+v_data .req v16
+q_data .req q16
+v_data_lo .req v17
+v_data_hi .req v18
+
+
+cdecl(gf_2vect_mad_neon):
+ /* less than 16 bytes, return_fail */
+ cmp x_len, #16
+ blt .return_fail
+
+ movi v_mask0f.16b, #0x0f
+ lsl x_vec_i, x_vec_i, #5
+ lsl x_vec, x_vec, #5
+ add x_tbl1, x_tbl, x_vec_i
+ add x_tbl2, x_tbl1, x_vec
+ add x_src_end, x_src, x_len
+
+ ldr x_dest1, [x_dest]
+ ldr x_dest2, [x_dest, #8]
+ ldr q_gft1_lo, [x_tbl1]
+ ldr q_gft1_hi, [x_tbl1, #16]
+ ldr q_gft2_lo, [x_tbl2]
+ ldr q_gft2_hi, [x_tbl2, #16]
+
+.Lloop128_init:
+ /* less than 128 bytes, goto Lloop16_init */
+ cmp x_len, #128
+ blt .Lloop16_init
+
+ /* save d8 ~ d15 to stack */
+ sub sp, sp, #64
+ stp d8, d9, [sp]
+ stp d10, d11, [sp, #16]
+ stp d12, d13, [sp, #32]
+ stp d14, d15, [sp, #48]
+
+ sub x_src_end, x_src_end, #128
+
+.Lloop128:
+ ldr q_data_0, [x_src, #16*0]
+ ldr q_data_1, [x_src, #16*1]
+ ldr q_data_2, [x_src, #16*2]
+ ldr q_data_3, [x_src, #16*3]
+ ldr q_data_4, [x_src, #16*4]
+ ldr q_data_5, [x_src, #16*5]
+ ldr q_data_6, [x_src, #16*6]
+ ldr q_data_7, [x_src, #16*7]
+
+ ldr q_d0, [x_dest1, #16*0]
+ ldr q_d1, [x_dest1, #16*1]
+ ldr q_d2, [x_dest1, #16*2]
+ ldr q_d3, [x_dest1, #16*3]
+ ldr q_d4, [x_dest1, #16*4]
+ ldr q_d5, [x_dest1, #16*5]
+ ldr q_d6, [x_dest1, #16*6]
+ ldr q_d7, [x_dest1, #16*7]
+
+ and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
+ and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
+ and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b
+ and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b
+ and v_data_4_lo.16b, v_data_4.16b, v_mask0f.16b
+ and v_data_5_lo.16b, v_data_5.16b, v_mask0f.16b
+ and v_data_6_lo.16b, v_data_6.16b, v_mask0f.16b
+ and v_data_7_lo.16b, v_data_7.16b, v_mask0f.16b
+
+ ushr v_data_0_hi.16b, v_data_0.16b, #4
+ ushr v_data_1_hi.16b, v_data_1.16b, #4
+ ushr v_data_2_hi.16b, v_data_2.16b, #4
+ ushr v_data_3_hi.16b, v_data_3.16b, #4
+ ushr v_data_4_hi.16b, v_data_4.16b, #4
+ ushr v_data_5_hi.16b, v_data_5.16b, #4
+ ushr v_data_6_hi.16b, v_data_6.16b, #4
+ ushr v_data_7_hi.16b, v_data_7.16b, #4
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
+ eor v_d0.16b, v_tmp_lo.16b, v_d0.16b
+ eor v_d0.16b, v_d0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
+ eor v_d1.16b, v_tmp_lo.16b, v_d1.16b
+ eor v_d1.16b, v_d1.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b
+ eor v_d2.16b, v_tmp_lo.16b, v_d2.16b
+ eor v_d2.16b, v_d2.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b
+ eor v_d3.16b, v_tmp_lo.16b, v_d3.16b
+ eor v_d3.16b, v_d3.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_4_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_4_hi.16b
+ eor v_d4.16b, v_tmp_lo.16b, v_d4.16b
+ eor v_d4.16b, v_d4.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_5_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_5_hi.16b
+ eor v_d5.16b, v_tmp_lo.16b, v_d5.16b
+ eor v_d5.16b, v_d5.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_6_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_6_hi.16b
+ eor v_d6.16b, v_tmp_lo.16b, v_d6.16b
+ eor v_d6.16b, v_d6.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_7_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_7_hi.16b
+ eor v_d7.16b, v_tmp_lo.16b, v_d7.16b
+ eor v_d7.16b, v_d7.16b, v_tmp_hi.16b
+
+ str q_d0, [x_dest1, #16*0]
+ str q_d1, [x_dest1, #16*1]
+ str q_d2, [x_dest1, #16*2]
+ str q_d3, [x_dest1, #16*3]
+ str q_d4, [x_dest1, #16*4]
+ str q_d5, [x_dest1, #16*5]
+ str q_d6, [x_dest1, #16*6]
+ str q_d7, [x_dest1, #16*7]
+
+ ldr q_d0, [x_dest2, #16*0]
+ ldr q_d1, [x_dest2, #16*1]
+ ldr q_d2, [x_dest2, #16*2]
+ ldr q_d3, [x_dest2, #16*3]
+ ldr q_d4, [x_dest2, #16*4]
+ ldr q_d5, [x_dest2, #16*5]
+ ldr q_d6, [x_dest2, #16*6]
+ ldr q_d7, [x_dest2, #16*7]
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_0_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_0_hi.16b
+ eor v_d0.16b, v_tmp_lo.16b, v_d0.16b
+ eor v_d0.16b, v_d0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_1_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_1_hi.16b
+ eor v_d1.16b, v_tmp_lo.16b, v_d1.16b
+ eor v_d1.16b, v_d1.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_2_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_2_hi.16b
+ eor v_d2.16b, v_tmp_lo.16b, v_d2.16b
+ eor v_d2.16b, v_d2.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_3_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_3_hi.16b
+ eor v_d3.16b, v_tmp_lo.16b, v_d3.16b
+ eor v_d3.16b, v_d3.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_4_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_4_hi.16b
+ eor v_d4.16b, v_tmp_lo.16b, v_d4.16b
+ eor v_d4.16b, v_d4.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_5_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_5_hi.16b
+ eor v_d5.16b, v_tmp_lo.16b, v_d5.16b
+ eor v_d5.16b, v_d5.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_6_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_6_hi.16b
+ eor v_d6.16b, v_tmp_lo.16b, v_d6.16b
+ eor v_d6.16b, v_d6.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_7_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_7_hi.16b
+ eor v_d7.16b, v_tmp_lo.16b, v_d7.16b
+ eor v_d7.16b, v_d7.16b, v_tmp_hi.16b
+
+ str q_d0, [x_dest2, #16*0]
+ str q_d1, [x_dest2, #16*1]
+ str q_d2, [x_dest2, #16*2]
+ str q_d3, [x_dest2, #16*3]
+ str q_d4, [x_dest2, #16*4]
+ str q_d5, [x_dest2, #16*5]
+ str q_d6, [x_dest2, #16*6]
+ str q_d7, [x_dest2, #16*7]
+
+ add x_src, x_src, #128
+ add x_dest1, x_dest1, #128
+ add x_dest2, x_dest2, #128
+ cmp x_src, x_src_end
+ bls .Lloop128
+
+.Lloop128_end:
+ /* restore d8 ~ d15 */
+ ldp d8, d9, [sp]
+ ldp d10, d11, [sp, #16]
+ ldp d12, d13, [sp, #32]
+ ldp d14, d15, [sp, #48]
+ add sp, sp, #64
+ add x_src_end, x_src_end, #128
+
+.Lloop16_init:
+ sub x_src_end, x_src_end, #16
+ cmp x_src, x_src_end
+ bhi .lessthan16_init
+
+.Lloop16:
+ ldr q_data, [x_src]
+
+ ldr q_d0, [x_dest1]
+ ldr q_d1, [x_dest2]
+
+ and v_data_lo.16b, v_data.16b, v_mask0f.16b
+ ushr v_data_hi.16b, v_data.16b, #4
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
+ eor v_d0.16b, v_tmp_lo.16b, v_d0.16b
+ eor v_d0.16b, v_d0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
+ eor v_d1.16b, v_tmp_lo.16b, v_d1.16b
+ eor v_d1.16b, v_d1.16b, v_tmp_hi.16b
+
+ str q_d0, [x_dest1]
+ str q_d1, [x_dest2]
+
+ add x_dest1, x_dest1, #16
+ add x_dest2, x_dest2, #16
+ add x_src, x_src, #16
+ cmp x_src, x_src_end
+ bls .Lloop16
+
+.lessthan16_init:
+ sub x_tmp, x_src, x_src_end
+ cmp x_tmp, #16
+ beq .return_pass
+
+.lessthan16:
+ mov x_src, x_src_end
+ sub x_dest1, x_dest1, x_tmp
+ sub x_dest2, x_dest2, x_tmp
+
+#ifndef __APPLE__
+ adrp x_const, const_tbl
+ add x_const, x_const, :lo12:const_tbl
+#else
+ adrp x_const, const_tbl@PAGE
+ add x_const, x_const, const_tbl@PAGEOFF
+#endif
+ sub x_const, x_const, x_tmp
+ ldr q_tmp, [x_const, #16]
+
+ ldr q_data, [x_src]
+ ldr q_d0, [x_dest1]
+ ldr q_d1, [x_dest2]
+
+ and v_data_lo.16b, v_data.16b, v_mask0f.16b
+ ushr v_data_hi.16b, v_data.16b, #4
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
+ eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
+ and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
+ eor v_d0.16b, v_d0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
+ eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
+ and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
+ eor v_d1.16b, v_d1.16b, v_tmp_hi.16b
+
+ str q_d0, [x_dest1]
+ str q_d1, [x_dest2]
+
+.return_pass:
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
+
+ASM_DEF_RODATA
+.balign 8
+const_tbl:
+ .dword 0x0000000000000000, 0x0000000000000000
+ .dword 0xffffffffffffffff, 0xffffffffffffffff
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_mad_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_mad_sve.S
new file mode 100644
index 0000000000..f0ddf01187
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_mad_sve.S
@@ -0,0 +1,152 @@
+/**************************************************************
+ Copyright (c) 2021 Linaro Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+.text
+.align 6
+.arch armv8-a+sve
+
+#include "../include/aarch64_label.h"
+
+.global cdecl(gf_2vect_mad_sve)
+#ifndef __APPLE__
+.type gf_2vect_mad_sve, %function
+#endif
+
+/* gf_2vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char **dest);
+ */
+/* arguments */
+x_len .req x0
+x_vec .req x1
+x_vec_i .req x2
+x_tbl .req x3
+x_src .req x4
+x_dest .req x5
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_pos .req x6
+x_dest2 .req x7
+x_dest1 .req x12
+
+/* vectors */
+z_mask0f .req z0
+
+z_src .req z1
+z_src_lo .req z2
+z_src_hi .req z_src
+
+z_dest1 .req z3
+
+z_tmp_lo .req z4
+z_tmp_hi .req z5
+
+z_gft1_lo .req z6
+z_gft1_hi .req z7
+q_gft1_lo .req q6
+q_gft1_hi .req q7
+
+/* bottom 64-bit of v8..v15 must be preserved if used */
+z_gft2_lo .req z17
+z_gft2_hi .req z18
+q_gft2_lo .req q17
+q_gft2_hi .req q18
+
+z_dest2 .req z27
+
+cdecl(gf_2vect_mad_sve):
+ /* less than 16 bytes, return_fail */
+ cmp x_len, #16
+ blt .return_fail
+
+ mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */
+ /* load table 1 */
+ add x_tbl, x_tbl, x_vec_i, LSL #5 /* x_tbl += x_vec_i * 2^5 */
+
+ /* Load table 1 with NEON instruction ldp */
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl]
+ /* load table 2 */
+ add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */
+ ldp q_gft2_lo, q_gft2_hi, [x_tbl]
+
+ ldr x_dest1, [x_dest, #8*0] /* pointer to dest1 */
+ ldr x_dest2, [x_dest, #8*1] /* pointer to dest2 */
+
+ mov x_pos, #0
+
+ /* vector length agnostic */
+.Lloopsve_vl:
+ whilelo p0.b, x_pos, x_len
+ b.none .return_pass
+
+ /* prefetch dest data */
+ prfb pldl2strm, p0, [x_dest1, x_pos]
+ prfb pldl2strm, p0, [x_dest2, x_pos]
+
+ /* load src data, governed by p0 */
+ ld1b z_src.b, p0/z, [x_src, x_pos]
+
+ /* split 4-bit lo; 4-bit hi */
+ and z_src_lo.d, z_src.d, z_mask0f.d
+ lsr z_src_hi.b, z_src.b, #4
+
+ /* load dest data, governed by p0 */
+ ld1b z_dest1.b, p0/z, [x_dest1, x_pos]
+ ld1b z_dest2.b, p0/z, [x_dest2, x_pos]
+
+ /* dest1 */
+ /* table indexing, ie. gf(2^8) multiplication */
+ tbl z_tmp_lo.b, {z_gft1_lo.b}, z_src_lo.b
+ tbl z_tmp_hi.b, {z_gft1_hi.b}, z_src_hi.b
+ /* exclusive or, ie. gf(2^8) add */
+ eor z_dest1.d, z_tmp_lo.d, z_dest1.d
+ eor z_dest1.d, z_tmp_hi.d, z_dest1.d
+
+ /* dest2 */
+ tbl z_tmp_lo.b, {z_gft2_lo.b}, z_src_lo.b
+ tbl z_tmp_hi.b, {z_gft2_hi.b}, z_src_hi.b
+ eor z_dest2.d, z_tmp_lo.d, z_dest2.d
+ eor z_dest2.d, z_tmp_hi.d, z_dest2.d
+
+ /* store dest data, governed by p0 */
+ st1b z_dest1.b, p0, [x_dest1, x_pos]
+ st1b z_dest2.b, p0, [x_dest2, x_pos]
+ /* increment one vector length */
+ incb x_pos
+
+ b .Lloopsve_vl
+
+.return_pass:
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_dot_prod_neon.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_dot_prod_neon.S
new file mode 100644
index 0000000000..cff34fc3dd
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_dot_prod_neon.S
@@ -0,0 +1,361 @@
+/**************************************************************
+ Copyright (c) 2019 Huawei Technologies Co., Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "../include/aarch64_label.h"
+
+.text
+
+.global cdecl(gf_3vect_dot_prod_neon)
+#ifndef __APPLE__
+.type gf_3vect_dot_prod_neon, %function
+#endif
+
+/* arguments */
+x_len .req x0
+x_vec .req x1
+x_tbl .req x2
+x_src .req x3
+x_dest .req x4
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_vec_i .req x5
+x_ptr .req x6
+x_pos .req x7
+x_tmp .req x8
+x_dest1 .req x9
+x_tbl1 .req x10
+x_dest2 .req x11
+x_tbl2 .req x12
+x_dest3 .req x13
+x_tbl3 .req x14
+
+/* vectors */
+v_gft1_lo .req v0
+v_gft1_hi .req v1
+v_gft2_lo .req v2
+v_gft2_hi .req v3
+v_gft3_lo .req v4
+v_gft3_hi .req v5
+q_gft1_lo .req q0
+q_gft1_hi .req q1
+q_gft2_lo .req q2
+q_gft2_hi .req q3
+q_gft3_lo .req q4
+q_gft3_hi .req q5
+
+v_mask0f .req v6
+q_mask0f .req q6
+v_tmp1 .req v7
+
+v_data_0 .req v8
+v_data_1 .req v9
+v_data_2 .req v10
+v_data_3 .req v11
+q_data_0 .req q8
+q_data_1 .req q9
+q_data_2 .req q10
+q_data_3 .req q11
+
+v_tmp1_lo .req v12
+v_tmp1_hi .req v13
+
+v_p1_0 .req v20
+v_p1_1 .req v21
+v_p1_2 .req v22
+v_p1_3 .req v23
+v_p2_0 .req v24
+v_p2_1 .req v25
+v_p2_2 .req v26
+v_p2_3 .req v27
+v_p3_0 .req v28
+v_p3_1 .req v29
+v_p3_2 .req v30
+v_p3_3 .req v31
+
+q_p1_0 .req q20
+q_p1_1 .req q21
+q_p1_2 .req q22
+q_p1_3 .req q23
+q_p2_0 .req q24
+q_p2_1 .req q25
+q_p2_2 .req q26
+q_p2_3 .req q27
+q_p3_0 .req q28
+q_p3_1 .req q29
+q_p3_2 .req q30
+q_p3_3 .req q31
+
+v_data .req v_p1_1
+q_data .req q_p1_1
+v_data_lo .req v_p1_2
+v_data_hi .req v_p1_3
+
+
+cdecl(gf_3vect_dot_prod_neon):
+ /* less than 16 bytes, return_fail */
+ cmp x_len, #16
+ blt .return_fail
+
+ movi v_mask0f.16b, #0x0f
+ mov x_pos, #0
+ lsl x_vec, x_vec, #3
+ ldr x_dest1, [x_dest, #8*0]
+ ldr x_dest2, [x_dest, #8*1]
+ ldr x_dest3, [x_dest, #8*2]
+
+.Lloop64_init:
+ /* less than 64 bytes, goto Lloop16_init */
+ cmp x_len, #64
+ blt .Lloop16_init
+
+ /* save d8 ~ d15 to stack */
+ sub sp, sp, #64
+ stp d8, d9, [sp]
+ stp d10, d11, [sp, #16]
+ stp d12, d13, [sp, #32]
+ stp d14, d15, [sp, #48]
+
+ sub x_len, x_len, #64
+
+.Lloop64:
+ movi v_p1_0.16b, #0
+ movi v_p1_1.16b, #0
+ movi v_p1_2.16b, #0
+ movi v_p1_3.16b, #0
+ movi v_p2_0.16b, #0
+ movi v_p2_1.16b, #0
+ movi v_p2_2.16b, #0
+ movi v_p2_3.16b, #0
+ movi v_p3_0.16b, #0
+ movi v_p3_1.16b, #0
+ movi v_p3_2.16b, #0
+ movi v_p3_3.16b, #0
+
+ mov x_tbl1, x_tbl
+ add x_tbl2, x_tbl1, x_vec, lsl #2
+ add x_tbl3, x_tbl2, x_vec, lsl #2
+ mov x_vec_i, #0
+
+.Lloop64_vects:
+ ldr x_ptr, [x_src, x_vec_i]
+ add x_vec_i, x_vec_i, #8
+ add x_ptr, x_ptr, x_pos
+
+ ldr q_data_0, [x_ptr], #16
+ ldr q_data_1, [x_ptr], #16
+
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32
+ ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32
+ ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32
+
+ ldr q_data_2, [x_ptr], #16
+ ldr q_data_3, [x_ptr], #16
+ prfm pldl1strm, [x_ptr]
+ prfm pldl1keep, [x_tbl1]
+ prfm pldl1keep, [x_tbl2]
+ prfm pldl1keep, [x_tbl3]
+
+ /* data_0 */
+ and v_tmp1.16b, v_data_0.16b, v_mask0f.16b
+ ushr v_data_0.16b, v_data_0.16b, #4
+
+ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_0.16b
+ eor v_p1_0.16b, v_tmp1_lo.16b, v_p1_0.16b
+ eor v_p1_0.16b, v_p1_0.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_0.16b
+ eor v_p2_0.16b, v_tmp1_lo.16b, v_p2_0.16b
+ eor v_p2_0.16b, v_p2_0.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_0.16b
+ eor v_p3_0.16b, v_tmp1_lo.16b, v_p3_0.16b
+ eor v_p3_0.16b, v_p3_0.16b, v_tmp1_hi.16b
+
+ /* data_1 */
+ and v_tmp1.16b, v_data_1.16b, v_mask0f.16b
+ ushr v_data_1.16b, v_data_1.16b, #4
+
+ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_1.16b
+ eor v_p1_1.16b, v_tmp1_lo.16b, v_p1_1.16b
+ eor v_p1_1.16b, v_p1_1.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_1.16b
+ eor v_p2_1.16b, v_tmp1_lo.16b, v_p2_1.16b
+ eor v_p2_1.16b, v_p2_1.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_1.16b
+ eor v_p3_1.16b, v_tmp1_lo.16b, v_p3_1.16b
+ eor v_p3_1.16b, v_p3_1.16b, v_tmp1_hi.16b
+
+ /* data_2 */
+ and v_tmp1.16b, v_data_2.16b, v_mask0f.16b
+ ushr v_data_2.16b, v_data_2.16b, #4
+
+ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_2.16b
+ eor v_p1_2.16b, v_tmp1_lo.16b, v_p1_2.16b
+ eor v_p1_2.16b, v_p1_2.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_2.16b
+ eor v_p2_2.16b, v_tmp1_lo.16b, v_p2_2.16b
+ eor v_p2_2.16b, v_p2_2.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_2.16b
+ eor v_p3_2.16b, v_tmp1_lo.16b, v_p3_2.16b
+ eor v_p3_2.16b, v_p3_2.16b, v_tmp1_hi.16b
+
+ /* data_3 */
+ and v_tmp1.16b, v_data_3.16b, v_mask0f.16b
+ ushr v_data_3.16b, v_data_3.16b, #4
+
+ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_3.16b
+ eor v_p1_3.16b, v_tmp1_lo.16b, v_p1_3.16b
+ eor v_p1_3.16b, v_p1_3.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_3.16b
+ eor v_p2_3.16b, v_tmp1_lo.16b, v_p2_3.16b
+ eor v_p2_3.16b, v_p2_3.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_3.16b
+ eor v_p3_3.16b, v_tmp1_lo.16b, v_p3_3.16b
+ eor v_p3_3.16b, v_p3_3.16b, v_tmp1_hi.16b
+
+ cmp x_vec_i, x_vec
+ blt .Lloop64_vects
+
+.Lloop64_vects_end:
+ add x_ptr, x_dest1, x_pos
+ stp q_p1_0, q_p1_1, [x_ptr], #32
+ stp q_p1_2, q_p1_3, [x_ptr]
+
+ add x_ptr, x_dest2, x_pos
+ stp q_p2_0, q_p2_1, [x_ptr], #32
+ stp q_p2_2, q_p2_3, [x_ptr]
+
+ add x_ptr, x_dest3, x_pos
+ stp q_p3_0, q_p3_1, [x_ptr], #32
+ stp q_p3_2, q_p3_3, [x_ptr]
+
+ add x_pos, x_pos, #64
+ cmp x_pos, x_len
+ ble .Lloop64
+
+.Lloop64_end:
+ /* restore d8 ~ d15 */
+ ldp d8, d9, [sp]
+ ldp d10, d11, [sp, #16]
+ ldp d12, d13, [sp, #32]
+ ldp d14, d15, [sp, #48]
+ add sp, sp, #64
+
+ add x_len, x_len, #64
+ cmp x_pos, x_len
+ beq .return_pass
+
+.Lloop16_init:
+ sub x_len, x_len, #16
+ cmp x_pos, x_len
+ bgt .lessthan16_init
+
+.Lloop16:
+ movi v_p1_0.16b, #0
+ movi v_p2_0.16b, #0
+ movi v_p3_0.16b, #0
+ mov x_tbl1, x_tbl
+ add x_tbl2, x_tbl1, x_vec, lsl #2
+ add x_tbl3, x_tbl2, x_vec, lsl #2
+ mov x_vec_i, #0
+
+.Lloop16_vects:
+ ldr x_ptr, [x_src, x_vec_i]
+ add x_vec_i, x_vec_i, #8
+ ldr q_data, [x_ptr, x_pos]
+
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32
+ ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32
+ ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32
+
+ and v_data_lo.16b, v_data.16b, v_mask0f.16b
+ ushr v_data_hi.16b, v_data.16b, #4
+
+ tbl v_gft1_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
+ tbl v_gft1_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
+ tbl v_gft2_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
+ tbl v_gft2_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
+ tbl v_gft3_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
+ tbl v_gft3_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
+
+ eor v_p1_0.16b, v_gft1_hi.16b, v_p1_0.16b
+ eor v_p1_0.16b, v_p1_0.16b, v_gft1_lo.16b
+ eor v_p2_0.16b, v_gft2_hi.16b, v_p2_0.16b
+ eor v_p2_0.16b, v_p2_0.16b, v_gft2_lo.16b
+ eor v_p3_0.16b, v_gft3_hi.16b, v_p3_0.16b
+ eor v_p3_0.16b, v_p3_0.16b, v_gft3_lo.16b
+
+ cmp x_vec_i, x_vec
+ bne .Lloop16_vects
+
+.Lloop16_vects_end:
+ str q_p1_0, [x_dest1, x_pos]
+ str q_p2_0, [x_dest2, x_pos]
+ str q_p3_0, [x_dest3, x_pos]
+ add x_pos, x_pos, #16
+ cmp x_pos, x_len
+ ble .Lloop16
+
+.Lloop16_end:
+ sub x_tmp, x_pos, x_len
+ cmp x_tmp, #16
+ beq .return_pass
+
+.lessthan16_init:
+ mov x_pos, x_len
+ b .Lloop16
+
+.return_pass:
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_dot_prod_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_dot_prod_sve.S
new file mode 100644
index 0000000000..8f6414ee52
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_dot_prod_sve.S
@@ -0,0 +1,189 @@
+/*************************************************************
+ Copyright (c) 2021 Linaro Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+.text
+.align 6
+.arch armv8-a+sve
+
+#include "../include/aarch64_label.h"
+
+.global cdecl(gf_3vect_dot_prod_sve)
+#ifndef __APPLE__
+.type gf_3vect_dot_prod_sve, %function
+#endif
+/* void gf_3vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+ */
+
+/* arguments */
+x_len .req x0 /* vector length */
+x_vec .req x1 /* number of source vectors (ie. data blocks) */
+x_tbl .req x2
+x_src .req x3
+x_dest .req x4
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_vec_i .req x5
+x_ptr .req x6
+x_pos .req x7
+
+x_tbl1 .req x8
+x_tbl2 .req x9
+x_tbl3 .req x10
+x_dest1 .req x11
+x_dest2 .req x12
+x_dest3 .req x_dest /* reused */
+
+/* r16,r17,r18,r29,r30: special role registers, avoided */
+/* r19..r29 and SP must be preserved */
+
+/* vectors */
+z_mask0f .req z0
+
+z_src .req z1
+z_src_lo .req z2
+z_src_hi .req z_src
+
+z_dest1 .req z3
+
+z_gft1_lo .req z4
+z_gft1_hi .req z5
+q_gft1_lo .req q4
+q_gft1_hi .req q5
+
+/* bottom 64-bit of v8..v15 must be preserved if used */
+z_gft2_lo .req z17
+z_gft2_hi .req z18
+q_gft2_lo .req q17
+q_gft2_hi .req q18
+
+z_gft3_lo .req z19
+z_gft3_hi .req z20
+q_gft3_lo .req q19
+q_gft3_hi .req q20
+
+z_dest2 .req z27
+z_dest3 .req z28
+
+cdecl(gf_3vect_dot_prod_sve):
+ /* less than 16 bytes, return_fail */
+ cmp x_len, #16
+ blt .return_fail
+
+ mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */
+ mov x_pos, #0
+ lsl x_vec, x_vec, #3
+ ldp x_dest1, x_dest2, [x_dest, #8*0]
+ ldr x_dest3, [x_dest, #8*2]
+
+/* Loop 1: x_len, vector length */
+.Lloopsve_vl:
+ whilelo p0.b, x_pos, x_len
+ b.none .return_pass
+
+ mov x_vec_i, #0 /* clear x_vec_i */
+ ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */
+
+ mov z_dest1.b, #0 /* clear z_dest1 */
+ mov z_dest2.b, #0 /* clear z_dest2 */
+ mov z_dest3.b, #0 /* clear z_dest3 */
+
+ /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */
+ mov x_tbl1, x_tbl /* reset x_tbl1 */
+ add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */
+ add x_tbl3, x_tbl2, x_vec, LSL #2 /* reset x_tbl3 */
+
+/* Loop 2: x_vec, number of source vectors (ie. data blocks) */
+.Lloopsve_vl_vects:
+ /* load src data, governed by p0 */
+ ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */
+ /* split 4-bit lo; 4-bit hi */
+ and z_src_lo.d, z_src.d, z_mask0f.d
+ lsr z_src_hi.b, z_src.b, #4
+
+
+ /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */
+ /* load gf_table's */
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */
+ ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32
+
+ /* prefetch */
+ prfb pldl2keep, p0, [x_tbl1]
+ prfb pldl2keep, p0, [x_tbl2]
+
+ /* calc for next */
+ add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */
+ ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */
+
+ /* dest 1 */
+ /* table indexing, ie. gf(2^8) multiplication */
+ tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b
+ tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b
+ /* exclusive or, ie. gf(2^8) add */
+ eor z_dest1.d, z_gft1_lo.d, z_dest1.d
+ eor z_dest1.d, z_dest1.d, z_gft1_hi.d
+
+ ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32
+ prfb pldl2keep, p0, [x_tbl3]
+
+ /* dest 2 */
+ tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b
+ tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b
+ eor z_dest2.d, z_gft2_lo.d, z_dest2.d
+ eor z_dest2.d, z_dest2.d, z_gft2_hi.d
+
+ /* dest 3 */
+ tbl z_gft3_lo.b, {z_gft3_lo.b}, z_src_lo.b
+ tbl z_gft3_hi.b, {z_gft3_hi.b}, z_src_hi.b
+ eor z_dest3.d, z_gft3_lo.d, z_dest3.d
+ eor z_dest3.d, z_dest3.d, z_gft3_hi.d
+
+ cmp x_vec_i, x_vec
+ blt .Lloopsve_vl_vects
+/* end of Loop 2 */
+
+ /* store dest data, governed by p0 */
+ st1b z_dest1.b, p0, [x_dest1, x_pos]
+ st1b z_dest2.b, p0, [x_dest2, x_pos]
+ st1b z_dest3.b, p0, [x_dest3, x_pos]
+
+ /* increment one vector length */
+ incb x_pos
+ b .Lloopsve_vl
+/* end of Loop 1 */
+
+.return_pass:
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_mad_neon.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_mad_neon.S
new file mode 100644
index 0000000000..fcfeec1e23
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_mad_neon.S
@@ -0,0 +1,391 @@
+/**************************************************************
+ Copyright (c) 2019 Huawei Technologies Co., Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "../include/aarch64_label.h"
+
+.text
+
+.global cdecl(gf_3vect_mad_neon)
+#ifndef __APPLE__
+.type gf_3vect_mad_neon, %function
+#endif
+
+/* arguments */
+x_len .req x0
+x_vec .req x1
+x_vec_i .req x2
+x_tbl .req x3
+x_src .req x4
+x_dest .req x5
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_src_end .req x6
+x_dest1 .req x7
+x_dest2 .req x8
+x_dest3 .req x_dest
+x_tmp .req x10
+x_tbl1 .req x11
+x_tbl2 .req x12
+x_tbl3 .req x13
+x_const .req x14
+
+/* vectors */
+v_mask0f .req v0
+v_tmp_lo .req v1
+v_tmp_hi .req v2
+v_tmp .req v3
+q_tmp .req q3
+
+v_gft1_lo .req v4
+v_gft1_hi .req v5
+v_gft2_lo .req v6
+v_gft2_hi .req v7
+v_gft3_lo .req v16
+v_gft3_hi .req v17
+q_gft1_lo .req q4
+q_gft1_hi .req q5
+q_gft2_lo .req q6
+q_gft2_hi .req q7
+q_gft3_lo .req q16
+q_gft3_hi .req q17
+
+v_data_0 .req v8
+v_data_1 .req v9
+v_data_2 .req v10
+v_data_3 .req v11
+q_data_0 .req q8
+q_data_1 .req q9
+q_data_2 .req q10
+q_data_3 .req q11
+
+v_data_0_lo .req v12
+v_data_1_lo .req v13
+v_data_2_lo .req v14
+v_data_3_lo .req v15
+v_data_0_hi .req v_data_0
+v_data_1_hi .req v_data_1
+v_data_2_hi .req v_data_2
+v_data_3_hi .req v_data_3
+
+v_d1_0 .req v20
+v_d1_1 .req v21
+v_d1_2 .req v22
+v_d1_3 .req v23
+v_d2_0 .req v24
+v_d2_1 .req v25
+v_d2_2 .req v26
+v_d2_3 .req v27
+v_d3_0 .req v28
+v_d3_1 .req v29
+v_d3_2 .req v30
+v_d3_3 .req v31
+q_d1_0 .req q20
+q_d1_1 .req q21
+q_d1_2 .req q22
+q_d1_3 .req q23
+q_d2_0 .req q24
+q_d2_1 .req q25
+q_d2_2 .req q26
+q_d2_3 .req q27
+q_d3_0 .req q28
+q_d3_1 .req q29
+q_d3_2 .req q30
+q_d3_3 .req q31
+
+v_data .req v21
+q_data .req q21
+v_data_lo .req v22
+v_data_hi .req v23
+
+cdecl(gf_3vect_mad_neon):
+ /* less than 16 bytes, return_fail */
+ cmp x_len, #16
+ blt .return_fail
+
+ movi v_mask0f.16b, #0x0f
+ lsl x_vec_i, x_vec_i, #5
+ lsl x_vec, x_vec, #5
+ add x_tbl1, x_tbl, x_vec_i
+ add x_tbl2, x_tbl1, x_vec
+ add x_tbl3, x_tbl2, x_vec
+ add x_src_end, x_src, x_len
+ ldr x_dest1, [x_dest]
+ ldr x_dest2, [x_dest, #8]
+ ldr x_dest3, [x_dest, #16]
+ ldr q_gft1_lo, [x_tbl1]
+ ldr q_gft1_hi, [x_tbl1, #16]
+ ldr q_gft2_lo, [x_tbl2]
+ ldr q_gft2_hi, [x_tbl2, #16]
+ ldr q_gft3_lo, [x_tbl3]
+ ldr q_gft3_hi, [x_tbl3, #16]
+
+.Lloop64_init:
+ /* less than 64 bytes, goto Lloop16_init */
+ cmp x_len, #64
+ blt .Lloop16_init
+
+ /* save d8 ~ d15 to stack */
+ sub sp, sp, #64
+ stp d8, d9, [sp]
+ stp d10, d11, [sp, #16]
+ stp d12, d13, [sp, #32]
+ stp d14, d15, [sp, #48]
+
+ sub x_src_end, x_src_end, #64
+
+.Lloop64:
+ ldr q_data_0, [x_src, #16*0]
+ ldr q_data_1, [x_src, #16*1]
+ ldr q_data_2, [x_src, #16*2]
+ ldr q_data_3, [x_src, #16*3]
+ add x_src, x_src, #64
+
+ ldr q_d1_0, [x_dest1, #16*0]
+ ldr q_d1_1, [x_dest1, #16*1]
+ ldr q_d1_2, [x_dest1, #16*2]
+ ldr q_d1_3, [x_dest1, #16*3]
+
+ ldr q_d2_0, [x_dest2, #16*0]
+ ldr q_d2_1, [x_dest2, #16*1]
+ ldr q_d2_2, [x_dest2, #16*2]
+ ldr q_d2_3, [x_dest2, #16*3]
+
+ ldr q_d3_0, [x_dest3, #16*0]
+ ldr q_d3_1, [x_dest3, #16*1]
+ ldr q_d3_2, [x_dest3, #16*2]
+ ldr q_d3_3, [x_dest3, #16*3]
+
+ and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
+ and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
+ and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b
+ and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b
+
+ ushr v_data_0_hi.16b, v_data_0.16b, #4
+ ushr v_data_1_hi.16b, v_data_1.16b, #4
+ ushr v_data_2_hi.16b, v_data_2.16b, #4
+ ushr v_data_3_hi.16b, v_data_3.16b, #4
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
+ eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b
+ eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
+ eor v_d1_1.16b, v_tmp_lo.16b, v_d1_1.16b
+ eor v_d1_1.16b, v_d1_1.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b
+ eor v_d1_2.16b, v_tmp_lo.16b, v_d1_2.16b
+ eor v_d1_2.16b, v_d1_2.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b
+ eor v_d1_3.16b, v_tmp_lo.16b, v_d1_3.16b
+ eor v_d1_3.16b, v_d1_3.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_0_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_0_hi.16b
+ eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b
+ eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_1_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_1_hi.16b
+ eor v_d2_1.16b, v_tmp_lo.16b, v_d2_1.16b
+ eor v_d2_1.16b, v_d2_1.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_2_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_2_hi.16b
+ eor v_d2_2.16b, v_tmp_lo.16b, v_d2_2.16b
+ eor v_d2_2.16b, v_d2_2.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_3_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_3_hi.16b
+ eor v_d2_3.16b, v_tmp_lo.16b, v_d2_3.16b
+ eor v_d2_3.16b, v_d2_3.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_0_lo.16b
+ tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_0_hi.16b
+ eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b
+ eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_1_lo.16b
+ tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_1_hi.16b
+ eor v_d3_1.16b, v_tmp_lo.16b, v_d3_1.16b
+ eor v_d3_1.16b, v_d3_1.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_2_lo.16b
+ tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_2_hi.16b
+ eor v_d3_2.16b, v_tmp_lo.16b, v_d3_2.16b
+ eor v_d3_2.16b, v_d3_2.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_3_lo.16b
+ tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_3_hi.16b
+ eor v_d3_3.16b, v_tmp_lo.16b, v_d3_3.16b
+ eor v_d3_3.16b, v_d3_3.16b, v_tmp_hi.16b
+
+ str q_d1_0, [x_dest1, #16*0]
+ str q_d1_1, [x_dest1, #16*1]
+ str q_d1_2, [x_dest1, #16*2]
+ str q_d1_3, [x_dest1, #16*3]
+ add x_dest1, x_dest1, #64
+
+ str q_d2_0, [x_dest2, #16*0]
+ str q_d2_1, [x_dest2, #16*1]
+ str q_d2_2, [x_dest2, #16*2]
+ str q_d2_3, [x_dest2, #16*3]
+ add x_dest2, x_dest2, #64
+
+ str q_d3_0, [x_dest3, #16*0]
+ str q_d3_1, [x_dest3, #16*1]
+ str q_d3_2, [x_dest3, #16*2]
+ str q_d3_3, [x_dest3, #16*3]
+ add x_dest3, x_dest3, #64
+
+ cmp x_src, x_src_end
+ bls .Lloop64
+
+.Lloop64_end:
+ /* restore d8 ~ d15 */
+ ldp d8, d9, [sp]
+ ldp d10, d11, [sp, #16]
+ ldp d12, d13, [sp, #32]
+ ldp d14, d15, [sp, #48]
+ add sp, sp, #64
+ add x_src_end, x_src_end, #64
+
+.Lloop16_init:
+ sub x_src_end, x_src_end, #16
+ cmp x_src, x_src_end
+ bhi .lessthan16_init
+
+.Lloop16:
+ ldr q_data, [x_src]
+
+ ldr q_d1_0, [x_dest1]
+ ldr q_d2_0, [x_dest2]
+ ldr q_d3_0, [x_dest3]
+
+ and v_data_lo.16b, v_data.16b, v_mask0f.16b
+ ushr v_data_hi.16b, v_data.16b, #4
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
+ eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b
+ eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
+ eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b
+ eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
+ eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b
+ eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
+
+ str q_d1_0, [x_dest1]
+ str q_d2_0, [x_dest2]
+ str q_d3_0, [x_dest3]
+
+ add x_src, x_src, #16
+ add x_dest1, x_dest1, #16
+ add x_dest2, x_dest2, #16
+ add x_dest3, x_dest3, #16
+ cmp x_src, x_src_end
+ bls .Lloop16
+
+.lessthan16_init:
+ sub x_tmp, x_src, x_src_end
+ cmp x_tmp, #16
+ beq .return_pass
+
+.lessthan16:
+ mov x_src, x_src_end
+ sub x_dest1, x_dest1, x_tmp
+ sub x_dest2, x_dest2, x_tmp
+ sub x_dest3, x_dest3, x_tmp
+
+#ifndef __APPLE__
+ adrp x_const, const_tbl
+ add x_const, x_const, :lo12:const_tbl
+#else
+ adrp x_const, const_tbl@PAGE
+ add x_const, x_const, const_tbl@PAGEOFF
+#endif
+ sub x_const, x_const, x_tmp
+ ldr q_tmp, [x_const, #16]
+
+ ldr q_data, [x_src]
+ ldr q_d1_0, [x_dest1]
+ ldr q_d2_0, [x_dest2]
+ ldr q_d3_0, [x_dest3]
+
+ and v_data_lo.16b, v_data.16b, v_mask0f.16b
+ ushr v_data_hi.16b, v_data.16b, #4
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
+ eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
+ and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
+ eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
+ eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
+ and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
+ eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
+ eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
+ and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
+ eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
+
+ str q_d1_0, [x_dest1]
+ str q_d2_0, [x_dest2]
+ str q_d3_0, [x_dest3]
+
+.return_pass:
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
+
+ASM_DEF_RODATA
+.balign 8
+const_tbl:
+ .dword 0x0000000000000000, 0x0000000000000000
+ .dword 0xffffffffffffffff, 0xffffffffffffffff
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_mad_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_mad_sve.S
new file mode 100644
index 0000000000..9e0ca5c4b3
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_mad_sve.S
@@ -0,0 +1,175 @@
+/**************************************************************
+ Copyright (c) 2021 Linaro Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+.text
+.align 6
+.arch armv8-a+sve
+
+#include "../include/aarch64_label.h"
+
+.global cdecl(gf_3vect_mad_sve)
+#ifndef __APPLE__
+.type gf_3vect_mad_sve, %function
+#endif
+
+/* gf_3vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char **dest);
+ */
+/* arguments */
+x_len .req x0
+x_vec .req x1
+x_vec_i .req x2
+x_tbl .req x3
+x_src .req x4
+x_dest .req x5
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_pos .req x6
+x_dest2 .req x7
+x_dest3 .req x8
+x_dest1 .req x12
+
+/* vectors */
+z_mask0f .req z0
+
+z_src .req z1
+z_src_lo .req z2
+z_src_hi .req z_src
+
+z_dest1 .req z3
+
+z_tmp_lo .req z4
+z_tmp_hi .req z5
+
+z_gft1_lo .req z6
+z_gft1_hi .req z7
+q_gft1_lo .req q6
+q_gft1_hi .req q7
+
+/* bottom 64-bit of v8..v15 must be preserved if used */
+z_gft2_lo .req z17
+z_gft2_hi .req z18
+q_gft2_lo .req q17
+q_gft2_hi .req q18
+
+z_gft3_lo .req z19
+z_gft3_hi .req z20
+q_gft3_lo .req q19
+q_gft3_hi .req q20
+
+z_dest2 .req z27
+z_dest3 .req z28
+
+cdecl(gf_3vect_mad_sve):
+ /* less than 16 bytes, return_fail */
+ cmp x_len, #16
+ blt .return_fail
+
+ mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */
+ /* load table 1 */
+ add x_tbl, x_tbl, x_vec_i, LSL #5 /* x_tbl += x_vec_i * 2^5 */
+
+ /* Load table 1 with NEON instruction ldp */
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl]
+ /* load table 2 */
+ add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */
+ ldp q_gft2_lo, q_gft2_hi, [x_tbl]
+ /* load table 3 */
+ add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */
+ ldp q_gft3_lo, q_gft3_hi, [x_tbl]
+
+ ldr x_dest1, [x_dest, #8*0] /* pointer to dest1 */
+ ldr x_dest2, [x_dest, #8*1] /* pointer to dest2 */
+ ldr x_dest3, [x_dest, #8*2] /* pointer to dest3 */
+
+ mov x_pos, #0
+
+ /* vector length agnostic */
+.Lloopsve_vl:
+ whilelo p0.b, x_pos, x_len
+ b.none .return_pass
+
+ /* dest data prefetch */
+ prfb pldl2strm, p0, [x_dest1, x_pos]
+ prfb pldl2strm, p0, [x_dest2, x_pos]
+
+ /* load src data, governed by p0 */
+ ld1b z_src.b, p0/z, [x_src, x_pos]
+
+ /* split 4-bit lo; 4-bit hi */
+ and z_src_lo.d, z_src.d, z_mask0f.d
+ lsr z_src_hi.b, z_src.b, #4
+
+ /* load dest data, governed by p0 */
+ ld1b z_dest1.b, p0/z, [x_dest1, x_pos]
+ ld1b z_dest2.b, p0/z, [x_dest2, x_pos]
+ prfb pldl2strm, p0, [x_dest3, x_pos]
+
+ /* dest1 */
+ /* table indexing, ie. gf(2^8) multiplication */
+ tbl z_tmp_lo.b, {z_gft1_lo.b}, z_src_lo.b
+ tbl z_tmp_hi.b, {z_gft1_hi.b}, z_src_hi.b
+ /* exclusive or, ie. gf(2^8) add */
+ eor z_dest1.d, z_tmp_lo.d, z_dest1.d
+ eor z_dest1.d, z_tmp_hi.d, z_dest1.d
+
+ /* dest2 */
+ tbl z_tmp_lo.b, {z_gft2_lo.b}, z_src_lo.b
+ tbl z_tmp_hi.b, {z_gft2_hi.b}, z_src_hi.b
+
+ ld1b z_dest3.b, p0/z, [x_dest3, x_pos]
+ /* store dest data, governed by p0 */
+ st1b z_dest1.b, p0, [x_dest1, x_pos]
+
+ eor z_dest2.d, z_tmp_lo.d, z_dest2.d
+ eor z_dest2.d, z_tmp_hi.d, z_dest2.d
+
+ /* dest3 */
+ tbl z_tmp_lo.b, {z_gft3_lo.b}, z_src_lo.b
+ tbl z_tmp_hi.b, {z_gft3_hi.b}, z_src_hi.b
+ eor z_dest3.d, z_tmp_lo.d, z_dest3.d
+ eor z_dest3.d, z_tmp_hi.d, z_dest3.d
+
+ /* store dest data, governed by p0 */
+ st1b z_dest2.b, p0, [x_dest2, x_pos]
+ st1b z_dest3.b, p0, [x_dest3, x_pos]
+ /* increment one vector length */
+ incb x_pos
+
+ b .Lloopsve_vl
+
+.return_pass:
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_dot_prod_neon.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_dot_prod_neon.S
new file mode 100644
index 0000000000..6204102f68
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_dot_prod_neon.S
@@ -0,0 +1,425 @@
+/**************************************************************
+ Copyright (c) 2019 Huawei Technologies Co., Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "../include/aarch64_label.h"
+
+.text
+
+.global cdecl(gf_4vect_dot_prod_neon)
+#ifndef __APPLE__
+.type gf_4vect_dot_prod_neon, %function
+#endif
+
+/* arguments */
+x_len .req x0
+x_vec .req x1
+x_tbl .req x2
+x_src .req x3
+x_dest .req x4
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_vec_i .req x5
+x_ptr .req x6
+x_pos .req x7
+x_tmp .req x8
+x_dest1 .req x9
+x_tbl1 .req x10
+x_dest2 .req x11
+x_tbl2 .req x12
+x_dest3 .req x13
+x_tbl3 .req x14
+x_dest4 .req x_dest
+x_tbl4 .req x15
+
+/* vectors */
+v_mask0f .req v0
+q_mask0f .req q0
+v_tmp1_lo .req v1
+v_tmp1_hi .req v2
+v_tmp1 .req v3
+q_tmp1 .req q3
+
+v_p1_0 .req v4
+v_p2_0 .req v5
+v_p3_0 .req v6
+v_p4_0 .req v7
+
+q_p1_0 .req q4
+q_p2_0 .req q5
+q_p3_0 .req q6
+q_p4_0 .req q7
+
+v_data_0 .req v8
+v_data_1 .req v9
+v_data_2 .req v10
+v_data_3 .req v11
+q_data_0 .req q8
+q_data_1 .req q9
+q_data_2 .req q10
+q_data_3 .req q11
+
+v_p1_3 .req v12
+v_p2_3 .req v13
+v_p3_3 .req v14
+v_p4_3 .req v15
+q_p1_3 .req q12
+q_p2_3 .req q13
+q_p3_3 .req q14
+q_p4_3 .req q15
+
+v_gft1_lo .req v16
+v_gft1_hi .req v17
+v_gft2_lo .req v18
+v_gft2_hi .req v19
+v_gft3_lo .req v20
+v_gft3_hi .req v21
+v_gft4_lo .req v22
+v_gft4_hi .req v23
+q_gft1_lo .req q16
+q_gft1_hi .req q17
+q_gft2_lo .req q18
+q_gft2_hi .req q19
+q_gft3_lo .req q20
+q_gft3_hi .req q21
+q_gft4_lo .req q22
+q_gft4_hi .req q23
+
+v_p1_1 .req v24
+v_p1_2 .req v25
+v_p2_1 .req v26
+v_p2_2 .req v27
+v_p3_1 .req v28
+v_p3_2 .req v29
+v_p4_1 .req v30
+v_p4_2 .req v31
+
+q_p1_1 .req q24
+q_p1_2 .req q25
+q_p2_1 .req q26
+q_p2_2 .req q27
+q_p3_1 .req q28
+q_p3_2 .req q29
+q_p4_1 .req q30
+q_p4_2 .req q31
+
+v_data .req v_tmp1
+q_data .req q_tmp1
+v_data_lo .req v_tmp1_lo
+v_data_hi .req v_tmp1_hi
+
+cdecl(gf_4vect_dot_prod_neon):
+ /* less than 16 bytes, return_fail */
+ cmp x_len, #16
+ blt .return_fail
+
+ movi v_mask0f.16b, #0x0f
+ mov x_pos, #0
+ lsl x_vec, x_vec, #3
+ ldr x_dest1, [x_dest, #8*0]
+ ldr x_dest2, [x_dest, #8*1]
+ ldr x_dest3, [x_dest, #8*2]
+ ldr x_dest4, [x_dest, #8*3]
+
+.Lloop64_init:
+ /* less than 64 bytes, goto Lloop16_init */
+ cmp x_len, #64
+ blt .Lloop16_init
+
+ /* save d8 ~ d15 to stack */
+ sub sp, sp, #64
+ stp d8, d9, [sp]
+ stp d10, d11, [sp, #16]
+ stp d12, d13, [sp, #32]
+ stp d14, d15, [sp, #48]
+
+ sub x_len, x_len, #64
+
+.Lloop64:
+ movi v_p1_0.16b, #0
+ movi v_p1_1.16b, #0
+ movi v_p1_2.16b, #0
+ movi v_p1_3.16b, #0
+ movi v_p2_0.16b, #0
+ movi v_p2_1.16b, #0
+ movi v_p2_2.16b, #0
+ movi v_p2_3.16b, #0
+ movi v_p3_0.16b, #0
+ movi v_p3_1.16b, #0
+ movi v_p3_2.16b, #0
+ movi v_p3_3.16b, #0
+ movi v_p4_0.16b, #0
+ movi v_p4_1.16b, #0
+ movi v_p4_2.16b, #0
+ movi v_p4_3.16b, #0
+
+ mov x_tbl1, x_tbl
+ add x_tbl2, x_tbl1, x_vec, lsl #2
+ add x_tbl3, x_tbl2, x_vec, lsl #2
+ add x_tbl4, x_tbl3, x_vec, lsl #2
+ mov x_vec_i, #0
+ prfm pldl1keep, [x_tbl1]
+ prfm pldl1keep, [x_tbl2]
+ prfm pldl1keep, [x_tbl3]
+ prfm pldl1keep, [x_tbl4]
+
+.Lloop64_vects:
+ ldr x_ptr, [x_src, x_vec_i]
+ add x_vec_i, x_vec_i, #8
+ add x_ptr, x_ptr, x_pos
+
+ ldr q_data_0, [x_ptr], #16
+ ldr q_data_1, [x_ptr], #16
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32
+ ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32
+ ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32
+ ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32
+ ldr q_data_2, [x_ptr], #16
+ ldr q_data_3, [x_ptr], #16
+
+ prfm pldl1strm, [x_ptr]
+ prfm pldl1keep, [x_tbl1]
+ prfm pldl1keep, [x_tbl2]
+ prfm pldl1keep, [x_tbl3]
+ prfm pldl1keep, [x_tbl4]
+
+ /* data_0 */
+ and v_tmp1.16b, v_data_0.16b, v_mask0f.16b
+ ushr v_data_0.16b, v_data_0.16b, #4
+
+ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_0.16b
+ eor v_p1_0.16b, v_tmp1_lo.16b, v_p1_0.16b
+ eor v_p1_0.16b, v_p1_0.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_0.16b
+ eor v_p2_0.16b, v_tmp1_lo.16b, v_p2_0.16b
+ eor v_p2_0.16b, v_p2_0.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_0.16b
+ eor v_p3_0.16b, v_tmp1_lo.16b, v_p3_0.16b
+ eor v_p3_0.16b, v_p3_0.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_0.16b
+ eor v_p4_0.16b, v_tmp1_lo.16b, v_p4_0.16b
+ eor v_p4_0.16b, v_p4_0.16b, v_tmp1_hi.16b
+
+ /* data_1 */
+ and v_tmp1.16b, v_data_1.16b, v_mask0f.16b
+ ushr v_data_1.16b, v_data_1.16b, #4
+
+ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_1.16b
+ eor v_p1_1.16b, v_tmp1_lo.16b, v_p1_1.16b
+ eor v_p1_1.16b, v_p1_1.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_1.16b
+ eor v_p2_1.16b, v_tmp1_lo.16b, v_p2_1.16b
+ eor v_p2_1.16b, v_p2_1.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_1.16b
+ eor v_p3_1.16b, v_tmp1_lo.16b, v_p3_1.16b
+ eor v_p3_1.16b, v_p3_1.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_1.16b
+ eor v_p4_1.16b, v_tmp1_lo.16b, v_p4_1.16b
+ eor v_p4_1.16b, v_p4_1.16b, v_tmp1_hi.16b
+
+ /* data_2 */
+ and v_tmp1.16b, v_data_2.16b, v_mask0f.16b
+ ushr v_data_2.16b, v_data_2.16b, #4
+
+ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_2.16b
+ eor v_p1_2.16b, v_tmp1_lo.16b, v_p1_2.16b
+ eor v_p1_2.16b, v_p1_2.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_2.16b
+ eor v_p2_2.16b, v_tmp1_lo.16b, v_p2_2.16b
+ eor v_p2_2.16b, v_p2_2.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_2.16b
+ eor v_p3_2.16b, v_tmp1_lo.16b, v_p3_2.16b
+ eor v_p3_2.16b, v_p3_2.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_2.16b
+ eor v_p4_2.16b, v_tmp1_lo.16b, v_p4_2.16b
+ eor v_p4_2.16b, v_p4_2.16b, v_tmp1_hi.16b
+
+ /* data_3 */
+ and v_tmp1.16b, v_data_3.16b, v_mask0f.16b
+ ushr v_data_3.16b, v_data_3.16b, #4
+
+ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_3.16b
+ eor v_p1_3.16b, v_tmp1_lo.16b, v_p1_3.16b
+ eor v_p1_3.16b, v_p1_3.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_3.16b
+ eor v_p2_3.16b, v_tmp1_lo.16b, v_p2_3.16b
+ eor v_p2_3.16b, v_p2_3.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_3.16b
+ eor v_p3_3.16b, v_tmp1_lo.16b, v_p3_3.16b
+ eor v_p3_3.16b, v_p3_3.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_3.16b
+ eor v_p4_3.16b, v_tmp1_lo.16b, v_p4_3.16b
+ eor v_p4_3.16b, v_p4_3.16b, v_tmp1_hi.16b
+
+ cmp x_vec_i, x_vec
+ blt .Lloop64_vects
+
+.Lloop64_vects_end:
+ add x_ptr, x_dest1, x_pos
+ stp q_p1_0, q_p1_1, [x_ptr], #32
+ stp q_p1_2, q_p1_3, [x_ptr]
+
+ add x_ptr, x_dest2, x_pos
+ stp q_p2_0, q_p2_1, [x_ptr], #32
+ stp q_p2_2, q_p2_3, [x_ptr]
+
+ add x_ptr, x_dest3, x_pos
+ stp q_p3_0, q_p3_1, [x_ptr], #32
+ stp q_p3_2, q_p3_3, [x_ptr]
+
+ add x_ptr, x_dest4, x_pos
+ stp q_p4_0, q_p4_1, [x_ptr], #32
+ stp q_p4_2, q_p4_3, [x_ptr]
+
+ add x_pos, x_pos, #64
+ cmp x_pos, x_len
+ ble .Lloop64
+
+.Lloop64_end:
+ /* restore d8 ~ d15 */
+ ldp d8, d9, [sp]
+ ldp d10, d11, [sp, #16]
+ ldp d12, d13, [sp, #32]
+ ldp d14, d15, [sp, #48]
+ add sp, sp, #64
+
+ add x_len, x_len, #64
+ cmp x_pos, x_len
+ beq .return_pass
+
+.Lloop16_init:
+ sub x_len, x_len, #16
+ cmp x_pos, x_len
+ bgt .lessthan16_init
+
+.Lloop16:
+ movi v_p1_0.16b, #0
+ movi v_p2_0.16b, #0
+ movi v_p3_0.16b, #0
+ movi v_p4_0.16b, #0
+ mov x_tbl1, x_tbl
+ add x_tbl2, x_tbl1, x_vec, lsl #2
+ add x_tbl3, x_tbl2, x_vec, lsl #2
+ add x_tbl4, x_tbl3, x_vec, lsl #2
+ mov x_vec_i, #0
+
+.Lloop16_vects:
+ ldr x_ptr, [x_src, x_vec_i]
+ add x_vec_i, x_vec_i, #8
+ ldr q_data, [x_ptr, x_pos]
+
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32
+ ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32
+ ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32
+ ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32
+
+ prfm pldl1keep, [x_tbl1]
+ prfm pldl1keep, [x_tbl2]
+ prfm pldl1keep, [x_tbl3]
+ prfm pldl1keep, [x_tbl4]
+
+ and v_data_lo.16b, v_data.16b, v_mask0f.16b
+ ushr v_data_hi.16b, v_data.16b, #4
+
+ tbl v_gft1_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
+ tbl v_gft1_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
+ tbl v_gft2_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
+ tbl v_gft2_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
+ tbl v_gft3_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
+ tbl v_gft3_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
+ tbl v_gft4_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b
+ tbl v_gft4_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b
+
+ eor v_p1_0.16b, v_gft1_hi.16b, v_p1_0.16b
+ eor v_p1_0.16b, v_p1_0.16b, v_gft1_lo.16b
+ eor v_p2_0.16b, v_gft2_hi.16b, v_p2_0.16b
+ eor v_p2_0.16b, v_p2_0.16b, v_gft2_lo.16b
+ eor v_p3_0.16b, v_gft3_hi.16b, v_p3_0.16b
+ eor v_p3_0.16b, v_p3_0.16b, v_gft3_lo.16b
+ eor v_p4_0.16b, v_gft4_hi.16b, v_p4_0.16b
+ eor v_p4_0.16b, v_p4_0.16b, v_gft4_lo.16b
+
+ cmp x_vec_i, x_vec
+ bne .Lloop16_vects
+
+.Lloop16_vects_end:
+ str q_p1_0, [x_dest1, x_pos]
+ str q_p2_0, [x_dest2, x_pos]
+ str q_p3_0, [x_dest3, x_pos]
+ str q_p4_0, [x_dest4, x_pos]
+ add x_pos, x_pos, #16
+ cmp x_pos, x_len
+ ble .Lloop16
+
+.Lloop16_end:
+ sub x_tmp, x_pos, x_len
+ cmp x_tmp, #16
+ beq .return_pass
+
+.lessthan16_init:
+ mov x_pos, x_len
+ b .Lloop16
+
+.return_pass:
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_dot_prod_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_dot_prod_sve.S
new file mode 100644
index 0000000000..eb354279f8
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_dot_prod_sve.S
@@ -0,0 +1,208 @@
+/*************************************************************
+ Copyright (c) 2021 Linaro Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+.text
+.align 6
+.arch armv8-a+sve
+
+#include "../include/aarch64_label.h"
+
+.global cdecl(gf_4vect_dot_prod_sve)
+#ifndef __APPLE__
+.type gf_4vect_dot_prod_sve, %function
+#endif
+/* void gf_4vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+ */
+
+/* arguments */
+x_len .req x0 /* vector length */
+x_vec .req x1 /* number of source vectors (ie. data blocks) */
+x_tbl .req x2
+x_src .req x3
+x_dest .req x4
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_vec_i .req x5
+x_ptr .req x6
+x_pos .req x7
+
+x_tbl1 .req x8
+x_tbl2 .req x9
+x_tbl3 .req x10
+x_tbl4 .req x11
+x_dest1 .req x12
+x_dest2 .req x13
+x_dest3 .req x14
+x_dest4 .req x_dest /* reused */
+
+/* r16,r17,r18,r29,r30: special role registers, avoided */
+/* r19..r29 and SP must be preserved */
+
+/* vectors */
+z_mask0f .req z0
+
+z_src .req z1
+z_src_lo .req z2
+z_src_hi .req z_src
+
+z_dest1 .req z3
+
+z_gft1_lo .req z4
+z_gft1_hi .req z5
+q_gft1_lo .req q4
+q_gft1_hi .req q5
+
+/* bottom 64-bit of v8..v15 must be preserved if used */
+z_gft2_lo .req z17
+z_gft2_hi .req z18
+q_gft2_lo .req q17
+q_gft2_hi .req q18
+
+z_gft3_lo .req z19
+z_gft3_hi .req z20
+q_gft3_lo .req q19
+q_gft3_hi .req q20
+
+z_gft4_lo .req z21
+z_gft4_hi .req z22
+q_gft4_lo .req q21
+q_gft4_hi .req q22
+
+z_dest2 .req z27
+z_dest3 .req z28
+z_dest4 .req z29
+
+cdecl(gf_4vect_dot_prod_sve):
+ /* less than 16 bytes, return_fail */
+ cmp x_len, #16
+ blt .return_fail
+
+ mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */
+ mov x_pos, #0
+ lsl x_vec, x_vec, #3
+ ldp x_dest1, x_dest2, [x_dest, #8*0]
+ ldp x_dest3, x_dest4, [x_dest, #8*2]
+
+/* Loop 1: x_len, vector length */
+.Lloopsve_vl:
+ whilelo p0.b, x_pos, x_len
+ b.none .return_pass
+
+ mov x_vec_i, #0 /* clear x_vec_i */
+ ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */
+
+ mov z_dest1.b, #0 /* clear z_dest1 */
+ mov z_dest2.b, #0 /* clear z_dest2 */
+ mov z_dest3.b, #0 /* clear z_dest3 */
+ mov z_dest4.b, #0 /* clear z_dest4 */
+
+ /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */
+ mov x_tbl1, x_tbl /* reset x_tbl1 */
+ add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */
+ add x_tbl3, x_tbl2, x_vec, LSL #2 /* reset x_tbl3 */
+ add x_tbl4, x_tbl3, x_vec, LSL #2 /* reset x_tbl4 */
+
+/* Loop 2: x_vec, number of source vectors (ie. data blocks) */
+.Lloopsve_vl_vects:
+ /* load src data, governed by p0 */
+ ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */
+ /* split 4-bit lo; 4-bit hi */
+ and z_src_lo.d, z_src.d, z_mask0f.d
+ lsr z_src_hi.b, z_src.b, #4
+
+
+ /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */
+ /* load gf_table's */
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */
+ ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32
+
+ /* prefetch */
+ prfb pldl2keep, p0, [x_tbl1]
+ prfb pldl2keep, p0, [x_tbl2]
+
+ /* calc for next */
+ add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */
+ ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */
+
+ /* dest 1 */
+ /* table indexing, ie. gf(2^8) multiplication */
+ tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b
+ tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b
+ /* exclusive or, ie. gf(2^8) add */
+ eor z_dest1.d, z_gft1_lo.d, z_dest1.d
+ eor z_dest1.d, z_dest1.d, z_gft1_hi.d
+
+ ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32
+ ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32
+ prfb pldl2keep, p0, [x_tbl3]
+ prfb pldl2keep, p0, [x_tbl4]
+
+ /* dest 2 */
+ tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b
+ tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b
+ eor z_dest2.d, z_gft2_lo.d, z_dest2.d
+ eor z_dest2.d, z_dest2.d, z_gft2_hi.d
+
+ /* dest 3 */
+ tbl z_gft3_lo.b, {z_gft3_lo.b}, z_src_lo.b
+ tbl z_gft3_hi.b, {z_gft3_hi.b}, z_src_hi.b
+ eor z_dest3.d, z_gft3_lo.d, z_dest3.d
+ eor z_dest3.d, z_dest3.d, z_gft3_hi.d
+
+ /* dest 4 */
+ tbl z_gft4_lo.b, {z_gft4_lo.b}, z_src_lo.b
+ tbl z_gft4_hi.b, {z_gft4_hi.b}, z_src_hi.b
+ eor z_dest4.d, z_gft4_lo.d, z_dest4.d
+ eor z_dest4.d, z_dest4.d, z_gft4_hi.d
+
+ cmp x_vec_i, x_vec
+ blt .Lloopsve_vl_vects
+/* end of Loop 2 */
+
+ /* store dest data, governed by p0 */
+ st1b z_dest1.b, p0, [x_dest1, x_pos]
+ st1b z_dest2.b, p0, [x_dest2, x_pos]
+ st1b z_dest3.b, p0, [x_dest3, x_pos]
+ st1b z_dest4.b, p0, [x_dest4, x_pos]
+
+ /* increment one vector length */
+ incb x_pos
+ b .Lloopsve_vl
+/* end of Loop 1 */
+
+.return_pass:
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_mad_neon.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_mad_neon.S
new file mode 100644
index 0000000000..ebf82e7ffe
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_mad_neon.S
@@ -0,0 +1,464 @@
+/**************************************************************
+ Copyright (c) 2019 Huawei Technologies Co., Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "../include/aarch64_label.h"
+
+.text
+
+.global cdecl(gf_4vect_mad_neon)
+#ifndef __APPLE__
+.type gf_4vect_mad_neon, %function
+#endif
+
+/* arguments */
+x_len .req x0
+x_vec .req x1
+x_vec_i .req x2
+x_tbl .req x3
+x_src .req x4
+x_dest .req x5
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_src_end .req x6
+x_dest1 .req x7
+x_dest2 .req x8
+x_dest3 .req x9
+x_dest4 .req x_dest
+x_tmp .req x10
+x_tbl1 .req x11
+x_tbl2 .req x12
+x_tbl3 .req x13
+x_tbl4 .req x14
+x_const .req x15
+
+/* vectors */
+v_mask0f .req v0
+v_tmp_lo .req v1
+v_tmp_hi .req v2
+v_tmp .req v3
+q_tmp .req q3
+
+v_gft1_lo .req v4
+v_gft1_hi .req v5
+v_gft2_lo .req v6
+v_gft2_hi .req v7
+v_gft3_lo .req v16
+v_gft3_hi .req v17
+v_gft4_lo .req v18
+v_gft4_hi .req v19
+q_gft1_lo .req q4
+q_gft1_hi .req q5
+q_gft2_lo .req q6
+q_gft2_hi .req q7
+q_gft3_lo .req q16
+q_gft3_hi .req q17
+q_gft4_lo .req q18
+q_gft4_hi .req q19
+
+v_data_0 .req v8
+v_data_1 .req v9
+v_data_2 .req v10
+v_data_3 .req v11
+q_data_0 .req q8
+q_data_1 .req q9
+q_data_2 .req q10
+q_data_3 .req q11
+
+v_data_0_lo .req v12
+v_data_1_lo .req v13
+v_data_2_lo .req v14
+v_data_3_lo .req v15
+v_data_0_hi .req v_data_0
+v_data_1_hi .req v_data_1
+v_data_2_hi .req v_data_2
+v_data_3_hi .req v_data_3
+
+v_d1_0 .req v20
+v_d1_1 .req v21
+v_d1_2 .req v22
+v_d1_3 .req v23
+v_d2_0 .req v24
+v_d2_1 .req v25
+v_d2_2 .req v26
+v_d2_3 .req v27
+v_d3_0 .req v28
+v_d3_1 .req v29
+v_d3_2 .req v30
+v_d3_3 .req v31
+q_d1_0 .req q20
+q_d1_1 .req q21
+q_d1_2 .req q22
+q_d1_3 .req q23
+q_d2_0 .req q24
+q_d2_1 .req q25
+q_d2_2 .req q26
+q_d2_3 .req q27
+q_d3_0 .req q28
+q_d3_1 .req q29
+q_d3_2 .req q30
+q_d3_3 .req q31
+
+v_d4_0 .req v_d1_0
+v_d4_1 .req v_d1_1
+v_d4_2 .req v_d1_2
+v_d4_3 .req v_d1_3
+q_d4_0 .req q_d1_0
+q_d4_1 .req q_d1_1
+q_d4_2 .req q_d1_2
+q_d4_3 .req q_d1_3
+
+v_data .req v21
+q_data .req q21
+v_data_lo .req v22
+v_data_hi .req v23
+
+cdecl(gf_4vect_mad_neon):
+ /* less than 16 bytes, return_fail */
+ cmp x_len, #16
+ blt .return_fail
+
+ movi v_mask0f.16b, #0x0f
+ lsl x_vec_i, x_vec_i, #5
+ lsl x_vec, x_vec, #5
+ add x_tbl1, x_tbl, x_vec_i
+ add x_tbl2, x_tbl1, x_vec
+ add x_tbl3, x_tbl2, x_vec
+ add x_tbl4, x_tbl3, x_vec
+ add x_src_end, x_src, x_len
+ ldr x_dest1, [x_dest, #8*0]
+ ldr x_dest2, [x_dest, #8*1]
+ ldr x_dest3, [x_dest, #8*2]
+ ldr x_dest4, [x_dest, #8*3]
+ ldr q_gft1_lo, [x_tbl1]
+ ldr q_gft1_hi, [x_tbl1, #16]
+ ldr q_gft2_lo, [x_tbl2]
+ ldr q_gft2_hi, [x_tbl2, #16]
+ ldr q_gft3_lo, [x_tbl3]
+ ldr q_gft3_hi, [x_tbl3, #16]
+ ldr q_gft4_lo, [x_tbl4]
+ ldr q_gft4_hi, [x_tbl4, #16]
+
+.Lloop64_init:
+ /* less than 64 bytes, goto Lloop16_init */
+ cmp x_len, #64
+ blt .Lloop16_init
+
+ /* save d8 ~ d15 to stack */
+ sub sp, sp, #64
+ stp d8, d9, [sp]
+ stp d10, d11, [sp, #16]
+ stp d12, d13, [sp, #32]
+ stp d14, d15, [sp, #48]
+
+ sub x_src_end, x_src_end, #64
+
+.Lloop64:
+ ldr q_data_0, [x_src, #16*0]
+ ldr q_data_1, [x_src, #16*1]
+ ldr q_data_2, [x_src, #16*2]
+ ldr q_data_3, [x_src, #16*3]
+ add x_src, x_src, #64
+
+ ldr q_d1_0, [x_dest1, #16*0]
+ ldr q_d1_1, [x_dest1, #16*1]
+ ldr q_d1_2, [x_dest1, #16*2]
+ ldr q_d1_3, [x_dest1, #16*3]
+
+ ldr q_d2_0, [x_dest2, #16*0]
+ ldr q_d2_1, [x_dest2, #16*1]
+ ldr q_d2_2, [x_dest2, #16*2]
+ ldr q_d2_3, [x_dest2, #16*3]
+
+ and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
+ and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
+ and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b
+ and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b
+
+ ushr v_data_0_hi.16b, v_data_0.16b, #4
+ ushr v_data_1_hi.16b, v_data_1.16b, #4
+ ushr v_data_2_hi.16b, v_data_2.16b, #4
+ ushr v_data_3_hi.16b, v_data_3.16b, #4
+
+ /* dest1 */
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
+ eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b
+ eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
+ eor v_d1_1.16b, v_tmp_lo.16b, v_d1_1.16b
+ eor v_d1_1.16b, v_d1_1.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b
+ eor v_d1_2.16b, v_tmp_lo.16b, v_d1_2.16b
+ eor v_d1_2.16b, v_d1_2.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b
+ eor v_d1_3.16b, v_tmp_lo.16b, v_d1_3.16b
+ eor v_d1_3.16b, v_d1_3.16b, v_tmp_hi.16b
+
+ /* dest2 */
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_0_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_0_hi.16b
+ eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b
+ eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_1_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_1_hi.16b
+ eor v_d2_1.16b, v_tmp_lo.16b, v_d2_1.16b
+ eor v_d2_1.16b, v_d2_1.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_2_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_2_hi.16b
+ eor v_d2_2.16b, v_tmp_lo.16b, v_d2_2.16b
+ eor v_d2_2.16b, v_d2_2.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_3_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_3_hi.16b
+ eor v_d2_3.16b, v_tmp_lo.16b, v_d2_3.16b
+ eor v_d2_3.16b, v_d2_3.16b, v_tmp_hi.16b
+
+ str q_d1_0, [x_dest1, #16*0]
+ str q_d1_1, [x_dest1, #16*1]
+ str q_d1_2, [x_dest1, #16*2]
+ str q_d1_3, [x_dest1, #16*3]
+ add x_dest1, x_dest1, #64
+
+ str q_d2_0, [x_dest2, #16*0]
+ str q_d2_1, [x_dest2, #16*1]
+ str q_d2_2, [x_dest2, #16*2]
+ str q_d2_3, [x_dest2, #16*3]
+ add x_dest2, x_dest2, #64
+
+ ldr q_d3_0, [x_dest3, #16*0]
+ ldr q_d3_1, [x_dest3, #16*1]
+ ldr q_d3_2, [x_dest3, #16*2]
+ ldr q_d3_3, [x_dest3, #16*3]
+
+ ldr q_d4_0, [x_dest4, #16*0]
+ ldr q_d4_1, [x_dest4, #16*1]
+ ldr q_d4_2, [x_dest4, #16*2]
+ ldr q_d4_3, [x_dest4, #16*3]
+
+ /* dest3 */
+ tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_0_lo.16b
+ tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_0_hi.16b
+ eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b
+ eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_1_lo.16b
+ tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_1_hi.16b
+ eor v_d3_1.16b, v_tmp_lo.16b, v_d3_1.16b
+ eor v_d3_1.16b, v_d3_1.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_2_lo.16b
+ tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_2_hi.16b
+ eor v_d3_2.16b, v_tmp_lo.16b, v_d3_2.16b
+ eor v_d3_2.16b, v_d3_2.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_3_lo.16b
+ tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_3_hi.16b
+ eor v_d3_3.16b, v_tmp_lo.16b, v_d3_3.16b
+ eor v_d3_3.16b, v_d3_3.16b, v_tmp_hi.16b
+
+ /* dest4 */
+ tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_0_lo.16b
+ tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_0_hi.16b
+ eor v_d4_0.16b, v_tmp_lo.16b, v_d4_0.16b
+ eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_1_lo.16b
+ tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_1_hi.16b
+ eor v_d4_1.16b, v_tmp_lo.16b, v_d4_1.16b
+ eor v_d4_1.16b, v_d4_1.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_2_lo.16b
+ tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_2_hi.16b
+ eor v_d4_2.16b, v_tmp_lo.16b, v_d4_2.16b
+ eor v_d4_2.16b, v_d4_2.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_3_lo.16b
+ tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_3_hi.16b
+ eor v_d4_3.16b, v_tmp_lo.16b, v_d4_3.16b
+ eor v_d4_3.16b, v_d4_3.16b, v_tmp_hi.16b
+
+ str q_d3_0, [x_dest3, #16*0]
+ str q_d3_1, [x_dest3, #16*1]
+ str q_d3_2, [x_dest3, #16*2]
+ str q_d3_3, [x_dest3, #16*3]
+ add x_dest3, x_dest3, #64
+
+ str q_d4_0, [x_dest4, #16*0]
+ str q_d4_1, [x_dest4, #16*1]
+ str q_d4_2, [x_dest4, #16*2]
+ str q_d4_3, [x_dest4, #16*3]
+ add x_dest4, x_dest4, #64
+
+ cmp x_src, x_src_end
+ bls .Lloop64
+
+.Lloop64_end:
+ /* restore d8 ~ d15 */
+ ldp d8, d9, [sp]
+ ldp d10, d11, [sp, #16]
+ ldp d12, d13, [sp, #32]
+ ldp d14, d15, [sp, #48]
+ add sp, sp, #64
+ add x_src_end, x_src_end, #64
+
+.Lloop16_init:
+ sub x_src_end, x_src_end, #16
+ cmp x_src, x_src_end
+ bhi .lessthan16_init
+
+.Lloop16:
+ ldr q_data, [x_src]
+
+ ldr q_d1_0, [x_dest1]
+ ldr q_d2_0, [x_dest2]
+
+ and v_data_lo.16b, v_data.16b, v_mask0f.16b
+ ushr v_data_hi.16b, v_data.16b, #4
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
+ eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b
+ eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
+ eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b
+ eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
+
+ str q_d1_0, [x_dest1]
+ str q_d2_0, [x_dest2]
+ ldr q_d3_0, [x_dest3]
+ ldr q_d4_0, [x_dest4]
+
+ tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
+ eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b
+ eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b
+ eor v_d4_0.16b, v_tmp_lo.16b, v_d4_0.16b
+ eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
+
+ str q_d3_0, [x_dest3]
+ str q_d4_0, [x_dest4]
+
+ add x_src, x_src, #16
+ add x_dest1, x_dest1, #16
+ add x_dest2, x_dest2, #16
+ add x_dest3, x_dest3, #16
+ add x_dest4, x_dest4, #16
+ cmp x_src, x_src_end
+ bls .Lloop16
+
+.lessthan16_init:
+ sub x_tmp, x_src, x_src_end
+ cmp x_tmp, #16
+ beq .return_pass
+
+.lessthan16:
+ mov x_src, x_src_end
+ sub x_dest1, x_dest1, x_tmp
+ sub x_dest2, x_dest2, x_tmp
+ sub x_dest3, x_dest3, x_tmp
+ sub x_dest4, x_dest4, x_tmp
+
+#ifndef __APPLE__
+ adrp x_const, const_tbl
+ add x_const, x_const, :lo12:const_tbl
+#else
+ adrp x_const, const_tbl@PAGE
+ add x_const, x_const, const_tbl@PAGEOFF
+#endif
+ sub x_const, x_const, x_tmp
+ ldr q_tmp, [x_const, #16]
+
+ ldr q_data, [x_src]
+ ldr q_d1_0, [x_dest1]
+ ldr q_d2_0, [x_dest2]
+
+ and v_data_lo.16b, v_data.16b, v_mask0f.16b
+ ushr v_data_hi.16b, v_data.16b, #4
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
+ eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
+ and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
+ eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
+ eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
+ and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
+ eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
+
+ str q_d1_0, [x_dest1]
+ str q_d2_0, [x_dest2]
+ ldr q_d3_0, [x_dest3]
+ ldr q_d4_0, [x_dest4]
+
+ tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
+ eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
+ and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
+ eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b
+ eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
+ and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
+ eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
+
+ str q_d3_0, [x_dest3]
+ str q_d4_0, [x_dest4]
+
+.return_pass:
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
+
+ASM_DEF_RODATA
+.balign 8
+const_tbl:
+ .dword 0x0000000000000000, 0x0000000000000000
+ .dword 0xffffffffffffffff, 0xffffffffffffffff
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_mad_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_mad_sve.S
new file mode 100644
index 0000000000..89ec89f5c6
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_mad_sve.S
@@ -0,0 +1,194 @@
+/**************************************************************
+ Copyright (c) 2021 Linaro Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+.text
+.align 6
+.arch armv8-a+sve
+
+#include "../include/aarch64_label.h"
+
+.global cdecl(gf_4vect_mad_sve)
+#ifndef __APPLE__
+.type gf_4vect_mad_sve, %function
+#endif
+
+/* gf_4vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char **dest);
+ */
+/* arguments */
+x_len .req x0
+x_vec .req x1
+x_vec_i .req x2
+x_tbl .req x3
+x_src .req x4
+x_dest .req x5
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_pos .req x6
+x_dest2 .req x7
+x_dest3 .req x8
+x_dest4 .req x9
+x_dest1 .req x12
+
+/* vectors */
+z_mask0f .req z0
+
+z_src .req z1
+z_src_lo .req z2
+z_src_hi .req z_src
+
+z_dest1 .req z3
+
+z_tmp_lo .req z4
+z_tmp_hi .req z5
+
+z_gft1_lo .req z6
+z_gft1_hi .req z7
+q_gft1_lo .req q6
+q_gft1_hi .req q7
+
+/* bottom 64-bit of v8..v15 must be preserved if used */
+z_gft2_lo .req z17
+z_gft2_hi .req z18
+q_gft2_lo .req q17
+q_gft2_hi .req q18
+
+z_gft3_lo .req z19
+z_gft3_hi .req z20
+q_gft3_lo .req q19
+q_gft3_hi .req q20
+
+z_gft4_lo .req z21
+z_gft4_hi .req z22
+q_gft4_lo .req q21
+q_gft4_hi .req q22
+
+z_dest2 .req z27
+z_dest3 .req z28
+z_dest4 .req z29
+
+cdecl(gf_4vect_mad_sve):
+ /* less than 16 bytes, return_fail */
+ cmp x_len, #16
+ blt .return_fail
+
+ mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */
+ /* load table 1 */
+ add x_tbl, x_tbl, x_vec_i, LSL #5 /* x_tbl += x_vec_i * 2^5 */
+
+ /* Load table 1 with NEON instruction ldp */
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl]
+ /* load table 2 */
+ add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */
+ ldp q_gft2_lo, q_gft2_hi, [x_tbl]
+ /* load table 3 */
+ add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */
+ ldp q_gft3_lo, q_gft3_hi, [x_tbl]
+ /* load table 4 */
+ add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */
+ ldp q_gft4_lo, q_gft4_hi, [x_tbl]
+
+ ldr x_dest1, [x_dest, #8*0] /* pointer to dest1 */
+ ldr x_dest2, [x_dest, #8*1] /* pointer to dest2 */
+ ldr x_dest3, [x_dest, #8*2] /* pointer to dest3 */
+ ldr x_dest4, [x_dest, #8*3] /* pointer to dest4 */
+
+ mov x_pos, #0
+
+ /* vector length agnostic */
+.Lloopsve_vl:
+ whilelo p0.b, x_pos, x_len
+ b.none .return_pass
+
+ prfb pldl2strm, p0, [x_dest1, x_pos]
+ prfb pldl2strm, p0, [x_dest2, x_pos]
+
+ /* load src data, governed by p0 */
+ ld1b z_src.b, p0/z, [x_src, x_pos]
+
+ /* split 4-bit lo; 4-bit hi */
+ and z_src_lo.d, z_src.d, z_mask0f.d
+ lsr z_src_hi.b, z_src.b, #4
+
+ /* load dest data, governed by p0 */
+ ld1b z_dest1.b, p0/z, [x_dest1, x_pos]
+ ld1b z_dest2.b, p0/z, [x_dest2, x_pos]
+
+ prfb pldl2strm, p0, [x_dest3, x_pos]
+ prfb pldl2strm, p0, [x_dest4, x_pos]
+
+ /* dest1 */
+ /* table indexing, ie. gf(2^8) multiplication */
+ tbl z_tmp_lo.b, {z_gft1_lo.b}, z_src_lo.b
+ tbl z_tmp_hi.b, {z_gft1_hi.b}, z_src_hi.b
+ /* exclusive or, ie. gf(2^8) add */
+ eor z_dest1.d, z_tmp_lo.d, z_dest1.d
+ eor z_dest1.d, z_tmp_hi.d, z_dest1.d
+
+ ld1b z_dest3.b, p0/z, [x_dest3, x_pos]
+ ld1b z_dest4.b, p0/z, [x_dest4, x_pos]
+
+ /* dest2 */
+ tbl z_tmp_lo.b, {z_gft2_lo.b}, z_src_lo.b
+ tbl z_tmp_hi.b, {z_gft2_hi.b}, z_src_hi.b
+ eor z_dest2.d, z_tmp_lo.d, z_dest2.d
+ eor z_dest2.d, z_tmp_hi.d, z_dest2.d
+
+ /* dest3 */
+ tbl z_tmp_lo.b, {z_gft3_lo.b}, z_src_lo.b
+ tbl z_tmp_hi.b, {z_gft3_hi.b}, z_src_hi.b
+ eor z_dest3.d, z_tmp_lo.d, z_dest3.d
+ eor z_dest3.d, z_tmp_hi.d, z_dest3.d
+
+ /* store dest data, governed by p0 */
+ st1b z_dest1.b, p0, [x_dest1, x_pos]
+ st1b z_dest2.b, p0, [x_dest2, x_pos]
+
+ /* dest4 */
+ tbl z_tmp_lo.b, {z_gft4_lo.b}, z_src_lo.b
+ tbl z_tmp_hi.b, {z_gft4_hi.b}, z_src_hi.b
+ eor z_dest4.d, z_tmp_lo.d, z_dest4.d
+ eor z_dest4.d, z_tmp_hi.d, z_dest4.d
+
+ st1b z_dest3.b, p0, [x_dest3, x_pos]
+ st1b z_dest4.b, p0, [x_dest4, x_pos]
+ /* increment one vector length */
+ incb x_pos
+
+ b .Lloopsve_vl
+
+.return_pass:
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_dot_prod_neon.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_dot_prod_neon.S
new file mode 100644
index 0000000000..13166665d6
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_dot_prod_neon.S
@@ -0,0 +1,484 @@
+/**************************************************************
+ Copyright (c) 2019 Huawei Technologies Co., Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "../include/aarch64_label.h"
+
+.text
+
+.global cdecl(gf_5vect_dot_prod_neon)
+#ifndef __APPLE__
+.type gf_5vect_dot_prod_neon, %function
+#endif
+
+/* arguments */
+x_len .req x0
+x_vec .req x1
+x_tbl .req x2
+x_src .req x3
+x_dest .req x4
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_vec_i .req x5
+x_ptr .req x6
+x_pos .req x7
+x_tmp .req x8
+x_dest1 .req x9
+x_dest2 .req x10
+x_dest3 .req x11
+x_dest4 .req x12
+x_dest5 .req x13
+
+/* vectors */
+v_tmp1 .req v0
+q_tmp1 .req q0
+v_tmp2 .req v1
+q_tmp2 .req q1
+
+v_mask0f .req v_tmp1
+q_mask0f .req q_tmp1
+v_tmp_lo .req v_tmp1
+v_tmp_hi .req v_tmp2
+
+v_gft_lo .req v2
+v_gft_hi .req v3
+q_gft_lo .req q2
+q_gft_hi .req q3
+
+v_p1_0 .req v4
+v_p2_0 .req v5
+v_p3_0 .req v6
+v_p4_0 .req v7
+
+q_p1_0 .req q4
+q_p2_0 .req q5
+q_p3_0 .req q6
+q_p4_0 .req q7
+
+v_data_0 .req v8
+v_data_1 .req v9
+v_data_2 .req v10
+v_data_3 .req v11
+q_data_0 .req q8
+q_data_1 .req q9
+q_data_2 .req q10
+q_data_3 .req q11
+
+v_data_0_lo .req v12
+v_data_1_lo .req v13
+v_data_2_lo .req v14
+v_data_3_lo .req v15
+v_data_0_hi .req v_data_0
+v_data_1_hi .req v_data_1
+v_data_2_hi .req v_data_2
+v_data_3_hi .req v_data_3
+
+v_p5_0 .req v16
+v_p1_1 .req v17
+v_p2_1 .req v18
+v_p3_1 .req v19
+v_p4_1 .req v20
+v_p5_1 .req v21
+v_p1_2 .req v22
+v_p2_2 .req v23
+v_p3_2 .req v24
+v_p4_2 .req v25
+v_p5_2 .req v26
+v_p1_3 .req v27
+v_p2_3 .req v28
+v_p3_3 .req v29
+v_p4_3 .req v30
+v_p5_3 .req v31
+
+q_p5_0 .req q16
+q_p1_1 .req q17
+q_p2_1 .req q18
+q_p3_1 .req q19
+q_p4_1 .req q20
+q_p5_1 .req q21
+q_p1_2 .req q22
+q_p2_2 .req q23
+q_p3_2 .req q24
+q_p4_2 .req q25
+q_p5_2 .req q26
+q_p1_3 .req q27
+q_p2_3 .req q28
+q_p3_3 .req q29
+q_p4_3 .req q30
+q_p5_3 .req q31
+
+v_data .req v_p1_1
+q_data .req q_p1_1
+v_data_lo .req v_p2_1
+v_data_hi .req v_p3_1
+
+v_gft1_lo .req v_p4_1
+v_gft1_hi .req v_p5_1
+v_gft2_lo .req v_p1_2
+v_gft2_hi .req v_p2_2
+v_gft3_lo .req v_p3_2
+v_gft3_hi .req v_p4_2
+v_gft4_lo .req v_p5_2
+v_gft4_hi .req v_p1_3
+v_gft5_lo .req v_p2_3
+v_gft5_hi .req v_p3_3
+q_gft1_lo .req q_p4_1
+q_gft1_hi .req q_p5_1
+q_gft2_lo .req q_p1_2
+q_gft2_hi .req q_p2_2
+q_gft3_lo .req q_p3_2
+q_gft3_hi .req q_p4_2
+q_gft4_lo .req q_p5_2
+q_gft4_hi .req q_p1_3
+q_gft5_lo .req q_p2_3
+q_gft5_hi .req q_p3_3
+
+
+cdecl(gf_5vect_dot_prod_neon):
+ /* less than 16 bytes, return_fail */
+ cmp x_len, #16
+ blt .return_fail
+
+ mov x_pos, #0
+ lsl x_vec, x_vec, #3
+ ldr x_dest1, [x_dest, #8*0]
+ ldr x_dest2, [x_dest, #8*1]
+ ldr x_dest3, [x_dest, #8*2]
+ ldr x_dest4, [x_dest, #8*3]
+ ldr x_dest5, [x_dest, #8*4]
+
+.Lloop64_init:
+ /* less than 64 bytes, goto Lloop16_init */
+ cmp x_len, #64
+ blt .Lloop16_init
+
+ /* save d8 ~ d15 to stack */
+ sub sp, sp, #64
+ stp d8, d9, [sp]
+ stp d10, d11, [sp, #16]
+ stp d12, d13, [sp, #32]
+ stp d14, d15, [sp, #48]
+
+ sub x_len, x_len, #64
+
+.Lloop64:
+ movi v_p1_0.16b, #0
+ movi v_p1_1.16b, #0
+ movi v_p1_2.16b, #0
+ movi v_p1_3.16b, #0
+ movi v_p2_0.16b, #0
+ movi v_p2_1.16b, #0
+ movi v_p2_2.16b, #0
+ movi v_p2_3.16b, #0
+ movi v_p3_0.16b, #0
+ movi v_p3_1.16b, #0
+ movi v_p3_2.16b, #0
+ movi v_p3_3.16b, #0
+ movi v_p4_0.16b, #0
+ movi v_p4_1.16b, #0
+ movi v_p4_2.16b, #0
+ movi v_p4_3.16b, #0
+ movi v_p5_0.16b, #0
+ movi v_p5_1.16b, #0
+ movi v_p5_2.16b, #0
+ movi v_p5_3.16b, #0
+ mov x_vec_i, #0
+
+.Lloop64_vects:
+ ldr x_ptr, [x_src, x_vec_i]
+ add x_ptr, x_ptr, x_pos
+
+ ldr q_data_0, [x_ptr], #16
+ ldr q_data_1, [x_ptr], #16
+ ldr q_data_2, [x_ptr], #16
+ ldr q_data_3, [x_ptr], #16
+ prfm pldl2keep, [x_ptr]
+
+ movi v_mask0f.16b, #0x0f
+ and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
+ and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
+ and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b
+ and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b
+ ushr v_data_0_hi.16b, v_data_0.16b, #4
+ ushr v_data_1_hi.16b, v_data_1.16b, #4
+ ushr v_data_2_hi.16b, v_data_2.16b, #4
+ ushr v_data_3_hi.16b, v_data_3.16b, #4
+
+ /* v_p1_x */
+ add x_tmp, x_tbl, x_vec_i, lsl #2
+ add x_vec_i, x_vec_i, #8
+ ldp q_gft_lo, q_gft_hi, [x_tmp]
+ prfm pldl3keep, [x_tmp, #32]
+ add x_tmp, x_tmp, x_vec, lsl #2
+
+ tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b
+ tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b
+ eor v_p1_0.16b, v_tmp_lo.16b, v_p1_0.16b
+ eor v_p1_0.16b, v_p1_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b
+ tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b
+ eor v_p1_1.16b, v_tmp_lo.16b, v_p1_1.16b
+ eor v_p1_1.16b, v_p1_1.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b
+ tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b
+ eor v_p1_2.16b, v_tmp_lo.16b, v_p1_2.16b
+ eor v_p1_2.16b, v_p1_2.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b
+ tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b
+ eor v_p1_3.16b, v_tmp_lo.16b, v_p1_3.16b
+ eor v_p1_3.16b, v_p1_3.16b, v_tmp_hi.16b
+
+ /* v_p2_x */
+ ldp q_gft_lo, q_gft_hi, [x_tmp]
+ prfm pldl3keep, [x_tmp, #32]
+ add x_tmp, x_tmp, x_vec, lsl #2
+
+ tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b
+ tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b
+ eor v_p2_0.16b, v_tmp_lo.16b, v_p2_0.16b
+ eor v_p2_0.16b, v_p2_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b
+ tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b
+ eor v_p2_1.16b, v_tmp_lo.16b, v_p2_1.16b
+ eor v_p2_1.16b, v_p2_1.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b
+ tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b
+ eor v_p2_2.16b, v_tmp_lo.16b, v_p2_2.16b
+ eor v_p2_2.16b, v_p2_2.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b
+ tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b
+ eor v_p2_3.16b, v_tmp_lo.16b, v_p2_3.16b
+ eor v_p2_3.16b, v_p2_3.16b, v_tmp_hi.16b
+
+ /* v_p3_x */
+ ldp q_gft_lo, q_gft_hi, [x_tmp]
+ prfm pldl3keep, [x_tmp, #32]
+ add x_tmp, x_tmp, x_vec, lsl #2
+
+ tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b
+ tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b
+ eor v_p3_0.16b, v_tmp_lo.16b, v_p3_0.16b
+ eor v_p3_0.16b, v_p3_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b
+ tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b
+ eor v_p3_1.16b, v_tmp_lo.16b, v_p3_1.16b
+ eor v_p3_1.16b, v_p3_1.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b
+ tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b
+ eor v_p3_2.16b, v_tmp_lo.16b, v_p3_2.16b
+ eor v_p3_2.16b, v_p3_2.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b
+ tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b
+ eor v_p3_3.16b, v_tmp_lo.16b, v_p3_3.16b
+ eor v_p3_3.16b, v_p3_3.16b, v_tmp_hi.16b
+
+ /* v_p4_x */
+ ldp q_gft_lo, q_gft_hi, [x_tmp]
+ prfm pldl3keep, [x_tmp, #32]
+ add x_tmp, x_tmp, x_vec, lsl #2
+
+ tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b
+ tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b
+ eor v_p4_0.16b, v_tmp_lo.16b, v_p4_0.16b
+ eor v_p4_0.16b, v_p4_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b
+ tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b
+ eor v_p4_1.16b, v_tmp_lo.16b, v_p4_1.16b
+ eor v_p4_1.16b, v_p4_1.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b
+ tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b
+ eor v_p4_2.16b, v_tmp_lo.16b, v_p4_2.16b
+ eor v_p4_2.16b, v_p4_2.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b
+ tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b
+ eor v_p4_3.16b, v_tmp_lo.16b, v_p4_3.16b
+ eor v_p4_3.16b, v_p4_3.16b, v_tmp_hi.16b
+
+ /* v_p5_x */
+ ldp q_gft_lo, q_gft_hi, [x_tmp]
+ prfm pldl3keep, [x_tmp, #32]
+
+ tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b
+ tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b
+ eor v_p5_0.16b, v_tmp_lo.16b, v_p5_0.16b
+ eor v_p5_0.16b, v_p5_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b
+ tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b
+ eor v_p5_1.16b, v_tmp_lo.16b, v_p5_1.16b
+ eor v_p5_1.16b, v_p5_1.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b
+ tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b
+ eor v_p5_2.16b, v_tmp_lo.16b, v_p5_2.16b
+ eor v_p5_2.16b, v_p5_2.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b
+ tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b
+ eor v_p5_3.16b, v_tmp_lo.16b, v_p5_3.16b
+ eor v_p5_3.16b, v_p5_3.16b, v_tmp_hi.16b
+
+ cmp x_vec_i, x_vec
+ blt .Lloop64_vects
+
+.Lloop64_vects_end:
+ add x_ptr, x_dest1, x_pos
+ stp q_p1_0, q_p1_1, [x_ptr], #32
+ stp q_p1_2, q_p1_3, [x_ptr]
+
+ add x_ptr, x_dest2, x_pos
+ stp q_p2_0, q_p2_1, [x_ptr], #32
+ stp q_p2_2, q_p2_3, [x_ptr]
+
+ add x_ptr, x_dest3, x_pos
+ stp q_p3_0, q_p3_1, [x_ptr], #32
+ stp q_p3_2, q_p3_3, [x_ptr]
+
+ add x_ptr, x_dest4, x_pos
+ stp q_p4_0, q_p4_1, [x_ptr], #32
+ stp q_p4_2, q_p4_3, [x_ptr]
+
+ add x_ptr, x_dest5, x_pos
+ stp q_p5_0, q_p5_1, [x_ptr], #32
+ stp q_p5_2, q_p5_3, [x_ptr]
+
+ add x_pos, x_pos, #64
+ cmp x_pos, x_len
+ ble .Lloop64
+
+.Lloop64_end:
+ /* restore d8 ~ d15 */
+ ldp d8, d9, [sp]
+ ldp d10, d11, [sp, #16]
+ ldp d12, d13, [sp, #32]
+ ldp d14, d15, [sp, #48]
+ add sp, sp, #64
+
+ add x_len, x_len, #64
+ cmp x_pos, x_len
+ beq .return_pass
+
+.Lloop16_init:
+ sub x_len, x_len, #16
+ cmp x_pos, x_len
+ bgt .lessthan16_init
+
+.Lloop16:
+ movi v_p1_0.16b, #0
+ movi v_p2_0.16b, #0
+ movi v_p3_0.16b, #0
+ movi v_p4_0.16b, #0
+ movi v_p5_0.16b, #0
+ mov x_vec_i, #0
+
+.Lloop16_vects:
+ ldr x_ptr, [x_src, x_vec_i]
+ ldr q_data, [x_ptr, x_pos]
+
+ movi v_mask0f.16b, #0x0f
+ and v_data_lo.16b, v_data.16b, v_mask0f.16b
+ ushr v_data_hi.16b, v_data.16b, #4
+
+ add x_tmp, x_tbl, x_vec_i, lsl #2
+ add x_vec_i, x_vec_i, #8
+ ldp q_gft1_lo, q_gft1_hi, [x_tmp]
+ add x_tmp, x_tmp, x_vec, lsl #2
+ ldp q_gft2_lo, q_gft2_hi, [x_tmp]
+ add x_tmp, x_tmp, x_vec, lsl #2
+ ldp q_gft3_lo, q_gft3_hi, [x_tmp]
+ add x_tmp, x_tmp, x_vec, lsl #2
+ ldp q_gft4_lo, q_gft4_hi, [x_tmp]
+ add x_tmp, x_tmp, x_vec, lsl #2
+ ldp q_gft5_lo, q_gft5_hi, [x_tmp]
+
+ tbl v_gft1_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
+ tbl v_gft1_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
+ tbl v_gft2_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
+ tbl v_gft2_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
+ tbl v_gft3_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
+ tbl v_gft3_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
+ tbl v_gft4_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b
+ tbl v_gft4_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b
+ tbl v_gft5_lo.16b, {v_gft5_lo.16b}, v_data_lo.16b
+ tbl v_gft5_hi.16b, {v_gft5_hi.16b}, v_data_hi.16b
+
+ eor v_p1_0.16b, v_gft1_hi.16b, v_p1_0.16b
+ eor v_p1_0.16b, v_p1_0.16b, v_gft1_lo.16b
+ eor v_p2_0.16b, v_gft2_hi.16b, v_p2_0.16b
+ eor v_p2_0.16b, v_p2_0.16b, v_gft2_lo.16b
+ eor v_p3_0.16b, v_gft3_hi.16b, v_p3_0.16b
+ eor v_p3_0.16b, v_p3_0.16b, v_gft3_lo.16b
+ eor v_p4_0.16b, v_gft4_hi.16b, v_p4_0.16b
+ eor v_p4_0.16b, v_p4_0.16b, v_gft4_lo.16b
+ eor v_p5_0.16b, v_gft5_hi.16b, v_p5_0.16b
+ eor v_p5_0.16b, v_p5_0.16b, v_gft5_lo.16b
+
+ cmp x_vec_i, x_vec
+ bne .Lloop16_vects
+
+.Lloop16_vects_end:
+ str q_p1_0, [x_dest1, x_pos]
+ str q_p2_0, [x_dest2, x_pos]
+ str q_p3_0, [x_dest3, x_pos]
+ str q_p4_0, [x_dest4, x_pos]
+ str q_p5_0, [x_dest5, x_pos]
+ add x_pos, x_pos, #16
+ cmp x_pos, x_len
+ ble .Lloop16
+
+.Lloop16_end:
+ sub x_tmp, x_pos, x_len
+ cmp x_tmp, #16
+ beq .return_pass
+
+.lessthan16_init:
+ mov x_pos, x_len
+ b .Lloop16
+
+.return_pass:
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_dot_prod_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_dot_prod_sve.S
new file mode 100644
index 0000000000..bb7cd0184e
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_dot_prod_sve.S
@@ -0,0 +1,237 @@
+/*************************************************************
+ Copyright (c) 2021 Linaro Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+.text
+.align 6
+.arch armv8-a+sve
+
+#include "../include/aarch64_label.h"
+
+.global cdecl(gf_5vect_dot_prod_sve)
+#ifndef __APPLE__
+.type gf_5vect_dot_prod_sve, %function
+#endif
+/* void gf_5vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+ */
+
+/* arguments */
+x_len .req x0 /* vector length */
+x_vec .req x1 /* number of source vectors (ie. data blocks) */
+x_tbl .req x2
+x_src .req x3
+x_dest .req x4
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_vec_i .req x5
+x_ptr .req x6
+x_pos .req x7
+
+x_tbl1 .req x8
+x_tbl2 .req x9
+x_tbl3 .req x10
+x_tbl4 .req x11
+x_tbl5 .req x12
+x_dest1 .req x13
+x_dest2 .req x14
+x_dest4 .req x15
+x_dest5 .req x_dest /* reused */
+
+/* r16,r17,r18,r29,r30: special role registers, avoided */
+/* r19..r29 and SP must be preserved */
+x_dest3 .req x19
+
+/* vectors */
+z_mask0f .req z0
+
+z_src .req z1
+z_src_lo .req z2
+z_src_hi .req z_src
+
+z_dest1 .req z3
+
+z_gft1_lo .req z4
+z_gft1_hi .req z5
+q_gft1_lo .req q4
+q_gft1_hi .req q5
+
+/* bottom 64-bit of v8..v15 must be preserved if used */
+z_gft2_lo .req z17
+z_gft2_hi .req z18
+q_gft2_lo .req q17
+q_gft2_hi .req q18
+
+z_gft3_lo .req z19
+z_gft3_hi .req z20
+q_gft3_lo .req q19
+q_gft3_hi .req q20
+
+z_gft4_lo .req z21
+z_gft4_hi .req z22
+q_gft4_lo .req q21
+q_gft4_hi .req q22
+
+z_gft5_lo .req z23
+z_gft5_hi .req z24
+q_gft5_lo .req q23
+q_gft5_hi .req q24
+
+z_dest2 .req z27
+z_dest3 .req z28
+z_dest4 .req z29
+z_dest5 .req z30
+
+cdecl(gf_5vect_dot_prod_sve):
+ /* less than 16 bytes, return_fail */
+ cmp x_len, #16
+ blt .return_fail
+
+ /* save r19..r29 */
+ sub sp, sp, #16 /* alignment */
+ str x19, [sp]
+
+ mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */
+ mov x_pos, #0
+ lsl x_vec, x_vec, #3
+ ldp x_dest1, x_dest2, [x_dest, #8*0]
+ ldp x_dest3, x_dest4, [x_dest, #8*2]
+ ldr x_dest5, [x_dest, #8*4]
+
+/* Loop 1: x_len, vector length */
+.Lloopsve_vl:
+ whilelo p0.b, x_pos, x_len
+ b.none .return_pass
+
+ mov x_vec_i, #0 /* clear x_vec_i */
+ ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */
+
+ mov z_dest1.b, #0 /* clear z_dest1 */
+ mov z_dest2.b, #0 /* clear z_dest2 */
+ mov z_dest3.b, #0 /* clear z_dest3 */
+ mov z_dest4.b, #0 /* clear z_dest4 */
+ mov z_dest5.b, #0 /* clear z_dest5 */
+
+ /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */
+ mov x_tbl1, x_tbl /* reset x_tbl1 */
+ add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */
+ add x_tbl3, x_tbl2, x_vec, LSL #2 /* reset x_tbl3 */
+ add x_tbl4, x_tbl3, x_vec, LSL #2 /* reset x_tbl4 */
+ add x_tbl5, x_tbl4, x_vec, LSL #2 /* reset x_tbl5 */
+
+/* Loop 2: x_vec, number of source vectors (ie. data blocks) */
+.Lloopsve_vl_vects:
+ /* load src data, governed by p0 */
+ ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */
+ /* split 4-bit lo; 4-bit hi */
+ and z_src_lo.d, z_src.d, z_mask0f.d
+ lsr z_src_hi.b, z_src.b, #4
+
+
+ /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */
+ /* load gf_table's */
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */
+ ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32
+
+ /* prefetch */
+ prfb pldl2keep, p0, [x_tbl1]
+ prfb pldl2keep, p0, [x_tbl2]
+
+ /* calc for next */
+ add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */
+ ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */
+
+ /* dest 1 */
+ /* table indexing, ie. gf(2^8) multiplication */
+ tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b
+ tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b
+ /* exclusive or, ie. gf(2^8) add */
+ eor z_dest1.d, z_gft1_lo.d, z_dest1.d
+ eor z_dest1.d, z_dest1.d, z_gft1_hi.d
+
+ ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32
+ ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32
+ prfb pldl2keep, p0, [x_tbl3]
+ prfb pldl2keep, p0, [x_tbl4]
+
+ /* dest 2 */
+ tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b
+ tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b
+ eor z_dest2.d, z_gft2_lo.d, z_dest2.d
+ eor z_dest2.d, z_dest2.d, z_gft2_hi.d
+
+ /* dest 3 */
+ tbl z_gft3_lo.b, {z_gft3_lo.b}, z_src_lo.b
+ tbl z_gft3_hi.b, {z_gft3_hi.b}, z_src_hi.b
+ eor z_dest3.d, z_gft3_lo.d, z_dest3.d
+ eor z_dest3.d, z_dest3.d, z_gft3_hi.d
+
+ ldp q_gft5_lo, q_gft5_hi, [x_tbl5], #32
+ prfb pldl2keep, p0, [x_tbl5]
+
+ /* dest 4 */
+ tbl z_gft4_lo.b, {z_gft4_lo.b}, z_src_lo.b
+ tbl z_gft4_hi.b, {z_gft4_hi.b}, z_src_hi.b
+ eor z_dest4.d, z_gft4_lo.d, z_dest4.d
+ eor z_dest4.d, z_dest4.d, z_gft4_hi.d
+
+ /* dest 5 */
+ tbl z_gft5_lo.b, {z_gft5_lo.b}, z_src_lo.b
+ tbl z_gft5_hi.b, {z_gft5_hi.b}, z_src_hi.b
+ eor z_dest5.d, z_gft5_lo.d, z_dest5.d
+ eor z_dest5.d, z_dest5.d, z_gft5_hi.d
+
+ cmp x_vec_i, x_vec
+ blt .Lloopsve_vl_vects
+/* end of Loop 2 */
+
+ /* store dest data, governed by p0 */
+ st1b z_dest1.b, p0, [x_dest1, x_pos]
+ st1b z_dest2.b, p0, [x_dest2, x_pos]
+ st1b z_dest3.b, p0, [x_dest3, x_pos]
+ st1b z_dest4.b, p0, [x_dest4, x_pos]
+ st1b z_dest5.b, p0, [x_dest5, x_pos]
+
+ /* increment one vector length */
+ incb x_pos
+ b .Lloopsve_vl
+/* end of Loop 1 */
+
+.return_pass:
+ /* restore r19..r29 */
+ ldr x19, [sp]
+ add sp, sp, #16
+
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_mad_neon.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_mad_neon.S
new file mode 100644
index 0000000000..473e4c5774
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_mad_neon.S
@@ -0,0 +1,544 @@
+/**************************************************************
+ Copyright (c) 2019 Huawei Technologies Co., Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "../include/aarch64_label.h"
+
+.text
+
+.global cdecl(gf_5vect_mad_neon)
+#ifndef __APPLE__
+.type gf_5vect_mad_neon, %function
+#endif
+
+/* arguments */
+x_len .req x0
+x_vec .req x1
+x_vec_i .req x2
+x_tbl .req x3
+x_src .req x4
+x_dest .req x5
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_src_end .req x6
+x_dest1 .req x7
+x_dest2 .req x8
+x_dest3 .req x9
+x_dest4 .req x10
+x_dest5 .req x_dest
+x_tmp .req x11
+x_tbl1 .req x12
+x_tbl2 .req x13
+x_tbl3 .req x14
+x_tbl4 .req x15
+x_tbl5 .req x16
+x_const .req x17
+
+/* vectors */
+v_mask0f .req v0
+v_tmp_lo .req v1
+v_tmp_hi .req v2
+v_tmp .req v3
+q_tmp .req q3
+
+v_gft1_lo .req v4
+v_gft1_hi .req v5
+v_gft2_lo .req v6
+v_gft2_hi .req v7
+v_gft3_lo .req v16
+v_gft3_hi .req v17
+q_gft1_lo .req q4
+q_gft1_hi .req q5
+q_gft2_lo .req q6
+q_gft2_hi .req q7
+q_gft3_lo .req q16
+q_gft3_hi .req q17
+
+v_gft4_lo .req v18
+v_gft4_hi .req v19
+q_gft4_lo .req q18
+q_gft4_hi .req q19
+v_gft5_lo .req v_gft2_lo
+v_gft5_hi .req v_gft2_hi
+q_gft5_lo .req q_gft2_lo
+q_gft5_hi .req q_gft2_hi
+
+v_data_0 .req v8
+v_data_1 .req v9
+v_data_2 .req v10
+v_data_3 .req v11
+q_data_0 .req q8
+q_data_1 .req q9
+q_data_2 .req q10
+q_data_3 .req q11
+
+v_data_0_lo .req v12
+v_data_1_lo .req v13
+v_data_2_lo .req v14
+v_data_3_lo .req v15
+v_data_0_hi .req v_data_0
+v_data_1_hi .req v_data_1
+v_data_2_hi .req v_data_2
+v_data_3_hi .req v_data_3
+
+v_d1_0 .req v20
+v_d1_1 .req v21
+v_d1_2 .req v22
+v_d1_3 .req v23
+v_d2_0 .req v24
+v_d2_1 .req v25
+v_d2_2 .req v26
+v_d2_3 .req v27
+v_d3_0 .req v28
+v_d3_1 .req v29
+v_d3_2 .req v30
+v_d3_3 .req v31
+q_d1_0 .req q20
+q_d1_1 .req q21
+q_d1_2 .req q22
+q_d1_3 .req q23
+q_d2_0 .req q24
+q_d2_1 .req q25
+q_d2_2 .req q26
+q_d2_3 .req q27
+q_d3_0 .req q28
+q_d3_1 .req q29
+q_d3_2 .req q30
+q_d3_3 .req q31
+
+v_d4_0 .req v_d1_0
+v_d4_1 .req v_d1_1
+v_d4_2 .req v_d1_2
+v_d4_3 .req v_d1_3
+q_d4_0 .req q_d1_0
+q_d4_1 .req q_d1_1
+q_d4_2 .req q_d1_2
+q_d4_3 .req q_d1_3
+v_d5_0 .req v_d2_0
+v_d5_1 .req v_d2_1
+v_d5_2 .req v_d2_2
+v_d5_3 .req v_d2_3
+q_d5_0 .req q_d2_0
+q_d5_1 .req q_d2_1
+q_d5_2 .req q_d2_2
+q_d5_3 .req q_d2_3
+
+v_data .req v21
+q_data .req q21
+v_data_lo .req v22
+v_data_hi .req v23
+
+cdecl(gf_5vect_mad_neon):
+ /* less than 16 bytes, return_fail */
+ cmp x_len, #16
+ blt .return_fail
+
+ movi v_mask0f.16b, #0x0f
+ lsl x_vec_i, x_vec_i, #5
+ lsl x_vec, x_vec, #5
+ add x_tbl1, x_tbl, x_vec_i
+ add x_tbl2, x_tbl1, x_vec
+ add x_tbl3, x_tbl2, x_vec
+ add x_tbl4, x_tbl3, x_vec
+ add x_tbl5, x_tbl4, x_vec
+ add x_src_end, x_src, x_len
+ ldr x_dest1, [x_dest, #8*0]
+ ldr x_dest2, [x_dest, #8*1]
+ ldr x_dest3, [x_dest, #8*2]
+ ldr x_dest4, [x_dest, #8*3]
+ ldr x_dest5, [x_dest, #8*4]
+ ldr q_gft1_lo, [x_tbl1]
+ ldr q_gft1_hi, [x_tbl1, #16]
+ ldr q_gft3_lo, [x_tbl3]
+ ldr q_gft3_hi, [x_tbl3, #16]
+ ldr q_gft4_lo, [x_tbl4]
+ ldr q_gft4_hi, [x_tbl4, #16]
+
+.Lloop64_init:
+ /* less than 64 bytes, goto Lloop16_init */
+ cmp x_len, #64
+ blt .Lloop16_init
+
+ /* save d8 ~ d15 to stack */
+ sub sp, sp, #64
+ stp d8, d9, [sp]
+ stp d10, d11, [sp, #16]
+ stp d12, d13, [sp, #32]
+ stp d14, d15, [sp, #48]
+
+ sub x_src_end, x_src_end, #64
+
+.Lloop64:
+ ldr q_data_0, [x_src, #16*0]
+ ldr q_data_1, [x_src, #16*1]
+ ldr q_data_2, [x_src, #16*2]
+ ldr q_data_3, [x_src, #16*3]
+ add x_src, x_src, #64
+
+ ldr q_d1_0, [x_dest1, #16*0]
+ ldr q_d1_1, [x_dest1, #16*1]
+ ldr q_d1_2, [x_dest1, #16*2]
+ ldr q_d1_3, [x_dest1, #16*3]
+
+ ldr q_d2_0, [x_dest2, #16*0]
+ ldr q_d2_1, [x_dest2, #16*1]
+ ldr q_d2_2, [x_dest2, #16*2]
+ ldr q_d2_3, [x_dest2, #16*3]
+
+ ldr q_d3_0, [x_dest3, #16*0]
+ ldr q_d3_1, [x_dest3, #16*1]
+ ldr q_d3_2, [x_dest3, #16*2]
+ ldr q_d3_3, [x_dest3, #16*3]
+
+ ldr q_gft2_lo, [x_tbl2]
+ ldr q_gft2_hi, [x_tbl2, #16]
+
+ and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
+ and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
+ and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b
+ and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b
+
+ ushr v_data_0_hi.16b, v_data_0.16b, #4
+ ushr v_data_1_hi.16b, v_data_1.16b, #4
+ ushr v_data_2_hi.16b, v_data_2.16b, #4
+ ushr v_data_3_hi.16b, v_data_3.16b, #4
+
+ /* dest1 */
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
+ eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b
+ eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
+ eor v_d1_1.16b, v_tmp_lo.16b, v_d1_1.16b
+ eor v_d1_1.16b, v_d1_1.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b
+ eor v_d1_2.16b, v_tmp_lo.16b, v_d1_2.16b
+ eor v_d1_2.16b, v_d1_2.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b
+ eor v_d1_3.16b, v_tmp_lo.16b, v_d1_3.16b
+ eor v_d1_3.16b, v_d1_3.16b, v_tmp_hi.16b
+
+ /* dest2 */
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_0_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_0_hi.16b
+ eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b
+ eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_1_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_1_hi.16b
+ eor v_d2_1.16b, v_tmp_lo.16b, v_d2_1.16b
+ eor v_d2_1.16b, v_d2_1.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_2_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_2_hi.16b
+ eor v_d2_2.16b, v_tmp_lo.16b, v_d2_2.16b
+ eor v_d2_2.16b, v_d2_2.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_3_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_3_hi.16b
+ eor v_d2_3.16b, v_tmp_lo.16b, v_d2_3.16b
+ eor v_d2_3.16b, v_d2_3.16b, v_tmp_hi.16b
+
+ /* dest3 */
+ tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_0_lo.16b
+ tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_0_hi.16b
+ eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b
+ eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_1_lo.16b
+ tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_1_hi.16b
+ eor v_d3_1.16b, v_tmp_lo.16b, v_d3_1.16b
+ eor v_d3_1.16b, v_d3_1.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_2_lo.16b
+ tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_2_hi.16b
+ eor v_d3_2.16b, v_tmp_lo.16b, v_d3_2.16b
+ eor v_d3_2.16b, v_d3_2.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_3_lo.16b
+ tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_3_hi.16b
+ eor v_d3_3.16b, v_tmp_lo.16b, v_d3_3.16b
+ eor v_d3_3.16b, v_d3_3.16b, v_tmp_hi.16b
+
+ str q_d1_0, [x_dest1, #16*0]
+ str q_d1_1, [x_dest1, #16*1]
+ str q_d1_2, [x_dest1, #16*2]
+ str q_d1_3, [x_dest1, #16*3]
+ add x_dest1, x_dest1, #64
+
+ str q_d2_0, [x_dest2, #16*0]
+ str q_d2_1, [x_dest2, #16*1]
+ str q_d2_2, [x_dest2, #16*2]
+ str q_d2_3, [x_dest2, #16*3]
+ add x_dest2, x_dest2, #64
+
+ str q_d3_0, [x_dest3, #16*0]
+ str q_d3_1, [x_dest3, #16*1]
+ str q_d3_2, [x_dest3, #16*2]
+ str q_d3_3, [x_dest3, #16*3]
+ add x_dest3, x_dest3, #64
+
+ ldr q_d4_0, [x_dest4, #16*0]
+ ldr q_d4_1, [x_dest4, #16*1]
+ ldr q_d4_2, [x_dest4, #16*2]
+ ldr q_d4_3, [x_dest4, #16*3]
+
+ ldr q_d5_0, [x_dest5, #16*0]
+ ldr q_d5_1, [x_dest5, #16*1]
+ ldr q_d5_2, [x_dest5, #16*2]
+ ldr q_d5_3, [x_dest5, #16*3]
+
+ ldr q_gft5_lo, [x_tbl5]
+ ldr q_gft5_hi, [x_tbl5, #16]
+
+ /* dest4 */
+ tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_0_lo.16b
+ tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_0_hi.16b
+ eor v_d4_0.16b, v_tmp_lo.16b, v_d4_0.16b
+ eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_1_lo.16b
+ tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_1_hi.16b
+ eor v_d4_1.16b, v_tmp_lo.16b, v_d4_1.16b
+ eor v_d4_1.16b, v_d4_1.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_2_lo.16b
+ tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_2_hi.16b
+ eor v_d4_2.16b, v_tmp_lo.16b, v_d4_2.16b
+ eor v_d4_2.16b, v_d4_2.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_3_lo.16b
+ tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_3_hi.16b
+ eor v_d4_3.16b, v_tmp_lo.16b, v_d4_3.16b
+ eor v_d4_3.16b, v_d4_3.16b, v_tmp_hi.16b
+
+ /* dest5 */
+ tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_0_lo.16b
+ tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_0_hi.16b
+ eor v_d5_0.16b, v_tmp_lo.16b, v_d5_0.16b
+ eor v_d5_0.16b, v_d5_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_1_lo.16b
+ tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_1_hi.16b
+ eor v_d5_1.16b, v_tmp_lo.16b, v_d5_1.16b
+ eor v_d5_1.16b, v_d5_1.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_2_lo.16b
+ tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_2_hi.16b
+ eor v_d5_2.16b, v_tmp_lo.16b, v_d5_2.16b
+ eor v_d5_2.16b, v_d5_2.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_3_lo.16b
+ tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_3_hi.16b
+ eor v_d5_3.16b, v_tmp_lo.16b, v_d5_3.16b
+ eor v_d5_3.16b, v_d5_3.16b, v_tmp_hi.16b
+
+ str q_d4_0, [x_dest4, #16*0]
+ str q_d4_1, [x_dest4, #16*1]
+ str q_d4_2, [x_dest4, #16*2]
+ str q_d4_3, [x_dest4, #16*3]
+ add x_dest4, x_dest4, #64
+
+ str q_d5_0, [x_dest5, #16*0]
+ str q_d5_1, [x_dest5, #16*1]
+ str q_d5_2, [x_dest5, #16*2]
+ str q_d5_3, [x_dest5, #16*3]
+ add x_dest5, x_dest5, #64
+
+ cmp x_src, x_src_end
+ bls .Lloop64
+
+.Lloop64_end:
+ /* restore d8 ~ d15 */
+ ldp d8, d9, [sp]
+ ldp d10, d11, [sp, #16]
+ ldp d12, d13, [sp, #32]
+ ldp d14, d15, [sp, #48]
+ add sp, sp, #64
+ add x_src_end, x_src_end, #64
+
+.Lloop16_init:
+ sub x_src_end, x_src_end, #16
+ cmp x_src, x_src_end
+ bhi .lessthan16_init
+
+.Lloop16:
+ ldr q_data, [x_src]
+
+ ldr q_d1_0, [x_dest1]
+ ldr q_d2_0, [x_dest2]
+ ldr q_d3_0, [x_dest3]
+ ldr q_gft2_lo, [x_tbl2]
+ ldr q_gft2_hi, [x_tbl2, #16]
+
+ and v_data_lo.16b, v_data.16b, v_mask0f.16b
+ ushr v_data_hi.16b, v_data.16b, #4
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
+ eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b
+ eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
+ eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b
+ eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
+ eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b
+ eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
+
+ str q_d1_0, [x_dest1]
+ str q_d2_0, [x_dest2]
+ str q_d3_0, [x_dest3]
+
+ ldr q_d4_0, [x_dest4]
+ ldr q_d5_0, [x_dest5]
+ ldr q_gft5_lo, [x_tbl5]
+ ldr q_gft5_hi, [x_tbl5, #16]
+
+ tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b
+ eor v_d4_0.16b, v_tmp_lo.16b, v_d4_0.16b
+ eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_hi.16b
+ eor v_d5_0.16b, v_tmp_lo.16b, v_d5_0.16b
+ eor v_d5_0.16b, v_d5_0.16b, v_tmp_hi.16b
+
+ str q_d4_0, [x_dest4]
+ str q_d5_0, [x_dest5]
+
+ add x_src, x_src, #16
+ add x_dest1, x_dest1, #16
+ add x_dest2, x_dest2, #16
+ add x_dest3, x_dest3, #16
+ add x_dest4, x_dest4, #16
+ add x_dest5, x_dest5, #16
+ cmp x_src, x_src_end
+ bls .Lloop16
+
+.lessthan16_init:
+ sub x_tmp, x_src, x_src_end
+ cmp x_tmp, #16
+ beq .return_pass
+
+.lessthan16:
+ mov x_src, x_src_end
+ sub x_dest1, x_dest1, x_tmp
+ sub x_dest2, x_dest2, x_tmp
+ sub x_dest3, x_dest3, x_tmp
+ sub x_dest4, x_dest4, x_tmp
+ sub x_dest5, x_dest5, x_tmp
+
+#ifndef __APPLE__
+ adrp x_const, const_tbl
+ add x_const, x_const, :lo12:const_tbl
+#else
+ adrp x_const, const_tbl@PAGE
+ add x_const, x_const, const_tbl@PAGEOFF
+#endif
+ sub x_const, x_const, x_tmp
+ ldr q_tmp, [x_const, #16]
+
+ ldr q_data, [x_src]
+ ldr q_d1_0, [x_dest1]
+ ldr q_d2_0, [x_dest2]
+ ldr q_d3_0, [x_dest3]
+ ldr q_gft2_lo, [x_tbl2]
+ ldr q_gft2_hi, [x_tbl2, #16]
+
+ and v_data_lo.16b, v_data.16b, v_mask0f.16b
+ ushr v_data_hi.16b, v_data.16b, #4
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
+ eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
+ and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
+ eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
+ eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
+ and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
+ eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
+ eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
+ and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
+ eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
+
+ str q_d1_0, [x_dest1]
+ str q_d2_0, [x_dest2]
+ str q_d3_0, [x_dest3]
+
+ ldr q_d4_0, [x_dest4]
+ ldr q_d5_0, [x_dest5]
+ ldr q_gft5_lo, [x_tbl5]
+ ldr q_gft5_hi, [x_tbl5, #16]
+
+ tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b
+ eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
+ and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
+ eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_hi.16b
+ eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
+ and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
+ eor v_d5_0.16b, v_d5_0.16b, v_tmp_hi.16b
+
+ str q_d4_0, [x_dest4]
+ str q_d5_0, [x_dest5]
+
+.return_pass:
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
+
+ASM_DEF_RODATA
+.balign 8
+const_tbl:
+ .dword 0x0000000000000000, 0x0000000000000000
+ .dword 0xffffffffffffffff, 0xffffffffffffffff
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_mad_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_mad_sve.S
new file mode 100644
index 0000000000..ab374d365a
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_mad_sve.S
@@ -0,0 +1,218 @@
+/**************************************************************
+ Copyright (c) 2021 Linaro Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+.text
+.align 6
+.arch armv8-a+sve
+
+#include "../include/aarch64_label.h"
+
+.global cdecl(gf_5vect_mad_sve)
+#ifndef __APPLE__
+.type gf_5vect_mad_sve, %function
+#endif
+
+/* gf_5vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char **dest);
+ */
+/* arguments */
+x_len .req x0
+x_vec .req x1
+x_vec_i .req x2
+x_tbl .req x3
+x_src .req x4
+x_dest .req x5
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_pos .req x6
+x_dest2 .req x7
+x_dest3 .req x8
+x_dest4 .req x9
+x_dest5 .req x10
+x_dest1 .req x12
+
+/* vectors */
+z_mask0f .req z0
+
+z_src .req z1
+z_src_lo .req z2
+z_src_hi .req z_src
+
+z_dest1 .req z3
+
+z_tmp_lo .req z4
+z_tmp_hi .req z5
+
+z_gft1_lo .req z6
+z_gft1_hi .req z7
+q_gft1_lo .req q6
+q_gft1_hi .req q7
+
+/* bottom 64-bit of v8..v15 must be preserved if used */
+z_gft2_lo .req z17
+z_gft2_hi .req z18
+q_gft2_lo .req q17
+q_gft2_hi .req q18
+
+z_gft3_lo .req z19
+z_gft3_hi .req z20
+q_gft3_lo .req q19
+q_gft3_hi .req q20
+
+z_gft4_lo .req z21
+z_gft4_hi .req z22
+q_gft4_lo .req q21
+q_gft4_hi .req q22
+
+z_gft5_lo .req z23
+z_gft5_hi .req z24
+q_gft5_lo .req q23
+q_gft5_hi .req q24
+
+z_dest2 .req z27
+z_dest3 .req z28
+z_dest4 .req z29
+z_dest5 .req z30
+
+cdecl(gf_5vect_mad_sve):
+ /* less than 16 bytes, return_fail */
+ cmp x_len, #16
+ blt .return_fail
+
+ mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */
+ /* load table 1 */
+ add x_tbl, x_tbl, x_vec_i, LSL #5 /* x_tbl += x_vec_i * 2^5 */
+
+ /* Load with NEON instruction ldp */
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl]
+ /* load table 2 */
+ add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */
+ ldp q_gft2_lo, q_gft2_hi, [x_tbl]
+ /* load table 3 */
+ add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */
+ ldp q_gft3_lo, q_gft3_hi, [x_tbl]
+ /* load table 4 */
+ add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */
+ ldp q_gft4_lo, q_gft4_hi, [x_tbl]
+ /* load table 5 */
+ add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */
+ ldp q_gft5_lo, q_gft5_hi, [x_tbl]
+
+ ldr x_dest1, [x_dest, #8*0] /* pointer to dest1 */
+ ldr x_dest2, [x_dest, #8*1] /* pointer to dest2 */
+ ldr x_dest3, [x_dest, #8*2] /* pointer to dest3 */
+ ldr x_dest4, [x_dest, #8*3] /* pointer to dest4 */
+ ldr x_dest5, [x_dest, #8*4] /* pointer to dest5 */
+
+ mov x_pos, #0
+
+ /* vector length agnostic */
+.Lloopsve_vl:
+ whilelo p0.b, x_pos, x_len
+ b.none .return_pass
+
+ prfb pldl2strm, p0, [x_dest1, x_pos]
+ prfb pldl2strm, p0, [x_dest2, x_pos]
+
+ /* load src data, governed by p0 */
+ ld1b z_src.b, p0/z, [x_src, x_pos]
+
+ /* split 4-bit lo; 4-bit hi */
+ and z_src_lo.d, z_src.d, z_mask0f.d
+ lsr z_src_hi.b, z_src.b, #4
+
+ /* load dest data, governed by p0 */
+ ld1b z_dest1.b, p0/z, [x_dest1, x_pos]
+ ld1b z_dest2.b, p0/z, [x_dest2, x_pos]
+
+ prfb pldl2strm, p0, [x_dest3, x_pos]
+ prfb pldl2strm, p0, [x_dest4, x_pos]
+
+ /* dest1 */
+ /* table indexing, ie. gf(2^8) multiplication */
+ tbl z_tmp_lo.b, {z_gft1_lo.b}, z_src_lo.b
+ tbl z_tmp_hi.b, {z_gft1_hi.b}, z_src_hi.b
+ /* exclusive or, ie. gf(2^8) add */
+ eor z_dest1.d, z_tmp_lo.d, z_dest1.d
+ eor z_dest1.d, z_tmp_hi.d, z_dest1.d
+
+ ld1b z_dest3.b, p0/z, [x_dest3, x_pos]
+ ld1b z_dest4.b, p0/z, [x_dest4, x_pos]
+ prfb pldl2strm, p0, [x_dest5, x_pos]
+
+ /* dest2 */
+ tbl z_tmp_lo.b, {z_gft2_lo.b}, z_src_lo.b
+ tbl z_tmp_hi.b, {z_gft2_hi.b}, z_src_hi.b
+ eor z_dest2.d, z_tmp_lo.d, z_dest2.d
+ eor z_dest2.d, z_tmp_hi.d, z_dest2.d
+
+ ld1b z_dest5.b, p0/z, [x_dest5, x_pos]
+
+ /* store dest data, governed by p0 */
+ st1b z_dest1.b, p0, [x_dest1, x_pos]
+
+ /* dest3 */
+ tbl z_tmp_lo.b, {z_gft3_lo.b}, z_src_lo.b
+ tbl z_tmp_hi.b, {z_gft3_hi.b}, z_src_hi.b
+ eor z_dest3.d, z_tmp_lo.d, z_dest3.d
+ eor z_dest3.d, z_tmp_hi.d, z_dest3.d
+
+ /* dest4 */
+ tbl z_tmp_lo.b, {z_gft4_lo.b}, z_src_lo.b
+ tbl z_tmp_hi.b, {z_gft4_hi.b}, z_src_hi.b
+ eor z_dest4.d, z_tmp_lo.d, z_dest4.d
+ eor z_dest4.d, z_tmp_hi.d, z_dest4.d
+
+ /* store dest data, governed by p0 */
+ st1b z_dest2.b, p0, [x_dest2, x_pos]
+ st1b z_dest3.b, p0, [x_dest3, x_pos]
+
+ /* dest5 */
+ tbl z_tmp_lo.b, {z_gft5_lo.b}, z_src_lo.b
+ tbl z_tmp_hi.b, {z_gft5_hi.b}, z_src_hi.b
+ eor z_dest5.d, z_tmp_lo.d, z_dest5.d
+ eor z_dest5.d, z_tmp_hi.d, z_dest5.d
+
+ /* store dest data, governed by p0 */
+ st1b z_dest4.b, p0, [x_dest4, x_pos]
+ st1b z_dest5.b, p0, [x_dest5, x_pos]
+ /* increment one vector length */
+ incb x_pos
+
+ b .Lloopsve_vl
+
+.return_pass:
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_6vect_dot_prod_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_6vect_dot_prod_sve.S
new file mode 100644
index 0000000000..acc98953b3
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_6vect_dot_prod_sve.S
@@ -0,0 +1,258 @@
+/*************************************************************
+ Copyright (c) 2021 Linaro Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+.text
+.align 6
+.arch armv8-a+sve
+
+#include "../include/aarch64_label.h"
+
+.global cdecl(gf_6vect_dot_prod_sve)
+#ifndef __APPLE__
+.type gf_6vect_dot_prod_sve, %function
+#endif
+/* void gf_6vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+ */
+
+/* arguments */
+x_len .req x0 /* vector length */
+x_vec .req x1 /* number of source vectors (ie. data blocks) */
+x_tbl .req x2
+x_src .req x3
+x_dest .req x4
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_vec_i .req x5
+x_ptr .req x6
+x_pos .req x7
+
+x_tbl1 .req x8
+x_tbl2 .req x9
+x_tbl3 .req x10
+x_tbl4 .req x11
+x_tbl5 .req x12
+x_tbl6 .req x13
+x_dest1 .req x14
+x_dest2 .req x15
+x_dest6 .req x_dest /* reused */
+
+/* r16,r17,r18,r29,r30: special role registers, avoided */
+/* r19..r29 and SP must be preserved */
+x_dest3 .req x19
+x_dest4 .req x20
+x_dest5 .req x21
+
+/* vectors */
+z_mask0f .req z0
+
+z_src .req z1
+z_src_lo .req z2
+z_src_hi .req z_src
+
+z_dest1 .req z3
+
+z_gft1_lo .req z4
+z_gft1_hi .req z5
+q_gft1_lo .req q4
+q_gft1_hi .req q5
+
+/* bottom 64-bit of v8..v15 must be preserved if used */
+z_gft2_lo .req z17
+z_gft2_hi .req z18
+q_gft2_lo .req q17
+q_gft2_hi .req q18
+
+z_gft3_lo .req z19
+z_gft3_hi .req z20
+q_gft3_lo .req q19
+q_gft3_hi .req q20
+
+z_gft4_lo .req z21
+z_gft4_hi .req z22
+q_gft4_lo .req q21
+q_gft4_hi .req q22
+
+z_gft5_lo .req z23
+z_gft5_hi .req z24
+q_gft5_lo .req q23
+q_gft5_hi .req q24
+
+z_gft6_lo .req z25
+z_gft6_hi .req z26
+q_gft6_lo .req q25
+q_gft6_hi .req q26
+
+z_dest2 .req z27
+z_dest3 .req z28
+z_dest4 .req z29
+z_dest5 .req z30
+z_dest6 .req z31
+
+cdecl(gf_6vect_dot_prod_sve):
+ /* less than 16 bytes, return_fail */
+ cmp x_len, #16
+ blt .return_fail
+
+ /* save r19..r29 */
+ sub sp, sp, #32 /* alignment */
+ stp x19, x20, [sp]
+ str x21, [sp, #16]
+
+ mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */
+ mov x_pos, #0
+ lsl x_vec, x_vec, #3
+ ldp x_dest1, x_dest2, [x_dest, #8*0]
+ ldp x_dest3, x_dest4, [x_dest, #8*2]
+ ldp x_dest5, x_dest6, [x_dest, #8*4] /* x_dest6 reuses x_dest */
+
+/* Loop 1: x_len, vector length */
+.Lloopsve_vl:
+ whilelo p0.b, x_pos, x_len
+ b.none .return_pass
+
+ mov x_vec_i, #0 /* clear x_vec_i */
+ ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */
+
+ mov z_dest1.b, #0 /* clear z_dest1 */
+ mov z_dest2.b, #0 /* clear z_dest2 */
+ mov z_dest3.b, #0 /* clear z_dest3 */
+ mov z_dest4.b, #0 /* clear z_dest4 */
+ mov z_dest5.b, #0 /* clear z_dest5 */
+ mov z_dest6.b, #0 /* clear z_dest6 */
+
+ /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */
+ mov x_tbl1, x_tbl /* reset x_tbl1 */
+ add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */
+ add x_tbl3, x_tbl2, x_vec, LSL #2 /* reset x_tbl3 */
+ add x_tbl4, x_tbl3, x_vec, LSL #2 /* reset x_tbl4 */
+ add x_tbl5, x_tbl4, x_vec, LSL #2 /* reset x_tbl5 */
+ add x_tbl6, x_tbl5, x_vec, LSL #2 /* reset x_tbl6 */
+
+/* Loop 2: x_vec, number of source vectors (ie. data blocks) */
+.Lloopsve_vl_vects:
+ /* load src data, governed by p0 */
+ ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */
+ /* split 4-bit lo; 4-bit hi */
+ and z_src_lo.d, z_src.d, z_mask0f.d
+ lsr z_src_hi.b, z_src.b, #4
+
+
+ /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */
+ /* load gf_table's */
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */
+ ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32
+
+ /* prefetch */
+ prfb pldl2keep, p0, [x_tbl1]
+ prfb pldl2keep, p0, [x_tbl2]
+
+ /* calc for next and prefetch */
+ add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */
+ ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */
+
+ /* dest 1 */
+ /* table indexing, ie. gf(2^8) multiplication */
+ tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b
+ tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b
+ /* exclusive or, ie. gf(2^8) add */
+ eor z_dest1.d, z_gft1_lo.d, z_dest1.d
+ eor z_dest1.d, z_dest1.d, z_gft1_hi.d
+
+ ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32
+ ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32
+ prfb pldl2keep, p0, [x_tbl3]
+ prfb pldl2keep, p0, [x_tbl4]
+
+ /* dest 2 */
+ tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b
+ tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b
+ eor z_dest2.d, z_gft2_lo.d, z_dest2.d
+ eor z_dest2.d, z_dest2.d, z_gft2_hi.d
+
+ /* dest 3 */
+ tbl z_gft3_lo.b, {z_gft3_lo.b}, z_src_lo.b
+ tbl z_gft3_hi.b, {z_gft3_hi.b}, z_src_hi.b
+ eor z_dest3.d, z_gft3_lo.d, z_dest3.d
+ eor z_dest3.d, z_dest3.d, z_gft3_hi.d
+
+ ldp q_gft5_lo, q_gft5_hi, [x_tbl5], #32
+ ldp q_gft6_lo, q_gft6_hi, [x_tbl6], #32
+ prfb pldl2keep, p0, [x_tbl5]
+ prfb pldl2keep, p0, [x_tbl6]
+
+ /* dest 4 */
+ tbl z_gft4_lo.b, {z_gft4_lo.b}, z_src_lo.b
+ tbl z_gft4_hi.b, {z_gft4_hi.b}, z_src_hi.b
+ eor z_dest4.d, z_gft4_lo.d, z_dest4.d
+ eor z_dest4.d, z_dest4.d, z_gft4_hi.d
+
+ /* dest 5 */
+ tbl z_gft5_lo.b, {z_gft5_lo.b}, z_src_lo.b
+ tbl z_gft5_hi.b, {z_gft5_hi.b}, z_src_hi.b
+ eor z_dest5.d, z_gft5_lo.d, z_dest5.d
+ eor z_dest5.d, z_dest5.d, z_gft5_hi.d
+
+ /* dest 6 */
+ tbl z_gft6_lo.b, {z_gft6_lo.b}, z_src_lo.b
+ tbl z_gft6_hi.b, {z_gft6_hi.b}, z_src_hi.b
+ eor z_dest6.d, z_gft6_lo.d, z_dest6.d
+ eor z_dest6.d, z_dest6.d, z_gft6_hi.d
+
+ cmp x_vec_i, x_vec
+ blt .Lloopsve_vl_vects
+/* end of Loop 2 */
+
+ /* store dest data, governed by p0 */
+ st1b z_dest1.b, p0, [x_dest1, x_pos]
+ st1b z_dest2.b, p0, [x_dest2, x_pos]
+ st1b z_dest3.b, p0, [x_dest3, x_pos]
+ st1b z_dest4.b, p0, [x_dest4, x_pos]
+ st1b z_dest5.b, p0, [x_dest5, x_pos]
+ st1b z_dest6.b, p0, [x_dest6, x_pos]
+
+ /* increment one vector length */
+ incb x_pos
+ b .Lloopsve_vl
+/* end of Loop 1 */
+
+.return_pass:
+ /* restore r19..r29 */
+ ldr x21, [sp, #16]
+ ldp x19, x20, [sp]
+ add sp, sp, #32
+
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_6vect_mad_neon.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_6vect_mad_neon.S
new file mode 100644
index 0000000000..3b1b1b4b21
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_6vect_mad_neon.S
@@ -0,0 +1,618 @@
+/**************************************************************
+ Copyright (c) 2019 Huawei Technologies Co., Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "../include/aarch64_label.h"
+
+.text
+.global cdecl(gf_6vect_mad_neon)
+#ifndef __APPLE__
+.type gf_6vect_mad_neon, %function
+#endif
+
+/* arguments */
+x_len .req x0
+x_vec .req x1
+x_vec_i .req x2
+x_tbl .req x3
+x_src .req x4
+x_dest .req x5
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_src_end .req x6
+x_dest1 .req x7
+x_dest2 .req x8
+x_dest3 .req x9
+x_dest4 .req x10
+x_dest5 .req x11
+x_dest6 .req x_dest
+x_tmp .req x12
+x_tbl1 .req x13
+x_tbl2 .req x14
+x_tbl3 .req x15
+x_tbl4 .req x16
+x_tbl5 .req x17
+x_tbl6 .req x_tbl
+x_const .req x18
+
+/* vectors */
+v_mask0f .req v0
+v_tmp_lo .req v1
+v_tmp_hi .req v2
+v_tmp .req v3
+q_tmp .req q3
+
+v_gft1_lo .req v4
+v_gft1_hi .req v5
+v_gft2_lo .req v6
+v_gft2_hi .req v7
+v_gft3_lo .req v16
+v_gft3_hi .req v17
+q_gft1_lo .req q4
+q_gft1_hi .req q5
+q_gft2_lo .req q6
+q_gft2_hi .req q7
+q_gft3_lo .req q16
+q_gft3_hi .req q17
+
+v_gft4_lo .req v18
+v_gft4_hi .req v19
+q_gft4_lo .req q18
+q_gft4_hi .req q19
+v_gft5_lo .req v_gft2_lo
+v_gft5_hi .req v_gft2_hi
+q_gft5_lo .req q_gft2_lo
+q_gft5_hi .req q_gft2_hi
+v_gft6_lo .req v_gft3_lo
+v_gft6_hi .req v_gft3_hi
+q_gft6_lo .req q_gft3_lo
+q_gft6_hi .req q_gft3_hi
+
+v_data_0 .req v8
+v_data_1 .req v9
+v_data_2 .req v10
+v_data_3 .req v11
+q_data_0 .req q8
+q_data_1 .req q9
+q_data_2 .req q10
+q_data_3 .req q11
+
+v_data_0_lo .req v12
+v_data_1_lo .req v13
+v_data_2_lo .req v14
+v_data_3_lo .req v15
+v_data_0_hi .req v_data_0
+v_data_1_hi .req v_data_1
+v_data_2_hi .req v_data_2
+v_data_3_hi .req v_data_3
+
+v_d1_0 .req v20
+v_d1_1 .req v21
+v_d1_2 .req v22
+v_d1_3 .req v23
+v_d2_0 .req v24
+v_d2_1 .req v25
+v_d2_2 .req v26
+v_d2_3 .req v27
+v_d3_0 .req v28
+v_d3_1 .req v29
+v_d3_2 .req v30
+v_d3_3 .req v31
+q_d1_0 .req q20
+q_d1_1 .req q21
+q_d1_2 .req q22
+q_d1_3 .req q23
+q_d2_0 .req q24
+q_d2_1 .req q25
+q_d2_2 .req q26
+q_d2_3 .req q27
+q_d3_0 .req q28
+q_d3_1 .req q29
+q_d3_2 .req q30
+q_d3_3 .req q31
+
+v_d4_0 .req v_d1_0
+v_d4_1 .req v_d1_1
+v_d4_2 .req v_d1_2
+v_d4_3 .req v_d1_3
+q_d4_0 .req q_d1_0
+q_d4_1 .req q_d1_1
+q_d4_2 .req q_d1_2
+q_d4_3 .req q_d1_3
+v_d5_0 .req v_d2_0
+v_d5_1 .req v_d2_1
+v_d5_2 .req v_d2_2
+v_d5_3 .req v_d2_3
+q_d5_0 .req q_d2_0
+q_d5_1 .req q_d2_1
+q_d5_2 .req q_d2_2
+q_d5_3 .req q_d2_3
+v_d6_0 .req v_d3_0
+v_d6_1 .req v_d3_1
+v_d6_2 .req v_d3_2
+v_d6_3 .req v_d3_3
+q_d6_0 .req q_d3_0
+q_d6_1 .req q_d3_1
+q_d6_2 .req q_d3_2
+q_d6_3 .req q_d3_3
+
+v_data .req v21
+q_data .req q21
+v_data_lo .req v22
+v_data_hi .req v23
+
+cdecl(gf_6vect_mad_neon):
+ /* less than 16 bytes, return_fail */
+ cmp x_len, #16
+ blt .return_fail
+
+ movi v_mask0f.16b, #0x0f
+ lsl x_vec_i, x_vec_i, #5
+ lsl x_vec, x_vec, #5
+ add x_tbl1, x_tbl, x_vec_i
+ add x_tbl2, x_tbl1, x_vec
+ add x_tbl3, x_tbl2, x_vec
+ add x_tbl4, x_tbl3, x_vec
+ add x_tbl5, x_tbl4, x_vec
+ add x_tbl6, x_tbl5, x_vec
+ add x_src_end, x_src, x_len
+ ldr x_dest1, [x_dest, #8*0]
+ ldr x_dest2, [x_dest, #8*1]
+ ldr x_dest3, [x_dest, #8*2]
+ ldr x_dest4, [x_dest, #8*3]
+ ldr x_dest5, [x_dest, #8*4]
+ ldr x_dest6, [x_dest, #8*5]
+ ldr q_gft1_lo, [x_tbl1]
+ ldr q_gft1_hi, [x_tbl1, #16]
+ ldr q_gft4_lo, [x_tbl4]
+ ldr q_gft4_hi, [x_tbl4, #16]
+
+.Lloop64_init:
+ /* less than 64 bytes, goto Lloop16_init */
+ cmp x_len, #64
+ blt .Lloop16_init
+
+ /* save d8 ~ d15 to stack */
+ sub sp, sp, #64
+ stp d8, d9, [sp]
+ stp d10, d11, [sp, #16]
+ stp d12, d13, [sp, #32]
+ stp d14, d15, [sp, #48]
+
+ sub x_src_end, x_src_end, #64
+
+.Lloop64:
+ ldr q_data_0, [x_src, #16*0]
+ ldr q_data_1, [x_src, #16*1]
+ ldr q_data_2, [x_src, #16*2]
+ ldr q_data_3, [x_src, #16*3]
+ add x_src, x_src, #64
+
+ ldr q_d1_0, [x_dest1, #16*0]
+ ldr q_d1_1, [x_dest1, #16*1]
+ ldr q_d1_2, [x_dest1, #16*2]
+ ldr q_d1_3, [x_dest1, #16*3]
+
+ ldr q_d2_0, [x_dest2, #16*0]
+ ldr q_d2_1, [x_dest2, #16*1]
+ ldr q_d2_2, [x_dest2, #16*2]
+ ldr q_d2_3, [x_dest2, #16*3]
+
+ ldr q_d3_0, [x_dest3, #16*0]
+ ldr q_d3_1, [x_dest3, #16*1]
+ ldr q_d3_2, [x_dest3, #16*2]
+ ldr q_d3_3, [x_dest3, #16*3]
+
+ ldr q_gft2_lo, [x_tbl2]
+ ldr q_gft2_hi, [x_tbl2, #16]
+ ldr q_gft3_lo, [x_tbl3]
+ ldr q_gft3_hi, [x_tbl3, #16]
+
+ and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
+ and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
+ and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b
+ and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b
+
+ ushr v_data_0_hi.16b, v_data_0.16b, #4
+ ushr v_data_1_hi.16b, v_data_1.16b, #4
+ ushr v_data_2_hi.16b, v_data_2.16b, #4
+ ushr v_data_3_hi.16b, v_data_3.16b, #4
+
+ /* dest1 */
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
+ eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b
+ eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
+ eor v_d1_1.16b, v_tmp_lo.16b, v_d1_1.16b
+ eor v_d1_1.16b, v_d1_1.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b
+ eor v_d1_2.16b, v_tmp_lo.16b, v_d1_2.16b
+ eor v_d1_2.16b, v_d1_2.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b
+ eor v_d1_3.16b, v_tmp_lo.16b, v_d1_3.16b
+ eor v_d1_3.16b, v_d1_3.16b, v_tmp_hi.16b
+
+ /* dest2 */
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_0_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_0_hi.16b
+ eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b
+ eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_1_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_1_hi.16b
+ eor v_d2_1.16b, v_tmp_lo.16b, v_d2_1.16b
+ eor v_d2_1.16b, v_d2_1.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_2_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_2_hi.16b
+ eor v_d2_2.16b, v_tmp_lo.16b, v_d2_2.16b
+ eor v_d2_2.16b, v_d2_2.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_3_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_3_hi.16b
+ eor v_d2_3.16b, v_tmp_lo.16b, v_d2_3.16b
+ eor v_d2_3.16b, v_d2_3.16b, v_tmp_hi.16b
+
+ /* dest3 */
+ tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_0_lo.16b
+ tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_0_hi.16b
+ eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b
+ eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_1_lo.16b
+ tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_1_hi.16b
+ eor v_d3_1.16b, v_tmp_lo.16b, v_d3_1.16b
+ eor v_d3_1.16b, v_d3_1.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_2_lo.16b
+ tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_2_hi.16b
+ eor v_d3_2.16b, v_tmp_lo.16b, v_d3_2.16b
+ eor v_d3_2.16b, v_d3_2.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_3_lo.16b
+ tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_3_hi.16b
+ eor v_d3_3.16b, v_tmp_lo.16b, v_d3_3.16b
+ eor v_d3_3.16b, v_d3_3.16b, v_tmp_hi.16b
+
+ str q_d1_0, [x_dest1, #16*0]
+ str q_d1_1, [x_dest1, #16*1]
+ str q_d1_2, [x_dest1, #16*2]
+ str q_d1_3, [x_dest1, #16*3]
+ add x_dest1, x_dest1, #64
+
+ str q_d2_0, [x_dest2, #16*0]
+ str q_d2_1, [x_dest2, #16*1]
+ str q_d2_2, [x_dest2, #16*2]
+ str q_d2_3, [x_dest2, #16*3]
+ add x_dest2, x_dest2, #64
+
+ str q_d3_0, [x_dest3, #16*0]
+ str q_d3_1, [x_dest3, #16*1]
+ str q_d3_2, [x_dest3, #16*2]
+ str q_d3_3, [x_dest3, #16*3]
+ add x_dest3, x_dest3, #64
+
+ ldr q_d4_0, [x_dest4, #16*0]
+ ldr q_d4_1, [x_dest4, #16*1]
+ ldr q_d4_2, [x_dest4, #16*2]
+ ldr q_d4_3, [x_dest4, #16*3]
+
+ ldr q_d5_0, [x_dest5, #16*0]
+ ldr q_d5_1, [x_dest5, #16*1]
+ ldr q_d5_2, [x_dest5, #16*2]
+ ldr q_d5_3, [x_dest5, #16*3]
+
+ ldr q_d6_0, [x_dest6, #16*0]
+ ldr q_d6_1, [x_dest6, #16*1]
+ ldr q_d6_2, [x_dest6, #16*2]
+ ldr q_d6_3, [x_dest6, #16*3]
+
+ ldr q_gft5_lo, [x_tbl5]
+ ldr q_gft5_hi, [x_tbl5, #16]
+ ldr q_gft6_lo, [x_tbl6]
+ ldr q_gft6_hi, [x_tbl6, #16]
+
+ /* dest4 */
+ tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_0_lo.16b
+ tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_0_hi.16b
+ eor v_d4_0.16b, v_tmp_lo.16b, v_d4_0.16b
+ eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_1_lo.16b
+ tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_1_hi.16b
+ eor v_d4_1.16b, v_tmp_lo.16b, v_d4_1.16b
+ eor v_d4_1.16b, v_d4_1.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_2_lo.16b
+ tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_2_hi.16b
+ eor v_d4_2.16b, v_tmp_lo.16b, v_d4_2.16b
+ eor v_d4_2.16b, v_d4_2.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_3_lo.16b
+ tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_3_hi.16b
+ eor v_d4_3.16b, v_tmp_lo.16b, v_d4_3.16b
+ eor v_d4_3.16b, v_d4_3.16b, v_tmp_hi.16b
+
+ /* dest5 */
+ tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_0_lo.16b
+ tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_0_hi.16b
+ eor v_d5_0.16b, v_tmp_lo.16b, v_d5_0.16b
+ eor v_d5_0.16b, v_d5_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_1_lo.16b
+ tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_1_hi.16b
+ eor v_d5_1.16b, v_tmp_lo.16b, v_d5_1.16b
+ eor v_d5_1.16b, v_d5_1.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_2_lo.16b
+ tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_2_hi.16b
+ eor v_d5_2.16b, v_tmp_lo.16b, v_d5_2.16b
+ eor v_d5_2.16b, v_d5_2.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_3_lo.16b
+ tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_3_hi.16b
+ eor v_d5_3.16b, v_tmp_lo.16b, v_d5_3.16b
+ eor v_d5_3.16b, v_d5_3.16b, v_tmp_hi.16b
+
+ /* dest6 */
+ tbl v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_0_lo.16b
+ tbl v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_0_hi.16b
+ eor v_d6_0.16b, v_tmp_lo.16b, v_d6_0.16b
+ eor v_d6_0.16b, v_d6_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_1_lo.16b
+ tbl v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_1_hi.16b
+ eor v_d6_1.16b, v_tmp_lo.16b, v_d6_1.16b
+ eor v_d6_1.16b, v_d6_1.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_2_lo.16b
+ tbl v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_2_hi.16b
+ eor v_d6_2.16b, v_tmp_lo.16b, v_d6_2.16b
+ eor v_d6_2.16b, v_d6_2.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_3_lo.16b
+ tbl v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_3_hi.16b
+ eor v_d6_3.16b, v_tmp_lo.16b, v_d6_3.16b
+ eor v_d6_3.16b, v_d6_3.16b, v_tmp_hi.16b
+
+ str q_d4_0, [x_dest4, #16*0]
+ str q_d4_1, [x_dest4, #16*1]
+ str q_d4_2, [x_dest4, #16*2]
+ str q_d4_3, [x_dest4, #16*3]
+ add x_dest4, x_dest4, #64
+
+ str q_d5_0, [x_dest5, #16*0]
+ str q_d5_1, [x_dest5, #16*1]
+ str q_d5_2, [x_dest5, #16*2]
+ str q_d5_3, [x_dest5, #16*3]
+ add x_dest5, x_dest5, #64
+
+ str q_d6_0, [x_dest6, #16*0]
+ str q_d6_1, [x_dest6, #16*1]
+ str q_d6_2, [x_dest6, #16*2]
+ str q_d6_3, [x_dest6, #16*3]
+ add x_dest6, x_dest6, #64
+
+ cmp x_src, x_src_end
+ bls .Lloop64
+
+.Lloop64_end:
+ /* restore d8 ~ d15 */
+ ldp d8, d9, [sp]
+ ldp d10, d11, [sp, #16]
+ ldp d12, d13, [sp, #32]
+ ldp d14, d15, [sp, #48]
+ add sp, sp, #64
+ add x_src_end, x_src_end, #64
+
+.Lloop16_init:
+ sub x_src_end, x_src_end, #16
+ cmp x_src, x_src_end
+ bhi .lessthan16_init
+
+.Lloop16:
+ ldr q_data, [x_src]
+
+ ldr q_d1_0, [x_dest1]
+ ldr q_d2_0, [x_dest2]
+ ldr q_d3_0, [x_dest3]
+ ldr q_gft2_lo, [x_tbl2]
+ ldr q_gft2_hi, [x_tbl2, #16]
+ ldr q_gft3_lo, [x_tbl3]
+ ldr q_gft3_hi, [x_tbl3, #16]
+
+ and v_data_lo.16b, v_data.16b, v_mask0f.16b
+ ushr v_data_hi.16b, v_data.16b, #4
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
+ eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b
+ eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
+ eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b
+ eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
+ eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b
+ eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
+
+ str q_d1_0, [x_dest1]
+ str q_d2_0, [x_dest2]
+ str q_d3_0, [x_dest3]
+
+ ldr q_d4_0, [x_dest4]
+ ldr q_d5_0, [x_dest5]
+ ldr q_d6_0, [x_dest6]
+ ldr q_gft5_lo, [x_tbl5]
+ ldr q_gft5_hi, [x_tbl5, #16]
+ ldr q_gft6_lo, [x_tbl6]
+ ldr q_gft6_hi, [x_tbl6, #16]
+
+ tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b
+ eor v_d4_0.16b, v_tmp_lo.16b, v_d4_0.16b
+ eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_hi.16b
+ eor v_d5_0.16b, v_tmp_lo.16b, v_d5_0.16b
+ eor v_d5_0.16b, v_d5_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_hi.16b
+ eor v_d6_0.16b, v_tmp_lo.16b, v_d6_0.16b
+ eor v_d6_0.16b, v_d6_0.16b, v_tmp_hi.16b
+
+ str q_d4_0, [x_dest4]
+ str q_d5_0, [x_dest5]
+ str q_d6_0, [x_dest6]
+
+ add x_src, x_src, #16
+ add x_dest1, x_dest1, #16
+ add x_dest2, x_dest2, #16
+ add x_dest3, x_dest3, #16
+ add x_dest4, x_dest4, #16
+ add x_dest5, x_dest5, #16
+ add x_dest6, x_dest6, #16
+ cmp x_src, x_src_end
+ bls .Lloop16
+
+.lessthan16_init:
+ sub x_tmp, x_src, x_src_end
+ cmp x_tmp, #16
+ beq .return_pass
+
+.lessthan16:
+ mov x_src, x_src_end
+ sub x_dest1, x_dest1, x_tmp
+ sub x_dest2, x_dest2, x_tmp
+ sub x_dest3, x_dest3, x_tmp
+ sub x_dest4, x_dest4, x_tmp
+ sub x_dest5, x_dest5, x_tmp
+ sub x_dest6, x_dest6, x_tmp
+
+#ifndef __APPLE__
+ adrp x_const, const_tbl
+ add x_const, x_const, :lo12:const_tbl
+#else
+ adrp x_const, const_tbl@PAGE
+ add x_const, x_const, const_tbl@PAGEOFF
+#endif
+ sub x_const, x_const, x_tmp
+ ldr q_tmp, [x_const, #16]
+
+ ldr q_data, [x_src]
+ ldr q_d1_0, [x_dest1]
+ ldr q_d2_0, [x_dest2]
+ ldr q_d3_0, [x_dest3]
+ ldr q_gft2_lo, [x_tbl2]
+ ldr q_gft2_hi, [x_tbl2, #16]
+ ldr q_gft3_lo, [x_tbl3]
+ ldr q_gft3_hi, [x_tbl3, #16]
+
+ and v_data_lo.16b, v_data.16b, v_mask0f.16b
+ ushr v_data_hi.16b, v_data.16b, #4
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
+ eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
+ and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
+ eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
+ eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
+ and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
+ eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
+ eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
+ and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
+ eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
+
+ str q_d1_0, [x_dest1]
+ str q_d2_0, [x_dest2]
+ str q_d3_0, [x_dest3]
+
+ ldr q_d4_0, [x_dest4]
+ ldr q_d5_0, [x_dest5]
+ ldr q_d6_0, [x_dest6]
+ ldr q_gft5_lo, [x_tbl5]
+ ldr q_gft5_hi, [x_tbl5, #16]
+ ldr q_gft6_lo, [x_tbl6]
+ ldr q_gft6_hi, [x_tbl6, #16]
+
+ tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b
+ eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
+ and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
+ eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_hi.16b
+ eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
+ and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
+ eor v_d5_0.16b, v_d5_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_hi.16b
+ eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
+ and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
+ eor v_d6_0.16b, v_d6_0.16b, v_tmp_hi.16b
+
+ str q_d4_0, [x_dest4]
+ str q_d5_0, [x_dest5]
+ str q_d6_0, [x_dest6]
+
+.return_pass:
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
+
+ASM_DEF_RODATA
+.balign 8
+const_tbl:
+ .dword 0x0000000000000000, 0x0000000000000000
+ .dword 0xffffffffffffffff, 0xffffffffffffffff
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_6vect_mad_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_6vect_mad_sve.S
new file mode 100644
index 0000000000..c4f372cd73
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_6vect_mad_sve.S
@@ -0,0 +1,237 @@
+/**************************************************************
+ Copyright (c) 2021 Linaro Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+.text
+.align 6
+.arch armv8-a+sve
+
+#include "../include/aarch64_label.h"
+
+.global cdecl(gf_6vect_mad_sve)
+#ifndef __APPLE__
+.type gf_6vect_mad_sve, %function
+#endif
+
+/* gf_6vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char **dest);
+ */
+/* arguments */
+x_len .req x0
+x_vec .req x1
+x_vec_i .req x2
+x_tbl .req x3
+x_src .req x4
+x_dest .req x5
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_pos .req x6
+x_dest2 .req x7
+x_dest3 .req x8
+x_dest4 .req x9
+x_dest5 .req x10
+x_dest6 .req x11
+x_dest1 .req x12
+
+/* vectors */
+z_mask0f .req z0
+
+z_src .req z1
+z_src_lo .req z2
+z_src_hi .req z_src
+
+z_dest1 .req z3
+
+z_tmp_lo .req z4
+z_tmp_hi .req z5
+
+z_gft1_lo .req z6
+z_gft1_hi .req z7
+q_gft1_lo .req q6
+q_gft1_hi .req q7
+
+/* bottom 64-bit of v8..v15 must be preserved if used */
+z_gft2_lo .req z17
+z_gft2_hi .req z18
+q_gft2_lo .req q17
+q_gft2_hi .req q18
+
+z_gft3_lo .req z19
+z_gft3_hi .req z20
+q_gft3_lo .req q19
+q_gft3_hi .req q20
+
+z_gft4_lo .req z21
+z_gft4_hi .req z22
+q_gft4_lo .req q21
+q_gft4_hi .req q22
+
+z_gft5_lo .req z23
+z_gft5_hi .req z24
+q_gft5_lo .req q23
+q_gft5_hi .req q24
+
+z_gft6_lo .req z25
+z_gft6_hi .req z26
+q_gft6_lo .req q25
+q_gft6_hi .req q26
+
+z_dest2 .req z27
+z_dest3 .req z28
+z_dest4 .req z29
+z_dest5 .req z30
+z_dest6 .req z31
+
+cdecl(gf_6vect_mad_sve):
+ /* less than 16 bytes, return_fail */
+ cmp x_len, #16
+ blt .return_fail
+
+ mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */
+ /* load table 1 */
+ add x_tbl, x_tbl, x_vec_i, LSL #5 /* x_tbl += x_vec_i * 2^5 */
+
+ /* Load with NEON instruction ldp */
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl]
+ /* load table 2 */
+ add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */
+ ldp q_gft2_lo, q_gft2_hi, [x_tbl]
+ /* load table 3 */
+ add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */
+ ldp q_gft3_lo, q_gft3_hi, [x_tbl]
+ /* load table 4 */
+ add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */
+ ldp q_gft4_lo, q_gft4_hi, [x_tbl]
+ /* load table 5 */
+ add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */
+ ldp q_gft5_lo, q_gft5_hi, [x_tbl]
+ /* load table 6 */
+ add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */
+ ldp q_gft6_lo, q_gft6_hi, [x_tbl]
+
+ ldr x_dest1, [x_dest, #8*0] /* pointer to dest1 */
+ ldr x_dest2, [x_dest, #8*1] /* pointer to dest2 */
+ ldr x_dest3, [x_dest, #8*2] /* pointer to dest3 */
+ ldr x_dest4, [x_dest, #8*3] /* pointer to dest4 */
+ ldr x_dest5, [x_dest, #8*4] /* pointer to dest5 */
+ ldr x_dest6, [x_dest, #8*5] /* pointer to dest6 */
+
+ mov x_pos, #0
+
+ /* vector length agnostic */
+.Lloopsve_vl:
+ whilelo p0.b, x_pos, x_len
+ b.none .return_pass
+
+ prfb pldl2strm, p0, [x_dest1, x_pos]
+ prfb pldl2strm, p0, [x_dest2, x_pos]
+
+ /* load src data, governed by p0 */
+ ld1b z_src.b, p0/z, [x_src, x_pos]
+
+ /* split 4-bit lo; 4-bit hi */
+ and z_src_lo.d, z_src.d, z_mask0f.d
+ lsr z_src_hi.b, z_src.b, #4
+
+ /* load dest data, governed by p0 */
+ ld1b z_dest1.b, p0/z, [x_dest1, x_pos]
+ ld1b z_dest2.b, p0/z, [x_dest2, x_pos]
+
+ prfb pldl2strm, p0, [x_dest3, x_pos]
+ prfb pldl2strm, p0, [x_dest4, x_pos]
+
+ /* dest1 */
+ /* table indexing, ie. gf(2^8) multiplication */
+ tbl z_tmp_lo.b, {z_gft1_lo.b}, z_src_lo.b
+ tbl z_tmp_hi.b, {z_gft1_hi.b}, z_src_hi.b
+ /* exclusive or, ie. gf(2^8) add */
+ eor z_dest1.d, z_tmp_lo.d, z_dest1.d
+ eor z_dest1.d, z_tmp_hi.d, z_dest1.d
+
+ ld1b z_dest3.b, p0/z, [x_dest3, x_pos]
+ ld1b z_dest4.b, p0/z, [x_dest4, x_pos]
+
+ prfb pldl2strm, p0, [x_dest5, x_pos]
+ prfb pldl2strm, p0, [x_dest6, x_pos]
+
+ /* dest2 */
+ tbl z_tmp_lo.b, {z_gft2_lo.b}, z_src_lo.b
+ tbl z_tmp_hi.b, {z_gft2_hi.b}, z_src_hi.b
+ eor z_dest2.d, z_tmp_lo.d, z_dest2.d
+ eor z_dest2.d, z_tmp_hi.d, z_dest2.d
+
+ ld1b z_dest5.b, p0/z, [x_dest5, x_pos]
+ ld1b z_dest6.b, p0/z, [x_dest6, x_pos]
+
+ /* dest3 */
+ tbl z_tmp_lo.b, {z_gft3_lo.b}, z_src_lo.b
+ tbl z_tmp_hi.b, {z_gft3_hi.b}, z_src_hi.b
+ eor z_dest3.d, z_tmp_lo.d, z_dest3.d
+ eor z_dest3.d, z_tmp_hi.d, z_dest3.d
+
+ /* store dest data, governed by p0 */
+ st1b z_dest1.b, p0, [x_dest1, x_pos]
+ st1b z_dest2.b, p0, [x_dest2, x_pos]
+
+ /* dest4 */
+ tbl z_tmp_lo.b, {z_gft4_lo.b}, z_src_lo.b
+ tbl z_tmp_hi.b, {z_gft4_hi.b}, z_src_hi.b
+ eor z_dest4.d, z_tmp_lo.d, z_dest4.d
+ eor z_dest4.d, z_tmp_hi.d, z_dest4.d
+
+ /* dest5 */
+ tbl z_tmp_lo.b, {z_gft5_lo.b}, z_src_lo.b
+ tbl z_tmp_hi.b, {z_gft5_hi.b}, z_src_hi.b
+ eor z_dest5.d, z_tmp_lo.d, z_dest5.d
+ eor z_dest5.d, z_tmp_hi.d, z_dest5.d
+
+ st1b z_dest3.b, p0, [x_dest3, x_pos]
+ st1b z_dest4.b, p0, [x_dest4, x_pos]
+
+ /* dest6 */
+ tbl z_tmp_lo.b, {z_gft6_lo.b}, z_src_lo.b
+ tbl z_tmp_hi.b, {z_gft6_hi.b}, z_src_hi.b
+ eor z_dest6.d, z_tmp_lo.d, z_dest6.d
+ eor z_dest6.d, z_tmp_hi.d, z_dest6.d
+
+ st1b z_dest5.b, p0, [x_dest5, x_pos]
+ st1b z_dest6.b, p0, [x_dest6, x_pos]
+ /* increment one vector length */
+ incb x_pos
+
+ b .Lloopsve_vl
+
+.return_pass:
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_7vect_dot_prod_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_7vect_dot_prod_sve.S
new file mode 100644
index 0000000000..0f74873de0
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_7vect_dot_prod_sve.S
@@ -0,0 +1,281 @@
+/*************************************************************
+ Copyright (c) 2021 Linaro Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+.text
+.align 6
+.arch armv8-a+sve
+
+#include "../include/aarch64_label.h"
+
+.global cdecl(gf_7vect_dot_prod_sve)
+#ifndef __APPLE__
+.type gf_7vect_dot_prod_sve, %function
+#endif
+/* void gf_7vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+ */
+
+/* arguments */
+x_len .req x0 /* vector length */
+x_vec .req x1 /* number of source vectors (ie. data blocks) */
+x_tbl .req x2
+x_src .req x3
+x_dest .req x4
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_vec_i .req x5
+x_ptr .req x6
+x_pos .req x7
+
+x_tbl1 .req x8
+x_tbl2 .req x9
+x_tbl3 .req x10
+x_tbl4 .req x11
+x_tbl5 .req x12
+x_tbl6 .req x13
+x_tbl7 .req x14
+
+x_dest1 .req x15
+
+/* r16,r17,r18,r29,r30: special role registers, avoided */
+/* r19..r29 and SP must be preserved */
+x_dest2 .req x19
+x_dest3 .req x20
+x_dest4 .req x21
+x_dest5 .req x22
+x_dest6 .req x23
+x_dest7 .req x_dest /* reused */
+
+/* vectors */
+z_mask0f .req z0
+
+z_src .req z1
+z_src_lo .req z2
+z_src_hi .req z_src
+
+z_dest1 .req z3
+z_gft1_lo .req z4
+z_gft1_hi .req z5
+q_gft1_lo .req q4
+q_gft1_hi .req q5
+
+z_gft7_lo .req z6
+z_gft7_hi .req z7
+q_gft7_lo .req q6
+q_gft7_hi .req q7
+
+/* bottom 64-bit of v8..v15 must be preserved if used */
+z_dest7 .req z16
+
+z_gft2_lo .req z17
+z_gft2_hi .req z18
+q_gft2_lo .req q17
+q_gft2_hi .req q18
+
+z_gft3_lo .req z19
+z_gft3_hi .req z20
+q_gft3_lo .req q19
+q_gft3_hi .req q20
+
+z_gft4_lo .req z21
+z_gft4_hi .req z22
+q_gft4_lo .req q21
+q_gft4_hi .req q22
+
+z_gft5_lo .req z23
+z_gft5_hi .req z24
+q_gft5_lo .req q23
+q_gft5_hi .req q24
+
+z_gft6_lo .req z25
+z_gft6_hi .req z26
+q_gft6_lo .req q25
+q_gft6_hi .req q26
+
+z_dest2 .req z27
+z_dest3 .req z28
+z_dest4 .req z29
+z_dest5 .req z30
+z_dest6 .req z31
+
+cdecl(gf_7vect_dot_prod_sve):
+ /* less than 16 bytes, return_fail */
+ cmp x_len, #16
+ blt .return_fail
+
+ /* save r19..r29 */
+ sub sp, sp, #48 /* alignment */
+ stp x19, x20, [sp]
+ stp x21, x22, [sp, #16]
+ str x23, [sp, #32]
+
+ mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */
+ mov x_pos, #0
+ lsl x_vec, x_vec, #3
+ ldp x_dest1, x_dest2, [x_dest, #8*0]
+ ldp x_dest3, x_dest4, [x_dest, #8*2]
+ ldp x_dest5, x_dest6, [x_dest, #8*4]
+ ldr x_dest7, [x_dest, #8*6] /* x_dest7 reuses x_dest */
+
+/* Loop 1: x_len, vector length */
+.Lloopsve_vl:
+ whilelo p0.b, x_pos, x_len
+ b.none .return_pass
+
+ mov x_vec_i, #0 /* clear x_vec_i */
+ ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */
+
+ mov z_dest1.b, #0 /* clear z_dest1 */
+ mov z_dest2.b, #0 /* clear z_dest2 */
+ mov z_dest3.b, #0 /* clear z_dest3 */
+ mov z_dest4.b, #0 /* clear z_dest4 */
+ mov z_dest5.b, #0 /* clear z_dest5 */
+ mov z_dest6.b, #0 /* clear z_dest6 */
+ mov z_dest7.b, #0 /* clear z_dest7 */
+
+ /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */
+ mov x_tbl1, x_tbl /* reset x_tbl1 */
+ add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */
+ add x_tbl3, x_tbl2, x_vec, LSL #2 /* reset x_tbl3 */
+ add x_tbl4, x_tbl3, x_vec, LSL #2 /* reset x_tbl4 */
+ add x_tbl5, x_tbl4, x_vec, LSL #2 /* reset x_tbl5 */
+ add x_tbl6, x_tbl5, x_vec, LSL #2 /* reset x_tbl6 */
+ add x_tbl7, x_tbl6, x_vec, LSL #2 /* reset x_tbl7 */
+
+/* Loop 2: x_vec, number of source vectors (ie. data blocks) */
+.Lloopsve_vl_vects:
+ /* load src data, governed by p0 */
+ ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */
+ /* split 4-bit lo; 4-bit hi */
+ and z_src_lo.d, z_src.d, z_mask0f.d
+ lsr z_src_hi.b, z_src.b, #4
+
+ /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */
+ /* load gf_table's */
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */
+ ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32
+
+ /* prefetch */
+ prfb pldl2keep, p0, [x_tbl1]
+ prfb pldl2keep, p0, [x_tbl2]
+
+ /* calc for next and prefetch */
+ add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */
+ ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */
+
+ /* dest 1 */
+ /* table indexing, ie. gf(2^8) multiplication */
+ tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b
+ tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b
+ /* exclusive or, ie. gf(2^8) add */
+ eor z_dest1.d, z_gft1_lo.d, z_dest1.d
+ eor z_dest1.d, z_gft1_hi.d, z_dest1.d
+
+ ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32
+ ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32
+ prfb pldl2keep, p0, [x_tbl3]
+ prfb pldl2keep, p0, [x_tbl4]
+
+ /* dest 2 */
+ tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b
+ tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b
+ eor z_dest2.d, z_gft2_lo.d, z_dest2.d
+ eor z_dest2.d, z_gft2_hi.d, z_dest2.d
+
+ /* dest 3 */
+ tbl z_gft3_lo.b, {z_gft3_lo.b}, z_src_lo.b
+ tbl z_gft3_hi.b, {z_gft3_hi.b}, z_src_hi.b
+ eor z_dest3.d, z_gft3_lo.d, z_dest3.d
+ eor z_dest3.d, z_gft3_hi.d, z_dest3.d
+
+ ldp q_gft5_lo, q_gft5_hi, [x_tbl5], #32
+ ldp q_gft6_lo, q_gft6_hi, [x_tbl6], #32
+ prfb pldl2keep, p0, [x_tbl5]
+ prfb pldl2keep, p0, [x_tbl6]
+
+ /* dest 4 */
+ tbl z_gft4_lo.b, {z_gft4_lo.b}, z_src_lo.b
+ tbl z_gft4_hi.b, {z_gft4_hi.b}, z_src_hi.b
+ eor z_dest4.d, z_gft4_lo.d, z_dest4.d
+ eor z_dest4.d, z_gft4_hi.d, z_dest4.d
+
+ /* dest 5 */
+ tbl z_gft5_lo.b, {z_gft5_lo.b}, z_src_lo.b
+ tbl z_gft5_hi.b, {z_gft5_hi.b}, z_src_hi.b
+ eor z_dest5.d, z_gft5_lo.d, z_dest5.d
+ eor z_dest5.d, z_gft5_hi.d, z_dest5.d
+
+ ldp q_gft7_lo, q_gft7_hi, [x_tbl7], #32
+ prfb pldl2keep, p0, [x_tbl7]
+
+ /* dest 6 */
+ tbl z_gft6_lo.b, {z_gft6_lo.b}, z_src_lo.b
+ tbl z_gft6_hi.b, {z_gft6_hi.b}, z_src_hi.b
+ eor z_dest6.d, z_gft6_lo.d, z_dest6.d
+ eor z_dest6.d, z_gft6_hi.d, z_dest6.d
+
+ /* dest 7 */
+ tbl z_gft7_lo.b, {z_gft7_lo.b}, z_src_lo.b
+ tbl z_gft7_hi.b, {z_gft7_hi.b}, z_src_hi.b
+ eor z_dest7.d, z_gft7_lo.d, z_dest7.d
+ eor z_dest7.d, z_gft7_hi.d, z_dest7.d
+
+ cmp x_vec_i, x_vec
+ blt .Lloopsve_vl_vects
+/* end of Loop 2 */
+
+ /* store dest data, governed by p0 */
+ st1b z_dest1.b, p0, [x_dest1, x_pos]
+ st1b z_dest2.b, p0, [x_dest2, x_pos]
+ st1b z_dest3.b, p0, [x_dest3, x_pos]
+ st1b z_dest4.b, p0, [x_dest4, x_pos]
+ st1b z_dest5.b, p0, [x_dest5, x_pos]
+ st1b z_dest6.b, p0, [x_dest6, x_pos]
+ st1b z_dest7.b, p0, [x_dest7, x_pos]
+
+ /* increment one vector length */
+ incb x_pos
+ b .Lloopsve_vl
+/* end of Loop 1 */
+
+.return_pass:
+ /* restore r19..r29 */
+ ldr x23, [sp, #32]
+ ldp x21, x22, [sp, #16]
+ ldp x19, x20, [sp]
+ add sp, sp, #48
+
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_8vect_dot_prod_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_8vect_dot_prod_sve.S
new file mode 100644
index 0000000000..20768f4889
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_8vect_dot_prod_sve.S
@@ -0,0 +1,307 @@
+/*************************************************************
+ Copyright (c) 2021 Linaro Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+.text
+.align 6
+.arch armv8-a+sve
+
+#include "../include/aarch64_label.h"
+
+.global cdecl(gf_8vect_dot_prod_sve)
+#ifndef __APPLE__
+.type gf_8vect_dot_prod_sve, %function
+#endif
+/* void gf_8vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+ */
+
+/* arguments */
+x_len .req x0 /* vector length */
+x_vec .req x1 /* number of source vectors (ie. data blocks) */
+x_tbl .req x2
+x_src .req x3
+x_dest .req x4
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_vec_i .req x5
+x_ptr .req x6
+x_pos .req x7
+
+x_tbl1 .req x8
+x_tbl2 .req x9
+x_tbl3 .req x10
+x_tbl4 .req x11
+x_tbl5 .req x12
+x_tbl6 .req x13
+x_tbl7 .req x14
+
+x_dest1 .req x15
+
+/* r16,r17,r18,r29,r30: special role registers, avoided */
+/* r19..r29 and SP must be preserved */
+x_dest2 .req x19
+x_dest3 .req x20
+x_dest4 .req x21
+x_dest5 .req x22
+x_dest6 .req x23
+x_dest7 .req x24
+x_dest8 .req x_dest /* reused */
+x_tbl8 .req x25
+
+/* vectors */
+z_mask0f .req z0
+
+z_src .req z1
+z_src_lo .req z2
+z_src_hi .req z_src
+
+z_dest1 .req z3
+z_gft1_lo .req z4
+z_gft1_hi .req z5
+q_gft1_lo .req q4
+q_gft1_hi .req q5
+
+z_gft7_lo .req z6
+z_gft7_hi .req z7
+q_gft7_lo .req q6
+q_gft7_hi .req q7
+
+/* bottom 64-bit of v8..v15 must be preserved if used */
+z_dest7 .req z8
+
+z_gft8_lo .req z9
+z_gft8_hi .req z10
+q_gft8_lo .req q9
+q_gft8_hi .req q10
+
+z_dest8 .req z16
+
+z_gft2_lo .req z17
+z_gft2_hi .req z18
+q_gft2_lo .req q17
+q_gft2_hi .req q18
+
+z_gft3_lo .req z19
+z_gft3_hi .req z20
+q_gft3_lo .req q19
+q_gft3_hi .req q20
+
+z_gft4_lo .req z21
+z_gft4_hi .req z22
+q_gft4_lo .req q21
+q_gft4_hi .req q22
+
+z_gft5_lo .req z23
+z_gft5_hi .req z24
+q_gft5_lo .req q23
+q_gft5_hi .req q24
+
+z_gft6_lo .req z25
+z_gft6_hi .req z26
+q_gft6_lo .req q25
+q_gft6_hi .req q26
+
+z_dest2 .req z27
+z_dest3 .req z28
+z_dest4 .req z29
+z_dest5 .req z30
+z_dest6 .req z31
+
+cdecl(gf_8vect_dot_prod_sve):
+ /* less than 16 bytes, return_fail */
+ cmp x_len, #16
+ blt .return_fail
+
+ /* save r19..r29 */
+ sub sp, sp, #80 /* alignment */
+ stp x19, x20, [sp]
+ stp x21, x22, [sp, #16]
+ stp x23, x24, [sp, #32]
+ stp d8, d9, [sp, #48]
+ str d10, [sp, #56]
+ str x25, [sp, #64]
+
+ mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */
+ mov x_pos, #0
+ lsl x_vec, x_vec, #3
+ ldp x_dest1, x_dest2, [x_dest, #8*0]
+ ldp x_dest3, x_dest4, [x_dest, #8*2]
+ ldp x_dest5, x_dest6, [x_dest, #8*4]
+ ldp x_dest7, x_dest8, [x_dest, #8*6] /* x_dest8 reuses x_dest */
+
+/* Loop 1: x_len, vector length */
+.Lloopsve_vl:
+ whilelo p0.b, x_pos, x_len
+ b.none .return_pass
+
+ mov x_vec_i, #0 /* clear x_vec_i */
+ ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */
+
+ mov z_dest1.b, #0 /* clear z_dest1 */
+ mov z_dest2.b, #0 /* clear z_dest2 */
+ mov z_dest3.b, #0 /* clear z_dest3 */
+ mov z_dest4.b, #0 /* clear z_dest4 */
+ mov z_dest5.b, #0 /* clear z_dest5 */
+ mov z_dest6.b, #0 /* clear z_dest6 */
+ mov z_dest7.b, #0 /* clear z_dest7 */
+ mov z_dest8.b, #0 /* clear z_dest8 */
+
+ /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */
+ mov x_tbl1, x_tbl /* reset x_tbl1 */
+ add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */
+ add x_tbl3, x_tbl2, x_vec, LSL #2 /* reset x_tbl3 */
+ add x_tbl4, x_tbl3, x_vec, LSL #2 /* reset x_tbl4 */
+ add x_tbl5, x_tbl4, x_vec, LSL #2 /* reset x_tbl5 */
+ add x_tbl6, x_tbl5, x_vec, LSL #2 /* reset x_tbl6 */
+ add x_tbl7, x_tbl6, x_vec, LSL #2 /* reset x_tbl7 */
+ add x_tbl8, x_tbl7, x_vec, LSL #2 /* reset x_tbl8 */
+
+/* Loop 2: x_vec, number of source vectors (ie. data blocks) */
+.Lloopsve_vl_vects:
+ /* load src data, governed by p0 */
+ ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */
+ /* split 4-bit lo; 4-bit hi */
+ and z_src_lo.d, z_src.d, z_mask0f.d
+ lsr z_src_hi.b, z_src.b, #4
+
+ /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */
+ /* load gf_table's */
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */
+ ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32
+
+ /* prefetch */
+ prfb pldl2keep, p0, [x_tbl1]
+ prfb pldl2keep, p0, [x_tbl2]
+
+ /* calc for next and prefetch */
+ add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */
+ ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */
+
+ /* dest 1 */
+ /* table indexing, ie. gf(2^8) multiplication */
+ tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b
+ tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b
+ /* exclusive or, ie. gf(2^8) add */
+ eor z_dest1.d, z_gft1_lo.d, z_dest1.d
+ eor z_dest1.d, z_gft1_hi.d, z_dest1.d
+
+ ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32
+ ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32
+ prfb pldl2keep, p0, [x_tbl3]
+ prfb pldl2keep, p0, [x_tbl4]
+
+ /* dest 2 */
+ tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b
+ tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b
+ eor z_dest2.d, z_gft2_lo.d, z_dest2.d
+ eor z_dest2.d, z_gft2_hi.d, z_dest2.d
+
+ /* dest 3 */
+ tbl z_gft3_lo.b, {z_gft3_lo.b}, z_src_lo.b
+ tbl z_gft3_hi.b, {z_gft3_hi.b}, z_src_hi.b
+ eor z_dest3.d, z_gft3_lo.d, z_dest3.d
+ eor z_dest3.d, z_gft3_hi.d, z_dest3.d
+
+ ldp q_gft5_lo, q_gft5_hi, [x_tbl5], #32
+ ldp q_gft6_lo, q_gft6_hi, [x_tbl6], #32
+ prfb pldl2keep, p0, [x_tbl5]
+ prfb pldl2keep, p0, [x_tbl6]
+
+ /* dest 4 */
+ tbl z_gft4_lo.b, {z_gft4_lo.b}, z_src_lo.b
+ tbl z_gft4_hi.b, {z_gft4_hi.b}, z_src_hi.b
+ eor z_dest4.d, z_gft4_lo.d, z_dest4.d
+ eor z_dest4.d, z_gft4_hi.d, z_dest4.d
+
+ /* dest 5 */
+ tbl z_gft5_lo.b, {z_gft5_lo.b}, z_src_lo.b
+ tbl z_gft5_hi.b, {z_gft5_hi.b}, z_src_hi.b
+ eor z_dest5.d, z_gft5_lo.d, z_dest5.d
+ eor z_dest5.d, z_gft5_hi.d, z_dest5.d
+
+ ldp q_gft7_lo, q_gft7_hi, [x_tbl7], #32
+ ldp q_gft8_lo, q_gft8_hi, [x_tbl8], #32
+ prfb pldl2keep, p0, [x_tbl7]
+ prfb pldl2keep, p0, [x_tbl8]
+
+ /* dest 6 */
+ tbl z_gft6_lo.b, {z_gft6_lo.b}, z_src_lo.b
+ tbl z_gft6_hi.b, {z_gft6_hi.b}, z_src_hi.b
+ eor z_dest6.d, z_gft6_lo.d, z_dest6.d
+ eor z_dest6.d, z_gft6_hi.d, z_dest6.d
+
+ /* dest 7 */
+ tbl z_gft7_lo.b, {z_gft7_lo.b}, z_src_lo.b
+ tbl z_gft7_hi.b, {z_gft7_hi.b}, z_src_hi.b
+ eor z_dest7.d, z_gft7_lo.d, z_dest7.d
+ eor z_dest7.d, z_gft7_hi.d, z_dest7.d
+
+ /* dest 8 */
+ tbl z_gft8_lo.b, {z_gft8_lo.b}, z_src_lo.b
+ tbl z_gft8_hi.b, {z_gft8_hi.b}, z_src_hi.b
+ eor z_dest8.d, z_gft8_lo.d, z_dest8.d
+ eor z_dest8.d, z_gft8_hi.d, z_dest8.d
+
+ cmp x_vec_i, x_vec
+ blt .Lloopsve_vl_vects
+/* end of Loop 2 */
+
+ /* store dest data, governed by p0 */
+ st1b z_dest1.b, p0, [x_dest1, x_pos]
+ st1b z_dest2.b, p0, [x_dest2, x_pos]
+ st1b z_dest3.b, p0, [x_dest3, x_pos]
+ st1b z_dest4.b, p0, [x_dest4, x_pos]
+ st1b z_dest5.b, p0, [x_dest5, x_pos]
+ st1b z_dest6.b, p0, [x_dest6, x_pos]
+ st1b z_dest7.b, p0, [x_dest7, x_pos]
+ st1b z_dest8.b, p0, [x_dest8, x_pos]
+
+ /* increment one vector length */
+ incb x_pos
+ b .Lloopsve_vl
+/* end of Loop 1 */
+
+.return_pass:
+ /* restore r19..r29 */
+ ldr x25, [sp, #64]
+ ldr d10, [sp, #56]
+ ldp d8, d9, [sp, #48]
+ ldp x23, x24, [sp, #32]
+ ldp x21, x22, [sp, #16]
+ ldp x19, x20, [sp]
+ add sp, sp, #80
+
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_dot_prod_neon.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_dot_prod_neon.S
new file mode 100644
index 0000000000..4d17362894
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_dot_prod_neon.S
@@ -0,0 +1,303 @@
+/**************************************************************
+ Copyright (c) 2019 Huawei Technologies Co., Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "../include/aarch64_label.h"
+
+.text
+
+.global cdecl(gf_vect_dot_prod_neon)
+#ifndef __APPLE__
+.type gf_vect_dot_prod_neon, %function
+#endif
+
+/* arguments */
+x_len .req x0
+x_vec .req x1
+x_tbl .req x2
+x_src .req x3
+x_dest1 .req x4
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_vec_i .req x5
+x_ptr .req x6
+x_pos .req x7
+x_tmp .req x8
+x_tbl1 .req x9
+
+/* vectors */
+v_gft1_lo .req v0
+v_gft1_hi .req v1
+q_gft1_lo .req q0
+q_gft1_hi .req q1
+v_mask0f .req v2
+q_mask0f .req q2
+
+v_data_0 .req v8
+v_data_1 .req v9
+v_data_2 .req v10
+v_data_3 .req v11
+v_data_4 .req v12
+v_data_5 .req v13
+v_data_6 .req v14
+v_data_7 .req v15
+q_data_0 .req q8
+q_data_1 .req q9
+q_data_2 .req q10
+q_data_3 .req q11
+q_data_4 .req q12
+q_data_5 .req q13
+q_data_6 .req q14
+q_data_7 .req q15
+
+v_data_0_lo .req v16
+v_data_1_lo .req v17
+v_data_2_lo .req v18
+v_data_3_lo .req v19
+v_data_4_lo .req v20
+v_data_5_lo .req v21
+v_data_6_lo .req v22
+v_data_7_lo .req v23
+v_data_0_hi .req v_data_0
+v_data_1_hi .req v_data_1
+v_data_2_hi .req v_data_2
+v_data_3_hi .req v_data_3
+v_data_4_hi .req v_data_4
+v_data_5_hi .req v_data_5
+v_data_6_hi .req v_data_6
+v_data_7_hi .req v_data_7
+
+v_p0 .req v24
+v_p1 .req v25
+v_p2 .req v26
+v_p3 .req v27
+v_p4 .req v28
+v_p5 .req v29
+v_p6 .req v30
+v_p7 .req v31
+q_p0 .req q24
+q_p1 .req q25
+q_p2 .req q26
+q_p3 .req q27
+q_p4 .req q28
+q_p5 .req q29
+q_p6 .req q30
+q_p7 .req q31
+
+v_p .req v_p0
+q_p .req q_p0
+v_data .req v_p1
+q_data .req q_p1
+v_data_lo .req v_p2
+v_data_hi .req v_p3
+
+
+cdecl(gf_vect_dot_prod_neon):
+ /* less than 16 bytes, return_fail */
+ cmp x_len, #16
+ blt .return_fail
+
+ movi v_mask0f.16b, #0x0f
+ mov x_pos, #0
+
+ lsl x_vec, x_vec, #3
+
+.Lloop128_init:
+ /* less than 128 bytes, goto Lloop16_init */
+ cmp x_len, #128
+ blt .Lloop16_init
+
+ /* save d8 ~ d15 to stack */
+ sub sp, sp, #64
+ stp d8, d9, [sp]
+ stp d10, d11, [sp, #16]
+ stp d12, d13, [sp, #32]
+ stp d14, d15, [sp, #48]
+
+ sub x_len, x_len, #128
+
+.Lloop128:
+ movi v_p0.16b, #0
+ movi v_p1.16b, #0
+ movi v_p2.16b, #0
+ movi v_p3.16b, #0
+ movi v_p4.16b, #0
+ movi v_p5.16b, #0
+ movi v_p6.16b, #0
+ movi v_p7.16b, #0
+
+ mov x_tbl1, x_tbl
+ mov x_vec_i, #0
+
+.Lloop128_vects:
+ ldr x_ptr, [x_src, x_vec_i]
+ add x_vec_i, x_vec_i, #8
+ add x_ptr, x_ptr, x_pos
+
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32
+
+ ldp q_data_0, q_data_1, [x_ptr], #32
+ ldp q_data_2, q_data_3, [x_ptr], #32
+ ldp q_data_4, q_data_5, [x_ptr], #32
+ ldp q_data_6, q_data_7, [x_ptr]
+
+ prfm pldl1keep, [x_tbl1]
+ prfm pldl1strm, [x_ptr]
+
+ and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
+ and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
+ and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b
+ and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b
+ and v_data_4_lo.16b, v_data_4.16b, v_mask0f.16b
+ and v_data_5_lo.16b, v_data_5.16b, v_mask0f.16b
+ and v_data_6_lo.16b, v_data_6.16b, v_mask0f.16b
+ and v_data_7_lo.16b, v_data_7.16b, v_mask0f.16b
+
+ ushr v_data_0_hi.16b, v_data_0.16b, #4
+ ushr v_data_1_hi.16b, v_data_1.16b, #4
+ ushr v_data_2_hi.16b, v_data_2.16b, #4
+ ushr v_data_3_hi.16b, v_data_3.16b, #4
+ ushr v_data_4_hi.16b, v_data_4.16b, #4
+ ushr v_data_5_hi.16b, v_data_5.16b, #4
+ ushr v_data_6_hi.16b, v_data_6.16b, #4
+ ushr v_data_7_hi.16b, v_data_7.16b, #4
+
+ tbl v_data_0_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
+ tbl v_data_1_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
+ tbl v_data_2_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b
+ tbl v_data_3_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b
+ tbl v_data_4_lo.16b, {v_gft1_lo.16b}, v_data_4_lo.16b
+ tbl v_data_5_lo.16b, {v_gft1_lo.16b}, v_data_5_lo.16b
+ tbl v_data_6_lo.16b, {v_gft1_lo.16b}, v_data_6_lo.16b
+ tbl v_data_7_lo.16b, {v_gft1_lo.16b}, v_data_7_lo.16b
+
+ tbl v_data_0_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
+ tbl v_data_1_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
+ tbl v_data_2_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b
+ tbl v_data_3_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b
+ tbl v_data_4_hi.16b, {v_gft1_hi.16b}, v_data_4_hi.16b
+ tbl v_data_5_hi.16b, {v_gft1_hi.16b}, v_data_5_hi.16b
+ tbl v_data_6_hi.16b, {v_gft1_hi.16b}, v_data_6_hi.16b
+ tbl v_data_7_hi.16b, {v_gft1_hi.16b}, v_data_7_hi.16b
+
+ eor v_p0.16b, v_data_0_lo.16b, v_p0.16b
+ eor v_p0.16b, v_p0.16b, v_data_0_hi.16b
+ eor v_p1.16b, v_data_1_lo.16b, v_p1.16b
+ eor v_p1.16b, v_p1.16b, v_data_1_hi.16b
+ eor v_p2.16b, v_data_2_lo.16b, v_p2.16b
+ eor v_p2.16b, v_p2.16b, v_data_2_hi.16b
+ eor v_p3.16b, v_data_3_lo.16b, v_p3.16b
+ eor v_p3.16b, v_p3.16b, v_data_3_hi.16b
+ eor v_p4.16b, v_data_4_lo.16b, v_p4.16b
+ eor v_p4.16b, v_p4.16b, v_data_4_hi.16b
+ eor v_p5.16b, v_data_5_lo.16b, v_p5.16b
+ eor v_p5.16b, v_p5.16b, v_data_5_hi.16b
+ eor v_p6.16b, v_data_6_lo.16b, v_p6.16b
+ eor v_p6.16b, v_p6.16b, v_data_6_hi.16b
+ eor v_p7.16b, v_data_7_lo.16b, v_p7.16b
+ eor v_p7.16b, v_p7.16b, v_data_7_hi.16b
+
+ cmp x_vec_i, x_vec
+ blt .Lloop128_vects
+
+.Lloop128_vects_end:
+ add x_ptr, x_dest1, x_pos
+ stp q_p0, q_p1, [x_ptr], #32
+ stp q_p2, q_p3, [x_ptr], #32
+ stp q_p4, q_p5, [x_ptr], #32
+ stp q_p6, q_p7, [x_ptr]
+
+ add x_pos, x_pos, #128
+ cmp x_pos, x_len
+ ble .Lloop128
+
+.Lloop128_end:
+ /* restore d8 ~ d15 */
+ ldp d8, d9, [sp]
+ ldp d10, d11, [sp, #16]
+ ldp d12, d13, [sp, #32]
+ ldp d14, d15, [sp, #48]
+ add sp, sp, #64
+
+ add x_len, x_len, #128
+ cmp x_pos, x_len
+ beq .return_pass
+
+.Lloop16_init:
+ sub x_len, x_len, #16
+ cmp x_pos, x_len
+ bgt .lessthan16_init
+
+.Lloop16:
+ movi v_p.16b, #0
+ mov x_tbl1, x_tbl
+ mov x_vec_i, #0
+
+.Lloop16_vects:
+ ldr x_ptr, [x_src, x_vec_i]
+ ldr q_data, [x_ptr, x_pos]
+ add x_vec_i, x_vec_i, #8
+
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32
+
+ and v_data_lo.16b, v_data.16b, v_mask0f.16b
+ ushr v_data_hi.16b, v_data.16b, #4
+
+ tbl v_data_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
+ tbl v_data_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
+ eor v_p.16b, v_data_lo.16b, v_p.16b
+ eor v_p.16b, v_p.16b, v_data_hi.16b
+
+ cmp x_vec_i, x_vec
+ blt .Lloop16_vects
+
+.Lloop16_vects_end:
+ str q_p, [x_dest1, x_pos]
+ add x_pos, x_pos, #16
+ cmp x_pos, x_len
+ ble .Lloop16
+
+.Lloop16_end:
+ sub x_tmp, x_pos, x_len
+ cmp x_tmp, #16
+ beq .return_pass
+
+.lessthan16_init:
+ mov x_pos, x_len
+ b .Lloop16
+
+.return_pass:
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_dot_prod_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_dot_prod_sve.S
new file mode 100644
index 0000000000..48ce151fde
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_dot_prod_sve.S
@@ -0,0 +1,132 @@
+/**************************************************************
+ Copyright (c) 2021 Linaro Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+.text
+.align 6
+.arch armv8-a+sve
+
+#include "../include/aarch64_label.h"
+
+.global cdecl(gf_vect_dot_prod_sve)
+#ifndef __APPLE__
+.type gf_vect_dot_prod_sve, %function
+#endif
+/* void gf_vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char *dest);
+ */
+
+/* arguments */
+x_len .req x0 /* vector length */
+x_vec .req x1 /* number of source vectors (ie. data blocks) */
+x_tbl .req x2
+x_src .req x3
+x_dest1 .req x4
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_vec_i .req x5
+x_ptr .req x6
+x_pos .req x7
+x_tbl1 .req x8
+
+/* vectors */
+z_mask0f .req z0
+
+z_src .req z1
+z_src_lo .req z2
+z_src_hi .req z_src
+
+z_dest .req z3
+
+z_gft1_lo .req z4
+z_gft1_hi .req z5
+q_gft1_lo .req q4
+q_gft1_hi .req q5
+
+cdecl(gf_vect_dot_prod_sve):
+ /* less than 16 bytes, return_fail */
+ cmp x_len, #16
+ blt .return_fail
+
+ mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */
+ mov x_pos, #0
+ lsl x_vec, x_vec, #3
+
+/* Loop 1: x_len, vector length */
+.Lloopsve_vl:
+ whilelo p0.b, x_pos, x_len
+ b.none .return_pass
+
+ mov z_dest.b, #0 /* clear z_dest */
+ mov x_vec_i, #0 /* clear x_vec_i */
+ mov x_tbl1, x_tbl /* reset x_tbl1 */
+
+/* Loop 2: x_vec, number of source vectors (ie. data blocks) */
+.Lloopsve_vl_vects:
+ ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */
+ /* load src data, governed by p0 */
+ ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */
+
+ add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */
+
+ /* load gf_table */
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is added by #32
+ for each src vect */
+
+ /* split 4-bit lo; 4-bit hi */
+ and z_src_lo.d, z_src.d, z_mask0f.d
+ lsr z_src_hi.b, z_src.b, #4
+
+ /* table indexing, ie. gf(2^8) multiplication */
+ tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b
+ tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b
+
+ /* exclusive or, ie. gf(2^8) add */
+ eor z_dest.d, z_gft1_lo.d, z_dest.d
+ eor z_dest.d, z_gft1_hi.d, z_dest.d
+
+ cmp x_vec_i, x_vec
+ blt .Lloopsve_vl_vects
+
+ /* end of Loop 2 */
+ /* store dest data, governed by p0 */
+ st1b z_dest.b, p0, [x_dest1, x_pos]
+ /* increment one vector length */
+ incb x_pos
+
+ b .Lloopsve_vl
+
+.return_pass:
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mad_neon.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mad_neon.S
new file mode 100644
index 0000000000..bc2b957820
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mad_neon.S
@@ -0,0 +1,324 @@
+/**************************************************************
+ Copyright (c) 2019 Huawei Technologies Co., Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "../include/aarch64_label.h"
+
+.text
+
+.global cdecl(gf_vect_mad_neon)
+#ifndef __APPLE__
+.type gf_vect_mad_neon, %function
+#endif
+
+/* arguments */
+x_len .req x0
+x_vec .req x1
+x_vec_i .req x2
+x_tbl .req x3
+x_src .req x4
+x_dest .req x5
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_src_end .req x6
+x_dest1 .req x_dest
+x_tmp .req x7
+x_const .req x8
+
+/* vectors */
+v_mask0f .req v0
+v_tmp .req v1
+q_tmp .req q1
+
+v_tmp1_lo .req v2
+v_tmp1_hi .req v3
+v_tmp2_lo .req v4
+v_tmp2_hi .req v5
+
+v_gft1_lo .req v6
+v_gft1_hi .req v7
+q_gft1_lo .req q6
+q_gft1_hi .req q7
+
+v_data_0 .req v8
+v_data_1 .req v9
+v_data_2 .req v10
+v_data_3 .req v11
+v_data_4 .req v12
+v_data_5 .req v13
+v_data_6 .req v14
+v_data_7 .req v15
+q_data_0 .req q8
+q_data_1 .req q9
+q_data_2 .req q10
+q_data_3 .req q11
+q_data_4 .req q12
+q_data_5 .req q13
+q_data_6 .req q14
+q_data_7 .req q15
+
+v_data_0_lo .req v16
+v_data_1_lo .req v17
+v_data_2_lo .req v18
+v_data_3_lo .req v19
+v_data_4_lo .req v20
+v_data_5_lo .req v21
+v_data_6_lo .req v22
+v_data_7_lo .req v23
+v_data_0_hi .req v_data_0
+v_data_1_hi .req v_data_1
+v_data_2_hi .req v_data_2
+v_data_3_hi .req v_data_3
+v_data_4_hi .req v_data_4
+v_data_5_hi .req v_data_5
+v_data_6_hi .req v_data_6
+v_data_7_hi .req v_data_7
+
+v_d1_0 .req v24
+v_d1_1 .req v25
+v_d1_2 .req v26
+v_d1_3 .req v27
+v_d1_4 .req v28
+v_d1_5 .req v29
+v_d1_6 .req v30
+v_d1_7 .req v31
+q_d1_0 .req q24
+q_d1_1 .req q25
+q_d1_2 .req q26
+q_d1_3 .req q27
+q_d1_4 .req q28
+q_d1_5 .req q29
+q_d1_6 .req q30
+q_d1_7 .req q31
+
+v_data .req v_d1_1
+q_data .req q_d1_1
+v_data_lo .req v_d1_2
+v_data_hi .req v_d1_3
+
+
+cdecl(gf_vect_mad_neon):
+ /* less than 16 bytes, return_fail */
+ cmp x_len, #16
+ blt .return_fail
+
+ movi v_mask0f.16b, #0x0f
+ lsl x_vec_i, x_vec_i, #5
+ add x_tbl, x_tbl, x_vec_i
+ add x_src_end, x_src, x_len
+
+ ldr q_gft1_lo, [x_tbl]
+ ldr q_gft1_hi, [x_tbl, #16]
+
+.Lloop128_init:
+ /* less than 128 bytes, goto Lloop16_init */
+ cmp x_len, #128
+ blt .Lloop16_init
+
+ /* save d8 ~ d15 to stack */
+ sub sp, sp, #64
+ stp d8, d9, [sp]
+ stp d10, d11, [sp, #16]
+ stp d12, d13, [sp, #32]
+ stp d14, d15, [sp, #48]
+
+ sub x_src_end, x_src_end, #128
+
+.Lloop128:
+ ldr q_data_0, [x_src, #16*0]
+ ldr q_data_1, [x_src, #16*1]
+ ldr q_data_2, [x_src, #16*2]
+ ldr q_data_3, [x_src, #16*3]
+ ldr q_data_4, [x_src, #16*4]
+ ldr q_data_5, [x_src, #16*5]
+ ldr q_data_6, [x_src, #16*6]
+ ldr q_data_7, [x_src, #16*7]
+
+ ldr q_d1_0, [x_dest1, #16*0]
+ ldr q_d1_1, [x_dest1, #16*1]
+ ldr q_d1_2, [x_dest1, #16*2]
+ ldr q_d1_3, [x_dest1, #16*3]
+ ldr q_d1_4, [x_dest1, #16*4]
+ ldr q_d1_5, [x_dest1, #16*5]
+ ldr q_d1_6, [x_dest1, #16*6]
+ ldr q_d1_7, [x_dest1, #16*7]
+
+ and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
+ and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
+ and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b
+ and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b
+ and v_data_4_lo.16b, v_data_4.16b, v_mask0f.16b
+ and v_data_5_lo.16b, v_data_5.16b, v_mask0f.16b
+ and v_data_6_lo.16b, v_data_6.16b, v_mask0f.16b
+ and v_data_7_lo.16b, v_data_7.16b, v_mask0f.16b
+
+ ushr v_data_0_hi.16b, v_data_0.16b, #4
+ ushr v_data_1_hi.16b, v_data_1.16b, #4
+ ushr v_data_2_hi.16b, v_data_2.16b, #4
+ ushr v_data_3_hi.16b, v_data_3.16b, #4
+ ushr v_data_4_hi.16b, v_data_4.16b, #4
+ ushr v_data_5_hi.16b, v_data_5.16b, #4
+ ushr v_data_6_hi.16b, v_data_6.16b, #4
+ ushr v_data_7_hi.16b, v_data_7.16b, #4
+
+ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
+ tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
+ tbl v_tmp2_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
+ tbl v_tmp2_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
+
+ eor v_d1_0.16b, v_tmp1_lo.16b, v_d1_0.16b
+ eor v_d1_0.16b, v_d1_0.16b, v_tmp1_hi.16b
+ eor v_d1_1.16b, v_tmp2_lo.16b, v_d1_1.16b
+ eor v_d1_1.16b, v_d1_1.16b, v_tmp2_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b
+ tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b
+ tbl v_tmp2_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b
+ tbl v_tmp2_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b
+
+ eor v_d1_2.16b, v_tmp1_lo.16b, v_d1_2.16b
+ eor v_d1_2.16b, v_d1_2.16b, v_tmp1_hi.16b
+ eor v_d1_3.16b, v_tmp2_lo.16b, v_d1_3.16b
+ eor v_d1_3.16b, v_d1_3.16b, v_tmp2_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_4_lo.16b
+ tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_4_hi.16b
+ tbl v_tmp2_lo.16b, {v_gft1_lo.16b}, v_data_5_lo.16b
+ tbl v_tmp2_hi.16b, {v_gft1_hi.16b}, v_data_5_hi.16b
+
+ eor v_d1_4.16b, v_tmp1_lo.16b, v_d1_4.16b
+ eor v_d1_4.16b, v_d1_4.16b, v_tmp1_hi.16b
+ eor v_d1_5.16b, v_tmp2_lo.16b, v_d1_5.16b
+ eor v_d1_5.16b, v_d1_5.16b, v_tmp2_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_6_lo.16b
+ tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_6_hi.16b
+ tbl v_tmp2_lo.16b, {v_gft1_lo.16b}, v_data_7_lo.16b
+ tbl v_tmp2_hi.16b, {v_gft1_hi.16b}, v_data_7_hi.16b
+
+ eor v_d1_6.16b, v_tmp1_lo.16b, v_d1_6.16b
+ eor v_d1_6.16b, v_d1_6.16b, v_tmp1_hi.16b
+ eor v_d1_7.16b, v_tmp2_lo.16b, v_d1_7.16b
+ eor v_d1_7.16b, v_d1_7.16b, v_tmp2_hi.16b
+
+ str q_d1_0, [x_dest1, #16*0]
+ str q_d1_1, [x_dest1, #16*1]
+ str q_d1_2, [x_dest1, #16*2]
+ str q_d1_3, [x_dest1, #16*3]
+ str q_d1_4, [x_dest1, #16*4]
+ str q_d1_5, [x_dest1, #16*5]
+ str q_d1_6, [x_dest1, #16*6]
+ str q_d1_7, [x_dest1, #16*7]
+
+ add x_src, x_src, #128
+ add x_dest1, x_dest1, #128
+ cmp x_src, x_src_end
+ bls .Lloop128
+
+.Lloop128_end:
+ /* restore d8 ~ d15 */
+ ldp d8, d9, [sp]
+ ldp d10, d11, [sp, #16]
+ ldp d12, d13, [sp, #32]
+ ldp d14, d15, [sp, #48]
+ add sp, sp, #64
+ add x_src_end, x_src_end, #128
+
+.Lloop16_init:
+ sub x_src_end, x_src_end, #16
+ cmp x_src, x_src_end
+ bhi .lessthan16_init
+
+.Lloop16:
+ ldr q_data, [x_src]
+ ldr q_d1_0, [x_dest1]
+
+ and v_data_lo.16b, v_data.16b, v_mask0f.16b
+ ushr v_data_hi.16b, v_data.16b, #4
+
+ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
+ tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
+ eor v_d1_0.16b, v_tmp1_lo.16b, v_d1_0.16b
+ eor v_d1_0.16b, v_d1_0.16b, v_tmp1_hi.16b
+
+ str q_d1_0, [x_dest1]
+
+ add x_dest1, x_dest1, #16
+ add x_src, x_src, #16
+ cmp x_src, x_src_end
+ bls .Lloop16
+
+.lessthan16_init:
+ sub x_tmp, x_src, x_src_end
+ cmp x_tmp, #16
+ beq .return_pass
+
+.lessthan16:
+ mov x_src, x_src_end
+ sub x_dest1, x_dest1, x_tmp
+
+#ifndef __APPLE__
+ adrp x_const, const_tbl
+ add x_const, x_const, :lo12:const_tbl
+#else
+ adrp x_const, const_tbl@PAGE
+ add x_const, x_const, const_tbl@PAGEOFF
+#endif
+ sub x_const, x_const, x_tmp
+ ldr q_tmp, [x_const, #16]
+
+ ldr q_data, [x_src]
+ ldr q_d1_0, [x_dest1]
+
+ and v_data_lo.16b, v_data.16b, v_mask0f.16b
+ ushr v_data_hi.16b, v_data.16b, #4
+
+ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
+ tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
+ eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
+ and v_tmp1_hi.16b, v_tmp1_hi.16b, v_tmp.16b
+ eor v_d1_0.16b, v_d1_0.16b, v_tmp1_hi.16b
+
+ str q_d1_0, [x_dest1]
+
+.return_pass:
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
+
+ASM_DEF_RODATA
+.balign 8
+const_tbl:
+ .dword 0x0000000000000000, 0x0000000000000000
+ .dword 0xffffffffffffffff, 0xffffffffffffffff
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mad_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mad_sve.S
new file mode 100644
index 0000000000..41d6da9d9a
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mad_sve.S
@@ -0,0 +1,126 @@
+/**************************************************************
+ Copyright (c) 2021 Linaro Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+.text
+.align 6
+.arch armv8-a+sve
+
+#include "../include/aarch64_label.h"
+
+.global cdecl(gf_vect_mad_sve)
+#ifndef __APPLE__
+.type gf_vect_mad_sve, %function
+#endif
+
+/* gf_vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char *dest);
+ */
+/* arguments */
+x_len .req x0
+x_vec .req x1
+x_vec_i .req x2
+x_tbl .req x3
+x_src .req x4
+x_dest .req x5
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_pos .req x6
+
+/* vectors */
+z_mask0f .req z0
+
+z_src .req z1
+z_src_lo .req z2
+z_src_hi .req z_src
+
+z_dest .req z3
+
+z_tmp1_lo .req z4
+z_tmp1_hi .req z5
+
+z_gft1_lo .req z6
+z_gft1_hi .req z7
+q_gft1_lo .req q6
+q_gft1_hi .req q7
+
+cdecl(gf_vect_mad_sve):
+ /* less than 16 bytes, return_fail */
+ cmp x_len, #16
+ blt .return_fail
+
+ mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */
+ add x_tbl, x_tbl, x_vec_i, LSL #5 /* x_tbl += x_vec_i * 2^5 */
+
+ /* Load with NEON instruction ldp */
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl]
+
+ mov x_pos, #0
+
+ /* vector length agnostic */
+.Lloopsve_vl:
+ whilelo p0.b, x_pos, x_len
+ b.none .return_pass
+
+ /* prefetch dest data */
+ prfb pldl2strm, p0, [x_dest, x_pos]
+
+ /* load src data, governed by p0 */
+ ld1b z_src.b, p0/z, [x_src, x_pos]
+
+ /* split 4-bit lo; 4-bit hi */
+ and z_src_lo.d, z_src.d, z_mask0f.d
+ lsr z_src_hi.b, z_src.b, #4
+
+ /* load dest data, governed by p0 */
+ ld1b z_dest.b, p0/z, [x_dest, x_pos]
+
+ /* table indexing, ie. gf(2^8) multiplication */
+ tbl z_tmp1_lo.b, {z_gft1_lo.b}, z_src_lo.b
+ tbl z_tmp1_hi.b, {z_gft1_hi.b}, z_src_hi.b
+
+ /* exclusive or, ie. gf(2^8) add */
+ eor z_dest.d, z_tmp1_lo.d, z_dest.d
+ eor z_dest.d, z_tmp1_hi.d, z_dest.d
+
+ /* store dest data, governed by p0 */
+ st1b z_dest.b, p0, [x_dest, x_pos]
+ /* increment one vector length */
+ incb x_pos
+
+ b .Lloopsve_vl
+
+.return_pass:
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mul_neon.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mul_neon.S
new file mode 100644
index 0000000000..096b91dd29
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mul_neon.S
@@ -0,0 +1,240 @@
+/**************************************************************
+ Copyright (c) 2019 Huawei Technologies Co., Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "../include/aarch64_label.h"
+
+.text
+
+.global cdecl(gf_vect_mul_neon)
+#ifndef __APPLE__
+.type gf_vect_mul_neon, %function
+#endif
+
+/* arguments */
+x_len .req x0
+x_tbl .req x1
+x_src .req x2
+x_dest .req x3
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_dest1 .req x_dest
+x_src_end .req x4
+x_tmp .req x5
+
+/* vectors */
+v_mask0f .req v0
+
+v_gft1_lo .req v2
+v_gft1_hi .req v3
+q_gft1_lo .req q2
+q_gft1_hi .req q3
+
+v_data_0 .req v16
+v_data_1 .req v17
+v_data_2 .req v18
+v_data_3 .req v19
+v_data_4 .req v20
+v_data_5 .req v21
+v_data_6 .req v22
+v_data_7 .req v23
+q_data_0 .req q16
+q_data_1 .req q17
+q_data_2 .req q18
+q_data_3 .req q19
+q_data_4 .req q20
+q_data_5 .req q21
+q_data_6 .req q22
+q_data_7 .req q23
+
+v_data_0_lo .req v24
+v_data_1_lo .req v25
+v_data_2_lo .req v26
+v_data_3_lo .req v27
+v_data_4_lo .req v28
+v_data_5_lo .req v29
+v_data_6_lo .req v30
+v_data_7_lo .req v31
+v_data_0_hi .req v_data_0
+v_data_1_hi .req v_data_1
+v_data_2_hi .req v_data_2
+v_data_3_hi .req v_data_3
+v_data_4_hi .req v_data_4
+v_data_5_hi .req v_data_5
+v_data_6_hi .req v_data_6
+v_data_7_hi .req v_data_7
+
+
+cdecl(gf_vect_mul_neon):
+ /* less than 32 bytes, return_fail */
+ cmp x_len, #32
+ blt .return_fail
+
+ movi v_mask0f.16b, #0x0f
+ add x_src_end, x_src, x_len
+ ldr q_gft1_lo, [x_tbl]
+ ldr q_gft1_hi, [x_tbl, #16]
+
+
+.Lloop128_init:
+ /* less than 128 bytes, goto Lloop16_init */
+ cmp x_len, #128
+ blt .Lloop32_init
+
+ /* save d8 ~ d15 to stack */
+ sub sp, sp, #64
+ stp d8, d9, [sp]
+ stp d10, d11, [sp, #16]
+ stp d12, d13, [sp, #32]
+ stp d14, d15, [sp, #48]
+
+ sub x_src_end, x_src_end, #128
+
+.Lloop128:
+ ldr q_data_0, [x_src, #16*0]
+ ldr q_data_1, [x_src, #16*1]
+ ldr q_data_2, [x_src, #16*2]
+ ldr q_data_3, [x_src, #16*3]
+ ldr q_data_4, [x_src, #16*4]
+ ldr q_data_5, [x_src, #16*5]
+ ldr q_data_6, [x_src, #16*6]
+ ldr q_data_7, [x_src, #16*7]
+
+ and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
+ and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
+ and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b
+ and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b
+ and v_data_4_lo.16b, v_data_4.16b, v_mask0f.16b
+ and v_data_5_lo.16b, v_data_5.16b, v_mask0f.16b
+ and v_data_6_lo.16b, v_data_6.16b, v_mask0f.16b
+ and v_data_7_lo.16b, v_data_7.16b, v_mask0f.16b
+
+ ushr v_data_0_hi.16b, v_data_0.16b, #4
+ ushr v_data_1_hi.16b, v_data_1.16b, #4
+ ushr v_data_2_hi.16b, v_data_2.16b, #4
+ ushr v_data_3_hi.16b, v_data_3.16b, #4
+ ushr v_data_4_hi.16b, v_data_4.16b, #4
+ ushr v_data_5_hi.16b, v_data_5.16b, #4
+ ushr v_data_6_hi.16b, v_data_6.16b, #4
+ ushr v_data_7_hi.16b, v_data_7.16b, #4
+
+ tbl v_data_0_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
+ tbl v_data_1_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
+ tbl v_data_2_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b
+ tbl v_data_3_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b
+ tbl v_data_4_lo.16b, {v_gft1_lo.16b}, v_data_4_lo.16b
+ tbl v_data_5_lo.16b, {v_gft1_lo.16b}, v_data_5_lo.16b
+ tbl v_data_6_lo.16b, {v_gft1_lo.16b}, v_data_6_lo.16b
+ tbl v_data_7_lo.16b, {v_gft1_lo.16b}, v_data_7_lo.16b
+
+ tbl v_data_0_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
+ tbl v_data_1_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
+ tbl v_data_2_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b
+ tbl v_data_3_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b
+ tbl v_data_4_hi.16b, {v_gft1_hi.16b}, v_data_4_hi.16b
+ tbl v_data_5_hi.16b, {v_gft1_hi.16b}, v_data_5_hi.16b
+ tbl v_data_6_hi.16b, {v_gft1_hi.16b}, v_data_6_hi.16b
+ tbl v_data_7_hi.16b, {v_gft1_hi.16b}, v_data_7_hi.16b
+
+ eor v_data_0.16b, v_data_0_hi.16b, v_data_0_lo.16b
+ eor v_data_1.16b, v_data_1_hi.16b, v_data_1_lo.16b
+ eor v_data_2.16b, v_data_2_hi.16b, v_data_2_lo.16b
+ eor v_data_3.16b, v_data_3_hi.16b, v_data_3_lo.16b
+ eor v_data_4.16b, v_data_4_hi.16b, v_data_4_lo.16b
+ eor v_data_5.16b, v_data_5_hi.16b, v_data_5_lo.16b
+ eor v_data_6.16b, v_data_6_hi.16b, v_data_6_lo.16b
+ eor v_data_7.16b, v_data_7_hi.16b, v_data_7_lo.16b
+
+ str q_data_0, [x_dest1, #16*0]
+ str q_data_1, [x_dest1, #16*1]
+ str q_data_2, [x_dest1, #16*2]
+ str q_data_3, [x_dest1, #16*3]
+ str q_data_4, [x_dest1, #16*4]
+ str q_data_5, [x_dest1, #16*5]
+ str q_data_6, [x_dest1, #16*6]
+ str q_data_7, [x_dest1, #16*7]
+
+ add x_src, x_src, #128
+ add x_dest1, x_dest1, #128
+ cmp x_src, x_src_end
+ bls .Lloop128
+
+.Lloop128_end:
+ /* restore d8 ~ d15 */
+ ldp d8, d9, [sp]
+ ldp d10, d11, [sp, #16]
+ ldp d12, d13, [sp, #32]
+ ldp d14, d15, [sp, #48]
+ add sp, sp, #64
+ add x_src_end, x_src_end, #128
+ cmp x_src, x_src_end
+ beq .return_pass
+
+.Lloop32_init:
+ sub x_src_end, x_src_end, #32
+ cmp x_src, x_src_end
+ bhi .return_fail
+
+.Lloop32:
+ ldr q_data_0, [x_src, #16*0]
+ ldr q_data_1, [x_src, #16*1]
+
+ and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
+ and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
+ ushr v_data_0_hi.16b, v_data_0.16b, #4
+ ushr v_data_1_hi.16b, v_data_1.16b, #4
+ tbl v_data_0_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
+ tbl v_data_1_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
+ tbl v_data_0_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
+ tbl v_data_1_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
+ eor v_data_0.16b, v_data_0_hi.16b, v_data_0_lo.16b
+ eor v_data_1.16b, v_data_1_hi.16b, v_data_1_lo.16b
+ str q_data_0, [x_dest1, #16*0]
+ str q_data_1, [x_dest1, #16*1]
+
+ add x_dest1, x_dest1, #32
+ add x_src, x_src, #32
+ cmp x_src, x_src_end
+ bls .Lloop32
+
+.Lloop32_end:
+ sub x_tmp, x_src, x_src_end
+ cmp x_tmp, #32
+ beq .return_pass
+ b .return_fail
+
+.return_pass:
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mul_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mul_sve.S
new file mode 100644
index 0000000000..d2219bf54c
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mul_sve.S
@@ -0,0 +1,123 @@
+/**************************************************************
+ Copyright (c) 2021 Linaro Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+.text
+.align 6
+.arch armv8-a+sve
+
+#include "../include/aarch64_label.h"
+
+.global cdecl(gf_vect_mul_sve)
+#ifndef __APPLE__
+.type gf_vect_mul_sve, %function
+#endif
+
+/* Refer to include/gf_vect_mul.h
+ *
+ * @param len Length of vector in bytes. Must be aligned to 32B.
+ * @param gftbl Pointer to 32-byte array of pre-calculated constants based on C.
+ * @param src Pointer to src data array. Must be aligned to 32B.
+ * @param dest Pointer to destination data array. Must be aligned to 32B.
+ * @returns 0 pass, other fail
+ *
+ * int gf_vect_mul(int len, unsigned char *gftbl, void *src, void *dest);
+ */
+
+/* arguments */
+x_len .req x0
+x_tbl .req x1
+x_src .req x2
+x_dest .req x3
+x_tmp .req x4
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_pos .req x5
+
+/* vectors */
+z_mask0f .req z0
+
+z_src .req z1
+z_src_lo .req z2
+z_src_hi .req z_src /* reuse */
+
+z_dest .req z3
+z_tmp1_lo .req z4
+z_tmp1_hi .req z_dest /* reuse */
+
+z_gft1_lo .req z6
+z_gft1_hi .req z7
+q_gft1_lo .req q6
+q_gft1_hi .req q7
+
+cdecl(gf_vect_mul_sve):
+ /* len not aligned to 32B, return_fail */
+ and x_tmp, x_len, #0x1f
+ cmp x_tmp, #0
+ bne .return_fail
+
+ mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */
+ mov x_pos, #0
+
+ /* Load with NEON instruction ldp */
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl]
+
+ /* vector length agnostic */
+.Lloopsve_vl:
+ whilelo p0.b, x_pos, x_len
+ b.none .return_pass
+
+ /* load src data, governed by p0 */
+ ld1b z_src.b, p0/z, [x_src, x_pos]
+
+ /* split 4-bit lo; 4-bit hi */
+ and z_src_lo.d, z_src.d, z_mask0f.d
+ lsr z_src_hi.b, z_src.b, #4
+
+ /* table indexing, ie. gf(2^8) multiplication */
+ tbl z_tmp1_lo.b, {z_gft1_lo.b}, z_src_lo.b
+ tbl z_tmp1_hi.b, {z_gft1_hi.b}, z_src_hi.b
+ /* exclusive or, ie. gf(2^8) add */
+ eor z_dest.d, z_tmp1_hi.d, z_tmp1_lo.d
+
+ /* store dest data, governed by p0 */
+ st1b z_dest.b, p0, [x_dest, x_pos]
+ /* increment one vector length */
+ incb x_pos
+
+ b .Lloopsve_vl
+
+.return_pass:
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/ya.make b/contrib/libs/isa-l/erasure_code/aarch64/ya.make
new file mode 100644
index 0000000000..ba7f601cfa
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/ya.make
@@ -0,0 +1,51 @@
+LIBRARY()
+
+LICENSE(BSD-3-Clause)
+
+LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
+
+VERSION(2.31)
+
+NO_UTIL()
+
+NO_COMPILER_WARNINGS()
+
+ADDINCL(
+ contrib/libs/isa-l/include
+)
+
+IF(ARCH_AARCH64)
+CFLAGS(-D__ASSEMBLY__)
+SRCS(
+ ec_multibinary_arm.S
+ gf_2vect_dot_prod_neon.S
+ gf_2vect_dot_prod_sve.S
+ gf_2vect_mad_neon.S
+ gf_2vect_mad_sve.S
+ gf_3vect_dot_prod_neon.S
+ gf_3vect_dot_prod_sve.S
+ gf_3vect_mad_neon.S
+ gf_3vect_mad_sve.S
+ gf_4vect_dot_prod_neon.S
+ gf_4vect_dot_prod_sve.S
+ gf_4vect_mad_neon.S
+ gf_4vect_mad_sve.S
+ gf_5vect_dot_prod_neon.S
+ gf_5vect_dot_prod_sve.S
+ gf_5vect_mad_neon.S
+ gf_5vect_mad_sve.S
+ gf_6vect_dot_prod_sve.S
+ gf_6vect_mad_neon.S
+ gf_6vect_mad_sve.S
+ gf_7vect_dot_prod_sve.S
+ gf_8vect_dot_prod_sve.S
+ gf_vect_dot_prod_neon.S
+ gf_vect_dot_prod_sve.S
+ gf_vect_mad_neon.S
+ gf_vect_mad_sve.S
+ gf_vect_mul_neon.S
+ gf_vect_mul_sve.S
+)
+ENDIF()
+
+END()
diff --git a/contrib/libs/isa-l/erasure_code/ec_base.c b/contrib/libs/isa-l/erasure_code/ec_base.c
index 9a8fbc759e..c076b517bf 100644
--- a/contrib/libs/isa-l/erasure_code/ec_base.c
+++ b/contrib/libs/isa-l/erasure_code/ec_base.c
@@ -29,10 +29,12 @@
#include <limits.h>
#include <string.h> // for memset
+#include <stdint.h>
+
#include "erasure_code.h"
#include "ec_base.h" // for GF tables
-void ec_init_tables(int k, int rows, unsigned char *a, unsigned char *g_tbls)
+void ec_init_tables_base(int k, int rows, unsigned char *a, unsigned char *g_tbls)
{
int i, j;
@@ -171,7 +173,7 @@ void gf_vect_mul_init(unsigned char c, unsigned char *tbl)
unsigned char c4 = (c2 << 1) ^ ((c2 & 0x80) ? 0x1d : 0); //Mult by GF{2}
unsigned char c8 = (c4 << 1) ^ ((c4 & 0x80) ? 0x1d : 0); //Mult by GF{2}
-#if __WORDSIZE == 64 || _WIN64 || __x86_64__
+#if (__WORDSIZE == 64 || _WIN64 || __x86_64__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
unsigned long long v1, v2, v4, v8, *t;
unsigned long long v10, v20, v40, v80;
unsigned char c17, c18, c20, c24;
@@ -331,41 +333,17 @@ void ec_encode_data_update_base(int len, int k, int rows, int vec_i, unsigned ch
}
}
-void gf_vect_mul_base(int len, unsigned char *a, unsigned char *src, unsigned char *dest)
+int gf_vect_mul_base(int len, unsigned char *a, unsigned char *src, unsigned char *dest)
{
//2nd element of table array is ref value used to fill it in
unsigned char c = a[1];
+
+ // Len must be aligned to 32B
+ if ((len % 32) != 0) {
+ return -1;
+ }
+
while (len-- > 0)
*dest++ = gf_mul_erasure(c, *src++);
+ return 0;
}
-
-struct slver {
- unsigned short snum;
- unsigned char ver;
- unsigned char core;
-};
-
-// Version info
-struct slver gf_vect_mul_init_slver_00020035;
-struct slver gf_vect_mul_init_slver = { 0x0035, 0x02, 0x00 };
-
-struct slver ec_encode_data_base_slver_00010135;
-struct slver ec_encode_data_base_slver = { 0x0135, 0x01, 0x00 };
-
-struct slver gf_vect_mul_base_slver_00010136;
-struct slver gf_vect_mul_base_slver = { 0x0136, 0x01, 0x00 };
-
-struct slver gf_vect_dot_prod_base_slver_00010137;
-struct slver gf_vect_dot_prod_base_slver = { 0x0137, 0x01, 0x00 };
-
-struct slver gf_mul_slver_00000214;
-struct slver gf_mul_slver = { 0x0214, 0x00, 0x00 };
-
-struct slver gf_invert_matrix_slver_00000215;
-struct slver gf_invert_matrix_slver = { 0x0215, 0x00, 0x00 };
-
-struct slver gf_gen_rs_matrix_slver_00000216;
-struct slver gf_gen_rs_matrix_slver = { 0x0216, 0x00, 0x00 };
-
-struct slver gf_gen_cauchy1_matrix_slver_00000217;
-struct slver gf_gen_cauchy1_matrix_slver = { 0x0217, 0x00, 0x00 };
diff --git a/contrib/libs/isa-l/erasure_code/ec_base.h b/contrib/libs/isa-l/erasure_code/ec_base.h
index 070b276652..ace384968b 100644
--- a/contrib/libs/isa-l/erasure_code/ec_base.h
+++ b/contrib/libs/isa-l/erasure_code/ec_base.h
@@ -30,6 +30,77 @@
#ifndef _EC_BASE_H_
#define _EC_BASE_H_
+#include <stdint.h>
+
+#define MAX_NUM_OUTPUTS_CALL 6
+
+static const uint64_t gf_table_gfni[256] = {
+ 0x0000000000000000, 0x102040810204080, 0x8001828488102040, 0x8103868c983060c0,
+ 0x408041c2c4881020, 0x418245cad4a850a0, 0xc081c3464c983060, 0xc183c74e5cb870e0,
+ 0x2040a061e2c48810, 0x2142a469f2e4c890, 0xa04122e56ad4a850, 0xa14326ed7af4e8d0,
+ 0x60c0e1a3264c9830, 0x61c2e5ab366cd8b0, 0xe0c16327ae5cb870, 0xe1c3672fbe7cf8f0,
+ 0x102050b071e2c488, 0x112254b861c28408, 0x9021d234f9f2e4c8, 0x9123d63ce9d2a448,
+ 0x50a01172b56ad4a8, 0x51a2157aa54a9428, 0xd0a193f63d7af4e8, 0xd1a397fe2d5ab468,
+ 0x3060f0d193264c98, 0x3162f4d983060c18, 0xb06172551b366cd8, 0xb163765d0b162c58,
+ 0x70e0b11357ae5cb8, 0x71e2b51b478e1c38, 0xf0e13397dfbe7cf8, 0xf1e3379fcf9e3c78,
+ 0x8810a8d83871e2c4, 0x8912acd02851a244, 0x08112a5cb061c284, 0x09132e54a0418204,
+ 0xc890e91afcf9f2e4, 0xc992ed12ecd9b264, 0x48916b9e74e9d2a4, 0x49936f9664c99224,
+ 0xa85008b9dab56ad4, 0xa9520cb1ca952a54, 0x28518a3d52a54a94, 0x29538e3542850a14,
+ 0xe8d0497b1e3d7af4, 0xe9d24d730e1d3a74, 0x68d1cbff962d5ab4, 0x69d3cff7860d1a34,
+ 0x9830f8684993264c, 0x9932fc6059b366cc, 0x18317aecc183060c, 0x19337ee4d1a3468c,
+ 0xd8b0b9aa8d1b366c, 0xd9b2bda29d3b76ec, 0x58b13b2e050b162c, 0x59b33f26152b56ac,
+ 0xb8705809ab57ae5c, 0xb9725c01bb77eedc, 0x3871da8d23478e1c, 0x3973de853367ce9c,
+ 0xf8f019cb6fdfbe7c, 0xf9f21dc37ffffefc, 0x78f19b4fe7cf9e3c, 0x79f39f47f7efdebc,
+ 0xc488d46c1c3871e2, 0xc58ad0640c183162, 0x448956e8942851a2, 0x458b52e084081122,
+ 0x840895aed8b061c2, 0x850a91a6c8902142, 0x0409172a50a04182, 0x050b132240800102,
+ 0xe4c8740dfefcf9f2, 0xe5ca7005eedcb972, 0x64c9f68976ecd9b2, 0x65cbf28166cc9932,
+ 0xa44835cf3a74e9d2, 0xa54a31c72a54a952, 0x2449b74bb264c992, 0x254bb343a2448912,
+ 0xd4a884dc6ddab56a, 0xd5aa80d47dfaf5ea, 0x54a90658e5ca952a, 0x55ab0250f5ead5aa,
+ 0x9428c51ea952a54a, 0x952ac116b972e5ca, 0x1429479a2142850a, 0x152b43923162c58a,
+ 0xf4e824bd8f1e3d7a, 0xf5ea20b59f3e7dfa, 0x74e9a639070e1d3a, 0x75eba231172e5dba,
+ 0xb468657f4b962d5a, 0xb56a61775bb66dda, 0x3469e7fbc3860d1a, 0x356be3f3d3a64d9a,
+ 0x4c987cb424499326, 0x4d9a78bc3469d3a6, 0xcc99fe30ac59b366, 0xcd9bfa38bc79f3e6,
+ 0x0c183d76e0c18306, 0x0d1a397ef0e1c386, 0x8c19bff268d1a346, 0x8d1bbbfa78f1e3c6,
+ 0x6cd8dcd5c68d1b36, 0x6ddad8ddd6ad5bb6, 0xecd95e514e9d3b76, 0xeddb5a595ebd7bf6,
+ 0x2c589d1702050b16, 0x2d5a991f12254b96, 0xac591f938a152b56, 0xad5b1b9b9a356bd6,
+ 0x5cb82c0455ab57ae, 0x5dba280c458b172e, 0xdcb9ae80ddbb77ee, 0xddbbaa88cd9b376e,
+ 0x1c386dc69123478e, 0x1d3a69ce8103070e, 0x9c39ef42193367ce, 0x9d3beb4a0913274e,
+ 0x7cf88c65b76fdfbe, 0x7dfa886da74f9f3e, 0xfcf90ee13f7ffffe, 0xfdfb0ae92f5fbf7e,
+ 0x3c78cda773e7cf9e, 0x3d7ac9af63c78f1e, 0xbc794f23fbf7efde, 0xbd7b4b2bebd7af5e,
+ 0xe2c46a368e1c3871, 0xe3c66e3e9e3c78f1, 0x62c5e8b2060c1831, 0x63c7ecba162c58b1,
+ 0xa2442bf44a942851, 0xa3462ffc5ab468d1, 0x2245a970c2840811, 0x2347ad78d2a44891,
+ 0xc284ca576cd8b061, 0xc386ce5f7cf8f0e1, 0x428548d3e4c89021, 0x43874cdbf4e8d0a1,
+ 0x82048b95a850a041, 0x83068f9db870e0c1, 0x0205091120408001, 0x03070d193060c081,
+ 0xf2e43a86fffefcf9, 0xf3e63e8eefdebc79, 0x72e5b80277eedcb9, 0x73e7bc0a67ce9c39,
+ 0xb2647b443b76ecd9, 0xb3667f4c2b56ac59, 0x3265f9c0b366cc99, 0x3367fdc8a3468c19,
+ 0xd2a49ae71d3a74e9, 0xd3a69eef0d1a3469, 0x52a51863952a54a9, 0x53a71c6b850a1429,
+ 0x9224db25d9b264c9, 0x9326df2dc9922449, 0x122559a151a24489, 0x13275da941820409,
+ 0x6ad4c2eeb66ddab5, 0x6bd6c6e6a64d9a35, 0xead5406a3e7dfaf5, 0xebd744622e5dba75,
+ 0x2a54832c72e5ca95, 0x2b56872462c58a15, 0xaa5501a8faf5ead5, 0xab5705a0ead5aa55,
+ 0x4a94628f54a952a5, 0x4b96668744891225, 0xca95e00bdcb972e5, 0xcb97e403cc993265,
+ 0x0a14234d90214285, 0x0b16274580010205, 0x8a15a1c9183162c5, 0x8b17a5c108112245,
+ 0x7af4925ec78f1e3d, 0x7bf69656d7af5ebd, 0xfaf510da4f9f3e7d, 0xfbf714d25fbf7efd,
+ 0x3a74d39c03070e1d, 0x3b76d79413274e9d, 0xba7551188b172e5d, 0xbb7755109b376edd,
+ 0x5ab4323f254b962d, 0x5bb63637356bd6ad, 0xdab5b0bbad5bb66d, 0xdbb7b4b3bd7bf6ed,
+ 0x1a3473fde1c3860d, 0x1b3677f5f1e3c68d, 0x9a35f17969d3a64d, 0x9b37f57179f3e6cd,
+ 0x264cbe5a92244993, 0x274eba5282040913, 0xa64d3cde1a3469d3, 0xa74f38d60a142953,
+ 0x66ccff9856ac59b3, 0x67cefb90468c1933, 0xe6cd7d1cdebc79f3, 0xe7cf7914ce9c3973,
+ 0x060c1e3b70e0c183, 0x070e1a3360c08103, 0x860d9cbff8f0e1c3, 0x870f98b7e8d0a143,
+ 0x468c5ff9b468d1a3, 0x478e5bf1a4489123, 0xc68ddd7d3c78f1e3, 0xc78fd9752c58b163,
+ 0x366ceeeae3c68d1b, 0x376eeae2f3e6cd9b, 0xb66d6c6e6bd6ad5b, 0xb76f68667bf6eddb,
+ 0x76ecaf28274e9d3b, 0x77eeab20376eddbb, 0xf6ed2dacaf5ebd7b, 0xf7ef29a4bf7efdfb,
+ 0x162c4e8b0102050b, 0x172e4a831122458b, 0x962dcc0f8912254b, 0x972fc807993265cb,
+ 0x56ac0f49c58a152b, 0x57ae0b41d5aa55ab, 0xd6ad8dcd4d9a356b, 0xd7af89c55dba75eb,
+ 0xae5c1682aa55ab57, 0xaf5e128aba75ebd7, 0x2e5d940622458b17, 0x2f5f900e3265cb97,
+ 0xeedc57406eddbb77, 0xefde53487efdfbf7, 0x6eddd5c4e6cd9b37, 0x6fdfd1ccf6eddbb7,
+ 0x8e1cb6e348912347, 0x8f1eb2eb58b163c7, 0x0e1d3467c0810307, 0x0f1f306fd0a14387,
+ 0xce9cf7218c193367, 0xcf9ef3299c3973e7, 0x4e9d75a504091327, 0x4f9f71ad142953a7,
+ 0xbe7c4632dbb76fdf, 0xbf7e423acb972f5f, 0x3e7dc4b653a74f9f, 0x3f7fc0be43870f1f,
+ 0xfefc07f01f3f7fff, 0xfffe03f80f1f3f7f, 0x7efd8574972f5fbf, 0x7fff817c870f1f3f,
+ 0x9e3ce6533973e7cf, 0x9f3ee25b2953a74f, 0x1e3d64d7b163c78f, 0x1f3f60dfa143870f,
+ 0xdebca791fdfbf7ef, 0xdfbea399eddbb76f, 0x5ebd251575ebd7af, 0x5fbf211d65cb972f
+};
+
// Global GF(256) tables
#ifndef GF_LARGE_TABLES
static const unsigned char gff_base[] = {
diff --git a/contrib/libs/isa-l/erasure_code/ec_base.patch b/contrib/libs/isa-l/erasure_code/ec_base.patch
deleted file mode 100644
index 86a927f8c3..0000000000
--- a/contrib/libs/isa-l/erasure_code/ec_base.patch
+++ /dev/null
@@ -1,44 +0,0 @@
-47c47
-< unsigned char gf_mul_erasure(unsigned char a, unsigned char b)
----
-> unsigned char gf_mul(unsigned char a, unsigned char b)
-86c86
-< p = gf_mul_erasure(p, gen);
----
-> p = gf_mul(p, gen);
-88c88
-< gen = gf_mul_erasure(gen, 2);
----
-> gen = gf_mul(gen, 2);
-147,148c147,148
-< in_mat[i * n + j] = gf_mul_erasure(in_mat[i * n + j], temp);
-< out_mat[i * n + j] = gf_mul_erasure(out_mat[i * n + j], temp);
----
-> in_mat[i * n + j] = gf_mul(in_mat[i * n + j], temp);
-> out_mat[i * n + j] = gf_mul(out_mat[i * n + j], temp);
-157,158c157,158
-< out_mat[j * n + k] ^= gf_mul_erasure(temp, out_mat[i * n + k]);
-< in_mat[j * n + k] ^= gf_mul_erasure(temp, in_mat[i * n + k]);
----
-> out_mat[j * n + k] ^= gf_mul(temp, out_mat[i * n + k]);
-> in_mat[j * n + k] ^= gf_mul(temp, in_mat[i * n + k]);
-283c283
-< s ^= gf_mul_erasure(src[j][i], v[j * 32 + 1]);
----
-> s ^= gf_mul(src[j][i], v[j * 32 + 1]);
-296c296
-< s ^= gf_mul_erasure(src[i], v[vec_i * 32 + 1]);
----
-> s ^= gf_mul(src[i], v[vec_i * 32 + 1]);
-311c311
-< s ^= gf_mul_erasure(src[j][i], v[j * 32 + l * srcs * 32 + 1]);
----
-> s ^= gf_mul(src[j][i], v[j * 32 + l * srcs * 32 + 1]);
-327c327
-< s ^= gf_mul_erasure(data[i], v[vec_i * 32 + l * k * 32 + 1]);
----
-> s ^= gf_mul(data[i], v[vec_i * 32 + l * k * 32 + 1]);
-339c339
-< *dest++ = gf_mul_erasure(c, *src++);
----
-> *dest++ = gf_mul(c, *src++);
diff --git a/contrib/libs/isa-l/erasure_code/ec_base_aliases.c b/contrib/libs/isa-l/erasure_code/ec_base_aliases.c
index d046ff61ad..705dfb685c 100644
--- a/contrib/libs/isa-l/erasure_code/ec_base_aliases.c
+++ b/contrib/libs/isa-l/erasure_code/ec_base_aliases.c
@@ -56,6 +56,10 @@ void ec_encode_data_update(int len, int k, int rows, int vec_i, unsigned char *v
int gf_vect_mul(int len, unsigned char *a, void *src, void *dest)
{
- gf_vect_mul_base(len, a, (unsigned char *)src, (unsigned char *)dest);
- return 0;
+ return gf_vect_mul_base(len, a, (unsigned char *)src, (unsigned char *)dest);
+}
+
+void ec_init_tables(int k, int rows, unsigned char *a, unsigned char *g_tbls)
+{
+ return ec_init_tables_base(k, rows, a, g_tbls);
}
diff --git a/contrib/libs/isa-l/erasure_code/ec_highlevel_func.c b/contrib/libs/isa-l/erasure_code/ec_highlevel_func.c
index c57d460a61..373cd33726 100644
--- a/contrib/libs/isa-l/erasure_code/ec_highlevel_func.c
+++ b/contrib/libs/isa-l/erasure_code/ec_highlevel_func.c
@@ -1,5 +1,5 @@
/**********************************************************************
- Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+ Copyright(c) 2011-2019 Intel Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
@@ -28,6 +28,7 @@
**********************************************************************/
#include <limits.h>
#include "erasure_code.h"
+#include "ec_base.h" /* for GF tables */
#if __x86_64__ || __i386__ || _M_X64 || _M_IX86
void ec_encode_data_sse(int len, int k, int rows, unsigned char *g_tbls, unsigned char **data,
@@ -39,13 +40,19 @@ void ec_encode_data_sse(int len, int k, int rows, unsigned char *g_tbls, unsigne
return;
}
- while (rows >= 4) {
- gf_4vect_dot_prod_sse(len, k, g_tbls, data, coding);
- g_tbls += 4 * k * 32;
- coding += 4;
- rows -= 4;
+ while (rows >= 6) {
+ gf_6vect_dot_prod_sse(len, k, g_tbls, data, coding);
+ g_tbls += 6 * k * 32;
+ coding += 6;
+ rows -= 6;
}
switch (rows) {
+ case 5:
+ gf_5vect_dot_prod_sse(len, k, g_tbls, data, coding);
+ break;
+ case 4:
+ gf_4vect_dot_prod_sse(len, k, g_tbls, data, coding);
+ break;
case 3:
gf_3vect_dot_prod_sse(len, k, g_tbls, data, coding);
break;
@@ -69,13 +76,19 @@ void ec_encode_data_avx(int len, int k, int rows, unsigned char *g_tbls, unsigne
return;
}
- while (rows >= 4) {
- gf_4vect_dot_prod_avx(len, k, g_tbls, data, coding);
- g_tbls += 4 * k * 32;
- coding += 4;
- rows -= 4;
+ while (rows >= 6) {
+ gf_6vect_dot_prod_avx(len, k, g_tbls, data, coding);
+ g_tbls += 6 * k * 32;
+ coding += 6;
+ rows -= 6;
}
switch (rows) {
+ case 5:
+ gf_5vect_dot_prod_avx(len, k, g_tbls, data, coding);
+ break;
+ case 4:
+ gf_4vect_dot_prod_avx(len, k, g_tbls, data, coding);
+ break;
case 3:
gf_3vect_dot_prod_avx(len, k, g_tbls, data, coding);
break;
@@ -100,13 +113,19 @@ void ec_encode_data_avx2(int len, int k, int rows, unsigned char *g_tbls, unsign
return;
}
- while (rows >= 4) {
- gf_4vect_dot_prod_avx2(len, k, g_tbls, data, coding);
- g_tbls += 4 * k * 32;
- coding += 4;
- rows -= 4;
+ while (rows >= 6) {
+ gf_6vect_dot_prod_avx2(len, k, g_tbls, data, coding);
+ g_tbls += 6 * k * 32;
+ coding += 6;
+ rows -= 6;
}
switch (rows) {
+ case 5:
+ gf_5vect_dot_prod_avx2(len, k, g_tbls, data, coding);
+ break;
+ case 4:
+ gf_4vect_dot_prod_avx2(len, k, g_tbls, data, coding);
+ break;
case 3:
gf_3vect_dot_prod_avx2(len, k, g_tbls, data, coding);
break;
@@ -132,6 +151,10 @@ extern int gf_3vect_dot_prod_avx512(int len, int k, unsigned char *g_tbls,
unsigned char **data, unsigned char **coding);
extern int gf_4vect_dot_prod_avx512(int len, int k, unsigned char *g_tbls,
unsigned char **data, unsigned char **coding);
+extern int gf_5vect_dot_prod_avx512(int len, int k, unsigned char *g_tbls,
+ unsigned char **data, unsigned char **coding);
+extern int gf_6vect_dot_prod_avx512(int len, int k, unsigned char *g_tbls,
+ unsigned char **data, unsigned char **coding);
extern void gf_vect_mad_avx512(int len, int vec, int vec_i, unsigned char *gftbls,
unsigned char *src, unsigned char *dest);
extern void gf_2vect_mad_avx512(int len, int vec, int vec_i, unsigned char *gftbls,
@@ -140,6 +163,10 @@ extern void gf_3vect_mad_avx512(int len, int vec, int vec_i, unsigned char *gftb
unsigned char *src, unsigned char **dest);
extern void gf_4vect_mad_avx512(int len, int vec, int vec_i, unsigned char *gftbls,
unsigned char *src, unsigned char **dest);
+extern void gf_5vect_mad_avx512(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char **dest);
+extern void gf_6vect_mad_avx512(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char **dest);
void ec_encode_data_avx512(int len, int k, int rows, unsigned char *g_tbls,
unsigned char **data, unsigned char **coding)
@@ -150,13 +177,19 @@ void ec_encode_data_avx512(int len, int k, int rows, unsigned char *g_tbls,
return;
}
- while (rows >= 4) {
- gf_4vect_dot_prod_avx512(len, k, g_tbls, data, coding);
- g_tbls += 4 * k * 32;
- coding += 4;
- rows -= 4;
+ while (rows >= 6) {
+ gf_6vect_dot_prod_avx512(len, k, g_tbls, data, coding);
+ g_tbls += 6 * k * 32;
+ coding += 6;
+ rows -= 6;
}
switch (rows) {
+ case 5:
+ gf_5vect_dot_prod_avx512(len, k, g_tbls, data, coding);
+ break;
+ case 4:
+ gf_4vect_dot_prod_avx512(len, k, g_tbls, data, coding);
+ break;
case 3:
gf_3vect_dot_prod_avx512(len, k, g_tbls, data, coding);
break;
@@ -179,13 +212,19 @@ void ec_encode_data_update_avx512(int len, int k, int rows, int vec_i, unsigned
return;
}
- while (rows >= 4) {
- gf_4vect_mad_avx512(len, k, vec_i, g_tbls, data, coding);
- g_tbls += 4 * k * 32;
- coding += 4;
- rows -= 4;
+ while (rows >= 6) {
+ gf_6vect_mad_avx512(len, k, vec_i, g_tbls, data, coding);
+ g_tbls += 6 * k * 32;
+ coding += 6;
+ rows -= 6;
}
switch (rows) {
+ case 5:
+ gf_5vect_mad_avx512(len, k, vec_i, g_tbls, data, coding);
+ break;
+ case 4:
+ gf_4vect_mad_avx512(len, k, vec_i, g_tbls, data, coding);
+ break;
case 3:
gf_3vect_mad_avx512(len, k, vec_i, g_tbls, data, coding);
break;
@@ -200,6 +239,179 @@ void ec_encode_data_update_avx512(int len, int k, int rows, int vec_i, unsigned
}
}
+#if AS_FEATURE_LEVEL >= 10
+
+extern void gf_vect_dot_prod_avx512_gfni(int len, int k, unsigned char *g_tbls,
+ unsigned char **data, unsigned char *dest);
+extern void gf_2vect_dot_prod_avx512_gfni(int len, int k, unsigned char *g_tbls,
+ unsigned char **data, unsigned char **coding);
+extern void gf_3vect_dot_prod_avx512_gfni(int len, int k, unsigned char *g_tbls,
+ unsigned char **data, unsigned char **coding);
+extern void gf_4vect_dot_prod_avx512_gfni(int len, int k, unsigned char *g_tbls,
+ unsigned char **data, unsigned char **coding);
+extern void gf_5vect_dot_prod_avx512_gfni(int len, int k, unsigned char *g_tbls,
+ unsigned char **data, unsigned char **coding);
+extern void gf_6vect_dot_prod_avx512_gfni(int len, int k, unsigned char *g_tbls,
+ unsigned char **data, unsigned char **coding);
+
+extern void gf_vect_mad_avx512_gfni(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char *dest);
+extern void gf_2vect_mad_avx512_gfni(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char **dest);
+extern void gf_3vect_mad_avx512_gfni(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char **dest);
+extern void gf_4vect_mad_avx512_gfni(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char **dest);
+extern void gf_5vect_mad_avx512_gfni(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char **dest);
+extern void gf_6vect_mad_avx512_gfni(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char **dest);
+
+extern void gf_vect_dot_prod_avx2_gfni(int len, int k, unsigned char *g_tbls,
+ unsigned char **data, unsigned char *dest);
+extern void gf_2vect_dot_prod_avx2_gfni(int len, int k, unsigned char *g_tbls,
+ unsigned char **data, unsigned char **coding);
+extern void gf_3vect_dot_prod_avx2_gfni(int len, int k, unsigned char *g_tbls,
+ unsigned char **data, unsigned char **coding);
+extern void gf_vect_mad_avx2_gfni(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char *dest);
+extern void gf_2vect_mad_avx2_gfni(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char **dest);
+extern void gf_3vect_mad_avx2_gfni(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char **dest);
+extern void gf_4vect_mad_avx2_gfni(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char **dest);
+extern void gf_5vect_mad_avx2_gfni(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char **dest);
+
+void ec_init_tables_gfni(int k, int rows, unsigned char *a, unsigned char *g_tbls)
+{
+ int i, j;
+
+ uint64_t *g64 = (uint64_t *) g_tbls;
+
+ for (i = 0; i < rows; i++)
+ for (j = 0; j < k; j++)
+ *(g64++) = gf_table_gfni[*a++];
+
+}
+
+void ec_encode_data_avx512_gfni(int len, int k, int rows, unsigned char *g_tbls,
+ unsigned char **data, unsigned char **coding)
+{
+
+ while (rows >= 6) {
+ gf_6vect_dot_prod_avx512_gfni(len, k, g_tbls, data, coding);
+ g_tbls += 6 * k * 8;
+ coding += 6;
+ rows -= 6;
+ }
+ switch (rows) {
+ case 5:
+ gf_5vect_dot_prod_avx512_gfni(len, k, g_tbls, data, coding);
+ break;
+ case 4:
+ gf_4vect_dot_prod_avx512_gfni(len, k, g_tbls, data, coding);
+ break;
+ case 3:
+ gf_3vect_dot_prod_avx512_gfni(len, k, g_tbls, data, coding);
+ break;
+ case 2:
+ gf_2vect_dot_prod_avx512_gfni(len, k, g_tbls, data, coding);
+ break;
+ case 1:
+ gf_vect_dot_prod_avx512_gfni(len, k, g_tbls, data, *coding);
+ break;
+ case 0:
+ default:
+ break;
+ }
+}
+
+void ec_encode_data_avx2_gfni(int len, int k, int rows, unsigned char *g_tbls,
+ unsigned char **data, unsigned char **coding)
+{
+ while (rows >= 3) {
+ gf_3vect_dot_prod_avx2_gfni(len, k, g_tbls, data, coding);
+ g_tbls += 3 * k * 8;
+ coding += 3;
+ rows -= 3;
+ }
+ switch (rows) {
+ case 2:
+ gf_2vect_dot_prod_avx2_gfni(len, k, g_tbls, data, coding);
+ break;
+ case 1:
+ gf_vect_dot_prod_avx2_gfni(len, k, g_tbls, data, *coding);
+ break;
+ case 0:
+ default:
+ break;
+ }
+}
+
+void ec_encode_data_update_avx512_gfni(int len, int k, int rows, int vec_i,
+ unsigned char *g_tbls, unsigned char *data,
+ unsigned char **coding)
+{
+ while (rows >= 6) {
+ gf_6vect_mad_avx512_gfni(len, k, vec_i, g_tbls, data, coding);
+ g_tbls += 6 * k * 8;
+ coding += 6;
+ rows -= 6;
+ }
+ switch (rows) {
+ case 5:
+ gf_5vect_mad_avx512_gfni(len, k, vec_i, g_tbls, data, coding);
+ break;
+ case 4:
+ gf_4vect_mad_avx512_gfni(len, k, vec_i, g_tbls, data, coding);
+ break;
+ case 3:
+ gf_3vect_mad_avx512_gfni(len, k, vec_i, g_tbls, data, coding);
+ break;
+ case 2:
+ gf_2vect_mad_avx512_gfni(len, k, vec_i, g_tbls, data, coding);
+ break;
+ case 1:
+ gf_vect_mad_avx512_gfni(len, k, vec_i, g_tbls, data, *coding);
+ break;
+ case 0:
+ default:
+ break;
+ }
+}
+
+void ec_encode_data_update_avx2_gfni(int len, int k, int rows, int vec_i,
+ unsigned char *g_tbls, unsigned char *data,
+ unsigned char **coding)
+{
+ while (rows >= 5) {
+ gf_5vect_mad_avx2_gfni(len, k, vec_i, g_tbls, data, coding);
+ g_tbls += 5 * k * 8;
+ coding += 5;
+ rows -= 5;
+ }
+ switch (rows) {
+ case 4:
+ gf_4vect_mad_avx2_gfni(len, k, vec_i, g_tbls, data, coding);
+ break;
+ case 3:
+ gf_3vect_mad_avx2_gfni(len, k, vec_i, g_tbls, data, coding);
+ break;
+ case 2:
+ gf_2vect_mad_avx2_gfni(len, k, vec_i, g_tbls, data, coding);
+ break;
+ case 1:
+ gf_vect_mad_avx2_gfni(len, k, vec_i, g_tbls, data, *coding);
+ break;
+ case 0:
+ default:
+ break;
+ }
+}
+
+#endif // AS_FEATURE_LEVEL >= 10
#endif // HAVE_AS_KNOWS_AVX512
#if __WORDSIZE == 64 || _WIN64 || __x86_64__
@@ -321,16 +533,3 @@ void ec_encode_data_update_avx2(int len, int k, int rows, int vec_i, unsigned ch
#endif //__WORDSIZE == 64 || _WIN64 || __x86_64__
#endif //__x86_64__ || __i386__ || _M_X64 || _M_IX86
-
-struct slver {
- unsigned short snum;
- unsigned char ver;
- unsigned char core;
-};
-
-// Version info
-struct slver ec_init_tables_slver_00010068;
-struct slver ec_init_tables_slver = { 0x0068, 0x01, 0x00 };
-
-struct slver ec_encode_data_sse_slver_00020069;
-struct slver ec_encode_data_sse_slver = { 0x0069, 0x02, 0x00 };
diff --git a/contrib/libs/isa-l/erasure_code/ec_multibinary.asm b/contrib/libs/isa-l/erasure_code/ec_multibinary.asm
index a07f45d6f8..424687877d 100644
--- a/contrib/libs/isa-l/erasure_code/ec_multibinary.asm
+++ b/contrib/libs/isa-l/erasure_code/ec_multibinary.asm
@@ -53,6 +53,16 @@
extern gf_vect_mad_avx2
%endif
+%if (AS_FEATURE_LEVEL) >= 10
+ extern ec_init_tables_gfni
+ extern ec_encode_data_avx512_gfni
+ extern ec_encode_data_avx2_gfni
+ extern ec_encode_data_update_avx512_gfni
+ extern ec_encode_data_update_avx2_gfni
+%endif
+
+extern ec_init_tables_base
+
extern gf_vect_mul_base
extern ec_encode_data_base
extern ec_encode_data_update_base
@@ -71,6 +81,7 @@ mbin_interface gf_vect_dot_prod
mbin_interface gf_vect_mul
mbin_interface ec_encode_data_update
mbin_interface gf_vect_mad
+mbin_interface ec_init_tables
%ifidn __OUTPUT_FORMAT__, elf32
mbin_dispatch_init5 ec_encode_data, ec_encode_data_base, ec_encode_data_sse, ec_encode_data_avx, ec_encode_data_avx2
@@ -78,18 +89,13 @@ mbin_interface gf_vect_mad
mbin_dispatch_init2 gf_vect_mul, gf_vect_mul_base
mbin_dispatch_init2 ec_encode_data_update, ec_encode_data_update_base
mbin_dispatch_init2 gf_vect_mad, gf_vect_mad_base
+ mbin_dispatch_init2 ec_init_tables, ec_init_tables_base
%else
mbin_dispatch_init5 gf_vect_mul, gf_vect_mul_base, gf_vect_mul_sse, gf_vect_mul_avx, gf_vect_mul_avx
- mbin_dispatch_init6 ec_encode_data, ec_encode_data_base, ec_encode_data_sse, ec_encode_data_avx, ec_encode_data_avx2, ec_encode_data_avx512
- mbin_dispatch_init6 ec_encode_data_update, ec_encode_data_update_base, ec_encode_data_update_sse, ec_encode_data_update_avx, ec_encode_data_update_avx2, ec_encode_data_update_avx512
+ mbin_dispatch_init8 ec_encode_data, ec_encode_data_base, ec_encode_data_sse, ec_encode_data_avx, ec_encode_data_avx2, ec_encode_data_avx512, ec_encode_data_avx2_gfni, ec_encode_data_avx512_gfni
+ mbin_dispatch_init8 ec_encode_data_update, ec_encode_data_update_base, ec_encode_data_update_sse, ec_encode_data_update_avx, ec_encode_data_update_avx2, ec_encode_data_update_avx512, ec_encode_data_update_avx2_gfni, ec_encode_data_update_avx512_gfni
mbin_dispatch_init6 gf_vect_mad, gf_vect_mad_base, gf_vect_mad_sse, gf_vect_mad_avx, gf_vect_mad_avx2, gf_vect_mad_avx512
mbin_dispatch_init6 gf_vect_dot_prod, gf_vect_dot_prod_base, gf_vect_dot_prod_sse, gf_vect_dot_prod_avx, gf_vect_dot_prod_avx2, gf_vect_dot_prod_avx512
+ mbin_dispatch_init8 ec_init_tables, ec_init_tables_base, ec_init_tables_base, ec_init_tables_base, ec_init_tables_base, ec_init_tables_base, ec_init_tables_gfni, ec_init_tables_gfni
%endif
-
-;;; func core, ver, snum
-slversion ec_encode_data, 00, 06, 0133
-slversion gf_vect_mul, 00, 05, 0134
-slversion ec_encode_data_update, 00, 05, 0212
-slversion gf_vect_dot_prod, 00, 05, 0138
-slversion gf_vect_mad, 00, 04, 0213
diff --git a/contrib/libs/isa-l/erasure_code/ec_multibinary_darwin.asm b/contrib/libs/isa-l/erasure_code/ec_multibinary_darwin.asm
index 8c2537f562..c05ff5b720 100644
--- a/contrib/libs/isa-l/erasure_code/ec_multibinary_darwin.asm
+++ b/contrib/libs/isa-l/erasure_code/ec_multibinary_darwin.asm
@@ -53,6 +53,16 @@
extern _gf_vect_mad_avx2
%endif
+%if (AS_FEATURE_LEVEL) >= 10
+ extern _ec_init_tables_gfni
+ extern _ec_encode_data_avx512_gfni
+ extern _ec_encode_data_avx2_gfni
+ extern _ec_encode_data_update_avx512_gfni
+ extern _ec_encode_data_update_avx2_gfni
+%endif
+
+extern _ec_init_tables_base
+
extern _gf_vect_mul_base
extern _ec_encode_data_base
extern _ec_encode_data_update_base
@@ -71,6 +81,7 @@ mbin_interface _gf_vect_dot_prod
mbin_interface _gf_vect_mul
mbin_interface _ec_encode_data_update
mbin_interface _gf_vect_mad
+mbin_interface _ec_init_tables
%ifidn __OUTPUT_FORMAT__, elf32
mbin_dispatch_init5 _ec_encode_data, _ec_encode_data_base, _ec_encode_data_sse, _ec_encode_data_avx, _ec_encode_data_avx2
@@ -78,19 +89,13 @@ mbin_interface _gf_vect_mad
mbin_dispatch_init2 _gf_vect_mul, _gf_vect_mul_base
mbin_dispatch_init2 _ec_encode_data_update, _ec_encode_data_update_base
mbin_dispatch_init2 _gf_vect_mad, _gf_vect_mad_base
+ mbin_dispatch_init2 _ec_init_tables, _ec_init_tables_base
%else
mbin_dispatch_init5 _gf_vect_mul, _gf_vect_mul_base, _gf_vect_mul_sse, _gf_vect_mul_avx, _gf_vect_mul_avx
- mbin_dispatch_init6 _ec_encode_data, _ec_encode_data_base, _ec_encode_data_sse, _ec_encode_data_avx, _ec_encode_data_avx2, _ec_encode_data_avx512
- mbin_dispatch_init6 _ec_encode_data_update, _ec_encode_data_update_base, _ec_encode_data_update_sse, _ec_encode_data_update_avx, _ec_encode_data_update_avx2, _ec_encode_data_update_avx512
+ mbin_dispatch_init8 _ec_encode_data, _ec_encode_data_base, _ec_encode_data_sse, _ec_encode_data_avx, _ec_encode_data_avx2, _ec_encode_data_avx512, _ec_encode_data_avx2_gfni, _ec_encode_data_avx512_gfni
+ mbin_dispatch_init8 _ec_encode_data_update, _ec_encode_data_update_base, _ec_encode_data_update_sse, _ec_encode_data_update_avx, _ec_encode_data_update_avx2, _ec_encode_data_update_avx512, _ec_encode_data_update_avx2_gfni, _ec_encode_data_update_avx512_gfni
mbin_dispatch_init6 _gf_vect_mad, _gf_vect_mad_base, _gf_vect_mad_sse, _gf_vect_mad_avx, _gf_vect_mad_avx2, _gf_vect_mad_avx512
mbin_dispatch_init6 _gf_vect_dot_prod, _gf_vect_dot_prod_base, _gf_vect_dot_prod_sse, _gf_vect_dot_prod_avx, _gf_vect_dot_prod_avx2, _gf_vect_dot_prod_avx512
+ mbin_dispatch_init8 _ec_init_tables, _ec_init_tables_base, _ec_init_tables_base, _ec_init_tables_base, _ec_init_tables_base, _ec_init_tables_base, _ec_init_tables_gfni, _ec_init_tables_gfni
%endif
-
-
-;;; func core, ver, snum
-slversion ec_encode_data, 00, 06, 0133
-slversion gf_vect_mul, 00, 05, 0134
-slversion ec_encode_data_update, 00, 05, 0212
-slversion gf_vect_dot_prod, 00, 05, 0138
-slversion gf_vect_mad, 00, 04, 0213
diff --git a/contrib/libs/isa-l/erasure_code/erasure_code_base_perf.c b/contrib/libs/isa-l/erasure_code/erasure_code_base_perf.c
index 9587788d86..4fca10599d 100644
--- a/contrib/libs/isa-l/erasure_code/erasure_code_base_perf.c
+++ b/contrib/libs/isa-l/erasure_code/erasure_code_base_perf.c
@@ -30,25 +30,26 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h> // for memset, memcmp
+#include <assert.h>
#include "erasure_code.h"
#include "test.h"
-//#define CACHED_TEST
-#ifdef CACHED_TEST
+#ifndef GT_L3_CACHE
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+#endif
+
+#if !defined(COLD_TEST) && !defined(TEST_CUSTOM)
// Cached test, loop many times over small dataset
# define TEST_SOURCES 32
# define TEST_LEN(m) ((128*1024 / m) & ~(64-1))
# define TEST_TYPE_STR "_warm"
-#else
-# ifndef TEST_CUSTOM
+#elif defined (COLD_TEST)
// Uncached test. Pull from large mem base.
-# define TEST_SOURCES 32
-# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
-# define TEST_LEN(m) ((GT_L3_CACHE / m) & ~(64-1))
-# define TEST_TYPE_STR "_cold"
-# else
-# define TEST_TYPE_STR "_cus"
-# endif
+# define TEST_SOURCES 32
+# define TEST_LEN(m) ((GT_L3_CACHE / m) & ~(64-1))
+# define TEST_TYPE_STR "_cold"
+#elif defined (TEST_CUSTOM)
+# define TEST_TYPE_STR "_cus"
#endif
#define MMAX TEST_SOURCES
@@ -60,7 +61,7 @@ typedef unsigned char u8;
void ec_encode_perf(int m, int k, u8 * a, u8 * g_tbls, u8 ** buffs)
{
- ec_init_tables(k, m - k, &a[k * k], g_tbls);
+ ec_init_tables_base(k, m - k, &a[k * k], g_tbls);
ec_encode_data_base(TEST_LEN(m), k, m - k, g_tbls, buffs, &buffs[k]);
}
@@ -88,7 +89,7 @@ int ec_decode_perf(int m, int k, u8 * a, u8 * g_tbls, u8 ** buffs, u8 * src_in_e
c[k * i + j] = d[k * src_err_list[i] + j];
// Recover data
- ec_init_tables(k, nerrs, c, g_tbls);
+ ec_init_tables_base(k, nerrs, c, g_tbls);
ec_encode_data_base(TEST_LEN(m), k, nerrs, g_tbls, recov, temp_buffs);
return 0;
@@ -112,10 +113,8 @@ int main(int argc, char *argv[])
printf("erasure_code_base_perf: %dx%d %d\n", m, TEST_LEN(m), nerrs);
- if (m > MMAX || k > KMAX || nerrs > (m - k)) {
- printf(" Input test parameter error\n");
- return -1;
- }
+ // check input parameters
+ assert(!(m > MMAX || k > KMAX || nerrs > (m - k)));
memcpy(src_err_list, err_list, nerrs);
memset(src_in_err, 0, TEST_SOURCES);
diff --git a/contrib/libs/isa-l/erasure_code/erasure_code_base_test.c b/contrib/libs/isa-l/erasure_code/erasure_code_base_test.c
index a87f33f9f4..ad48d8e448 100644
--- a/contrib/libs/isa-l/erasure_code/erasure_code_base_test.c
+++ b/contrib/libs/isa-l/erasure_code/erasure_code_base_test.c
@@ -30,10 +30,11 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h> // for memset, memcmp
+#include <assert.h>
#include "erasure_code.h"
-// #include "types.h"
+#include "test.h"
-#define TEST_LEN 512
+#define TEST_LEN 8192
#define TEST_SIZE (TEST_LEN/2)
#ifndef TEST_SOURCES
@@ -264,8 +265,7 @@ int main(int argc, char *argv[])
// Pick a first test
m = 9;
k = 5;
- if (m > MMAX || k > KMAX)
- return -1;
+ assert((m <= MMAX) && (k <= KMAX));
// Make random data
for (i = 0; i < k; i++)
@@ -278,7 +278,7 @@ int main(int argc, char *argv[])
gf_gen_rs_matrix(encode_matrix, m, k);
// Generate g_tbls from encode matrix encode_matrix
- ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
+ ec_init_tables_base(k, m - k, &encode_matrix[k * k], g_tbls);
// Perform matrix dot_prod for EC encoding
// using g_tbls from encode matrix encode_matrix
@@ -304,7 +304,7 @@ int main(int argc, char *argv[])
}
// Recover data
- ec_init_tables(k, nerrs, decode_matrix, g_tbls);
+ ec_init_tables_base(k, nerrs, decode_matrix, g_tbls);
ec_encode_data_base(TEST_LEN, k, nerrs, g_tbls, recov, &temp_buffs[k]);
for (i = 0; i < nerrs; i++) {
@@ -346,7 +346,7 @@ int main(int argc, char *argv[])
gf_gen_cauchy1_matrix(encode_matrix, m, k);
// Generate g_tbls from encode matrix encode_matrix
- ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
+ ec_init_tables_base(k, m - k, &encode_matrix[k * k], g_tbls);
// Perform matrix dot_prod for EC encoding
// using g_tbls from encode matrix encode_matrix
@@ -372,7 +372,7 @@ int main(int argc, char *argv[])
}
// Recover data
- ec_init_tables(k, nerrs, decode_matrix, g_tbls);
+ ec_init_tables_base(k, nerrs, decode_matrix, g_tbls);
ec_encode_data_base(TEST_LEN, k, nerrs, g_tbls, recov, &temp_buffs[k]);
for (i = 0; i < nerrs; i++) {
@@ -417,7 +417,7 @@ int main(int argc, char *argv[])
// Make parity vects
// Generate g_tbls from encode matrix a
- ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
+ ec_init_tables_base(k, m - k, &encode_matrix[k * k], g_tbls);
// Perform matrix dot_prod for EC encoding
// using g_tbls from encode matrix a
ec_encode_data_base(TEST_LEN, k, m - k, g_tbls, buffs, &buffs[k]);
@@ -442,7 +442,7 @@ int main(int argc, char *argv[])
}
// Recover data
- ec_init_tables(k, nerrs, decode_matrix, g_tbls);
+ ec_init_tables_base(k, nerrs, decode_matrix, g_tbls);
ec_encode_data_base(TEST_LEN, k, nerrs, g_tbls, recov, &temp_buffs[k]);
for (i = 0; i < nerrs; i++) {
@@ -470,7 +470,9 @@ int main(int argc, char *argv[])
return -1;
}
}
+#ifdef TEST_VERBOSE
putchar('.');
+#endif
}
// Run tests at end of buffer for Electric Fence
@@ -500,7 +502,7 @@ int main(int argc, char *argv[])
// Make parity vects
// Generate g_tbls from encode matrix a
- ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
+ ec_init_tables_base(k, m - k, &encode_matrix[k * k], g_tbls);
// Perform matrix dot_prod for EC encoding
// using g_tbls from encode matrix a
ec_encode_data_base(size, k, m - k, g_tbls, efence_buffs,
@@ -526,7 +528,7 @@ int main(int argc, char *argv[])
}
// Recover data
- ec_init_tables(k, nerrs, decode_matrix, g_tbls);
+ ec_init_tables_base(k, nerrs, decode_matrix, g_tbls);
ec_encode_data_base(size, k, nerrs, g_tbls, recov, &temp_buffs[k]);
for (i = 0; i < nerrs; i++) {
@@ -593,7 +595,7 @@ int main(int argc, char *argv[])
// Make parity vects
// Generate g_tbls from encode matrix a
- ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
+ ec_init_tables_base(k, m - k, &encode_matrix[k * k], g_tbls);
// Perform matrix dot_prod for EC encoding
// using g_tbls from encode matrix a
ec_encode_data_base(size, k, m - k, g_tbls, ubuffs, &ubuffs[k]);
@@ -618,7 +620,7 @@ int main(int argc, char *argv[])
}
// Recover data
- ec_init_tables(k, nerrs, decode_matrix, g_tbls);
+ ec_init_tables_base(k, nerrs, decode_matrix, g_tbls);
ec_encode_data_base(size, k, nerrs, g_tbls, recov, &temp_ubuffs[k]);
for (i = 0; i < nerrs; i++) {
@@ -681,7 +683,9 @@ int main(int argc, char *argv[])
}
}
+#ifdef TEST_VERBOSE
putchar('.');
+#endif
}
// Test size alignment
@@ -705,7 +709,7 @@ int main(int argc, char *argv[])
// Make parity vects
// Generate g_tbls from encode matrix a
- ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
+ ec_init_tables_base(k, m - k, &encode_matrix[k * k], g_tbls);
// Perform matrix dot_prod for EC encoding
// using g_tbls from encode matrix a
ec_encode_data_base(size, k, m - k, g_tbls, buffs, &buffs[k]);
@@ -729,7 +733,7 @@ int main(int argc, char *argv[])
}
// Recover data
- ec_init_tables(k, nerrs, decode_matrix, g_tbls);
+ ec_init_tables_base(k, nerrs, decode_matrix, g_tbls);
ec_encode_data_base(size, k, nerrs, g_tbls, recov, &temp_buffs[k]);
for (i = 0; i < nerrs; i++) {
diff --git a/contrib/libs/isa-l/erasure_code/erasure_code_base_test.patch b/contrib/libs/isa-l/erasure_code/erasure_code_base_test.patch
deleted file mode 100644
index 0d84217177..0000000000
--- a/contrib/libs/isa-l/erasure_code/erasure_code_base_test.patch
+++ /dev/null
@@ -1,12 +0,0 @@
-34c34
-< // #include "types.h"
----
-> #include "types.h"
-36c36
-< #define TEST_LEN 512
----
-> #define TEST_LEN 8192
-204c204
-< s ^= gf_mul_erasure(invert_matrix[j * k + i],
----
-> s ^= gf_mul(invert_matrix[j * k + i],
diff --git a/contrib/libs/isa-l/erasure_code/erasure_code_perf.c b/contrib/libs/isa-l/erasure_code/erasure_code_perf.c
index c4cad880f1..25c8774507 100644
--- a/contrib/libs/isa-l/erasure_code/erasure_code_perf.c
+++ b/contrib/libs/isa-l/erasure_code/erasure_code_perf.c
@@ -33,22 +33,25 @@
#include "erasure_code.h"
#include "test.h"
-//#define CACHED_TEST
-#ifdef CACHED_TEST
+#ifndef GT_L3_CACHE
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+#endif
+
+#if !defined(COLD_TEST) && !defined(TEST_CUSTOM)
// Cached test, loop many times over small dataset
# define TEST_SOURCES 32
# define TEST_LEN(m) ((128*1024 / m) & ~(64-1))
# define TEST_TYPE_STR "_warm"
-#else
-# ifndef TEST_CUSTOM
+#elif defined (COLD_TEST)
// Uncached test. Pull from large mem base.
-# define TEST_SOURCES 32
-# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
-# define TEST_LEN(m) ((GT_L3_CACHE / m) & ~(64-1))
-# define TEST_TYPE_STR "_cold"
-# else
-# define TEST_TYPE_STR "_cus"
-# endif
+# define TEST_SOURCES 32
+# define TEST_LEN(m) ((GT_L3_CACHE / m) & ~(64-1))
+# define TEST_TYPE_STR "_cold"
+#elif defined (TEST_CUSTOM)
+# define TEST_TYPE_STR "_cus"
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
#endif
#define MMAX TEST_SOURCES
@@ -58,14 +61,26 @@
typedef unsigned char u8;
-void ec_encode_perf(int m, int k, u8 * a, u8 * g_tbls, u8 ** buffs)
+void usage(const char *app_name)
+{
+ fprintf(stderr,
+ "Usage: %s [options]\n"
+ " -h Help\n"
+ " -k <val> Number of source buffers\n"
+ " -p <val> Number of parity buffers\n"
+ " -e <val> Number of simulated buffers with errors (cannot be higher than p or k)\n",
+ app_name);
+}
+
+void ec_encode_perf(int m, int k, u8 * a, u8 * g_tbls, u8 ** buffs, struct perf *start)
{
ec_init_tables(k, m - k, &a[k * k], g_tbls);
- ec_encode_data(TEST_LEN(m), k, m - k, g_tbls, buffs, &buffs[k]);
+ BENCHMARK(start, BENCHMARK_TIME,
+ ec_encode_data(TEST_LEN(m), k, m - k, g_tbls, buffs, &buffs[k]));
}
int ec_decode_perf(int m, int k, u8 * a, u8 * g_tbls, u8 ** buffs, u8 * src_in_err,
- u8 * src_err_list, int nerrs, u8 ** temp_buffs)
+ u8 * src_err_list, int nerrs, u8 ** temp_buffs, struct perf *start)
{
int i, j, r;
u8 b[MMAX * KMAX], c[MMAX * KMAX], d[MMAX * KMAX];
@@ -89,34 +104,109 @@ int ec_decode_perf(int m, int k, u8 * a, u8 * g_tbls, u8 ** buffs, u8 * src_in_e
// Recover data
ec_init_tables(k, nerrs, c, g_tbls);
- ec_encode_data(TEST_LEN(m), k, nerrs, g_tbls, recov, temp_buffs);
+ BENCHMARK(start, BENCHMARK_TIME,
+ ec_encode_data(TEST_LEN(m), k, nerrs, g_tbls, recov, temp_buffs));
return 0;
}
int main(int argc, char *argv[])
{
- int i, j, m, k, nerrs, check;
+ int i, j, m, k, p, nerrs, check, ret = -1;
void *buf;
- u8 *temp_buffs[TEST_SOURCES], *buffs[TEST_SOURCES];
+ u8 *temp_buffs[TEST_SOURCES] = { NULL };
+ u8 *buffs[TEST_SOURCES] = { NULL };
u8 a[MMAX * KMAX];
u8 g_tbls[KMAX * TEST_SOURCES * 32], src_in_err[TEST_SOURCES];
u8 src_err_list[TEST_SOURCES];
struct perf start;
- // Pick test parameters
- m = 14;
- k = 10;
+ /* Set default parameters */
+ k = 8;
+ p = 6;
nerrs = 4;
- const u8 err_list[] = { 2, 4, 5, 7 };
- printf("erasure_code_perf: %dx%d %d\n", m, TEST_LEN(m), nerrs);
+ /* Parse arguments */
+ for (i = 1; i < argc; i++) {
+ if (strcmp(argv[i], "-k") == 0) {
+ k = atoi(argv[++i]);
+ } else if (strcmp(argv[i], "-p") == 0) {
+ p = atoi(argv[++i]);
+ } else if (strcmp(argv[i], "-e") == 0) {
+ nerrs = atoi(argv[++i]);
+ } else if (strcmp(argv[i], "-h") == 0) {
+ usage(argv[0]);
+ return 0;
+ } else {
+ usage(argv[0]);
+ return -1;
+ }
+ }
+
+ if (nerrs > k) {
+ printf
+ ("Number of errors (%d) cannot be higher than number of data buffers (%d)\n",
+ nerrs, k);
+ return -1;
+ }
+
+ if (k <= 0) {
+ printf("Number of source buffers (%d) must be > 0\n", k);
+ return -1;
+ }
+
+ if (p <= 0) {
+ printf("Number of parity buffers (%d) must be > 0\n", p);
+ return -1;
+ }
+
+ if (nerrs <= 0) {
+ printf("Number of errors (%d) must be > 0\n", nerrs);
+ return -1;
+ }
- if (m > MMAX || k > KMAX || nerrs > (m - k)) {
- printf(" Input test parameter error\n");
+ if (nerrs > p) {
+ printf
+ ("Number of errors (%d) cannot be higher than number of parity buffers (%d)\n",
+ nerrs, p);
return -1;
}
+ m = k + p;
+
+ if (m > MMAX) {
+ printf("Number of total buffers (data and parity) cannot be higher than %d\n",
+ MMAX);
+ return -1;
+ }
+
+ u8 *err_list = malloc((size_t)nerrs);
+ if (err_list == NULL) {
+ printf("Error allocating list of array of error indices\n");
+ return -1;
+ }
+
+ srand(TEST_SEED);
+
+ for (i = 0; i < nerrs;) {
+ u8 next_err = rand() % k;
+ for (j = 0; j < i; j++)
+ if (next_err == err_list[j])
+ break;
+ if (j != i)
+ continue;
+ err_list[i++] = next_err;
+ }
+
+ printf("Testing with %u data buffers and %u parity buffers (num errors = %u, in [ ", k,
+ p, nerrs);
+ for (i = 0; i < nerrs; i++)
+ printf("%d ", (int)err_list[i]);
+
+ printf("])\n");
+
+ printf("erasure_code_perf: %dx%d %d\n", m, TEST_LEN(m), nerrs);
+
memcpy(src_err_list, err_list, nerrs);
memset(src_in_err, 0, TEST_SOURCES);
for (i = 0; i < nerrs; i++)
@@ -125,16 +215,16 @@ int main(int argc, char *argv[])
// Allocate the arrays
for (i = 0; i < m; i++) {
if (posix_memalign(&buf, 64, TEST_LEN(m))) {
- printf("alloc error: Fail\n");
- return -1;
+ printf("Error allocating buffers\n");
+ goto exit;
}
buffs[i] = buf;
}
- for (i = 0; i < (m - k); i++) {
+ for (i = 0; i < p; i++) {
if (posix_memalign(&buf, 64, TEST_LEN(m))) {
- printf("alloc error: Fail\n");
- return -1;
+ printf("Error allocating buffers\n");
+ goto exit;
}
temp_buffs[i] = buf;
}
@@ -147,24 +237,24 @@ int main(int argc, char *argv[])
gf_gen_rs_matrix(a, m, k);
// Start encode test
- BENCHMARK(&start, BENCHMARK_TIME, ec_encode_perf(m, k, a, g_tbls, buffs));
+ ec_encode_perf(m, k, a, g_tbls, buffs, &start);
printf("erasure_code_encode" TEST_TYPE_STR ": ");
perf_print(start, (long long)(TEST_LEN(m)) * (m));
// Start decode test
- BENCHMARK(&start, BENCHMARK_TIME, check =
- ec_decode_perf(m, k, a, g_tbls, buffs, src_in_err, src_err_list, nerrs,
- temp_buffs));
+ check = ec_decode_perf(m, k, a, g_tbls, buffs, src_in_err, src_err_list, nerrs,
+ temp_buffs, &start);
if (check == BAD_MATRIX) {
printf("BAD MATRIX\n");
- return check;
+ ret = check;
+ goto exit;
}
for (i = 0; i < nerrs; i++) {
if (0 != memcmp(temp_buffs[i], buffs[src_err_list[i]], TEST_LEN(m))) {
printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
- return -1;
+ goto exit;
}
}
@@ -172,5 +262,14 @@ int main(int argc, char *argv[])
perf_print(start, (long long)(TEST_LEN(m)) * (k + nerrs));
printf("done all: Pass\n");
- return 0;
+
+ ret = 0;
+
+ exit:
+ free(err_list);
+ for (i = 0; i < TEST_SOURCES; i++) {
+ free(buffs[i]);
+ free(temp_buffs[i]);
+ }
+ return ret;
}
diff --git a/contrib/libs/isa-l/erasure_code/erasure_code_test.c b/contrib/libs/isa-l/erasure_code/erasure_code_test.c
index 16a6457e4e..f45b38a06a 100644
--- a/contrib/libs/isa-l/erasure_code/erasure_code_test.c
+++ b/contrib/libs/isa-l/erasure_code/erasure_code_test.c
@@ -30,8 +30,9 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h> // for memset, memcmp
+#include <assert.h>
#include "erasure_code.h"
-// #include "types.h"
+#include "test.h"
#define TEST_LEN 8192
#define TEST_SIZE (TEST_LEN/2)
@@ -215,13 +216,14 @@ static int gf_gen_decode_matrix(unsigned char *encode_matrix,
int main(int argc, char *argv[])
{
- int re = 0;
+ int re = -1;
int i, j, p, rtest, m, k;
int nerrs, nsrcerrs;
void *buf;
unsigned int decode_index[MMAX];
- unsigned char *temp_buffs[TEST_SOURCES], *buffs[TEST_SOURCES];
- unsigned char *encode_matrix, *decode_matrix, *invert_matrix, *g_tbls;
+ unsigned char *temp_buffs[TEST_SOURCES] = { NULL }, *buffs[TEST_SOURCES] = { NULL };
+ unsigned char *encode_matrix = NULL, *decode_matrix = NULL, *invert_matrix =
+ NULL, *g_tbls = NULL;
unsigned char src_in_err[TEST_SOURCES], src_err_list[TEST_SOURCES];
unsigned char *recov[TEST_SOURCES];
@@ -238,7 +240,7 @@ int main(int argc, char *argv[])
for (i = 0; i < TEST_SOURCES; i++) {
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
- return -1;
+ goto exit;
}
buffs[i] = buf;
}
@@ -246,7 +248,7 @@ int main(int argc, char *argv[])
for (i = 0; i < TEST_SOURCES; i++) {
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
- return -1;
+ goto exit;
}
temp_buffs[i] = buf;
}
@@ -260,13 +262,12 @@ int main(int argc, char *argv[])
if (encode_matrix == NULL || decode_matrix == NULL
|| invert_matrix == NULL || g_tbls == NULL) {
printf("Test failure! Error with malloc\n");
- return -1;
+ goto exit;
}
// Pick a first test
m = 9;
k = 5;
- if (m > MMAX || k > KMAX)
- return -1;
+ assert((m <= MMAX) && (k <= KMAX));
// Make random data
for (i = 0; i < k; i++)
@@ -295,7 +296,7 @@ int main(int argc, char *argv[])
nerrs, nsrcerrs, k, m);
if (re != 0) {
printf("Fail to gf_gen_decode_matrix\n");
- return -1;
+ goto exit;
}
// Pack recovery array as list of valid sources
// Its order must be the same as the order
@@ -327,15 +328,18 @@ int main(int argc, char *argv[])
dump(temp_buffs[k + i], 25);
printf("orig :");
dump(buffs[src_err_list[i]], 25);
- return -1;
+ re = -1;
+ goto exit;
}
}
// Pick a first test
m = 9;
k = 5;
- if (m > MMAX || k > KMAX)
- return -1;
+ if (m > MMAX || k > KMAX) {
+ re = -1;
+ goto exit;
+ }
// Make random data
for (i = 0; i < k; i++)
@@ -363,7 +367,7 @@ int main(int argc, char *argv[])
nerrs, nsrcerrs, k, m);
if (re != 0) {
printf("Fail to gf_gen_decode_matrix\n");
- return -1;
+ goto exit;
}
// Pack recovery array as list of valid sources
// Its order must be the same as the order
@@ -395,7 +399,8 @@ int main(int argc, char *argv[])
dump(temp_buffs[k + i], 25);
printf("orig :");
dump(buffs[src_err_list[i]], 25);
- return -1;
+ re = -1;
+ goto exit;
}
}
@@ -433,7 +438,7 @@ int main(int argc, char *argv[])
src_in_err, nerrs, nsrcerrs, k, m);
if (re != 0) {
printf("Fail to gf_gen_decode_matrix\n");
- return -1;
+ goto exit;
}
// Pack recovery array as list of valid sources
// Its order must be the same as the order
@@ -468,22 +473,29 @@ int main(int argc, char *argv[])
dump(buffs[src_err_list[i]], 25);
printf("recov %d:", src_err_list[i]);
dump(temp_buffs[k + i], 25);
- return -1;
+ re = -1;
+ goto exit;
}
}
+#ifdef TEST_VERBOSE
putchar('.');
+#endif
}
// Run tests at end of buffer for Electric Fence
k = 16;
align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
- if (k > KMAX)
- return -1;
+ if (k > KMAX) {
+ re = -1;
+ goto exit;
+ }
for (rows = 1; rows <= 16; rows++) {
m = k + rows;
- if (m > MMAX)
- return -1;
+ if (m > MMAX) {
+ re = -1;
+ goto exit;
+ }
// Make random data
for (i = 0; i < k; i++)
@@ -516,7 +528,7 @@ int main(int argc, char *argv[])
src_in_err, nerrs, nsrcerrs, k, m);
if (re != 0) {
printf("Fail to gf_gen_decode_matrix\n");
- return -1;
+ goto exit;
}
// Pack recovery array as list of valid sources
// Its order must be the same as the order
@@ -556,7 +568,8 @@ int main(int argc, char *argv[])
dump(temp_buffs[k + i], align);
printf("orig :");
dump(efence_buffs[src_err_list[i]], align);
- return -1;
+ re = -1;
+ goto exit;
}
}
}
@@ -608,7 +621,7 @@ int main(int argc, char *argv[])
src_in_err, nerrs, nsrcerrs, k, m);
if (re != 0) {
printf("Fail to gf_gen_decode_matrix\n");
- return -1;
+ goto exit;
}
// Pack recovery array as list of valid sources
// Its order must be the same as the order
@@ -643,7 +656,8 @@ int main(int argc, char *argv[])
dump(ubuffs[src_err_list[i]], 25);
printf("recov %d:", src_err_list[i]);
dump(temp_ubuffs[k + i], 25);
- return -1;
+ re = -1;
+ goto exit;
}
}
@@ -656,13 +670,15 @@ int main(int argc, char *argv[])
if (memcmp(buffs[i], temp_buffs[0], offset)) {
printf("Fail rand ualign encode pad start\n");
- return -1;
+ re = -1;
+ goto exit;
}
if (memcmp
(buffs[i] + offset + size, temp_buffs[0],
PTR_ALIGN_CHK_B - offset)) {
printf("Fail rand ualign encode pad end\n");
- return -1;
+ re = -1;
+ goto exit;
}
}
@@ -671,17 +687,21 @@ int main(int argc, char *argv[])
offset = temp_ubuffs[k + i] - temp_buffs[k + i];
if (memcmp(temp_buffs[k + i], temp_buffs[0], offset)) {
printf("Fail rand ualign decode pad start\n");
- return -1;
+ re = -1;
+ goto exit;
}
if (memcmp
(temp_buffs[k + i] + offset + size, temp_buffs[0],
PTR_ALIGN_CHK_B - offset)) {
printf("Fail rand ualign decode pad end\n");
- return -1;
+ re = -1;
+ goto exit;
}
}
+#ifdef TEST_VERBOSE
putchar('.');
+#endif
}
// Test size alignment
@@ -719,7 +739,7 @@ int main(int argc, char *argv[])
src_in_err, nerrs, nsrcerrs, k, m);
if (re != 0) {
printf("Fail to gf_gen_decode_matrix\n");
- return -1;
+ goto exit;
}
// Pack recovery array as list of valid sources
// Its order must be the same as the order
@@ -754,11 +774,26 @@ int main(int argc, char *argv[])
dump(buffs[src_err_list[i]], 25);
printf("recov %d:", src_err_list[i]);
dump(temp_buffs[k + i], 25);
- return -1;
+ re = -1;
+ goto exit;
}
}
}
printf("done EC tests: Pass\n");
- return 0;
+ re = 0;
+
+ exit:
+ for (i = 0; i < TEST_SOURCES; i++) {
+ if (buffs[i])
+ aligned_free(buffs[i]);
+ if (temp_buffs[i])
+ aligned_free(temp_buffs[i]);
+ }
+ free(encode_matrix);
+ free(decode_matrix);
+ free(invert_matrix);
+ free(g_tbls);
+
+ return re;
}
diff --git a/contrib/libs/isa-l/erasure_code/erasure_code_test.patch b/contrib/libs/isa-l/erasure_code/erasure_code_test.patch
deleted file mode 100644
index 0bf88ff23b..0000000000
--- a/contrib/libs/isa-l/erasure_code/erasure_code_test.patch
+++ /dev/null
@@ -1,8 +0,0 @@
-34c34
-< // #include "types.h"
----
-> #include "types.h"
-205c205
-< s ^= gf_mul_erasure(invert_matrix[j * k + i],
----
-> s ^= gf_mul(invert_matrix[j * k + i],
diff --git a/contrib/libs/isa-l/erasure_code/erasure_code_update_perf.c b/contrib/libs/isa-l/erasure_code/erasure_code_update_perf.c
index 909e894149..e74a217cb3 100644
--- a/contrib/libs/isa-l/erasure_code/erasure_code_update_perf.c
+++ b/contrib/libs/isa-l/erasure_code/erasure_code_update_perf.c
@@ -31,7 +31,6 @@
#include <stdlib.h>
#include <string.h> // for memset, memcmp
#include "erasure_code.h"
-#include "types.h"
#include "test.h"
//By default, test multibinary version
@@ -48,22 +47,25 @@
#define str(s) #s
#define xstr(s) str(s)
-//#define CACHED_TEST
-#ifdef CACHED_TEST
+#ifndef GT_L3_CACHE
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+#endif
+
+#if !defined(COLD_TEST) && !defined(TEST_CUSTOM)
// Cached test, loop many times over small dataset
# define TEST_SOURCES 32
# define TEST_LEN(m) ((128*1024 / m) & ~(64-1))
# define TEST_TYPE_STR "_warm"
-#else
-# ifndef TEST_CUSTOM
+#elif defined (COLD_TEST)
// Uncached test. Pull from large mem base.
-# define TEST_SOURCES 32
-# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
-# define TEST_LEN(m) ((GT_L3_CACHE / m) & ~(64-1))
-# define TEST_TYPE_STR "_cold"
-# else
-# define TEST_TYPE_STR "_cus"
-# endif
+# define TEST_SOURCES 32
+# define TEST_LEN(m) ((GT_L3_CACHE / m) & ~(64-1))
+# define TEST_TYPE_STR "_cold"
+#elif defined (TEST_CUSTOM)
+# define TEST_TYPE_STR "_cus"
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
#endif
#define MMAX TEST_SOURCES
@@ -71,6 +73,17 @@
typedef unsigned char u8;
+void usage(const char *app_name)
+{
+ fprintf(stderr,
+ "Usage: %s [options]\n"
+ " -h Help\n"
+ " -k <val> Number of source buffers\n"
+ " -p <val> Number of parity buffers\n"
+ " -e <val> Number of simulated buffers with errors (cannot be higher than p or k)\n",
+ app_name);
+}
+
void dump(unsigned char *buf, int len)
{
int i;
@@ -134,29 +147,103 @@ int decode_test(int m, int k, u8 ** update_buffs, u8 ** recov, u8 * a, u8 * src_
int main(int argc, char *argv[])
{
- int i, j, check, m, k, nerrs;
+ int i, j, check, m, k, p, nerrs, ret = -1;
void *buf;
- u8 *temp_buffs[TEST_SOURCES], *buffs[TEST_SOURCES];
- u8 *update_buffs[TEST_SOURCES];
- u8 *perf_update_buffs[TEST_SOURCES];
+ u8 *temp_buffs[TEST_SOURCES] = { NULL };
+ u8 *buffs[TEST_SOURCES] = { NULL };
+ u8 *update_buffs[TEST_SOURCES] = { NULL };
+ u8 *perf_update_buffs[TEST_SOURCES] = { NULL };
u8 a[MMAX * KMAX];
u8 g_tbls[KMAX * TEST_SOURCES * 32], src_in_err[TEST_SOURCES];
u8 src_err_list[TEST_SOURCES], *recov[TEST_SOURCES];
struct perf start;
- // Pick test parameters
+ /* Set default parameters */
k = 10;
- m = k + VECT;
+ p = VECT;
nerrs = VECT;
- const u8 err_list[] = { 0, 2, 4, 5, 7, 8 };
- printf(xstr(FUNCTION_UNDER_TEST) "_perf: %dx%d %d\n", m, TEST_LEN(m), nerrs);
+ /* Parse arguments */
+ for (i = 1; i < argc; i++) {
+ if (strcmp(argv[i], "-k") == 0) {
+ k = atoi(argv[++i]);
+ } else if (strcmp(argv[i], "-p") == 0) {
+ p = atoi(argv[++i]);
+ } else if (strcmp(argv[i], "-e") == 0) {
+ nerrs = atoi(argv[++i]);
+ } else if (strcmp(argv[i], "-h") == 0) {
+ usage(argv[0]);
+ return 0;
+ } else {
+ usage(argv[0]);
+ return -1;
+ }
+ }
+
+ if (nerrs > k) {
+ printf
+ ("Number of errors (%d) cannot be higher than number of data buffers (%d)\n",
+ nerrs, k);
+ return -1;
+ }
+
+ if (k <= 0) {
+ printf("Number of source buffers (%d) must be > 0\n", k);
+ return -1;
+ }
+
+ if (p <= 0) {
+ printf("Number of parity buffers (%d) must be > 0\n", p);
+ return -1;
+ }
+
+ if (nerrs > p) {
+ printf
+ ("Number of errors (%d) cannot be higher than number of parity buffers (%d)\n",
+ nerrs, p);
+ return -1;
+ }
+
+ if (nerrs <= 0) {
+ printf("Number of errors (%d) must be > 0\n", nerrs);
+ return -1;
+ }
+
+ m = k + p;
- if (m > MMAX || k > KMAX || nerrs > (m - k)) {
- printf(" Input test parameter error\n");
+ if (m > MMAX) {
+ printf("Number of total buffers (data and parity) cannot be higher than %d\n",
+ MMAX);
return -1;
}
+ u8 *err_list = malloc((size_t)nerrs);
+ if (err_list == NULL) {
+ printf("Error allocating list of array of error indices\n");
+ return -1;
+ }
+
+ srand(TEST_SEED);
+
+ for (i = 0; i < nerrs;) {
+ u8 next_err = rand() % k;
+ for (j = 0; j < i; j++)
+ if (next_err == err_list[j])
+ break;
+ if (j != i)
+ continue;
+ err_list[i++] = next_err;
+ }
+
+ printf("Testing with %u data buffers and %u parity buffers (num errors = %u, in [ ", k,
+ p, nerrs);
+ for (i = 0; i < nerrs; i++)
+ printf("%d ", err_list[i]);
+
+ printf("])\n");
+
+ printf(xstr(FUNCTION_UNDER_TEST) "_perf: %dx%d %d\n", m, TEST_LEN(m), nerrs);
+
memcpy(src_err_list, err_list, nerrs);
memset(src_in_err, 0, TEST_SOURCES);
for (i = 0; i < nerrs; i++)
@@ -165,16 +252,16 @@ int main(int argc, char *argv[])
// Allocate the arrays
for (i = 0; i < m; i++) {
if (posix_memalign(&buf, 64, TEST_LEN(m))) {
- printf("alloc error: Fail\n");
- return -1;
+ printf("Error allocating buffers\n");
+ goto exit;
}
buffs[i] = buf;
}
for (i = 0; i < (m - k); i++) {
if (posix_memalign(&buf, 64, TEST_LEN(m))) {
- printf("alloc error: Fail\n");
- return -1;
+ printf("Error allocating buffers\n");
+ goto exit;
}
temp_buffs[i] = buf;
memset(temp_buffs[i], 0, TEST_LEN(m)); // initialize the destination buffer to be zero for update function
@@ -182,16 +269,16 @@ int main(int argc, char *argv[])
for (i = 0; i < TEST_SOURCES; i++) {
if (posix_memalign(&buf, 64, TEST_LEN(m))) {
- printf("alloc error: Fail");
- return -1;
+ printf("Error allocating buffers\n");
+ goto exit;
}
update_buffs[i] = buf;
memset(update_buffs[i], 0, TEST_LEN(m)); // initialize the destination buffer to be zero for update function
}
for (i = 0; i < TEST_SOURCES; i++) {
if (posix_memalign(&buf, 64, TEST_LEN(m))) {
- printf("alloc error: Fail");
- return -1;
+ printf("Error allocating buffers\n");
+ goto exit;
}
perf_update_buffs[i] = buf;
memset(perf_update_buffs[i], 0, TEST_LEN(m)); // initialize the destination buffer to be zero for update function
@@ -214,7 +301,7 @@ int main(int argc, char *argv[])
dump(update_buffs[k + i], 25);
printf("buffs%d :", i);
dump(buffs[k + i], 25);
- return -1;
+ goto exit;
}
}
@@ -263,13 +350,14 @@ int main(int argc, char *argv[])
nerrs, g_tbls, perf_update_buffs));
if (check) {
printf("BAD_MATRIX\n");
- return -1;
+ ret = check;
+ goto exit;
}
for (i = 0; i < nerrs; i++) {
if (0 != memcmp(temp_buffs[i], update_buffs[src_err_list[i]], TEST_LEN(m))) {
printf("Fail error recovery (%d, %d, %d) - \n", m, k, nerrs);
- return -1;
+ goto exit;
}
}
@@ -277,5 +365,16 @@ int main(int argc, char *argv[])
perf_print(start, (long long)(TEST_LEN(m)) * (k + nerrs));
printf("done all: Pass\n");
- return 0;
+
+ ret = 0;
+
+ exit:
+ free(err_list);
+ for (i = 0; i < TEST_SOURCES; i++) {
+ free(buffs[i]);
+ free(temp_buffs[i]);
+ free(update_buffs[i]);
+ free(perf_update_buffs[i]);
+ }
+ return ret;
}
diff --git a/contrib/libs/isa-l/erasure_code/erasure_code_update_test.c b/contrib/libs/isa-l/erasure_code/erasure_code_update_test.c
index 9014da7890..b13485cd72 100644
--- a/contrib/libs/isa-l/erasure_code/erasure_code_update_test.c
+++ b/contrib/libs/isa-l/erasure_code/erasure_code_update_test.c
@@ -30,8 +30,9 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h> // for memset, memcmp
+#include <assert.h>
#include "erasure_code.h"
-// #include "types.h"
+#include "test.h"
#ifndef ALIGN_SIZE
# define ALIGN_SIZE 16
@@ -227,14 +228,15 @@ static int gf_gen_decode_matrix(unsigned char *encode_matrix,
int main(int argc, char *argv[])
{
- int re = 0;
+ int re = -1;
int i, j, p, rtest, m, k;
int nerrs, nsrcerrs;
void *buf;
unsigned int decode_index[MMAX];
- unsigned char *temp_buffs[TEST_SOURCES], *buffs[TEST_SOURCES];
- unsigned char *update_buffs[TEST_SOURCES];
- unsigned char *encode_matrix, *decode_matrix, *invert_matrix, *g_tbls;
+ unsigned char *temp_buffs[TEST_SOURCES] = { NULL }, *buffs[TEST_SOURCES] = { NULL };
+ unsigned char *update_buffs[TEST_SOURCES] = { NULL };
+ unsigned char *encode_matrix = NULL, *decode_matrix = NULL, *invert_matrix =
+ NULL, *g_tbls = NULL;
unsigned char src_in_err[TEST_SOURCES], src_err_list[TEST_SOURCES];
unsigned char *recov[TEST_SOURCES];
@@ -253,7 +255,7 @@ int main(int argc, char *argv[])
for (i = 0; i < TEST_SOURCES; i++) {
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
- return -1;
+ goto exit;
}
buffs[i] = buf;
}
@@ -261,7 +263,7 @@ int main(int argc, char *argv[])
for (i = 0; i < TEST_SOURCES; i++) {
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
- return -1;
+ goto exit;
}
temp_buffs[i] = buf;
memset(temp_buffs[i], 0, TEST_LEN); // initialize the destination buffer to be zero for update function
@@ -270,7 +272,7 @@ int main(int argc, char *argv[])
for (i = 0; i < TEST_SOURCES; i++) {
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
- return -1;
+ goto exit;
}
update_buffs[i] = buf;
memset(update_buffs[i], 0, TEST_LEN); // initialize the destination buffer to be zero for update function
@@ -284,13 +286,12 @@ int main(int argc, char *argv[])
if (encode_matrix == NULL || decode_matrix == NULL
|| invert_matrix == NULL || g_tbls == NULL) {
printf("Test failure! Error with malloc\n");
- return -1;
+ goto exit;
}
// Pick a first test
m = 14;
k = 10;
- if (m > MMAX || k > KMAX)
- return -1;
+ assert(!(m > MMAX || k > KMAX));
// Make random data
for (i = 0; i < k; i++) {
@@ -321,7 +322,7 @@ int main(int argc, char *argv[])
dump(update_buffs[k + i], 25);
printf("buffs%d :", i);
dump(buffs[k + i], 25);
- return -1;
+ goto exit;
}
}
@@ -335,7 +336,7 @@ int main(int argc, char *argv[])
nerrs, nsrcerrs, k, m);
if (re != 0) {
printf("Fail to gf_gen_decode_matrix\n");
- return -1;
+ goto exit;
}
// Pack recovery array as list of valid sources
// Its order must be the same as the order
@@ -367,16 +368,21 @@ int main(int argc, char *argv[])
dump(temp_buffs[k + i], 25);
printf("orig :");
dump(update_buffs[src_err_list[i]], 25);
- return -1;
+ re = -1;
+ goto exit;
}
}
+#ifdef TEST_VERBOSE
putchar('.');
+#endif
// Pick a first test
m = 7;
k = 5;
- if (m > MMAX || k > KMAX)
- return -1;
+ if (m > MMAX || k > KMAX) {
+ re = -1;
+ goto exit;
+ }
// Zero the destination buffer for update function
for (i = k; i < TEST_SOURCES; i++) {
@@ -411,7 +417,8 @@ int main(int argc, char *argv[])
dump(update_buffs[k + i], 25);
printf("buffs%d :", i);
dump(buffs[k + i], 25);
- return -1;
+ re = -1;
+ goto exit;
}
}
@@ -425,7 +432,7 @@ int main(int argc, char *argv[])
nerrs, nsrcerrs, k, m);
if (re != 0) {
printf("Fail to gf_gen_decode_matrix\n");
- return -1;
+ goto exit;
}
// Pack recovery array as list of valid sources
// Its order must be the same as the order
@@ -462,10 +469,13 @@ int main(int argc, char *argv[])
dump(temp_buffs[k + i], 25);
printf("orig :");
dump(update_buffs[src_err_list[i]], 25);
- return -1;
+ re = -1;
+ goto exit;
}
}
+#ifdef TEST_VERBOSE
putchar('.');
+#endif
// Do more random tests
for (rtest = 0; rtest < RANDOMS; rtest++) {
@@ -508,7 +518,8 @@ int main(int argc, char *argv[])
dump(update_buffs[k + i], 25);
printf("buffs%d :", i);
dump(buffs[k + i], 25);
- return -1;
+ re = -1;
+ goto exit;
}
}
@@ -522,7 +533,7 @@ int main(int argc, char *argv[])
src_in_err, nerrs, nsrcerrs, k, m);
if (re != 0) {
printf("Fail to gf_gen_decode_matrix\n");
- return -1;
+ goto exit;
}
// Pack recovery array as list of valid sources
// Its order must be the same as the order
@@ -565,22 +576,29 @@ int main(int argc, char *argv[])
dump(update_buffs[src_err_list[i]], 25);
printf("recov %d:", src_err_list[i]);
dump(temp_buffs[k + i], 25);
- return -1;
+ re = -1;
+ goto exit;
}
}
+#ifdef TEST_VERBOSE
putchar('.');
+#endif
}
// Run tests at end of buffer for Electric Fence
k = 16;
align = (LEN_ALIGN_CHK_B != 0) ? 1 : ALIGN_SIZE;
- if (k > KMAX)
- return -1;
+ if (k > KMAX) {
+ re = -1;
+ goto exit;
+ }
for (rows = 1; rows <= 16; rows++) {
m = k + rows;
- if (m > MMAX)
- return -1;
+ if (m > MMAX) {
+ re = -1;
+ goto exit;
+ }
for (i = k; i < TEST_SOURCES; i++) {
memset(buffs[i], 0, TEST_LEN);
@@ -628,7 +646,8 @@ int main(int argc, char *argv[])
dump(efence_update_buffs[k + i], 25);
printf("efence_buffs%d :", i);
dump(efence_buffs[k + i], 25);
- return -1;
+ re = -1;
+ goto exit;
}
}
@@ -642,7 +661,7 @@ int main(int argc, char *argv[])
src_in_err, nerrs, nsrcerrs, k, m);
if (re != 0) {
printf("Fail to gf_gen_decode_matrix\n");
- return -1;
+ goto exit;
}
// Pack recovery array as list of valid sources
// Its order must be the same as the order
@@ -688,11 +707,14 @@ int main(int argc, char *argv[])
dump(temp_buffs[k + i], align);
printf("orig :");
dump(efence_update_buffs[src_err_list[i]], align);
- return -1;
+ re = 1;
+ goto exit;
}
}
}
+#ifdef TEST_VERBOSE
putchar('.');
+#endif
}
@@ -752,7 +774,8 @@ int main(int argc, char *argv[])
dump(update_ubuffs[k + i], 25);
printf("ubuffs%d :", i);
dump(ubuffs[k + i], 25);
- return -1;
+ re = -1;
+ goto exit;
}
}
@@ -766,7 +789,7 @@ int main(int argc, char *argv[])
src_in_err, nerrs, nsrcerrs, k, m);
if (re != 0) {
printf("Fail to gf_gen_decode_matrix\n");
- return -1;
+ goto exit;
}
// Pack recovery array as list of valid sources
// Its order must be the same as the order
@@ -808,7 +831,8 @@ int main(int argc, char *argv[])
dump(update_ubuffs[src_err_list[i]], 25);
printf("recov %d:", src_err_list[i]);
dump(temp_ubuffs[k + i], 25);
- return -1;
+ re = -1;
+ goto exit;
}
}
@@ -821,13 +845,15 @@ int main(int argc, char *argv[])
if (memcmp(update_buffs[i], temp_buffs[0], offset)) {
printf("Fail rand ualign encode pad start\n");
- return -1;
+ re = -1;
+ goto exit;
}
if (memcmp
(update_buffs[i] + offset + size, temp_buffs[0],
PTR_ALIGN_CHK_B - offset)) {
printf("Fail rand ualign encode pad end\n");
- return -1;
+ re = -1;
+ goto exit;
}
}
@@ -836,17 +862,21 @@ int main(int argc, char *argv[])
offset = temp_ubuffs[k + i] - temp_buffs[k + i];
if (memcmp(temp_buffs[k + i], temp_buffs[0], offset)) {
printf("Fail rand ualign decode pad start\n");
- return -1;
+ re = -1;
+ goto exit;
}
if (memcmp
(temp_buffs[k + i] + offset + size, temp_buffs[0],
PTR_ALIGN_CHK_B - offset)) {
printf("Fail rand ualign decode pad end\n");
- return -1;
+ re = -1;
+ goto exit;
}
}
+#ifdef TEST_VERBOSE
putchar('.');
+#endif
}
// Test size alignment
@@ -893,7 +923,8 @@ int main(int argc, char *argv[])
dump(update_buffs[k + i], 25);
printf("buffs%d (size=%d) :", i, size);
dump(buffs[k + i], 25);
- return -1;
+ re = -1;
+ goto exit;
}
}
@@ -906,7 +937,7 @@ int main(int argc, char *argv[])
src_in_err, nerrs, nsrcerrs, k, m);
if (re != 0) {
printf("Fail to gf_gen_decode_matrix\n");
- return -1;
+ goto exit;
}
// Pack recovery array as list of valid sources
// Its order must be the same as the order
@@ -948,12 +979,30 @@ int main(int argc, char *argv[])
dump(update_buffs[src_err_list[i]], 25);
printf("recov %d:", src_err_list[i]);
dump(temp_buffs[k + i], 25);
- return -1;
+ re = -1;
+ goto exit;
}
}
+#ifdef TEST_VERBOSE
putchar('.');
+#endif
}
printf("done EC tests: Pass\n");
+ re = 0;
+
+ exit:
+ for (i = 0; i < TEST_SOURCES; i++) {
+ if (buffs[i])
+ aligned_free(buffs[i]);
+ if (temp_buffs[i])
+ aligned_free(temp_buffs[i]);
+ if (update_buffs[i])
+ aligned_free(update_buffs[i]);
+ }
+ free(encode_matrix);
+ free(decode_matrix);
+ free(invert_matrix);
+ free(g_tbls);
return 0;
}
diff --git a/contrib/libs/isa-l/erasure_code/erasure_code_update_test.patch b/contrib/libs/isa-l/erasure_code/erasure_code_update_test.patch
deleted file mode 100644
index 3726f2d805..0000000000
--- a/contrib/libs/isa-l/erasure_code/erasure_code_update_test.patch
+++ /dev/null
@@ -1,8 +0,0 @@
-34c34
-< // #include "types.h"
----
-> #include "types.h"
-217c217
-< s ^= gf_mul_erasure(invert_matrix[j * k + i],
----
-> s ^= gf_mul(invert_matrix[j * k + i],
diff --git a/contrib/libs/isa-l/erasure_code/gen_rs_matrix_limits.c b/contrib/libs/isa-l/erasure_code/gen_rs_matrix_limits.c
index 85061484bc..18a559088d 100644
--- a/contrib/libs/isa-l/erasure_code/gen_rs_matrix_limits.c
+++ b/contrib/libs/isa-l/erasure_code/gen_rs_matrix_limits.c
@@ -9,7 +9,7 @@
#define ROWS M_MAX
#define COLS K_MAX
-static inline int min(int a, int b)
+static inline uint64_t min(const uint64_t a, const uint64_t b)
{
if (a <= b)
return a;
@@ -17,10 +17,11 @@ static inline int min(int a, int b)
return b;
}
-void gen_sub_matrix(unsigned char *out_matrix, int dim, unsigned char *in_matrix, int rows,
- int cols, uint64_t row_indicator, uint64_t col_indicator)
+void gen_sub_matrix(unsigned char *out_matrix, const uint64_t dim, unsigned char *in_matrix,
+ const uint64_t rows, const uint64_t cols, const uint64_t row_indicator,
+ const uint64_t col_indicator)
{
- int i, j, r, s;
+ uint64_t i, j, r, s;
for (i = 0, r = 0; i < rows; i++) {
if (!(row_indicator & ((uint64_t) 1 << i)))
@@ -51,23 +52,23 @@ uint64_t next_subset(uint64_t * subset, uint64_t element_count, uint64_t subsize
return 0;
}
-int are_submatrices_singular(unsigned char *vmatrix, int rows, int cols)
+int are_submatrices_singular(unsigned char *vmatrix, const uint64_t rows, const uint64_t cols)
{
unsigned char matrix[COLS * COLS];
unsigned char invert_matrix[COLS * COLS];
- uint64_t row_indicator, col_indicator, subset_init, subsize;
+ uint64_t subsize;
/* Check all square subsize x subsize submatrices of the rows x cols
* vmatrix for singularity*/
for (subsize = 1; subsize <= min(rows, cols); subsize++) {
- subset_init = (1 << subsize) - 1;
- col_indicator = subset_init;
+ const uint64_t subset_init = (1ULL << subsize) - 1ULL;
+ uint64_t col_indicator = subset_init;
do {
- row_indicator = subset_init;
+ uint64_t row_indicator = subset_init;
do {
gen_sub_matrix(matrix, subsize, vmatrix, rows,
cols, row_indicator, col_indicator);
- if (gf_invert_matrix(matrix, invert_matrix, subsize))
+ if (gf_invert_matrix(matrix, invert_matrix, (int)subsize))
return 1;
} while (next_subset(&row_indicator, rows, subsize) == 0);
@@ -80,7 +81,7 @@ int are_submatrices_singular(unsigned char *vmatrix, int rows, int cols)
int main(int argc, char **argv)
{
unsigned char vmatrix[(ROWS + COLS) * COLS];
- int rows, cols;
+ uint64_t rows, cols;
if (K_MAX > MAX_CHECK) {
printf("K_MAX too large for this test\n");
@@ -108,7 +109,7 @@ int main(int argc, char **argv)
break;
}
- printf(" k = %2d, m <= %2d \n", cols, rows + cols - 1);
+ printf(" k = %2u, m <= %2u \n", (unsigned)cols, (unsigned)(rows + cols - 1));
}
return 0;
diff --git a/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx.asm b/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx.asm
index 6233d42e5d..b5dcb0e112 100644
--- a/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx.asm
+++ b/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx.asm
@@ -52,7 +52,7 @@
%define PS 8
%define LOG_PS 3
- %define func(x) x:
+ %define func(x) x: endbranch
%macro FUNC_SAVE 0
push r12
%endmacro
@@ -84,9 +84,9 @@
%define func(x) proc_frame x
%macro FUNC_SAVE 0
alloc_stack stack_size
- save_xmm128 xmm6, 0*16
- save_xmm128 xmm7, 1*16
- save_xmm128 xmm8, 2*16
+ vmovdqa [rsp + 0*16], xmm6
+ vmovdqa [rsp + 1*16], xmm7
+ vmovdqa [rsp + 2*16], xmm8
save_reg r12, 3*16 + 0*8
save_reg r13, 3*16 + 1*8
save_reg r14, 3*16 + 2*8
@@ -127,7 +127,7 @@
%define PS 4
%define LOG_PS 2
- %define func(x) x:
+ %define func(x) x: endbranch
%define arg(x) [ebp + PS*2 + PS*x]
%define var(x) [ebp - PS - PS*x]
@@ -238,13 +238,9 @@ section .text
%endif
align 16
-global gf_2vect_dot_prod_avx:ISAL_SYM_TYPE_FUNCTION
-func(gf_2vect_dot_prod_avx)
-%ifidn __OUTPUT_FORMAT__, macho64
-global _gf_2vect_dot_prod_avx:ISAL_SYM_TYPE_FUNCTION
-func(_gf_2vect_dot_prod_avx)
-%endif
+global gf_2vect_dot_prod_avx, function
+func(gf_2vect_dot_prod_avx)
FUNC_SAVE
SLDR len, len_m
sub len, 16
@@ -336,6 +332,3 @@ section .data
align 16
mask0f: dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
-
-;;; func core, ver, snum
-slversion gf_2vect_dot_prod_avx, 02, 05, 0191
diff --git a/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx.patch b/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx.patch
deleted file mode 100644
index bca96af58e..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx.patch
+++ /dev/null
@@ -1,8 +0,0 @@
-242,246d241
-< func(gf_2vect_dot_prod_avx)
-< %ifidn __OUTPUT_FORMAT__, macho64
-< global _gf_2vect_dot_prod_avx:ISAL_SYM_TYPE_FUNCTION
-< func(_gf_2vect_dot_prod_avx)
-< %endif
-247a243
-> func(gf_2vect_dot_prod_avx)
diff --git a/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx2.asm b/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx2.asm
index 53052d56e0..3d13300528 100644
--- a/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx2.asm
+++ b/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx2.asm
@@ -54,7 +54,7 @@
%define PS 8
%define LOG_PS 3
- %define func(x) x:
+ %define func(x) x: endbranch
%macro FUNC_SAVE 0
push r12
%endmacro
@@ -131,7 +131,7 @@
%define PS 4
%define LOG_PS 2
- %define func(x) x:
+ %define func(x) x: endbranch
%define arg(x) [ebp + PS*2 + PS*x]
%define var(x) [ebp - PS - PS*x]
@@ -248,13 +248,9 @@ section .text
%endif
align 16
-global gf_2vect_dot_prod_avx2:ISAL_SYM_TYPE_FUNCTION
-func(gf_2vect_dot_prod_avx2)
-%ifidn __OUTPUT_FORMAT__, macho64
-global _gf_2vect_dot_prod_avx2:ISAL_SYM_TYPE_FUNCTION
-func(_gf_2vect_dot_prod_avx2)
-%endif
+global gf_2vect_dot_prod_avx2, function
+func(gf_2vect_dot_prod_avx2)
FUNC_SAVE
SLDR len, len_m
sub len, 32
@@ -353,8 +349,3 @@ func(_gf_2vect_dot_prod_avx2)
ret
endproc_frame
-
-section .data
-
-;;; func core, ver, snum
-slversion gf_2vect_dot_prod_avx2, 04, 05, 0196
diff --git a/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx2.patch b/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx2.patch
deleted file mode 100644
index cee2de5a58..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx2.patch
+++ /dev/null
@@ -1,8 +0,0 @@
-252,256d251
-< func(gf_2vect_dot_prod_avx2)
-< %ifidn __OUTPUT_FORMAT__, macho64
-< global _gf_2vect_dot_prod_avx2:ISAL_SYM_TYPE_FUNCTION
-< func(_gf_2vect_dot_prod_avx2)
-< %endif
-257a253
-> func(gf_2vect_dot_prod_avx2)
diff --git a/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx2_gfni.asm b/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx2_gfni.asm
new file mode 100644
index 0000000000..bdf03442e0
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx2_gfni.asm
@@ -0,0 +1,362 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2023 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_2vect_dot_prod_avx2_gfni(len, vec, *g_tbls, **buffs, **dests);
+;;;
+
+%include "reg_sizes.asm"
+%include "gf_vect_gfni.inc"
+%include "memcpy.asm"
+
+%if AS_FEATURE_LEVEL >= 10
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+ %define arg4 r8
+ %define arg5 r9
+
+ %define tmp r11
+ %define tmp2 r10
+ %define tmp3 r13 ; must be saved and restored
+ %define tmp4 r12 ; must be saved and restored
+ %define tmp5 r14 ; must be saved and restored
+
+ %define stack_size 3*8
+ %define func(x) x: endbranch
+ %macro FUNC_SAVE 0
+ sub rsp, stack_size
+ mov [rsp + 0*8], r12
+ mov [rsp + 1*8], r13
+ mov [rsp + 2*8], r14
+ %endmacro
+ %macro FUNC_RESTORE 0
+ mov r12, [rsp + 0*8]
+ mov r13, [rsp + 1*8]
+ mov r14, [rsp + 2*8]
+ add rsp, stack_size
+ %endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+
+ %define arg4 r12 ; must be saved, loaded and restored
+ %define arg5 r15 ; must be saved and restored
+ %define tmp r11
+ %define tmp2 r10
+ %define tmp3 r13 ; must be saved and restored
+ %define tmp4 r14 ; must be saved and restored
+ %define tmp5 rdi ; must be saved and restored
+ %define stack_size 7*16 + 5*8 ; must be an odd multiple of 8
+ %define arg(x) [rsp + stack_size + 8 + 8*x]
+
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ vmovdqa [rsp + 0*16], xmm6
+ vmovdqa [rsp + 1*16], xmm7
+ vmovdqa [rsp + 2*16], xmm8
+ vmovdqa [rsp + 3*16], xmm9
+ vmovdqa [rsp + 4*16], xmm10
+ vmovdqa [rsp + 5*16], xmm11
+ vmovdqa [rsp + 6*16], xmm12
+ mov [rsp + 7*16 + 0*8], r12
+ mov [rsp + 7*16 + 1*8], r13
+ mov [rsp + 7*16 + 2*8], r14
+ mov [rsp + 7*16 + 3*8], r15
+ mov [rsp + 7*16 + 4*8], rdi
+ end_prolog
+ mov arg4, arg(4)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ vmovdqa xmm6, [rsp + 0*16]
+ vmovdqa xmm7, [rsp + 1*16]
+ vmovdqa xmm8, [rsp + 2*16]
+ vmovdqa xmm9, [rsp + 3*16]
+ vmovdqa xmm10, [rsp + 4*16]
+ vmovdqa xmm11, [rsp + 5*16]
+ vmovdqa xmm12, [rsp + 6*16]
+ mov r12, [rsp + 7*16 + 0*8]
+ mov r13, [rsp + 7*16 + 1*8]
+ mov r14, [rsp + 7*16 + 2*8]
+ mov r15, [rsp + 7*16 + 3*8]
+ mov rdi, [rsp + 7*16 + 4*8]
+ add rsp, stack_size
+ %endmacro
+%endif
+
+
+%define len arg0
+%define vec arg1
+%define mul_array arg2
+%define src arg3
+%define dest arg4
+%define ptr arg5
+%define vec_i tmp2
+%define dest2 tmp3
+%define dest1 tmp5
+%define pos rax
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+ %define XLDR vmovdqa
+ %define XSTR vmovdqa
+ %else
+ %define XLDR vmovntdqa
+ %define XSTR vmovntdq
+ %endif
+%endif
+
+%define x0l ymm0
+%define x0h ymm1
+%define x0x ymm2
+
+%define xgft1 ymm3
+%define xgft2 ymm4
+
+%define xtmp1 ymm5
+%define xtmp2 ymm6
+
+%define xp1l ymm7
+%define xp2l ymm8
+
+%define xp1h ymm9
+%define xp2h ymm10
+
+%define xp1x ymm11
+%define xp2x ymm12
+
+%define x0 x0l
+%define xp1 xp1l
+%define xp2 xp2l
+
+default rel
+[bits 64]
+
+section .text
+
+;;
+;; Encodes 96 bytes of all "k" sources into 2x 96 bytes (parity disk)
+;;
+%macro ENCODE_96B_2 0
+ vpxor xp1l, xp1l, xp1l
+ vpxor xp1h, xp1h, xp1h
+ vpxor xp1x, xp1x, xp1x
+
+ vpxor xp2l, xp2l, xp2l
+ vpxor xp2h, xp2h, xp2h
+ vpxor xp2x, xp2x, xp2x
+ mov tmp, mul_array
+ xor vec_i, vec_i
+
+%%next_vect:
+ ;; load next source vector
+ mov ptr, [src + vec_i]
+ XLDR x0l, [ptr + pos]
+ XLDR x0h, [ptr + pos + 32]
+ XLDR x0x, [ptr + pos + 64]
+ add vec_i, 8
+
+ vbroadcastsd xgft1, [tmp]
+ vbroadcastsd xgft2, [tmp + vec]
+
+ GF_MUL_XOR VEX, x0l, xgft1, xtmp1, xp1l, xgft2, xtmp2, xp2l
+ GF_MUL_XOR VEX, x0h, xgft1, xtmp1, xp1h, xgft2, xtmp2, xp2h
+ GF_MUL_XOR VEX, x0x, xgft1, xtmp1, xp1x, xgft2, xtmp2, xp2x
+ add tmp, 8
+
+ cmp vec_i, vec
+ jl %%next_vect
+
+ XSTR [dest1 + pos], xp1l
+ XSTR [dest1 + pos + 32], xp1h
+ XSTR [dest1 + pos + 64], xp1x
+ XSTR [dest2 + pos], xp2l
+ XSTR [dest2 + pos + 32], xp2h
+ XSTR [dest2 + pos + 64], xp2x
+%endmacro
+
+;;
+;; Encodes 64 bytes of all "k" sources into 2x 64 bytes (parity disks)
+;;
+%macro ENCODE_64B_2 0
+ vpxor xp1l, xp1l, xp1l
+ vpxor xp1h, xp1h, xp1h
+ vpxor xp2l, xp2l, xp2l
+ vpxor xp2h, xp2h, xp2h
+ mov tmp, mul_array
+ xor vec_i, vec_i
+
+%%next_vect:
+ mov ptr, [src + vec_i]
+ XLDR x0l, [ptr + pos] ;; Get next source vector low 32 bytes
+ XLDR x0h, [ptr + pos + 32] ;; Get next source vector high 32 bytes
+ add vec_i, 8
+
+ vbroadcastsd xgft1, [tmp]
+ vbroadcastsd xgft2, [tmp + vec]
+ add tmp, 8
+
+ GF_MUL_XOR VEX, x0l, xgft1, xtmp1, xp1l, xgft2, xtmp2, xp2l
+ GF_MUL_XOR VEX, x0h, xgft1, xgft1, xp1h, xgft2, xgft2, xp2h
+
+ cmp vec_i, vec
+ jl %%next_vect
+
+ XSTR [dest1 + pos], xp1l
+ XSTR [dest1 + pos + 32], xp1h
+ XSTR [dest2 + pos], xp2l
+ XSTR [dest2 + pos + 32], xp2h
+%endmacro
+
+;;
+;; Encodes 32 bytes of all "k" sources into 2x 32 bytes (parity disks)
+;;
+%macro ENCODE_32B_2 0
+ vpxor xp1, xp1, xp1
+ vpxor xp2, xp2, xp2
+ mov tmp, mul_array
+ xor vec_i, vec_i
+
+%%next_vect:
+ mov ptr, [src + vec_i]
+ XLDR x0, [ptr + pos] ;Get next source vector (32 bytes)
+ add vec_i, 8
+
+ vbroadcastsd xgft1, [tmp]
+ vbroadcastsd xgft2, [tmp + vec]
+ add tmp, 8
+
+ GF_MUL_XOR VEX, x0, xgft1, xgft1, xp1, xgft2, xgft2, xp2
+
+ cmp vec_i, vec
+ jl %%next_vect
+
+ XSTR [dest1 + pos], xp1
+ XSTR [dest2 + pos], xp2
+%endmacro
+
+;;
+;; Encodes less than 32 bytes of all "k" sources into 2 parity disks
+;;
+%macro ENCODE_LT_32B_2 1
+%define %%LEN %1
+
+ vpxor xp1, xp1, xp1
+ vpxor xp2, xp2, xp2
+ xor vec_i, vec_i
+
+%%next_vect:
+ mov ptr, [src + vec_i]
+ simd_load_avx2 x0, ptr + pos, %%LEN, tmp, tmp4 ;Get next source vector
+ add vec_i, 8
+
+ vbroadcastsd xgft1, [mul_array]
+ vbroadcastsd xgft2, [mul_array + vec]
+ add mul_array, 8
+
+ GF_MUL_XOR VEX, x0, xgft1, xgft1, xp1, xgft2, xgft2, xp2
+
+ cmp vec_i, vec
+ jl %%next_vect
+
+ ;Store updated encoded data
+ lea ptr, [dest1 + pos]
+ simd_store_avx2 ptr, xp1, %%LEN, tmp, tmp4
+
+ lea ptr, [dest2 + pos]
+ simd_store_avx2 ptr, xp2, %%LEN, tmp, tmp4
+%endmacro
+
+align 16
+global gf_2vect_dot_prod_avx2_gfni, function
+func(gf_2vect_dot_prod_avx2_gfni)
+ FUNC_SAVE
+
+ xor pos, pos
+ shl vec, 3 ;; vec *= 8. Make vec_i count by 8
+ mov dest1, [dest]
+ mov dest2, [dest + 8]
+
+ cmp len, 96
+ jl .len_lt_96
+
+.loop96:
+ ENCODE_96B_2
+
+ add pos, 96 ;; Loop on 96 bytes at a time first
+ sub len, 96
+ cmp len, 96
+ jge .loop96
+
+.len_lt_96:
+ cmp len, 64
+ jl .len_lt_64
+
+ ENCODE_64B_2
+
+ add pos, 64 ;; encode next 64 bytes
+ sub len, 64
+
+.len_lt_64:
+ cmp len, 32
+ jl .len_lt_32
+
+ ENCODE_32B_2
+
+ add pos, 32 ;; encode next 32 bytes
+ sub len, 32
+
+.len_lt_32:
+ cmp len, 0
+ jle .exit
+
+ ENCODE_LT_32B_2 len ;; encode remaining bytes
+
+.exit:
+ vzeroupper
+
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+%endif ; if AS_FEATURE_LEVEL >= 10
diff --git a/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx512.asm b/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx512.asm
index 0fe2f434a1..1593c963a3 100644
--- a/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx512.asm
+++ b/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx512.asm
@@ -50,7 +50,7 @@
%define PS 8
%define LOG_PS 3
- %define func(x) x:
+ %define func(x) x: endbranch
%macro FUNC_SAVE 0
push r12
%endmacro
@@ -73,7 +73,7 @@
%define return rax
%define PS 8
%define LOG_PS 3
- %define stack_size 9*16 + 5*8 ; must be an odd multiple of 8
+ %define stack_size 3*16 + 3*8 ; must be an odd multiple of 8
%define arg(x) [rsp + stack_size + PS + PS*x]
%define func(x) proc_frame x
@@ -82,16 +82,9 @@
vmovdqa [rsp + 0*16], xmm6
vmovdqa [rsp + 1*16], xmm7
vmovdqa [rsp + 2*16], xmm8
- vmovdqa [rsp + 3*16], xmm9
- vmovdqa [rsp + 4*16], xmm10
- vmovdqa [rsp + 5*16], xmm11
- vmovdqa [rsp + 6*16], xmm12
- vmovdqa [rsp + 7*16], xmm13
- vmovdqa [rsp + 8*16], xmm14
- save_reg r12, 9*16 + 0*8
- save_reg r13, 9*16 + 1*8
- save_reg r14, 9*16 + 2*8
- save_reg r15, 9*16 + 3*8
+ save_reg r12, 3*16 + 0*8
+ save_reg r13, 3*16 + 1*8
+ save_reg r15, 3*16 + 2*8
end_prolog
mov arg4, arg(4)
%endmacro
@@ -100,16 +93,9 @@
vmovdqa xmm6, [rsp + 0*16]
vmovdqa xmm7, [rsp + 1*16]
vmovdqa xmm8, [rsp + 2*16]
- vmovdqa xmm9, [rsp + 3*16]
- vmovdqa xmm10, [rsp + 4*16]
- vmovdqa xmm11, [rsp + 5*16]
- vmovdqa xmm12, [rsp + 6*16]
- vmovdqa xmm13, [rsp + 7*16]
- vmovdqa xmm14, [rsp + 8*16]
- mov r12, [rsp + 9*16 + 0*8]
- mov r13, [rsp + 9*16 + 1*8]
- mov r14, [rsp + 9*16 + 2*8]
- mov r15, [rsp + 9*16 + 3*8]
+ mov r12, [rsp + 3*16 + 0*8]
+ mov r13, [rsp + 3*16 + 1*8]
+ mov r15, [rsp + 3*16 + 2*8]
add rsp, stack_size
%endmacro
%endif
@@ -133,8 +119,8 @@
%else
;;; Use Non-temporal load/stor
%ifdef NO_NT_LDST
- %define XLDR vmovdqa
- %define XSTR vmovdqa
+ %define XLDR vmovdqa64
+ %define XSTR vmovdqa64
%else
%define XLDR vmovntdqa
%define XSTR vmovntdq
@@ -160,13 +146,8 @@ default rel
section .text
align 16
-global gf_2vect_dot_prod_avx512:ISAL_SYM_TYPE_FUNCTION
+global gf_2vect_dot_prod_avx512, function
func(gf_2vect_dot_prod_avx512)
-%ifidn __OUTPUT_FORMAT__, macho64
-global _gf_2vect_dot_prod_avx512:ISAL_SYM_TYPE_FUNCTION
-func(_gf_2vect_dot_prod_avx512)
-%endif
-
FUNC_SAVE
sub len, 64
jl .return_fail
diff --git a/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx512.patch b/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx512.patch
deleted file mode 100644
index b00998d4b6..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx512.patch
+++ /dev/null
@@ -1,6 +0,0 @@
-165,169d164
-< %ifidn __OUTPUT_FORMAT__, macho64
-< global _gf_2vect_dot_prod_avx512:ISAL_SYM_TYPE_FUNCTION
-< func(_gf_2vect_dot_prod_avx512)
-< %endif
-<
diff --git a/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx512_gfni.asm b/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx512_gfni.asm
new file mode 100644
index 0000000000..33967b2928
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_avx512_gfni.asm
@@ -0,0 +1,209 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2023 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_2vect_dot_prod_avx512_gfni(len, vec, *g_tbls, **buffs, **dests);
+;;;
+
+%include "reg_sizes.asm"
+%include "gf_vect_gfni.inc"
+
+%if AS_FEATURE_LEVEL >= 10
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+ %define arg4 r8
+ %define arg5 r9
+
+ %define tmp r11
+ %define tmp2 r10
+ %define tmp3 r12 ; must be saved and restored
+
+ %define func(x) x: endbranch
+ %macro FUNC_SAVE 0
+ push r12
+ %endmacro
+ %macro FUNC_RESTORE 0
+ pop r12
+ %endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+
+ %define arg4 r12 ; must be saved, loaded and restored
+ %define arg5 r14 ; must be saved and restored
+ %define tmp r11
+ %define tmp2 r10
+ %define tmp3 r13 ; must be saved and restored
+ %define stack_size 3*8 ; must be an odd multiple of 8
+ %define arg(x) [rsp + stack_size + 8 + 8*x]
+
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ mov [rsp + 0*8], r12
+ mov [rsp + 1*8], r13
+ mov [rsp + 2*8], r14
+ end_prolog
+ mov arg4, arg(4)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ mov r12, [rsp + 0*8]
+ mov r13, [rsp + 1*8]
+ mov r14, [rsp + 2*8]
+ add rsp, stack_size
+ %endmacro
+%endif
+
+
+%define len arg0
+%define vec arg1
+%define mul_array arg2
+%define src arg3
+%define dest1 arg4
+%define ptr arg5
+%define vec_i tmp2
+%define dest2 tmp3
+%define pos rax
+
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu8
+ %define XSTR vmovdqu8
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+ %define XLDR vmovdqa64
+ %define XSTR vmovdqa64
+ %else
+ %define XLDR vmovntdqa
+ %define XSTR vmovntdq
+ %endif
+%endif
+
+%define xgft1 zmm3
+%define xgft2 zmm4
+
+%define x0 zmm0
+%define xp1 zmm1
+%define xp2 zmm2
+
+default rel
+[bits 64]
+
+section .text
+
+;;
+;; Encodes 64 bytes of all "k" sources into 2x 64 bytes (parity disks)
+;;
+%macro ENCODE_64B_2 0-1
+%define %%KMASK %1
+
+ vpxorq xp1, xp1, xp1
+ vpxorq xp2, xp2, xp2
+ mov tmp, mul_array
+ xor vec_i, vec_i
+
+%%next_vect:
+ mov ptr, [src + vec_i]
+%if %0 == 1
+ vmovdqu8 x0{%%KMASK}, [ptr + pos] ;Get next source vector (less than 64 bytes)
+%else
+ XLDR x0, [ptr + pos] ;Get next source vector (64 bytes)
+%endif
+ add vec_i, 8
+
+ vbroadcastf32x2 xgft1, [tmp]
+ vbroadcastf32x2 xgft2, [tmp + vec]
+ add tmp, 8
+
+ GF_MUL_XOR EVEX, x0, xgft1, xgft1, xp1, xgft2, xgft2, xp2
+
+ cmp vec_i, vec
+ jl %%next_vect
+
+%if %0 == 1
+ vmovdqu8 [dest1 + pos]{%%KMASK}, xp1
+ vmovdqu8 [dest2 + pos]{%%KMASK}, xp2
+%else
+ XSTR [dest1 + pos], xp1
+ XSTR [dest2 + pos], xp2
+%endif
+%endmacro
+
+align 16
+global gf_2vect_dot_prod_avx512_gfni, function
+func(gf_2vect_dot_prod_avx512_gfni)
+ FUNC_SAVE
+
+ xor pos, pos
+ shl vec, 3 ;vec *= 8. Make vec_i count by 8
+ mov dest2, [dest1 + 8]
+ mov dest1, [dest1]
+
+ cmp len, 64
+ jl .len_lt_64
+
+.loop64:
+
+ ENCODE_64B_2
+
+ add pos, 64 ;Loop on 64 bytes at a time
+ sub len, 64
+ cmp len, 64
+ jge .loop64
+
+.len_lt_64:
+ cmp len, 0
+ jle .exit
+
+ xor tmp, tmp
+ bts tmp, len
+ dec tmp
+ kmovq k1, tmp
+
+ ENCODE_64B_2 k1
+
+.exit:
+ vzeroupper
+
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+%endif ; if AS_FEATURE_LEVEL >= 10
diff --git a/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_sse.asm b/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_sse.asm
index ad61093471..986160204d 100644
--- a/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_sse.asm
+++ b/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_sse.asm
@@ -52,7 +52,7 @@
%define PS 8
%define LOG_PS 3
- %define func(x) x:
+ %define func(x) x: endbranch
%macro FUNC_SAVE 0
push r12
%endmacro
@@ -127,7 +127,7 @@
%define PS 4
%define LOG_PS 2
- %define func(x) x:
+ %define func(x) x: endbranch
%define arg(x) [ebp + PS*2 + PS*x]
%define var(x) [ebp - PS - PS*x]
@@ -238,13 +238,9 @@ section .text
%endif
align 16
-global gf_2vect_dot_prod_sse:ISAL_SYM_TYPE_FUNCTION
-func(gf_2vect_dot_prod_sse)
-%ifidn __OUTPUT_FORMAT__, macho64
-global _gf_2vect_dot_prod_sse:ISAL_SYM_TYPE_FUNCTION
-func(_gf_2vect_dot_prod_sse)
-%endif
+global gf_2vect_dot_prod_sse, function
+func(gf_2vect_dot_prod_sse)
FUNC_SAVE
SLDR len, len_m
sub len, 16
@@ -338,6 +334,3 @@ section .data
align 16
mask0f: dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
-
-;;; func core, ver, snum
-slversion gf_2vect_dot_prod_sse, 00, 04, 0062
diff --git a/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_sse.patch b/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_sse.patch
deleted file mode 100644
index 439a2b1ac9..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_sse.patch
+++ /dev/null
@@ -1,8 +0,0 @@
-242,246d241
-< func(gf_2vect_dot_prod_sse)
-< %ifidn __OUTPUT_FORMAT__, macho64
-< global _gf_2vect_dot_prod_sse:ISAL_SYM_TYPE_FUNCTION
-< func(_gf_2vect_dot_prod_sse)
-< %endif
-247a243
-> func(gf_2vect_dot_prod_sse)
diff --git a/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_sse_test.c b/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_sse_test.c
deleted file mode 100644
index 406183bc30..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_sse_test.c
+++ /dev/null
@@ -1,480 +0,0 @@
-/**********************************************************************
- Copyright(c) 2011-2015 Intel Corporation All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions
- are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in
- the documentation and/or other materials provided with the
- distribution.
- * Neither the name of Intel Corporation nor the names of its
- contributors may be used to endorse or promote products derived
- from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-**********************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h> // for memset, memcmp
-#include "erasure_code.h"
-// #include "types.h"
-
-#ifndef FUNCTION_UNDER_TEST
-# define FUNCTION_UNDER_TEST gf_2vect_dot_prod_sse
-#endif
-#ifndef TEST_MIN_SIZE
-# define TEST_MIN_SIZE 16
-#endif
-
-#define str(s) #s
-#define xstr(s) str(s)
-
-#define TEST_LEN 2048
-#define TEST_SIZE (TEST_LEN/2)
-#define TEST_MEM TEST_SIZE
-#define TEST_LOOPS 1000
-#define TEST_TYPE_STR ""
-
-#ifndef TEST_SOURCES
-# define TEST_SOURCES 16
-#endif
-#ifndef RANDOMS
-# define RANDOMS 20
-#endif
-
-#ifdef EC_ALIGNED_ADDR
-// Define power of 2 range to check ptr, len alignment
-# define PTR_ALIGN_CHK_B 0
-# define LEN_ALIGN_CHK_B 0 // 0 for aligned only
-#else
-// Define power of 2 range to check ptr, len alignment
-# define PTR_ALIGN_CHK_B 32
-# define LEN_ALIGN_CHK_B 32 // 0 for aligned only
-#endif
-
-typedef unsigned char u8;
-
-extern void FUNCTION_UNDER_TEST(int len, int vlen, unsigned char *gftbls,
- unsigned char **src, unsigned char **dest);
-
-void dump(unsigned char *buf, int len)
-{
- int i;
- for (i = 0; i < len;) {
- printf(" %2x", 0xff & buf[i++]);
- if (i % 32 == 0)
- printf("\n");
- }
- printf("\n");
-}
-
-void dump_matrix(unsigned char **s, int k, int m)
-{
- int i, j;
- for (i = 0; i < k; i++) {
- for (j = 0; j < m; j++) {
- printf(" %2x", s[i][j]);
- }
- printf("\n");
- }
- printf("\n");
-}
-
-void dump_u8xu8(unsigned char *s, int k, int m)
-{
- int i, j;
- for (i = 0; i < k; i++) {
- for (j = 0; j < m; j++) {
- printf(" %2x", 0xff & s[j + (i * m)]);
- }
- printf("\n");
- }
- printf("\n");
-}
-
-int main(int argc, char *argv[])
-{
- int i, j, rtest, srcs;
- void *buf;
- u8 g1[TEST_SOURCES], g2[TEST_SOURCES], g_tbls[2 * TEST_SOURCES * 32];
- u8 *dest1, *dest2, *dest_ref1, *dest_ref2, *dest_ptrs[2];
- u8 *buffs[TEST_SOURCES];
-
- int align, size;
- unsigned char *efence_buffs[TEST_SOURCES];
- unsigned int offset;
- u8 *ubuffs[TEST_SOURCES];
- u8 *udest_ptrs[2];
-
- printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d ", TEST_SOURCES, TEST_LEN);
-
- // Allocate the arrays
- for (i = 0; i < TEST_SOURCES; i++) {
- if (posix_memalign(&buf, 64, TEST_LEN)) {
- printf("alloc error: Fail");
- return -1;
- }
- buffs[i] = buf;
- }
-
- if (posix_memalign(&buf, 64, TEST_LEN)) {
- printf("alloc error: Fail");
- return -1;
- }
- dest1 = buf;
-
- if (posix_memalign(&buf, 64, TEST_LEN)) {
- printf("alloc error: Fail");
- return -1;
- }
- dest2 = buf;
-
- if (posix_memalign(&buf, 64, TEST_LEN)) {
- printf("alloc error: Fail");
- return -1;
- }
- dest_ref1 = buf;
-
- if (posix_memalign(&buf, 64, TEST_LEN)) {
- printf("alloc error: Fail");
- return -1;
- }
- dest_ref2 = buf;
-
- dest_ptrs[0] = dest1;
- dest_ptrs[1] = dest2;
-
- // Test of all zeros
- for (i = 0; i < TEST_SOURCES; i++)
- memset(buffs[i], 0, TEST_LEN);
-
- memset(dest1, 0, TEST_LEN);
- memset(dest2, 0, TEST_LEN);
- memset(dest_ref1, 0, TEST_LEN);
- memset(dest_ref2, 0, TEST_LEN);
- memset(g1, 2, TEST_SOURCES);
- memset(g2, 1, TEST_SOURCES);
-
- for (i = 0; i < TEST_SOURCES; i++) {
- gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
- gf_vect_mul_init(g2[i], &g_tbls[32 * TEST_SOURCES + i * 32]);
- }
-
- gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
- gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES], buffs,
- dest_ref2);
-
- FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
-
- if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
- printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test1\n");
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref1, 25);
- printf("dprod_dut:");
- dump(dest1, 25);
- return -1;
- }
- if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
- printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test2\n");
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref2, 25);
- printf("dprod_dut:");
- dump(dest2, 25);
- return -1;
- }
-
- putchar('.');
-
- // Rand data test
-
- for (rtest = 0; rtest < RANDOMS; rtest++) {
- for (i = 0; i < TEST_SOURCES; i++)
- for (j = 0; j < TEST_LEN; j++)
- buffs[i][j] = rand();
-
- for (i = 0; i < TEST_SOURCES; i++) {
- g1[i] = rand();
- g2[i] = rand();
- }
-
- for (i = 0; i < TEST_SOURCES; i++) {
- gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
- gf_vect_mul_init(g2[i], &g_tbls[(32 * TEST_SOURCES) + (i * 32)]);
- }
-
- gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
- gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
- buffs, dest_ref2);
-
- FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
-
- if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test1 %d\n", rtest);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref1, 25);
- printf("dprod_dut:");
- dump(dest1, 25);
- return -1;
- }
- if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test2 %d\n", rtest);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref2, 25);
- printf("dprod_dut:");
- dump(dest2, 25);
- return -1;
- }
-
- putchar('.');
- }
-
- // Rand data test with varied parameters
- for (rtest = 0; rtest < RANDOMS; rtest++) {
- for (srcs = TEST_SOURCES; srcs > 0; srcs--) {
- for (i = 0; i < srcs; i++)
- for (j = 0; j < TEST_LEN; j++)
- buffs[i][j] = rand();
-
- for (i = 0; i < srcs; i++) {
- g1[i] = rand();
- g2[i] = rand();
- }
-
- for (i = 0; i < srcs; i++) {
- gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
- gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
- }
-
- gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[0], buffs, dest_ref1);
- gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[32 * srcs], buffs,
- dest_ref2);
-
- FUNCTION_UNDER_TEST(TEST_LEN, srcs, g_tbls, buffs, dest_ptrs);
-
- if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
- " test1 srcs=%d\n", srcs);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref1, 25);
- printf("dprod_dut:");
- dump(dest1, 25);
- return -1;
- }
- if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
- " test2 srcs=%d\n", srcs);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref2, 25);
- printf("dprod_dut:");
- dump(dest2, 25);
- return -1;
- }
-
- putchar('.');
- }
- }
-
- // Run tests at end of buffer for Electric Fence
- align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
- for (size = TEST_MIN_SIZE; size <= TEST_SIZE; size += align) {
- for (i = 0; i < TEST_SOURCES; i++)
- for (j = 0; j < TEST_LEN; j++)
- buffs[i][j] = rand();
-
- for (i = 0; i < TEST_SOURCES; i++) // Line up TEST_SIZE from end
- efence_buffs[i] = buffs[i] + TEST_LEN - size;
-
- for (i = 0; i < TEST_SOURCES; i++) {
- g1[i] = rand();
- g2[i] = rand();
- }
-
- for (i = 0; i < TEST_SOURCES; i++) {
- gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
- gf_vect_mul_init(g2[i], &g_tbls[(32 * TEST_SOURCES) + (i * 32)]);
- }
-
- gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[0], efence_buffs, dest_ref1);
- gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
- efence_buffs, dest_ref2);
-
- FUNCTION_UNDER_TEST(size, TEST_SOURCES, g_tbls, efence_buffs, dest_ptrs);
-
- if (0 != memcmp(dest_ref1, dest1, size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test1 %d\n", rtest);
- dump_matrix(efence_buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref1, align);
- printf("dprod_dut:");
- dump(dest1, align);
- return -1;
- }
-
- if (0 != memcmp(dest_ref2, dest2, size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test2 %d\n", rtest);
- dump_matrix(efence_buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref2, align);
- printf("dprod_dut:");
- dump(dest2, align);
- return -1;
- }
-
- putchar('.');
- }
-
- // Test rand ptr alignment if available
-
- for (rtest = 0; rtest < RANDOMS; rtest++) {
- size = (TEST_LEN - PTR_ALIGN_CHK_B) & ~(TEST_MIN_SIZE - 1);
- srcs = rand() % TEST_SOURCES;
- if (srcs == 0)
- continue;
-
- offset = (PTR_ALIGN_CHK_B != 0) ? 1 : PTR_ALIGN_CHK_B;
- // Add random offsets
- for (i = 0; i < srcs; i++)
- ubuffs[i] = buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
-
- udest_ptrs[0] = dest1 + (rand() & (PTR_ALIGN_CHK_B - offset));
- udest_ptrs[1] = dest2 + (rand() & (PTR_ALIGN_CHK_B - offset));
-
- memset(dest1, 0, TEST_LEN); // zero pad to check write-over
- memset(dest2, 0, TEST_LEN);
-
- for (i = 0; i < srcs; i++)
- for (j = 0; j < size; j++)
- ubuffs[i][j] = rand();
-
- for (i = 0; i < srcs; i++) {
- g1[i] = rand();
- g2[i] = rand();
- }
-
- for (i = 0; i < srcs; i++) {
- gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
- gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
- }
-
- gf_vect_dot_prod_base(size, srcs, &g_tbls[0], ubuffs, dest_ref1);
- gf_vect_dot_prod_base(size, srcs, &g_tbls[32 * srcs], ubuffs, dest_ref2);
-
- FUNCTION_UNDER_TEST(size, srcs, g_tbls, ubuffs, udest_ptrs);
-
- if (memcmp(dest_ref1, udest_ptrs[0], size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
- srcs);
- dump_matrix(ubuffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref1, 25);
- printf("dprod_dut:");
- dump(udest_ptrs[0], 25);
- return -1;
- }
- if (memcmp(dest_ref2, udest_ptrs[1], size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
- srcs);
- dump_matrix(ubuffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref2, 25);
- printf("dprod_dut:");
- dump(udest_ptrs[1], 25);
- return -1;
- }
- // Confirm that padding around dests is unchanged
- memset(dest_ref1, 0, PTR_ALIGN_CHK_B); // Make reference zero buff
- offset = udest_ptrs[0] - dest1;
-
- if (memcmp(dest1, dest_ref1, offset)) {
- printf("Fail rand ualign pad1 start\n");
- return -1;
- }
- if (memcmp(dest1 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
- printf("Fail rand ualign pad1 end\n");
- return -1;
- }
-
- offset = udest_ptrs[1] - dest2;
- if (memcmp(dest2, dest_ref1, offset)) {
- printf("Fail rand ualign pad2 start\n");
- return -1;
- }
- if (memcmp(dest2 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
- printf("Fail rand ualign pad2 end\n");
- return -1;
- }
-
- putchar('.');
- }
-
- // Test all size alignment
- align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
-
- for (size = TEST_LEN; size >= TEST_MIN_SIZE; size -= align) {
- srcs = TEST_SOURCES;
-
- for (i = 0; i < srcs; i++)
- for (j = 0; j < size; j++)
- buffs[i][j] = rand();
-
- for (i = 0; i < srcs; i++) {
- g1[i] = rand();
- g2[i] = rand();
- }
-
- for (i = 0; i < srcs; i++) {
- gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
- gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
- }
-
- gf_vect_dot_prod_base(size, srcs, &g_tbls[0], buffs, dest_ref1);
- gf_vect_dot_prod_base(size, srcs, &g_tbls[32 * srcs], buffs, dest_ref2);
-
- FUNCTION_UNDER_TEST(size, srcs, g_tbls, buffs, dest_ptrs);
-
- if (memcmp(dest_ref1, dest_ptrs[0], size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
- size);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref1, 25);
- printf("dprod_dut:");
- dump(dest_ptrs[0], 25);
- return -1;
- }
- if (memcmp(dest_ref2, dest_ptrs[1], size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
- size);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref2, 25);
- printf("dprod_dut:");
- dump(dest_ptrs[1], 25);
- return -1;
- }
- }
-
- printf("Pass\n");
- return 0;
-
-}
diff --git a/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_sse_test.patch b/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_sse_test.patch
deleted file mode 100644
index 21bbfaa667..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_2vect_dot_prod_sse_test.patch
+++ /dev/null
@@ -1,4 +0,0 @@
-34c34
-< // #include "types.h"
----
-> #include "types.h"
diff --git a/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx.asm b/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx.asm
index 2d51dad33f..08e9a7f040 100644
--- a/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx.asm
+++ b/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx.asm
@@ -97,7 +97,7 @@
%define return rax
%define return.w eax
- %define func(x) x:
+ %define func(x) x: endbranch
%define FUNC_SAVE
%define FUNC_RESTORE
%endif
@@ -155,13 +155,9 @@ section .text
align 16
-global gf_2vect_mad_avx:ISAL_SYM_TYPE_FUNCTION
-func(gf_2vect_mad_avx)
-%ifidn __OUTPUT_FORMAT__, macho64
-global _gf_2vect_mad_avx:ISAL_SYM_TYPE_FUNCTION
-func(_gf_2vect_mad_avx)
-%endif
+global gf_2vect_mad_avx, function
+func(gf_2vect_mad_avx)
FUNC_SAVE
sub len, 16
jl .return_fail
@@ -235,6 +231,3 @@ section .data
align 16
mask0f: dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
-
-;;; func core, ver, snum
-slversion gf_2vect_mad_avx, 02, 01, 0204
diff --git a/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx.patch b/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx.patch
deleted file mode 100644
index b2bb2f2c3d..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx.patch
+++ /dev/null
@@ -1,8 +0,0 @@
-159,163d158
-< func(gf_2vect_mad_avx)
-< %ifidn __OUTPUT_FORMAT__, macho64
-< global _gf_2vect_mad_avx:ISAL_SYM_TYPE_FUNCTION
-< func(_gf_2vect_mad_avx)
-< %endif
-164a160
-> func(gf_2vect_mad_avx)
diff --git a/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx2.asm b/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx2.asm
index 2b0fd8ea2d..aa6a61c949 100644
--- a/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx2.asm
+++ b/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx2.asm
@@ -104,7 +104,7 @@
%define return rax
%define return.w eax
- %define func(x) x:
+ %define func(x) x: endbranch
%define FUNC_SAVE
%define FUNC_RESTORE
%endif
@@ -163,13 +163,9 @@ section .text
%define xtmpd2 ymm9
align 16
-global gf_2vect_mad_avx2:ISAL_SYM_TYPE_FUNCTION
-func(gf_2vect_mad_avx2)
-%ifidn __OUTPUT_FORMAT__, macho64
-global _gf_2vect_mad_avx2:ISAL_SYM_TYPE_FUNCTION
-func(_gf_2vect_mad_avx2)
-%endif
+global gf_2vect_mad_avx2, function
+func(gf_2vect_mad_avx2)
FUNC_SAVE
sub len, 32
jl .return_fail
@@ -244,8 +240,3 @@ func(_gf_2vect_mad_avx2)
ret
endproc_frame
-
-section .data
-
-;;; func core, ver, snum
-slversion gf_2vect_mad_avx2, 04, 01, 0205
diff --git a/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx2.patch b/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx2.patch
deleted file mode 100644
index 6f00af6393..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx2.patch
+++ /dev/null
@@ -1,8 +0,0 @@
-167,171d166
-< func(gf_2vect_mad_avx2)
-< %ifidn __OUTPUT_FORMAT__, macho64
-< global _gf_2vect_mad_avx2:ISAL_SYM_TYPE_FUNCTION
-< func(_gf_2vect_mad_avx2)
-< %endif
-172a168
-> func(gf_2vect_mad_avx2)
diff --git a/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx2_gfni.asm b/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx2_gfni.asm
new file mode 100644
index 0000000000..0445555419
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx2_gfni.asm
@@ -0,0 +1,298 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2023 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_2vect_mad_avx2_gfni(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%include "reg_sizes.asm"
+%include "gf_vect_gfni.inc"
+%include "memcpy.asm"
+
+%if AS_FEATURE_LEVEL >= 10
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+ %define arg4 r8
+ %define arg5 r9
+ %define tmp r11
+ %define tmp2 r10
+ %define func(x) x: endbranch
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+ %define arg4 r12 ; must be saved, loaded and restored
+ %define arg5 r13 ; must be saved and restored
+ %define tmp r11
+ %define tmp2 r10
+ %define stack_size 16*9 + 3*8
+ %define arg(x) [rsp + stack_size + 8 + 8*x]
+ %define func(x) proc_frame x
+
+ %macro FUNC_SAVE 0
+ sub rsp, stack_size
+ vmovdqa [rsp + 0*16], xmm6
+ vmovdqa [rsp + 1*16], xmm7
+ vmovdqa [rsp + 2*16], xmm8
+ vmovdqa [rsp + 3*16], xmm9
+ vmovdqa [rsp + 4*16], xmm10
+ vmovdqa [rsp + 5*16], xmm11
+ vmovdqa [rsp + 6*16], xmm12
+ vmovdqa [rsp + 7*16], xmm13
+ vmovdqa [rsp + 8*16], xmm14
+ mov [rsp + 9*16 + 0*8], r12
+ mov [rsp + 9*16 + 1*8], r13
+ end_prolog
+ mov arg4, arg(4)
+ mov arg5, arg(5)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ vmovdqa xmm6, [rsp + 0*16]
+ vmovdqa xmm7, [rsp + 1*16]
+ vmovdqa xmm8, [rsp + 2*16]
+ vmovdqa xmm9, [rsp + 3*16]
+ vmovdqa xmm10, [rsp + 4*16]
+ vmovdqa xmm11, [rsp + 5*16]
+ vmovdqa xmm12, [rsp + 6*16]
+ vmovdqa xmm13, [rsp + 7*16]
+ vmovdqa xmm14, [rsp + 8*16]
+ mov r12, [rsp + 9*16 + 0*8]
+ mov r13, [rsp + 9*16 + 1*8]
+ add rsp, stack_size
+ %endmacro
+%endif
+
+%define len arg0
+%define vec arg1
+%define vec_i arg2
+%define mul_array arg3
+%define src arg4
+%define dest1 arg5
+%define pos rax
+%define dest2 mul_array
+%define dest3 vec_i
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+ %define XLDR vmovdqa
+ %define XSTR vmovdqa
+ %else
+ %define XLDR vmovntdqa
+ %define XSTR vmovntdq
+ %endif
+%endif
+
+default rel
+[bits 64]
+section .text
+
+%define x0l ymm0
+%define x0h ymm1
+%define x0x ymm2
+
+%define xgft1 ymm3
+%define xgft2 ymm4
+%define xd1l ymm5
+%define xd1h ymm6
+%define xd1x ymm7
+
+%define xd2l ymm8
+%define xd2h ymm9
+%define xd2x ymm10
+
+%define xret1l ymm11
+%define xret1h ymm12
+%define xret2l ymm13
+%define xret2h ymm14
+
+%define x0 x0l
+%define xd1 xd1l
+%define xd2 xd2l
+%define xret1 xret1l
+%define xret2 xret2l
+
+;;
+;; Encodes 96 bytes of a single source into 2x 96 bytes (parity disks)
+;;
+%macro ENCODE_96B_2 0
+
+ ;Get next source vector
+ XLDR x0l, [src + pos]
+ XLDR x0h, [src + pos + 32]
+ XLDR x0x, [src + pos + 64]
+ ;Get next dest vectors
+ XLDR xd1l, [dest1 + pos]
+ XLDR xd1h, [dest1 + pos + 32]
+ XLDR xd1x, [dest1 + pos + 64]
+ XLDR xd2l, [dest2 + pos]
+ XLDR xd2h, [dest2 + pos + 32]
+ XLDR xd2x, [dest2 + pos + 64]
+
+ GF_MUL_XOR VEX, x0l, xgft1, xret1l, xd1l, xgft2, xret2l, xd2l
+ GF_MUL_XOR VEX, x0h, xgft1, xret1h, xd1h, xgft2, xret2h, xd2h
+ GF_MUL_XOR VEX, x0x, xgft1, xret1l, xd1x, xgft2, xret2l, xd2x
+
+ XSTR [dest1 + pos], xd1l
+ XSTR [dest1 + pos + 32], xd1h
+ XSTR [dest1 + pos + 64], xd1x
+ XSTR [dest2 + pos], xd2l
+ XSTR [dest2 + pos + 32], xd2h
+ XSTR [dest2 + pos + 64], xd2x
+%endmacro
+
+;;
+;; Encodes 64 bytes of a single source into 2x 64 bytes (parity disks)
+;;
+%macro ENCODE_64B_2 0
+
+ ;Get next source vector
+ XLDR x0l, [src + pos]
+ XLDR x0h, [src + pos + 32]
+ ;Get next dest vectors
+ XLDR xd1l, [dest1 + pos]
+ XLDR xd1h, [dest1 + pos + 32]
+ XLDR xd2l, [dest2 + pos]
+ XLDR xd2h, [dest2 + pos + 32]
+
+ GF_MUL_XOR VEX, x0l, xgft1, xret1l, xd1l, xgft2, xret2l, xd2l
+ GF_MUL_XOR VEX, x0h, xgft1, xret1h, xd1h, xgft2, xret2h, xd2h
+
+ XSTR [dest1 + pos], xd1l
+ XSTR [dest1 + pos + 32], xd1h
+ XSTR [dest2 + pos], xd2l
+ XSTR [dest2 + pos + 32], xd2h
+%endmacro
+
+;;
+;; Encodes 32 bytes of a single source into 2x 32 bytes (parity disks)
+;;
+%macro ENCODE_32B_2 0
+
+ ;Get next source vector
+ XLDR x0, [src + pos]
+ ;Get next dest vectors
+ XLDR xd1, [dest1 + pos]
+ XLDR xd2, [dest2 + pos]
+
+ GF_MUL_XOR VEX, x0, xgft1, xret1, xd1, xgft2, xret2, xd2
+
+ XSTR [dest1 + pos], xd1
+ XSTR [dest2 + pos], xd2
+%endmacro
+
+;;
+;; Encodes less than 32 bytes of a single source into 2x parity disks
+;;
+%macro ENCODE_LT_32B_2 1
+%define %%LEN %1
+
+ ;Get next source vector
+ simd_load_avx2 x0, src + pos, %%LEN, tmp, tmp2
+ ;Get next dest vectors
+ simd_load_avx2 xd1, dest1 + pos, %%LEN, tmp, tmp2
+ simd_load_avx2 xd2, dest2 + pos, %%LEN, tmp, tmp2
+
+ GF_MUL_XOR VEX, x0, xgft1, xret1, xd1, xgft2, xret2, xd2
+
+ lea dest1, [dest1 + pos]
+ simd_store_avx2 dest1, xd1, %%LEN, tmp, tmp2
+ lea dest2, [dest2 + pos]
+ simd_store_avx2 dest2, xd2, %%LEN, tmp, tmp2
+%endmacro
+
+align 16
+global gf_2vect_mad_avx2_gfni, function
+func(gf_2vect_mad_avx2_gfni)
+ FUNC_SAVE
+
+ xor pos, pos
+ shl vec_i, 3 ;Multiply by 8
+ shl vec, 3 ;Multiply by 8
+ lea tmp, [mul_array + vec_i]
+ vbroadcastsd xgft1, [tmp]
+ vbroadcastsd xgft2, [tmp + vec]
+ mov dest2, [dest1 + 8] ; reuse mul_array
+ mov dest1, [dest1]
+
+ cmp len, 96
+ jl .len_lt_96
+
+.loop96:
+ ENCODE_96B_2
+ add pos, 96 ;; loop on 96 bytes at a time
+ sub len, 96
+ cmp len, 96
+ jge .loop96
+
+.len_lt_96:
+ cmp len, 64
+ jl .len_lt_64
+ ENCODE_64B_2 ;; encode next 64 bytes
+
+ add pos, 64
+ sub len, 64
+
+.len_lt_64:
+ cmp len, 32
+ jl .len_lt_32
+
+ ENCODE_32B_2 ;; encode next 32 bytes
+
+ add pos, 32
+ sub len, 32
+
+.len_lt_32:
+ cmp len, 0
+ jle .exit
+
+ ENCODE_LT_32B_2 len ;; encode final bytes
+
+.exit:
+ vzeroupper
+
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+%endif ; if AS_FEATURE_LEVEL >= 10
diff --git a/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx512.asm b/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx512.asm
index acb67e4334..1b76432eb7 100644
--- a/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx512.asm
+++ b/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx512.asm
@@ -45,7 +45,7 @@
%define tmp r11
%define tmp2 r10
%define return rax
- %define func(x) x:
+ %define func(x) x: endbranch
%define FUNC_SAVE
%define FUNC_RESTORE
%endif
@@ -118,8 +118,8 @@
%else
;;; Use Non-temporal load/stor
%ifdef NO_NT_LDST
- %define XLDR vmovdqa
- %define XSTR vmovdqa
+ %define XLDR vmovdqa64
+ %define XSTR vmovdqa64
%else
%define XLDR vmovntdqa
%define XSTR vmovntdq
@@ -149,13 +149,8 @@ section .text
%define xmask0f zmm14
align 16
-global gf_2vect_mad_avx512:ISAL_SYM_TYPE_FUNCTION
+global gf_2vect_mad_avx512, function
func(gf_2vect_mad_avx512)
-%ifidn __OUTPUT_FORMAT__, macho64
-global _gf_2vect_mad_avx512:ISAL_SYM_TYPE_FUNCTION
-func(_gf_2vect_mad_avx512)
-%endif
-
FUNC_SAVE
sub len, 64
jl .return_fail
diff --git a/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx512.patch b/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx512.patch
deleted file mode 100644
index 6b3d2e6d23..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx512.patch
+++ /dev/null
@@ -1,6 +0,0 @@
-154,158d153
-< %ifidn __OUTPUT_FORMAT__, macho64
-< global _gf_2vect_mad_avx512:ISAL_SYM_TYPE_FUNCTION
-< func(_gf_2vect_mad_avx512)
-< %endif
-<
diff --git a/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx512_gfni.asm b/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx512_gfni.asm
new file mode 100644
index 0000000000..41343305b1
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/gf_2vect_mad_avx512_gfni.asm
@@ -0,0 +1,189 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2023 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_2vect_mad_avx512_gfni(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%include "reg_sizes.asm"
+%include "gf_vect_gfni.inc"
+
+%if AS_FEATURE_LEVEL >= 10
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+ %define arg4 r8
+ %define arg5 r9
+ %define tmp r11
+ %define tmp2 r10
+ %define func(x) x: endbranch
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+ %define arg4 r12
+ %define arg5 r13
+ %define tmp r11
+ %define tmp2 r10
+ %define stack_size 16 + 3*8 ; must be an odd multiple of 8
+ %define arg(x) [rsp + stack_size + 8 + 8*x]
+
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+ sub rsp, stack_size
+ vmovdqa [rsp + 16*0], xmm6
+ mov [rsp + 16 + 0*8], r12
+ mov [rsp + 16 + 1*8], r13
+ end_prolog
+ mov arg4, arg(4)
+ mov arg5, arg(5)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ vmovdqa xmm6, [rsp + 16*0]
+ mov r12, [rsp + 16 + 0*8]
+ mov r13, [rsp + 16 + 1*8]
+ add rsp, stack_size
+ %endmacro
+%endif
+
+%define len arg0
+%define vec arg1
+%define vec_i arg2
+%define mul_array arg3
+%define src arg4
+%define dest1 arg5
+%define pos rax
+%define dest2 tmp2
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu8
+ %define XSTR vmovdqu8
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+ %define XLDR vmovdqa64
+ %define XSTR vmovdqa64
+ %else
+ %define XLDR vmovntdqa
+ %define XSTR vmovntdq
+ %endif
+%endif
+
+default rel
+[bits 64]
+section .text
+
+%define x0 zmm0
+%define xd1 zmm1
+%define xd2 zmm2
+%define xgft1 zmm3
+%define xgft2 zmm4
+%define xret1 zmm5
+%define xret2 zmm6
+
+;;
+;; Encodes 64 bytes of a single source into 2x 64 bytes (parity disks)
+;;
+%macro ENCODE_64B_2 0-1
+%define %%KMASK %1
+
+%if %0 == 1
+ vmovdqu8 x0{%%KMASK}, [src + pos] ;Get next source vector
+ vmovdqu8 xd1{%%KMASK}, [dest1 + pos] ;Get next dest vector
+ vmovdqu8 xd2{%%KMASK}, [dest2 + pos] ;Get next dest vector
+%else
+ XLDR x0, [src + pos] ;Get next source vector
+ XLDR xd1, [dest1 + pos] ;Get next dest vector
+ XLDR xd2, [dest2 + pos] ;Get next dest vector
+%endif
+
+ GF_MUL_XOR EVEX, x0, xgft1, xret1, xd1, xgft2, xret2, xd2
+
+%if %0 == 1
+ vmovdqu8 [dest1 + pos]{%%KMASK}, xd1
+ vmovdqu8 [dest2 + pos]{%%KMASK}, xd2
+%else
+ XSTR [dest1 + pos], xd1
+ XSTR [dest2 + pos], xd2
+%endif
+%endmacro
+
+align 16
+global gf_2vect_mad_avx512_gfni, function
+func(gf_2vect_mad_avx512_gfni)
+ FUNC_SAVE
+
+ xor pos, pos
+ shl vec_i, 3 ;Multiply by 8
+ shl vec, 3
+ lea tmp, [mul_array + vec_i]
+ vbroadcastf32x2 xgft1, [tmp]
+ vbroadcastf32x2 xgft2, [tmp + vec]
+ mov dest2, [dest1 + 8] ; reuse mul_array
+ mov dest1, [dest1]
+
+ cmp len, 64
+ jl .len_lt_64
+.loop64:
+ ENCODE_64B_2
+
+ add pos, 64 ;Loop on 64 bytes at a time
+ sub len, 64
+ cmp len, 64
+ jge .loop64
+
+.len_lt_64:
+ cmp len, 0
+ jle .exit
+
+ xor tmp, tmp
+ bts tmp, len
+ dec tmp
+ kmovq k1, tmp
+
+ ENCODE_64B_2 k1
+
+.exit:
+ vzeroupper
+
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+%endif ; if AS_FEATURE_LEVEL >= 10
diff --git a/contrib/libs/isa-l/erasure_code/gf_2vect_mad_sse.asm b/contrib/libs/isa-l/erasure_code/gf_2vect_mad_sse.asm
index 5bf380df14..1fa6729a6c 100644
--- a/contrib/libs/isa-l/erasure_code/gf_2vect_mad_sse.asm
+++ b/contrib/libs/isa-l/erasure_code/gf_2vect_mad_sse.asm
@@ -97,7 +97,7 @@
%define return rax
%define return.w eax
- %define func(x) x:
+ %define func(x) x: endbranch
%define FUNC_SAVE
%define FUNC_RESTORE
%endif
@@ -154,13 +154,8 @@ section .text
align 16
-global gf_2vect_mad_sse:ISAL_SYM_TYPE_FUNCTION
+global gf_2vect_mad_sse, function
func(gf_2vect_mad_sse)
-%ifidn __OUTPUT_FORMAT__, macho64
-global _gf_2vect_mad_sse:ISAL_SYM_TYPE_FUNCTION
-func(_gf_2vect_mad_sse)
-%endif
-
FUNC_SAVE
sub len, 16
jl .return_fail
@@ -239,6 +234,3 @@ align 16
mask0f:
dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
-
-;;; func core, ver, snum
-slversion gf_2vect_mad_sse, 00, 01, 0203
diff --git a/contrib/libs/isa-l/erasure_code/gf_2vect_mad_sse.patch b/contrib/libs/isa-l/erasure_code/gf_2vect_mad_sse.patch
deleted file mode 100644
index 1d9e040742..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_2vect_mad_sse.patch
+++ /dev/null
@@ -1,6 +0,0 @@
-159,163d158
-< %ifidn __OUTPUT_FORMAT__, macho64
-< global _gf_2vect_mad_sse:ISAL_SYM_TYPE_FUNCTION
-< func(_gf_2vect_mad_sse)
-< %endif
-<
diff --git a/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx.asm b/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx.asm
index a2619507b7..7676c56229 100644
--- a/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx.asm
+++ b/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx.asm
@@ -52,7 +52,7 @@
%define PS 8
%define LOG_PS 3
- %define func(x) x:
+ %define func(x) x: endbranch
%macro FUNC_SAVE 0
push r12
push r13
@@ -87,12 +87,12 @@
%define func(x) proc_frame x
%macro FUNC_SAVE 0
alloc_stack stack_size
- save_xmm128 xmm6, 0*16
- save_xmm128 xmm7, 1*16
- save_xmm128 xmm8, 2*16
- save_xmm128 xmm9, 3*16
- save_xmm128 xmm10, 4*16
- save_xmm128 xmm11, 5*16
+ vmovdqa [rsp + 0*16], xmm6
+ vmovdqa [rsp + 1*16], xmm7
+ vmovdqa [rsp + 2*16], xmm8
+ vmovdqa [rsp + 3*16], xmm9
+ vmovdqa [rsp + 4*16], xmm10
+ vmovdqa [rsp + 5*16], xmm11
save_reg r12, 6*16 + 0*8
save_reg r13, 6*16 + 1*8
save_reg r14, 6*16 + 2*8
@@ -139,7 +139,7 @@
%define PS 4
%define LOG_PS 2
- %define func(x) x:
+ %define func(x) x: endbranch
%define arg(x) [ebp + PS*2 + PS*x]
%define var(x) [ebp - PS - PS*x]
@@ -261,13 +261,8 @@ section .text
%endif
align 16
-global gf_3vect_dot_prod_avx:ISAL_SYM_TYPE_FUNCTION
+global gf_3vect_dot_prod_avx, function
func(gf_3vect_dot_prod_avx)
-%ifidn __OUTPUT_FORMAT__, macho64
-global _gf_3vect_dot_prod_avx:ISAL_SYM_TYPE_FUNCTION
-func(_gf_3vect_dot_prod_avx)
-%endif
-
FUNC_SAVE
SLDR len, len_m
sub len, 16
@@ -377,6 +372,3 @@ section .data
align 16
mask0f: dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
-
-;;; func core, ver, snum
-slversion gf_3vect_dot_prod_avx, 02, 05, 0192
diff --git a/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx.patch b/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx.patch
deleted file mode 100644
index 8689356763..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx.patch
+++ /dev/null
@@ -1,6 +0,0 @@
-266,270d265
-< %ifidn __OUTPUT_FORMAT__, macho64
-< global _gf_3vect_dot_prod_avx:ISAL_SYM_TYPE_FUNCTION
-< func(_gf_3vect_dot_prod_avx)
-< %endif
-<
diff --git a/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx2.asm b/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx2.asm
index 26b6b82e21..d06ccc30d2 100644
--- a/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx2.asm
+++ b/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx2.asm
@@ -54,7 +54,7 @@
%define PS 8
%define LOG_PS 3
- %define func(x) x:
+ %define func(x) x: endbranch
%macro FUNC_SAVE 0
push r12
push r13
@@ -143,7 +143,7 @@
%define PS 4
%define LOG_PS 2
- %define func(x) x:
+ %define func(x) x: endbranch
%define arg(x) [ebp + PS*2 + PS*x]
%define var(x) [ebp - PS - PS*x]
@@ -269,13 +269,8 @@ section .text
%endif
align 16
-global gf_3vect_dot_prod_avx2:ISAL_SYM_TYPE_FUNCTION
+global gf_3vect_dot_prod_avx2, function
func(gf_3vect_dot_prod_avx2)
-%ifidn __OUTPUT_FORMAT__, macho64
-global _gf_3vect_dot_prod_avx2:ISAL_SYM_TYPE_FUNCTION
-func(_gf_3vect_dot_prod_avx2)
-%endif
-
FUNC_SAVE
SLDR len, len_m
sub len, 32
@@ -395,8 +390,3 @@ func(_gf_3vect_dot_prod_avx2)
ret
endproc_frame
-
-section .data
-
-;;; func core, ver, snum
-slversion gf_3vect_dot_prod_avx2, 04, 05, 0197
diff --git a/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx2.patch b/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx2.patch
deleted file mode 100644
index 9c59162877..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx2.patch
+++ /dev/null
@@ -1,6 +0,0 @@
-274,278d273
-< %ifidn __OUTPUT_FORMAT__, macho64
-< global _gf_3vect_dot_prod_avx2:ISAL_SYM_TYPE_FUNCTION
-< func(_gf_3vect_dot_prod_avx2)
-< %endif
-<
diff --git a/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx2_gfni.asm b/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx2_gfni.asm
new file mode 100644
index 0000000000..76a19763a3
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx2_gfni.asm
@@ -0,0 +1,335 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2023 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_3vect_dot_prod_avx2_gfni(len, vec, *g_tbls, **buffs, **dests);
+;;;
+
+%include "reg_sizes.asm"
+%include "gf_vect_gfni.inc"
+%include "memcpy.asm"
+
+%if AS_FEATURE_LEVEL >= 10
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+ %define arg4 r8
+ %define arg5 r9
+
+ %define tmp r11
+ %define tmp2 r10
+ %define tmp3 r13 ; must be saved and restored
+ %define tmp4 r12 ; must be saved and restored
+ %define tmp5 r14 ; must be saved and restored
+ %define tmp6 r15 ; must be saved and restored
+
+ %define stack_size 4*8
+ %define func(x) x: endbranch
+ %macro FUNC_SAVE 0
+ sub rsp, stack_size
+ mov [rsp + 0*8], r12
+ mov [rsp + 1*8], r13
+ mov [rsp + 2*8], r14
+ mov [rsp + 3*8], r15
+ %endmacro
+ %macro FUNC_RESTORE 0
+ mov r12, [rsp + 0*8]
+ mov r13, [rsp + 1*8]
+ mov r14, [rsp + 2*8]
+ mov r15, [rsp + 3*8]
+ add rsp, stack_size
+ %endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+
+ %define arg4 r12 ; must be saved, loaded and restored
+ %define arg5 r15 ; must be saved and restored
+ %define tmp r11
+ %define tmp2 r10
+ %define tmp3 r13 ; must be saved and restored
+ %define tmp4 r14 ; must be saved and restored
+ %define tmp5 rdi ; must be saved and restored
+ %define tmp6 rsi ; must be saved and restored
+ %define stack_size 8*16 + 7*8 ; must be an odd multiple of 8
+ %define arg(x) [rsp + stack_size + 8 + 8*x]
+
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ vmovdqa [rsp + 0*16], xmm6
+ vmovdqa [rsp + 1*16], xmm7
+ vmovdqa [rsp + 2*16], xmm8
+ vmovdqa [rsp + 3*16], xmm9
+ vmovdqa [rsp + 4*16], xmm10
+ vmovdqa [rsp + 5*16], xmm11
+ vmovdqa [rsp + 6*16], xmm12
+ vmovdqa [rsp + 7*16], xmm13
+ mov [rsp + 8*16 + 0*8], r12
+ mov [rsp + 8*16 + 1*8], r13
+ mov [rsp + 8*16 + 2*8], r14
+ mov [rsp + 8*16 + 3*8], r15
+ mov [rsp + 8*16 + 4*8], rdi
+ mov [rsp + 8*16 + 5*8], rsi
+ end_prolog
+ mov arg4, arg(4)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ vmovdqa xmm6, [rsp + 0*16]
+ vmovdqa xmm7, [rsp + 1*16]
+ vmovdqa xmm8, [rsp + 2*16]
+ vmovdqa xmm9, [rsp + 3*16]
+ vmovdqa xmm10, [rsp + 4*16]
+ vmovdqa xmm11, [rsp + 5*16]
+ vmovdqa xmm12, [rsp + 6*16]
+ vmovdqa xmm13, [rsp + 7*16]
+ mov r12, [rsp + 8*16 + 0*8]
+ mov r13, [rsp + 8*16 + 1*8]
+ mov r14, [rsp + 8*16 + 2*8]
+ mov r15, [rsp + 8*16 + 3*8]
+ mov rdi, [rsp + 8*16 + 4*8]
+ mov rsi, [rsp + 8*16 + 5*8]
+ add rsp, stack_size
+ %endmacro
+%endif
+
+
+%define len arg0
+%define vec arg1
+%define mul_array arg2
+%define src arg3
+%define dest arg4
+%define ptr arg5
+%define vec_i tmp2
+%define dest2 tmp3
+%define dest3 tmp4
+%define dest1 tmp5
+%define pos rax
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+ %define XLDR vmovdqa
+ %define XSTR vmovdqa
+ %else
+ %define XLDR vmovntdqa
+ %define XSTR vmovntdq
+ %endif
+%endif
+
+%define x0l ymm0
+%define x0h ymm1
+
+%define xgft1 ymm8
+%define xgft2 ymm9
+%define xgft3 ymm10
+
+%define xtmp1 ymm11
+%define xtmp2 ymm12
+%define xtmp3 ymm13
+
+%define xp1l ymm2
+%define xp2l ymm3
+%define xp3l ymm4
+%define xp1h ymm5
+%define xp2h ymm6
+%define xp3h ymm7
+
+%define x0 x0l
+%define xp1 xp1l
+%define xp2 xp2l
+%define xp3 xp3l
+
+default rel
+[bits 64]
+
+section .text
+
+;;
+;; Encodes 64 bytes of all "k" sources into 3x 64 bytes (parity disks)
+;;
+%macro ENCODE_64B_3 0
+ vpxor xp1l, xp1l, xp1l
+ vpxor xp1h, xp1h, xp1h
+ vpxor xp2l, xp2l, xp2l
+ vpxor xp2h, xp2h, xp2h
+ vpxor xp3l, xp3l, xp3l
+ vpxor xp3h, xp3h, xp3h
+ mov tmp, mul_array
+ xor vec_i, vec_i
+
+%%next_vect:
+ mov ptr, [src + vec_i]
+ XLDR x0l, [ptr + pos] ;; Get next source vector low 32 bytes
+ XLDR x0h, [ptr + pos + 32] ;; Get next source vector high 32 bytes
+ add vec_i, 8
+
+ vbroadcastsd xgft1, [tmp]
+ vbroadcastsd xgft2, [tmp + vec]
+ vbroadcastsd xgft3, [tmp + vec*2]
+ add tmp, 8
+
+ GF_MUL_XOR VEX, x0l, xgft1, xtmp1, xp1l, xgft2, xtmp2, xp2l, xgft3, xtmp3, xp3l
+ GF_MUL_XOR VEX, x0h, xgft1, xgft1, xp1h, xgft2, xgft2, xp2h, xgft3, xgft3, xp3h
+
+ cmp vec_i, vec
+ jl %%next_vect
+
+ XSTR [dest1 + pos], xp1l
+ XSTR [dest1 + pos + 32], xp1h
+ XSTR [dest2 + pos], xp2l
+ XSTR [dest2 + pos + 32], xp2h
+ XSTR [dest3 + pos], xp3l
+ XSTR [dest3 + pos + 32], xp3h
+%endmacro
+
+;;
+;; Encodes 32 bytes of all "k" sources into 3x 32 bytes (parity disks)
+;;
+%macro ENCODE_32B_3 0
+ vpxor xp1, xp1, xp1
+ vpxor xp2, xp2, xp2
+ vpxor xp3, xp3, xp3
+ mov tmp, mul_array
+ xor vec_i, vec_i
+
+%%next_vect:
+ mov ptr, [src + vec_i]
+ XLDR x0, [ptr + pos] ;Get next source vector (32 bytes)
+ add vec_i, 8
+
+ vbroadcastsd xgft1, [tmp]
+ vbroadcastsd xgft2, [tmp + vec]
+ vbroadcastsd xgft3, [tmp + vec*2]
+ add tmp, 8
+
+ GF_MUL_XOR VEX, x0, xgft1, xgft1, xp1, xgft2, xgft2, xp2, xgft3, xgft3, xp3
+
+ cmp vec_i, vec
+ jl %%next_vect
+
+ XSTR [dest1 + pos], xp1
+ XSTR [dest2 + pos], xp2
+ XSTR [dest3 + pos], xp3
+%endmacro
+
+;;
+;; Encodes less than 32 bytes of all "k" sources into 3 parity disks
+;;
+%macro ENCODE_LT_32B_3 1
+%define %%LEN %1
+
+ vpxor xp1, xp1, xp1
+ vpxor xp2, xp2, xp2
+ vpxor xp3, xp3, xp3
+ xor vec_i, vec_i
+
+%%next_vect:
+ mov ptr, [src + vec_i]
+ simd_load_avx2 x0, ptr + pos, %%LEN, tmp, tmp6 ;Get next source vector
+ add vec_i, 8
+
+ vbroadcastsd xgft1, [mul_array]
+ vbroadcastsd xgft2, [mul_array + vec]
+ vbroadcastsd xgft3, [mul_array + vec*2]
+ add mul_array, 8
+
+ GF_MUL_XOR VEX, x0, xgft1, xgft1, xp1, xgft2, xgft2, xp2, xgft3, xgft3, xp3
+
+ cmp vec_i, vec
+ jl %%next_vect
+
+ ;Store updated encoded data
+ lea ptr, [dest1 + pos]
+ simd_store_avx2 ptr, xp1, %%LEN, tmp, vec_i
+
+ lea ptr, [dest2 + pos]
+ simd_store_avx2 ptr, xp2, %%LEN, tmp, vec_i
+
+ lea ptr, [dest3 + pos]
+ simd_store_avx2 ptr, xp3, %%LEN, tmp, vec_i
+%endmacro
+
+align 16
+global gf_3vect_dot_prod_avx2_gfni, function
+func(gf_3vect_dot_prod_avx2_gfni)
+ FUNC_SAVE
+
+ xor pos, pos
+ shl vec, 3 ;; vec *= 8. Make vec_i count by 8
+ mov dest1, [dest]
+ mov dest2, [dest + 8]
+ mov dest3, [dest + 2*8]
+
+ cmp len, 64
+ jl .len_lt_64
+
+.loop64:
+ ENCODE_64B_3
+
+ add pos, 64 ;; Loop on 64 bytes at a time first
+ sub len, 64
+ cmp len, 64
+ jge .loop64
+
+.len_lt_64:
+ cmp len, 32
+ jl .len_lt_32
+
+ ENCODE_32B_3
+
+ add pos, 32 ;; encode next 32 bytes
+ sub len, 32
+
+.len_lt_32:
+ cmp len, 0
+ jle .exit
+
+ ENCODE_LT_32B_3 len
+
+.exit:
+ vzeroupper
+
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+%endif ; if AS_FEATURE_LEVEL >= 10
diff --git a/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx512.asm b/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx512.asm
index 16a90eb2af..fcd919367d 100644
--- a/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx512.asm
+++ b/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx512.asm
@@ -44,8 +44,6 @@
%define arg5 r9
%define tmp r11
- %define tmp.w r11d
- %define tmp.b r11b
%define tmp2 r10
%define tmp3 r13 ; must be saved and restored
%define tmp4 r12 ; must be saved and restored
@@ -53,7 +51,7 @@
%define PS 8
%define LOG_PS 3
- %define func(x) x:
+ %define func(x) x: endbranch
%macro FUNC_SAVE 0
push r12
push r13
@@ -73,15 +71,13 @@
%define arg4 r12 ; must be saved, loaded and restored
%define arg5 r15 ; must be saved and restored
%define tmp r11
- %define tmp.w r11d
- %define tmp.b r11b
%define tmp2 r10
%define tmp3 r13 ; must be saved and restored
%define tmp4 r14 ; must be saved and restored
%define return rax
%define PS 8
%define LOG_PS 3
- %define stack_size 9*16 + 5*8 ; must be an odd multiple of 8
+ %define stack_size 6*16 + 5*8 ; must be an odd multiple of 8
%define arg(x) [rsp + stack_size + PS + PS*x]
%define func(x) proc_frame x
@@ -93,13 +89,10 @@
vmovdqa [rsp + 3*16], xmm9
vmovdqa [rsp + 4*16], xmm10
vmovdqa [rsp + 5*16], xmm11
- vmovdqa [rsp + 6*16], xmm12
- vmovdqa [rsp + 7*16], xmm13
- vmovdqa [rsp + 8*16], xmm14
- save_reg r12, 9*16 + 0*8
- save_reg r13, 9*16 + 1*8
- save_reg r14, 9*16 + 2*8
- save_reg r15, 9*16 + 3*8
+ save_reg r12, 6*16 + 0*8
+ save_reg r13, 6*16 + 1*8
+ save_reg r14, 6*16 + 2*8
+ save_reg r15, 6*16 + 3*8
end_prolog
mov arg4, arg(4)
%endmacro
@@ -111,13 +104,10 @@
vmovdqa xmm9, [rsp + 3*16]
vmovdqa xmm10, [rsp + 4*16]
vmovdqa xmm11, [rsp + 5*16]
- vmovdqa xmm12, [rsp + 6*16]
- vmovdqa xmm13, [rsp + 7*16]
- vmovdqa xmm14, [rsp + 8*16]
- mov r12, [rsp + 9*16 + 0*8]
- mov r13, [rsp + 9*16 + 1*8]
- mov r14, [rsp + 9*16 + 2*8]
- mov r15, [rsp + 9*16 + 3*8]
+ mov r12, [rsp + 6*16 + 0*8]
+ mov r13, [rsp + 6*16 + 1*8]
+ mov r14, [rsp + 6*16 + 2*8]
+ mov r15, [rsp + 6*16 + 3*8]
add rsp, stack_size
%endmacro
%endif
@@ -142,8 +132,8 @@
%else
;;; Use Non-temporal load/stor
%ifdef NO_NT_LDST
- %define XLDR vmovdqa
- %define XSTR vmovdqa
+ %define XLDR vmovdqa64
+ %define XSTR vmovdqa64
%else
%define XLDR vmovntdqa
%define XSTR vmovntdq
@@ -173,13 +163,8 @@ default rel
section .text
align 16
-global gf_3vect_dot_prod_avx512:ISAL_SYM_TYPE_FUNCTION
+global gf_3vect_dot_prod_avx512, function
func(gf_3vect_dot_prod_avx512)
-%ifidn __OUTPUT_FORMAT__, macho64
-global _gf_3vect_dot_prod_avx512:ISAL_SYM_TYPE_FUNCTION
-func(_gf_3vect_dot_prod_avx512)
-%endif
-
FUNC_SAVE
sub len, 64
jl .return_fail
diff --git a/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx512.patch b/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx512.patch
deleted file mode 100644
index 8397eb6861..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx512.patch
+++ /dev/null
@@ -1,6 +0,0 @@
-178,182d177
-< %ifidn __OUTPUT_FORMAT__, macho64
-< global _gf_3vect_dot_prod_avx512:ISAL_SYM_TYPE_FUNCTION
-< func(_gf_3vect_dot_prod_avx512)
-< %endif
-<
diff --git a/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx512_gfni.asm b/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx512_gfni.asm
new file mode 100644
index 0000000000..39ee6382a2
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_avx512_gfni.asm
@@ -0,0 +1,225 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2023 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_3vect_dot_prod_avx512_gfni(len, vec, *g_tbls, **buffs, **dests);
+;;;
+
+%include "reg_sizes.asm"
+%include "gf_vect_gfni.inc"
+
+%if AS_FEATURE_LEVEL >= 10
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+ %define arg4 r8
+ %define arg5 r9
+
+ %define tmp r11
+ %define tmp2 r10
+ %define tmp3 r13 ; must be saved and restored
+ %define tmp4 r12 ; must be saved and restored
+
+ %define func(x) x: endbranch
+ %macro FUNC_SAVE 0
+ push r12
+ push r13
+ %endmacro
+ %macro FUNC_RESTORE 0
+ pop r13
+ pop r12
+ %endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+
+ %define arg4 r12 ; must be saved, loaded and restored
+ %define arg5 r15 ; must be saved and restored
+ %define tmp r11
+ %define tmp2 r10
+ %define tmp3 r13 ; must be saved and restored
+ %define tmp4 r14 ; must be saved and restored
+ %define stack_size 1*16 + 5*8 ; must be an odd multiple of 8
+ %define arg(x) [rsp + stack_size + 8 + 8*x]
+
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ vmovdqa [rsp + 0*16], xmm6
+ mov [rsp + 1*16 + 0*8], r12
+ mov [rsp + 1*16 + 1*8], r13
+ mov [rsp + 1*16 + 2*8], r14
+ mov [rsp + 1*16 + 3*8], r15
+ end_prolog
+ mov arg4, arg(4)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ vmovdqa xmm6, [rsp + 0*16]
+ mov r12, [rsp + 1*16 + 0*8]
+ mov r13, [rsp + 1*16 + 1*8]
+ mov r14, [rsp + 1*16 + 2*8]
+ mov r15, [rsp + 1*16 + 3*8]
+ add rsp, stack_size
+ %endmacro
+%endif
+
+
+%define len arg0
+%define vec arg1
+%define mul_array arg2
+%define src arg3
+%define dest1 arg4
+%define ptr arg5
+%define vec_i tmp2
+%define dest2 tmp3
+%define dest3 tmp4
+%define pos rax
+
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu8
+ %define XSTR vmovdqu8
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+ %define XLDR vmovdqa64
+ %define XSTR vmovdqa64
+ %else
+ %define XLDR vmovntdqa
+ %define XSTR vmovntdq
+ %endif
+%endif
+
+%define xgft1 zmm4
+%define xgft2 zmm5
+%define xgft3 zmm6
+
+%define x0 zmm0
+%define xp1 zmm1
+%define xp2 zmm2
+%define xp3 zmm3
+
+default rel
+[bits 64]
+
+section .text
+
+;;
+;; Encodes 64 bytes of all "k" sources into 3x 64 bytes (parity disks)
+;;
+%macro ENCODE_64B_3 0-1
+%define %%KMASK %1
+
+ vpxorq xp1, xp1, xp1
+ vpxorq xp2, xp2, xp2
+ vpxorq xp3, xp3, xp3
+ mov tmp, mul_array
+ xor vec_i, vec_i
+
+%%next_vect:
+ mov ptr, [src + vec_i]
+%if %0 == 1
+ vmovdqu8 x0{%%KMASK}, [ptr + pos] ;Get next source vector (less than 64 bytes)
+%else
+ XLDR x0, [ptr + pos] ;Get next source vector (64 bytes)
+%endif
+ add vec_i, 8
+
+ vbroadcastf32x2 xgft1, [tmp]
+ vbroadcastf32x2 xgft2, [tmp + vec]
+ vbroadcastf32x2 xgft3, [tmp + vec*2]
+ add tmp, 8
+
+ GF_MUL_XOR EVEX, x0, xgft1, xgft1, xp1, xgft2, xgft2, xp2, xgft3, xgft3, xp3
+
+ cmp vec_i, vec
+ jl %%next_vect
+
+%if %0 == 1
+ vmovdqu8 [dest1 + pos]{%%KMASK}, xp1
+ vmovdqu8 [dest2 + pos]{%%KMASK}, xp2
+ vmovdqu8 [dest3 + pos]{%%KMASK}, xp3
+%else
+ XSTR [dest1 + pos], xp1
+ XSTR [dest2 + pos], xp2
+ XSTR [dest3 + pos], xp3
+%endif
+%endmacro
+
+align 16
+global gf_3vect_dot_prod_avx512_gfni, function
+func(gf_3vect_dot_prod_avx512_gfni)
+ FUNC_SAVE
+
+ xor pos, pos
+ shl vec, 3 ;vec *= 8. Make vec_i count by 8
+ mov dest2, [dest1 + 8]
+ mov dest3, [dest1 + 2*8]
+ mov dest1, [dest1]
+
+ cmp len, 64
+ jl .len_lt_64
+
+.loop64:
+
+ ENCODE_64B_3
+
+ add pos, 64 ;Loop on 64 bytes at a time
+ sub len, 64
+ cmp len, 64
+ jge .loop64
+
+.len_lt_64:
+ cmp len, 0
+ jle .exit
+
+ xor tmp, tmp
+ bts tmp, len
+ dec tmp
+ kmovq k1, tmp
+
+ ENCODE_64B_3 k1
+
+.exit:
+ vzeroupper
+
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+%endif ; if AS_FEATURE_LEVEL >= 10
diff --git a/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_sse.asm b/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_sse.asm
index 582fac8481..af0875016c 100644
--- a/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_sse.asm
+++ b/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_sse.asm
@@ -52,7 +52,7 @@
%define PS 8
%define LOG_PS 3
- %define func(x) x:
+ %define func(x) x: endbranch
%macro FUNC_SAVE 0
push r12
push r13
@@ -139,7 +139,7 @@
%define PS 4
%define LOG_PS 2
- %define func(x) x:
+ %define func(x) x: endbranch
%define arg(x) [ebp + PS*2 + PS*x]
%define var(x) [ebp - PS - PS*x]
@@ -261,13 +261,8 @@ section .text
%endif
align 16
-global gf_3vect_dot_prod_sse:ISAL_SYM_TYPE_FUNCTION
+global gf_3vect_dot_prod_sse, function
func(gf_3vect_dot_prod_sse)
-%ifidn __OUTPUT_FORMAT__, macho64
-global _gf_3vect_dot_prod_sse:ISAL_SYM_TYPE_FUNCTION
-func(_gf_3vect_dot_prod_sse)
-%endif
-
FUNC_SAVE
SLDR len, len_m
sub len, 16
@@ -378,6 +373,3 @@ section .data
align 16
mask0f: dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
-
-;;; func core, ver, snum
-slversion gf_3vect_dot_prod_sse, 00, 06, 0063
diff --git a/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_sse.patch b/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_sse.patch
deleted file mode 100644
index f21ce0ff9c..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_sse.patch
+++ /dev/null
@@ -1,6 +0,0 @@
-266,270d265
-< %ifidn __OUTPUT_FORMAT__, macho64
-< global _gf_3vect_dot_prod_sse:ISAL_SYM_TYPE_FUNCTION
-< func(_gf_3vect_dot_prod_sse)
-< %endif
-<
diff --git a/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_sse_test.c b/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_sse_test.c
deleted file mode 100644
index b2c19382ff..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_sse_test.c
+++ /dev/null
@@ -1,586 +0,0 @@
-/**********************************************************************
- Copyright(c) 2011-2015 Intel Corporation All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions
- are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in
- the documentation and/or other materials provided with the
- distribution.
- * Neither the name of Intel Corporation nor the names of its
- contributors may be used to endorse or promote products derived
- from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-**********************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h> // for memset, memcmp
-#include "erasure_code.h"
-// #include "types.h"
-
-#ifndef FUNCTION_UNDER_TEST
-# define FUNCTION_UNDER_TEST gf_3vect_dot_prod_sse
-#endif
-#ifndef TEST_MIN_SIZE
-# define TEST_MIN_SIZE 16
-#endif
-
-#define str(s) #s
-#define xstr(s) str(s)
-
-#define TEST_LEN 2048
-#define TEST_SIZE (TEST_LEN/2)
-#define TEST_MEM TEST_SIZE
-#define TEST_LOOPS 1000
-#define TEST_TYPE_STR ""
-
-#ifndef TEST_SOURCES
-# define TEST_SOURCES 16
-#endif
-#ifndef RANDOMS
-# define RANDOMS 20
-#endif
-
-#ifdef EC_ALIGNED_ADDR
-// Define power of 2 range to check ptr, len alignment
-# define PTR_ALIGN_CHK_B 0
-# define LEN_ALIGN_CHK_B 0 // 0 for aligned only
-#else
-// Define power of 2 range to check ptr, len alignment
-# define PTR_ALIGN_CHK_B 32
-# define LEN_ALIGN_CHK_B 32 // 0 for aligned only
-#endif
-
-typedef unsigned char u8;
-
-extern void FUNCTION_UNDER_TEST(int len, int vlen, unsigned char *gftbls,
- unsigned char **src, unsigned char **dest);
-
-void dump(unsigned char *buf, int len)
-{
- int i;
- for (i = 0; i < len;) {
- printf(" %2x", 0xff & buf[i++]);
- if (i % 32 == 0)
- printf("\n");
- }
- printf("\n");
-}
-
-void dump_matrix(unsigned char **s, int k, int m)
-{
- int i, j;
- for (i = 0; i < k; i++) {
- for (j = 0; j < m; j++) {
- printf(" %2x", s[i][j]);
- }
- printf("\n");
- }
- printf("\n");
-}
-
-void dump_u8xu8(unsigned char *s, int k, int m)
-{
- int i, j;
- for (i = 0; i < k; i++) {
- for (j = 0; j < m; j++) {
- printf(" %2x", 0xff & s[j + (i * m)]);
- }
- printf("\n");
- }
- printf("\n");
-}
-
-int main(int argc, char *argv[])
-{
- int i, j, rtest, srcs;
- void *buf;
- u8 g1[TEST_SOURCES], g2[TEST_SOURCES], g3[TEST_SOURCES];
- u8 g_tbls[3 * TEST_SOURCES * 32], *dest_ptrs[3], *buffs[TEST_SOURCES];
- u8 *dest1, *dest2, *dest3, *dest_ref1, *dest_ref2, *dest_ref3;
-
- int align, size;
- unsigned char *efence_buffs[TEST_SOURCES];
- unsigned int offset;
- u8 *ubuffs[TEST_SOURCES];
- u8 *udest_ptrs[3];
- printf(xstr(FUNCTION_UNDER_TEST) "_test: %dx%d ", TEST_SOURCES, TEST_LEN);
-
- // Allocate the arrays
- for (i = 0; i < TEST_SOURCES; i++) {
- if (posix_memalign(&buf, 64, TEST_LEN)) {
- printf("alloc error: Fail");
- return -1;
- }
- buffs[i] = buf;
- }
-
- if (posix_memalign(&buf, 64, TEST_LEN)) {
- printf("alloc error: Fail");
- return -1;
- }
- dest1 = buf;
-
- if (posix_memalign(&buf, 64, TEST_LEN)) {
- printf("alloc error: Fail");
- return -1;
- }
- dest2 = buf;
-
- if (posix_memalign(&buf, 64, TEST_LEN)) {
- printf("alloc error: Fail");
- return -1;
- }
- dest3 = buf;
-
- if (posix_memalign(&buf, 64, TEST_LEN)) {
- printf("alloc error: Fail");
- return -1;
- }
- dest_ref1 = buf;
-
- if (posix_memalign(&buf, 64, TEST_LEN)) {
- printf("alloc error: Fail");;
- return -1;
- }
- dest_ref2 = buf;
-
- if (posix_memalign(&buf, 64, TEST_LEN)) {
- printf("alloc error: Fail");
- return -1;
- }
- dest_ref3 = buf;
-
- dest_ptrs[0] = dest1;
- dest_ptrs[1] = dest2;
- dest_ptrs[2] = dest3;
-
- // Test of all zeros
- for (i = 0; i < TEST_SOURCES; i++)
- memset(buffs[i], 0, TEST_LEN);
-
- memset(dest1, 0, TEST_LEN);
- memset(dest2, 0, TEST_LEN);
- memset(dest3, 0, TEST_LEN);
- memset(dest_ref1, 0, TEST_LEN);
- memset(dest_ref2, 0, TEST_LEN);
- memset(dest_ref3, 0, TEST_LEN);
- memset(g1, 2, TEST_SOURCES);
- memset(g2, 1, TEST_SOURCES);
- memset(g3, 7, TEST_SOURCES);
-
- for (i = 0; i < TEST_SOURCES; i++) {
- gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
- gf_vect_mul_init(g2[i], &g_tbls[32 * TEST_SOURCES + i * 32]);
- gf_vect_mul_init(g3[i], &g_tbls[64 * TEST_SOURCES + i * 32]);
- }
-
- gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
- gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES], buffs,
- dest_ref2);
- gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES], buffs,
- dest_ref3);
-
- FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
-
- if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
- printf("Fail zero" xstr(FUNCTION_UNDER_TEST) " test1\n");
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref1, 25);
- printf("dprod_dut:");
- dump(dest1, 25);
- return -1;
- }
- if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
- printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test2\n");
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref2, 25);
- printf("dprod_dut:");
- dump(dest2, 25);
- return -1;
- }
- if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
- printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test3\n");
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref3, 25);
- printf("dprod_dut:");
- dump(dest3, 25);
- return -1;
- }
-
- putchar('.');
-
- // Rand data test
-
- for (rtest = 0; rtest < RANDOMS; rtest++) {
- for (i = 0; i < TEST_SOURCES; i++)
- for (j = 0; j < TEST_LEN; j++)
- buffs[i][j] = rand();
-
- for (i = 0; i < TEST_SOURCES; i++) {
- g1[i] = rand();
- g2[i] = rand();
- g3[i] = rand();
- }
-
- for (i = 0; i < TEST_SOURCES; i++) {
- gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
- gf_vect_mul_init(g2[i], &g_tbls[(32 * TEST_SOURCES) + (i * 32)]);
- gf_vect_mul_init(g3[i], &g_tbls[(64 * TEST_SOURCES) + (i * 32)]);
- }
-
- gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
- gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
- buffs, dest_ref2);
- gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES],
- buffs, dest_ref3);
-
- FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
-
- if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test1 %d\n", rtest);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref1, 25);
- printf("dprod_dut:");
- dump(dest1, 25);
- return -1;
- }
- if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test2 %d\n", rtest);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref2, 25);
- printf("dprod_dut:");
- dump(dest2, 25);
- return -1;
- }
- if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test3 %d\n", rtest);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref3, 25);
- printf("dprod_dut:");
- dump(dest3, 25);
- return -1;
- }
-
- putchar('.');
- }
-
- // Rand data test with varied parameters
- for (rtest = 0; rtest < RANDOMS; rtest++) {
- for (srcs = TEST_SOURCES; srcs > 0; srcs--) {
- for (i = 0; i < srcs; i++)
- for (j = 0; j < TEST_LEN; j++)
- buffs[i][j] = rand();
-
- for (i = 0; i < srcs; i++) {
- g1[i] = rand();
- g2[i] = rand();
- g3[i] = rand();
- }
-
- for (i = 0; i < srcs; i++) {
- gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
- gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
- gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]);
- }
-
- gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[0], buffs, dest_ref1);
- gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[32 * srcs], buffs,
- dest_ref2);
- gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[64 * srcs], buffs,
- dest_ref3);
-
- FUNCTION_UNDER_TEST(TEST_LEN, srcs, g_tbls, buffs, dest_ptrs);
-
- if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
- " test1 srcs=%d\n", srcs);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref1, 25);
- printf("dprod_dut:");
- dump(dest1, 25);
- return -1;
- }
- if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
- " test2 srcs=%d\n", srcs);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref2, 25);
- printf("dprod_dut:");
- dump(dest2, 25);
- return -1;
- }
- if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
- " test3 srcs=%d\n", srcs);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref3, 25);
- printf("dprod_dut:");
- dump(dest3, 25);
- return -1;
- }
-
- putchar('.');
- }
- }
-
- // Run tests at end of buffer for Electric Fence
- align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
- for (size = TEST_MIN_SIZE; size <= TEST_SIZE; size += align) {
- for (i = 0; i < TEST_SOURCES; i++)
- for (j = 0; j < TEST_LEN; j++)
- buffs[i][j] = rand();
-
- for (i = 0; i < TEST_SOURCES; i++) // Line up TEST_SIZE from end
- efence_buffs[i] = buffs[i] + TEST_LEN - size;
-
- for (i = 0; i < TEST_SOURCES; i++) {
- g1[i] = rand();
- g2[i] = rand();
- g3[i] = rand();
- }
-
- for (i = 0; i < TEST_SOURCES; i++) {
- gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
- gf_vect_mul_init(g2[i], &g_tbls[(32 * TEST_SOURCES) + (i * 32)]);
- gf_vect_mul_init(g3[i], &g_tbls[(64 * TEST_SOURCES) + (i * 32)]);
- }
-
- gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[0], efence_buffs, dest_ref1);
- gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
- efence_buffs, dest_ref2);
- gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES],
- efence_buffs, dest_ref3);
-
- FUNCTION_UNDER_TEST(size, TEST_SOURCES, g_tbls, efence_buffs, dest_ptrs);
-
- if (0 != memcmp(dest_ref1, dest1, size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test1 %d\n", rtest);
- dump_matrix(efence_buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref1, align);
- printf("dprod_dut:");
- dump(dest1, align);
- return -1;
- }
-
- if (0 != memcmp(dest_ref2, dest2, size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test2 %d\n", rtest);
- dump_matrix(efence_buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref2, align);
- printf("dprod_dut:");
- dump(dest2, align);
- return -1;
- }
-
- if (0 != memcmp(dest_ref3, dest3, size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test3 %d\n", rtest);
- dump_matrix(efence_buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref3, align);
- printf("dprod_dut:");
- dump(dest3, align);
- return -1;
- }
-
- putchar('.');
- }
-
- // Test rand ptr alignment if available
-
- for (rtest = 0; rtest < RANDOMS; rtest++) {
- size = (TEST_LEN - PTR_ALIGN_CHK_B) & ~(TEST_MIN_SIZE - 1);
- srcs = rand() % TEST_SOURCES;
- if (srcs == 0)
- continue;
-
- offset = (PTR_ALIGN_CHK_B != 0) ? 1 : PTR_ALIGN_CHK_B;
- // Add random offsets
- for (i = 0; i < srcs; i++)
- ubuffs[i] = buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
-
- udest_ptrs[0] = dest1 + (rand() & (PTR_ALIGN_CHK_B - offset));
- udest_ptrs[1] = dest2 + (rand() & (PTR_ALIGN_CHK_B - offset));
- udest_ptrs[2] = dest3 + (rand() & (PTR_ALIGN_CHK_B - offset));
-
- memset(dest1, 0, TEST_LEN); // zero pad to check write-over
- memset(dest2, 0, TEST_LEN);
- memset(dest3, 0, TEST_LEN);
-
- for (i = 0; i < srcs; i++)
- for (j = 0; j < size; j++)
- ubuffs[i][j] = rand();
-
- for (i = 0; i < srcs; i++) {
- g1[i] = rand();
- g2[i] = rand();
- g3[i] = rand();
- }
-
- for (i = 0; i < srcs; i++) {
- gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
- gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
- gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]);
- }
-
- gf_vect_dot_prod_base(size, srcs, &g_tbls[0], ubuffs, dest_ref1);
- gf_vect_dot_prod_base(size, srcs, &g_tbls[32 * srcs], ubuffs, dest_ref2);
- gf_vect_dot_prod_base(size, srcs, &g_tbls[64 * srcs], ubuffs, dest_ref3);
-
- FUNCTION_UNDER_TEST(size, srcs, g_tbls, ubuffs, udest_ptrs);
-
- if (memcmp(dest_ref1, udest_ptrs[0], size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
- srcs);
- dump_matrix(ubuffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref1, 25);
- printf("dprod_dut:");
- dump(udest_ptrs[0], 25);
- return -1;
- }
- if (memcmp(dest_ref2, udest_ptrs[1], size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
- srcs);
- dump_matrix(ubuffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref2, 25);
- printf("dprod_dut:");
- dump(udest_ptrs[1], 25);
- return -1;
- }
- if (memcmp(dest_ref3, udest_ptrs[2], size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
- srcs);
- dump_matrix(ubuffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref3, 25);
- printf("dprod_dut:");
- dump(udest_ptrs[2], 25);
- return -1;
- }
- // Confirm that padding around dests is unchanged
- memset(dest_ref1, 0, PTR_ALIGN_CHK_B); // Make reference zero buff
- offset = udest_ptrs[0] - dest1;
-
- if (memcmp(dest1, dest_ref1, offset)) {
- printf("Fail rand ualign pad1 start\n");
- return -1;
- }
- if (memcmp(dest1 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
- printf("Fail rand ualign pad1 end\n");
- return -1;
- }
-
- offset = udest_ptrs[1] - dest2;
- if (memcmp(dest2, dest_ref1, offset)) {
- printf("Fail rand ualign pad2 start\n");
- return -1;
- }
- if (memcmp(dest2 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
- printf("Fail rand ualign pad2 end\n");
- return -1;
- }
-
- offset = udest_ptrs[2] - dest3;
- if (memcmp(dest3, dest_ref1, offset)) {
- printf("Fail rand ualign pad3 start\n");
- return -1;
- }
- if (memcmp(dest3 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
- printf("Fail rand ualign pad3 end\n");;
- return -1;
- }
-
- putchar('.');
- }
-
- // Test all size alignment
- align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
-
- for (size = TEST_LEN; size >= TEST_MIN_SIZE; size -= align) {
- srcs = TEST_SOURCES;
-
- for (i = 0; i < srcs; i++)
- for (j = 0; j < size; j++)
- buffs[i][j] = rand();
-
- for (i = 0; i < srcs; i++) {
- g1[i] = rand();
- g2[i] = rand();
- g3[i] = rand();
- }
-
- for (i = 0; i < srcs; i++) {
- gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
- gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
- gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]);
- }
-
- gf_vect_dot_prod_base(size, srcs, &g_tbls[0], buffs, dest_ref1);
- gf_vect_dot_prod_base(size, srcs, &g_tbls[32 * srcs], buffs, dest_ref2);
- gf_vect_dot_prod_base(size, srcs, &g_tbls[64 * srcs], buffs, dest_ref3);
-
- FUNCTION_UNDER_TEST(size, srcs, g_tbls, buffs, dest_ptrs);
-
- if (memcmp(dest_ref1, dest_ptrs[0], size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
- size);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref1, 25);
- printf("dprod_dut:");
- dump(dest_ptrs[0], 25);
- return -1;
- }
- if (memcmp(dest_ref2, dest_ptrs[1], size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
- size);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref2, 25);
- printf("dprod_dut:");
- dump(dest_ptrs[1], 25);
- return -1;
- }
- if (memcmp(dest_ref3, dest_ptrs[2], size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
- size);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref3, 25);
- printf("dprod_dut:");
- dump(dest_ptrs[2], 25);
- return -1;
- }
- }
-
- printf("Pass\n");
- return 0;
-
-}
diff --git a/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_sse_test.patch b/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_sse_test.patch
deleted file mode 100644
index 21bbfaa667..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_3vect_dot_prod_sse_test.patch
+++ /dev/null
@@ -1,4 +0,0 @@
-34c34
-< // #include "types.h"
----
-> #include "types.h"
diff --git a/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx.asm b/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx.asm
index 7cf630558c..4e30d1764e 100644
--- a/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx.asm
+++ b/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx.asm
@@ -97,7 +97,7 @@
%define return rax
%define return.w eax
- %define func(x) x:
+ %define func(x) x: endbranch
%define FUNC_SAVE
%define FUNC_RESTORE
%endif
@@ -158,12 +158,8 @@ section .text
%define xd3 xtmph1
align 16
-global gf_3vect_mad_avx:ISAL_SYM_TYPE_FUNCTION
+global gf_3vect_mad_avx, function
func(gf_3vect_mad_avx)
-%ifidn __OUTPUT_FORMAT__, macho64
-global _gf_3vect_mad_avx:ISAL_SYM_TYPE_FUNCTION
-func(_gf_3vect_mad_avx)
-%endif
FUNC_SAVE
sub len, 16
jl .return_fail
@@ -287,6 +283,3 @@ align 16
mask0f: dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
constip16:
dq 0xf8f9fafbfcfdfeff, 0xf0f1f2f3f4f5f6f7
-
-;;; func core, ver, snum
-slversion gf_3vect_mad_avx, 02, 01, 0207
diff --git a/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx.patch b/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx.patch
deleted file mode 100644
index 983b4fc414..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx.patch
+++ /dev/null
@@ -1,5 +0,0 @@
-163,166d162
-< %ifidn __OUTPUT_FORMAT__, macho64
-< global _gf_3vect_mad_avx:ISAL_SYM_TYPE_FUNCTION
-< func(_gf_3vect_mad_avx)
-< %endif
diff --git a/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx2.asm b/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx2.asm
index c218b4db28..069c5103bc 100644
--- a/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx2.asm
+++ b/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx2.asm
@@ -103,7 +103,7 @@
%define return rax
%define return.w eax
- %define func(x) x:
+ %define func(x) x: endbranch
%define FUNC_SAVE
%define FUNC_RESTORE
%endif
@@ -165,13 +165,8 @@ section .text
%define xd3 ymm10
align 16
-global gf_3vect_mad_avx2:ISAL_SYM_TYPE_FUNCTION
+global gf_3vect_mad_avx2, function
func(gf_3vect_mad_avx2)
-%ifidn __OUTPUT_FORMAT__, macho64
-global _gf_3vect_mad_avx2:ISAL_SYM_TYPE_FUNCTION
-func(_gf_3vect_mad_avx2)
-%endif
-
FUNC_SAVE
sub len, 32
jl .return_fail
@@ -317,6 +312,3 @@ align 32
constip32:
dq 0xf8f9fafbfcfdfeff, 0xf0f1f2f3f4f5f6f7
dq 0xe8e9eaebecedeeef, 0xe0e1e2e3e4e5e6e7
-
-;;; func core, ver, snum
-slversion gf_3vect_mad_avx2, 04, 01, 0208
diff --git a/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx2.patch b/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx2.patch
deleted file mode 100644
index 058f09b3c4..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx2.patch
+++ /dev/null
@@ -1,6 +0,0 @@
-170,174d169
-< %ifidn __OUTPUT_FORMAT__, macho64
-< global _gf_3vect_mad_avx2:ISAL_SYM_TYPE_FUNCTION
-< func(_gf_3vect_mad_avx2)
-< %endif
-<
diff --git a/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx2_gfni.asm b/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx2_gfni.asm
new file mode 100644
index 0000000000..8a04577acd
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx2_gfni.asm
@@ -0,0 +1,276 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2023 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_3vect_mad_avx2_gfni(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%include "reg_sizes.asm"
+%include "gf_vect_gfni.inc"
+%include "memcpy.asm"
+
+%if AS_FEATURE_LEVEL >= 10
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+ %define arg4 r8
+ %define arg5 r9
+ %define tmp r11
+ %define tmp2 r10
+ %define func(x) x: endbranch
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+ %define arg4 r12 ; must be saved, loaded and restored
+ %define arg5 r13 ; must be saved and restored
+ %define tmp r11
+ %define tmp2 r10
+ %define stack_size 16*10 + 3*8
+ %define arg(x) [rsp + stack_size + 8 + 8*x]
+ %define func(x) proc_frame x
+
+ %macro FUNC_SAVE 0
+ sub rsp, stack_size
+ vmovdqa [rsp + 0*16], xmm6
+ vmovdqa [rsp + 1*16], xmm7
+ vmovdqa [rsp + 2*16], xmm8
+ vmovdqa [rsp + 3*16], xmm9
+ vmovdqa [rsp + 4*16], xmm10
+ vmovdqa [rsp + 5*16], xmm11
+ vmovdqa [rsp + 6*16], xmm12
+ vmovdqa [rsp + 7*16], xmm13
+ vmovdqa [rsp + 8*16], xmm14
+ vmovdqa [rsp + 9*16], xmm15
+ mov [rsp + 10*16 + 0*8], r12
+ mov [rsp + 10*16 + 1*8], r13
+ end_prolog
+ mov arg4, arg(4)
+ mov arg5, arg(5)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ vmovdqa xmm6, [rsp + 0*16]
+ vmovdqa xmm7, [rsp + 1*16]
+ vmovdqa xmm8, [rsp + 2*16]
+ vmovdqa xmm9, [rsp + 3*16]
+ vmovdqa xmm10, [rsp + 4*16]
+ vmovdqa xmm11, [rsp + 5*16]
+ vmovdqa xmm12, [rsp + 6*16]
+ vmovdqa xmm13, [rsp + 7*16]
+ vmovdqa xmm14, [rsp + 8*16]
+ vmovdqa xmm15, [rsp + 9*16]
+ mov r12, [rsp + 10*16 + 0*8]
+ mov r13, [rsp + 10*16 + 1*8]
+ add rsp, stack_size
+ %endmacro
+%endif
+
+%define len arg0
+%define vec arg1
+%define vec_i arg2
+%define mul_array arg3
+%define src arg4
+%define dest1 arg5
+%define pos rax
+%define dest2 mul_array
+%define dest3 vec_i
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+ %define XLDR vmovdqa
+ %define XSTR vmovdqa
+ %else
+ %define XLDR vmovntdqa
+ %define XSTR vmovntdq
+ %endif
+%endif
+
+default rel
+[bits 64]
+section .text
+
+%define x0l ymm0
+%define x0h ymm0 ; reuse ymm0
+%define xgft1 ymm1
+%define xgft2 ymm2
+%define xgft3 ymm3
+%define xd1l ymm4
+%define xd1h ymm5
+%define xd2l ymm6
+%define xd2h ymm7
+%define xd3l ymm8
+%define xd3h ymm9
+
+%define xret1l ymm10
+%define xret1h ymm11
+%define xret2l ymm12
+%define xret2h ymm13
+%define xret3l ymm14
+%define xret3h ymm15
+
+%define x0 x0l
+%define xd1 xd1l
+%define xd2 xd2l
+%define xd3 xd3l
+%define xret1 xret1l
+%define xret2 xret2l
+%define xret3 xret3l
+
+;;
+;; Encodes 64 bytes of a single source into 3x 64 bytes (parity disks)
+;;
+%macro ENCODE_64B_3 0
+ ; get next source vector
+ XLDR x0l, [src + pos] ;; read low 32 bytes
+ ; get next dest vectors
+ XLDR xd1l, [dest1 + pos]
+ XLDR xd1h, [dest1 + pos + 32]
+ XLDR xd2l, [dest2 + pos]
+ XLDR xd2h, [dest2 + pos + 32]
+ XLDR xd3l, [dest3 + pos]
+ XLDR xd3h, [dest3 + pos + 32]
+
+ GF_MUL_XOR VEX, x0l, xgft1, xret1l, xd1l, xgft2, xret2l, xd2l, xgft3, xret3l, xd3l
+
+ XLDR x0h, [src + pos + 32] ;; read high 32 bytes
+
+ GF_MUL_XOR VEX, x0h, xgft1, xret1h, xd1h, xgft2, xret2h, xd2h, xgft3, xret3h, xd3h
+
+ XSTR [dest1 + pos], xd1l
+ XSTR [dest1 + pos + 32], xd1h
+ XSTR [dest2 + pos], xd2l
+ XSTR [dest2 + pos + 32], xd2h
+ XSTR [dest3 + pos], xd3l
+ XSTR [dest3 + pos + 32], xd3h
+%endmacro
+
+;;
+;; Encodes 32 bytes of a single source into 3x 32 bytes (parity disks)
+;;
+%macro ENCODE_32B_3 0
+ ; get next source vector
+ XLDR x0, [src + pos]
+ ; get next dest vectors
+ XLDR xd1, [dest1 + pos]
+ XLDR xd2, [dest2 + pos]
+ XLDR xd3, [dest3 + pos]
+
+ GF_MUL_XOR VEX, x0, xgft1, xret1, xd1, xgft2, xret2, xd2, xgft3, xret3, xd3
+
+ XSTR [dest1 + pos], xd1
+ XSTR [dest2 + pos], xd2
+ XSTR [dest3 + pos], xd3
+%endmacro
+
+;;
+;; Encodes less than 32 bytes of a single source into 3x parity disks
+;;
+%macro ENCODE_LT_32B_3 1
+%define %%LEN %1
+ ; get next source vector
+ simd_load_avx2 x0, src + pos, %%LEN, tmp, tmp2
+ ; get next dest vectors
+ simd_load_avx2 xd1, dest1 + pos, %%LEN, tmp, tmp2
+ simd_load_avx2 xd2, dest2 + pos, %%LEN, tmp, tmp2
+ simd_load_avx2 xd3, dest3 + pos, %%LEN, tmp, tmp2
+
+ GF_MUL_XOR VEX, x0, xgft1, xret1, xd1, xgft2, xret2, xd2, xgft3, xret3, xd3
+
+ lea dest1, [dest1 + pos]
+ simd_store_avx2 dest1, xd1, %%LEN, tmp, tmp2
+ lea dest2, [dest2 + pos]
+ simd_store_avx2 dest2, xd2, %%LEN, tmp, tmp2
+ lea dest3, [dest3 + pos]
+ simd_store_avx2 dest3, xd3, %%LEN, tmp, tmp2
+%endmacro
+
+align 16
+global gf_3vect_mad_avx2_gfni, function
+func(gf_3vect_mad_avx2_gfni)
+ FUNC_SAVE
+
+ xor pos, pos
+ shl vec_i, 3 ;Multiply by 8
+ shl vec, 3 ;Multiply by 8
+ lea tmp, [mul_array + vec_i]
+ vbroadcastsd xgft1, [tmp]
+ vbroadcastsd xgft2, [tmp + vec]
+ vbroadcastsd xgft3, [tmp + vec*2]
+ mov dest2, [dest1 + 8] ; reuse mul_array
+ mov dest3, [dest1 + 2*8] ; reuse vec_i
+ mov dest1, [dest1]
+
+ cmp len, 64
+ jl .len_lt_64
+
+.loop64:
+ ENCODE_64B_3 ;; loop on 64 bytes at a time
+
+ add pos, 64
+ sub len, 64
+ cmp len, 64
+ jge .loop64
+
+.len_lt_64:
+ cmp len, 32
+ jl .len_lt_32
+
+ ENCODE_32B_3 ;; encode next 32 bytes
+
+ add pos, 32
+ sub len, 32
+
+.len_lt_32:
+ cmp len, 0
+ jle .exit
+
+ ENCODE_LT_32B_3 len ;; encode final bytes
+
+.exit:
+ vzeroupper
+
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+%endif ; if AS_FEATURE_LEVEL >= 10
diff --git a/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx512.asm b/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx512.asm
index 53b3eb5afa..567624d273 100644
--- a/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx512.asm
+++ b/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx512.asm
@@ -44,7 +44,7 @@
%define arg5 r9
%define tmp r11
%define return rax
- %define func(x) x:
+ %define func(x) x: endbranch
%define FUNC_SAVE
%define FUNC_RESTORE
%endif
@@ -117,8 +117,8 @@
%else
;;; Use Non-temporal load/stor
%ifdef NO_NT_LDST
- %define XLDR vmovdqa
- %define XSTR vmovdqa
+ %define XLDR vmovdqa64
+ %define XSTR vmovdqa64
%else
%define XLDR vmovntdqa
%define XSTR vmovntdq
@@ -152,13 +152,8 @@ section .text
%define xmask0f zmm17
align 16
-global gf_3vect_mad_avx512:ISAL_SYM_TYPE_FUNCTION
+global gf_3vect_mad_avx512, function
func(gf_3vect_mad_avx512)
-%ifidn __OUTPUT_FORMAT__, macho64
-global _gf_3vect_mad_avx512:ISAL_SYM_TYPE_FUNCTION
-func(_gf_3vect_mad_avx512)
-%endif
-
FUNC_SAVE
sub len, 64
jl .return_fail
@@ -209,7 +204,7 @@ func(_gf_3vect_mad_avx512)
vpshufb xtmph3 {k1}{z}, xgft3_hi, x0 ;Lookup mul table of high nibble
vpshufb xtmpl3 {k1}{z}, xgft3_lo, xtmpa ;Lookup mul table of low nibble
vpxorq xtmph3, xtmph3, xtmpl3 ;GF add high and low partials
- vpxorq xd3, xd3, xtmph3 ;xd2 += partial
+ vpxorq xd3, xd3, xtmph3 ;xd3 += partial
XSTR [dest1+pos], xd1
XSTR [dest2+pos], xd2
diff --git a/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx512.patch b/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx512.patch
deleted file mode 100644
index d8b12fac96..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx512.patch
+++ /dev/null
@@ -1,6 +0,0 @@
-157,161d156
-< %ifidn __OUTPUT_FORMAT__, macho64
-< global _gf_3vect_mad_avx512:ISAL_SYM_TYPE_FUNCTION
-< func(_gf_3vect_mad_avx512)
-< %endif
-<
diff --git a/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx512_gfni.asm b/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx512_gfni.asm
new file mode 100644
index 0000000000..53cc812595
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/gf_3vect_mad_avx512_gfni.asm
@@ -0,0 +1,204 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2023 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_3vect_mad_avx512_gfni(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%include "reg_sizes.asm"
+%include "gf_vect_gfni.inc"
+
+%if AS_FEATURE_LEVEL >= 10
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+ %define arg4 r8
+ %define arg5 r9
+ %define tmp r11
+ %define func(x) x: endbranch
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+ %define arg4 r12 ; must be saved, loaded and restored
+ %define arg5 r13 ; must be saved and restored
+ %define tmp r11
+ %define stack_size 16*4 + 3*8
+ %define arg(x) [rsp + stack_size + 8 + 8*x]
+ %define func(x) proc_frame x
+
+ %macro FUNC_SAVE 0
+ sub rsp, stack_size
+ vmovdqa [rsp + 16*0], xmm6
+ vmovdqa [rsp + 16*1], xmm7
+ vmovdqa [rsp + 16*2], xmm8
+ vmovdqa [rsp + 16*3], xmm9
+ mov [rsp + 4*16 + 0*8], r12
+ mov [rsp + 4*16 + 1*8], r13
+ end_prolog
+ mov arg4, arg(4)
+ mov arg5, arg(5)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ vmovdqa xmm6, [rsp + 16*0]
+ vmovdqa xmm7, [rsp + 16*1]
+ vmovdqa xmm8, [rsp + 16*2]
+ vmovdqa xmm9, [rsp + 16*3]
+ mov r12, [rsp + 4*16 + 0*8]
+ mov r13, [rsp + 4*16 + 1*8]
+ add rsp, stack_size
+ %endmacro
+%endif
+
+%define len arg0
+%define vec arg1
+%define vec_i arg2
+%define mul_array arg3
+%define src arg4
+%define dest1 arg5
+%define pos rax
+%define dest2 mul_array
+%define dest3 vec_i
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu8
+ %define XSTR vmovdqu8
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+ %define XLDR vmovdqa64
+ %define XSTR vmovdqa64
+ %else
+ %define XLDR vmovntdqa
+ %define XSTR vmovntdq
+ %endif
+%endif
+
+default rel
+[bits 64]
+section .text
+
+%define x0 zmm0
+%define xgft1 zmm1
+%define xgft2 zmm2
+%define xgft3 zmm3
+%define xd1 zmm4
+%define xd2 zmm5
+%define xd3 zmm6
+
+%define xret1 zmm7
+%define xret2 zmm8
+%define xret3 zmm9
+
+;;
+;; Encodes 64 bytes of a single source into 3x 64 bytes (parity disks)
+;;
+%macro ENCODE_64B_3 0-1
+%define %%KMASK %1
+
+%if %0 == 1
+ vmovdqu8 x0{%%KMASK}, [src + pos] ;Get next source vector
+ vmovdqu8 xd1{%%KMASK}, [dest1 + pos] ;Get next dest vector
+ vmovdqu8 xd2{%%KMASK}, [dest2 + pos] ;Get next dest vector
+ vmovdqu8 xd3{%%KMASK}, [dest3 + pos] ;Get next dest vector
+%else
+ XLDR x0, [src + pos] ;Get next source vector
+ XLDR xd1, [dest1 + pos] ;Get next dest vector
+ XLDR xd2, [dest2 + pos] ;Get next dest vector
+ XLDR xd3, [dest3 + pos] ;Get next dest vector
+%endif
+
+ GF_MUL_XOR EVEX, x0, xgft1, xret1, xd1, xgft2, xret2, xd2, xgft3, xret3, xd3
+
+%if %0 == 1
+ vmovdqu8 [dest1 + pos]{%%KMASK}, xd1
+ vmovdqu8 [dest2 + pos]{%%KMASK}, xd2
+ vmovdqu8 [dest3 + pos]{%%KMASK}, xd3
+%else
+ XSTR [dest1 + pos], xd1
+ XSTR [dest2 + pos], xd2
+ XSTR [dest3 + pos], xd3
+%endif
+%endmacro
+
+align 16
+global gf_3vect_mad_avx512_gfni, function
+func(gf_3vect_mad_avx512_gfni)
+ FUNC_SAVE
+
+ xor pos, pos
+ shl vec_i, 3 ;Multiply by 8
+ shl vec, 3 ;Multiply by 8
+ lea tmp, [mul_array + vec_i]
+ vbroadcastf32x2 xgft1, [tmp]
+ vbroadcastf32x2 xgft2, [tmp + vec]
+ vbroadcastf32x2 xgft3, [tmp + vec*2]
+ mov dest2, [dest1 + 8] ; reuse mul_array
+ mov dest3, [dest1 + 2*8] ; reuse vec
+ mov dest1, [dest1]
+
+ cmp len, 64
+ jl .len_lt_64
+.loop64:
+ ENCODE_64B_3
+
+ add pos, 64 ;Loop on 64 bytes at a time
+ sub len, 64
+ cmp len, 64
+ jge .loop64
+
+.len_lt_64:
+ cmp len, 0
+ jle .exit
+
+ xor tmp, tmp
+ bts tmp, len
+ dec tmp
+ kmovq k1, tmp
+
+ ENCODE_64B_3 k1
+
+.exit:
+ vzeroupper
+
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+%endif ; if AS_FEATURE_LEVEL >= 10
diff --git a/contrib/libs/isa-l/erasure_code/gf_3vect_mad_sse.asm b/contrib/libs/isa-l/erasure_code/gf_3vect_mad_sse.asm
index d6dbe8f200..0a4284d53e 100644
--- a/contrib/libs/isa-l/erasure_code/gf_3vect_mad_sse.asm
+++ b/contrib/libs/isa-l/erasure_code/gf_3vect_mad_sse.asm
@@ -96,7 +96,7 @@
%define return rax
%define return.w eax
- %define func(x) x:
+ %define func(x) x: endbranch
%define FUNC_SAVE
%define FUNC_RESTORE
%endif
@@ -156,13 +156,8 @@ section .text
%define xd3 xtmph1
align 16
-global gf_3vect_mad_sse:ISAL_SYM_TYPE_FUNCTION
+global gf_3vect_mad_sse, function
func(gf_3vect_mad_sse)
-%ifidn __OUTPUT_FORMAT__, macho64
-global _gf_3vect_mad_sse:ISAL_SYM_TYPE_FUNCTION
-func(_gf_3vect_mad_sse)
-%endif
-
FUNC_SAVE
sub len, 16
jl .return_fail
@@ -298,6 +293,3 @@ mask0f:
dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
constip16:
dq 0xf8f9fafbfcfdfeff, 0xf0f1f2f3f4f5f6f7
-
-;;; func core, ver, snum
-slversion gf_3vect_mad_sse, 00, 01, 0206
diff --git a/contrib/libs/isa-l/erasure_code/gf_3vect_mad_sse.patch b/contrib/libs/isa-l/erasure_code/gf_3vect_mad_sse.patch
deleted file mode 100644
index 83363c45cf..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_3vect_mad_sse.patch
+++ /dev/null
@@ -1,6 +0,0 @@
-161,165d160
-< %ifidn __OUTPUT_FORMAT__, macho64
-< global _gf_3vect_mad_sse:ISAL_SYM_TYPE_FUNCTION
-< func(_gf_3vect_mad_sse)
-< %endif
-<
diff --git a/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx.asm b/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx.asm
index 30f1e81f6b..077018eefd 100644
--- a/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx.asm
+++ b/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx.asm
@@ -54,7 +54,7 @@
%define PS 8
%define LOG_PS 3
- %define func(x) x:
+ %define func(x) x: endbranch
%macro FUNC_SAVE 0
push r12
push r13
@@ -95,15 +95,15 @@
%define func(x) proc_frame x
%macro FUNC_SAVE 0
alloc_stack stack_size
- save_xmm128 xmm6, 0*16
- save_xmm128 xmm7, 1*16
- save_xmm128 xmm8, 2*16
- save_xmm128 xmm9, 3*16
- save_xmm128 xmm10, 4*16
- save_xmm128 xmm11, 5*16
- save_xmm128 xmm12, 6*16
- save_xmm128 xmm13, 7*16
- save_xmm128 xmm14, 8*16
+ vmovdqa [rsp + 0*16], xmm6
+ vmovdqa [rsp + 1*16], xmm7
+ vmovdqa [rsp + 2*16], xmm8
+ vmovdqa [rsp + 3*16], xmm9
+ vmovdqa [rsp + 4*16], xmm10
+ vmovdqa [rsp + 5*16], xmm11
+ vmovdqa [rsp + 6*16], xmm12
+ vmovdqa [rsp + 7*16], xmm13
+ vmovdqa [rsp + 8*16], xmm14
save_reg r12, 9*16 + 0*8
save_reg r13, 9*16 + 1*8
save_reg r14, 9*16 + 2*8
@@ -159,7 +159,7 @@
%define PS 4
%define LOG_PS 2
- %define func(x) x:
+ %define func(x) x: endbranch
%define arg(x) [ebp + PS*2 + PS*x]
%define var(x) [ebp - PS - PS*x]
@@ -294,13 +294,8 @@ section .text
%define xp4 xmm5
%endif
align 16
-global gf_4vect_dot_prod_avx:ISAL_SYM_TYPE_FUNCTION
+global gf_4vect_dot_prod_avx, function
func(gf_4vect_dot_prod_avx)
-%ifidn __OUTPUT_FORMAT__, macho64
-global _gf_4vect_dot_prod_avx:ISAL_SYM_TYPE_FUNCTION
-func(_gf_4vect_dot_prod_avx)
-%endif
-
FUNC_SAVE
SLDR len, len_m
sub len, 16
@@ -441,6 +436,3 @@ section .data
align 16
mask0f: dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
-
-;;; func core, ver, snum
-slversion gf_4vect_dot_prod_avx, 02, 05, 0193
diff --git a/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx.patch b/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx.patch
deleted file mode 100644
index aa908028bb..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx.patch
+++ /dev/null
@@ -1,6 +0,0 @@
-299,303d298
-< %ifidn __OUTPUT_FORMAT__, macho64
-< global _gf_4vect_dot_prod_avx:ISAL_SYM_TYPE_FUNCTION
-< func(_gf_4vect_dot_prod_avx)
-< %endif
-<
diff --git a/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx2.asm b/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx2.asm
index efe2f76de9..8d5febe0fa 100644
--- a/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx2.asm
+++ b/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx2.asm
@@ -56,7 +56,7 @@
%define PS 8
%define LOG_PS 3
- %define func(x) x:
+ %define func(x) x: endbranch
%macro FUNC_SAVE 0
push r12
push r13
@@ -163,7 +163,7 @@
%define PS 4
%define LOG_PS 2
- %define func(x) x:
+ %define func(x) x: endbranch
%define arg(x) [ebp + PS*2 + PS*x]
%define var(x) [ebp - PS - PS*x]
@@ -301,15 +301,9 @@ section .text
%define xp3 ymm4
%define xp4 ymm5
%endif
-
align 16
-global gf_4vect_dot_prod_avx2:ISAL_SYM_TYPE_FUNCTION
+global gf_4vect_dot_prod_avx2, function
func(gf_4vect_dot_prod_avx2)
-%ifidn __OUTPUT_FORMAT__, macho64
-global _gf_4vect_dot_prod_avx2:ISAL_SYM_TYPE_FUNCTION
-func(_gf_4vect_dot_prod_avx2)
-%endif
-
FUNC_SAVE
SLDR len, len_m
sub len, 32
@@ -459,8 +453,3 @@ func(_gf_4vect_dot_prod_avx2)
ret
endproc_frame
-
-section .data
-
-;;; func core, ver, snum
-slversion gf_4vect_dot_prod_avx2, 04, 05, 0198
diff --git a/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx2.patch b/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx2.patch
deleted file mode 100644
index 39cdd548a7..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx2.patch
+++ /dev/null
@@ -1,8 +0,0 @@
-304d303
-<
-308,312d306
-< %ifidn __OUTPUT_FORMAT__, macho64
-< global _gf_4vect_dot_prod_avx2:ISAL_SYM_TYPE_FUNCTION
-< func(_gf_4vect_dot_prod_avx2)
-< %endif
-<
diff --git a/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx512.asm b/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx512.asm
index c810008c85..9bdc1a5670 100644
--- a/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx512.asm
+++ b/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx512.asm
@@ -44,8 +44,6 @@
%define arg5 r9
%define tmp r11
- %define tmp.w r11d
- %define tmp.b r11b
%define tmp2 r10
%define tmp3 r13 ; must be saved and restored
%define tmp4 r12 ; must be saved and restored
@@ -54,19 +52,22 @@
%define return rax
%define PS 8
%define LOG_PS 3
+ %define stack_size 4*8
- %define func(x) x:
+ %define func(x) x: endbranch
%macro FUNC_SAVE 0
- push r12
- push r13
- push r14
- push r15
+ sub rsp, stack_size
+ mov [rsp + 0*8], r12
+ mov [rsp + 1*8], r13
+ mov [rsp + 2*8], r14
+ mov [rsp + 3*8], r15
%endmacro
%macro FUNC_RESTORE 0
- pop r15
- pop r14
- pop r13
- pop r12
+ mov r12, [rsp + 0*8]
+ mov r13, [rsp + 1*8]
+ mov r14, [rsp + 2*8]
+ mov r15, [rsp + 3*8]
+ add rsp, stack_size
%endmacro
%endif
@@ -79,8 +80,6 @@
%define arg4 r12 ; must be saved, loaded and restored
%define arg5 r15 ; must be saved and restored
%define tmp r11
- %define tmp.w r11d
- %define tmp.b r11b
%define tmp2 r10
%define tmp3 r13 ; must be saved and restored
%define tmp4 r14 ; must be saved and restored
@@ -156,8 +155,8 @@
%else
;;; Use Non-temporal load/stor
%ifdef NO_NT_LDST
- %define XLDR vmovdqa
- %define XSTR vmovdqa
+ %define XLDR vmovdqa64
+ %define XSTR vmovdqa64
%else
%define XLDR vmovntdqa
%define XSTR vmovntdq
@@ -191,13 +190,8 @@ default rel
section .text
align 16
-global gf_4vect_dot_prod_avx512:ISAL_SYM_TYPE_FUNCTION
+global gf_4vect_dot_prod_avx512, function
func(gf_4vect_dot_prod_avx512)
-%ifidn __OUTPUT_FORMAT__, macho64
-global _gf_4vect_dot_prod_avx512:ISAL_SYM_TYPE_FUNCTION
-func(_gf_4vect_dot_prod_avx512)
-%endif
-
FUNC_SAVE
sub len, 64
jl .return_fail
diff --git a/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx512.patch b/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx512.patch
deleted file mode 100644
index 6ca011bcc3..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx512.patch
+++ /dev/null
@@ -1,6 +0,0 @@
-196,200d195
-< %ifidn __OUTPUT_FORMAT__, macho64
-< global _gf_4vect_dot_prod_avx512:ISAL_SYM_TYPE_FUNCTION
-< func(_gf_4vect_dot_prod_avx512)
-< %endif
-<
diff --git a/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx512_gfni.asm b/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx512_gfni.asm
new file mode 100644
index 0000000000..9adb83f196
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_avx512_gfni.asm
@@ -0,0 +1,253 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2023 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_4vect_dot_prod_avx512_gfni(len, vec, *g_tbls, **buffs, **dests);
+;;;
+
+%include "reg_sizes.asm"
+%include "gf_vect_gfni.inc"
+
+%if AS_FEATURE_LEVEL >= 10
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+ %define arg4 r8
+ %define arg5 r9
+
+ %define tmp r11
+ %define tmp2 r10
+ %define tmp3 r13 ; must be saved and restored
+ %define tmp4 r12 ; must be saved and restored
+ %define tmp5 r14 ; must be saved and restored
+ %define tmp6 r15 ; must be saved and restored
+
+ %define func(x) x: endbranch
+ %macro FUNC_SAVE 0
+ push r12
+ push r13
+ push r14
+ push r15
+ %endmacro
+ %macro FUNC_RESTORE 0
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ %endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+
+ %define arg4 r12 ; must be saved, loaded and restored
+ %define arg5 r15 ; must be saved and restored
+ %define tmp r11
+ %define tmp2 r10
+ %define tmp3 r13 ; must be saved and restored
+ %define tmp4 r14 ; must be saved and restored
+ %define tmp5 rdi ; must be saved and restored
+ %define tmp6 rsi ; must be saved and restored
+ %define stack_size 3*16 + 7*8 ; must be an odd multiple of 8
+ %define arg(x) [rsp + stack_size + 8 + 8*x]
+
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ vmovdqa [rsp + 0*16], xmm6
+ vmovdqa [rsp + 1*16], xmm7
+ vmovdqa [rsp + 2*16], xmm8
+ mov [rsp + 3*16 + 0*8], r12
+ mov [rsp + 3*16 + 1*8], r13
+ mov [rsp + 3*16 + 2*8], r14
+ mov [rsp + 3*16 + 3*8], r15
+ mov [rsp + 3*16 + 4*8], rdi
+ mov [rsp + 3*16 + 5*8], rsi
+ end_prolog
+ mov arg4, arg(4)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ vmovdqa xmm6, [rsp + 0*16]
+ vmovdqa xmm7, [rsp + 1*16]
+ vmovdqa xmm8, [rsp + 2*16]
+ mov r12, [rsp + 3*16 + 0*8]
+ mov r13, [rsp + 3*16 + 1*8]
+ mov r14, [rsp + 3*16 + 2*8]
+ mov r15, [rsp + 3*16 + 3*8]
+ mov rdi, [rsp + 3*16 + 4*8]
+ mov rsi, [rsp + 3*16 + 5*8]
+ add rsp, stack_size
+ %endmacro
+%endif
+
+
+%define len arg0
+%define vec arg1
+%define mul_array arg2
+%define src arg3
+%define dest1 arg4
+%define ptr arg5
+%define vec_i tmp2
+%define dest2 tmp3
+%define dest3 tmp4
+%define dest4 tmp5
+%define vskip3 tmp6
+%define pos rax
+
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu8
+ %define XSTR vmovdqu8
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+ %define XLDR vmovdqa64
+ %define XSTR vmovdqa64
+ %else
+ %define XLDR vmovntdqa
+ %define XSTR vmovntdq
+ %endif
+%endif
+
+%define xgft1 zmm5
+%define xgft2 zmm6
+%define xgft3 zmm7
+%define xgft4 zmm8
+
+%define x0 zmm0
+%define xp1 zmm1
+%define xp2 zmm2
+%define xp3 zmm3
+%define xp4 zmm4
+
+default rel
+[bits 64]
+
+section .text
+
+;;
+;; Encodes 64 bytes of all "k" sources into 4x 64 bytes (parity disks)
+;;
+%macro ENCODE_64B_4 0-1
+%define %%KMASK %1
+
+ vpxorq xp1, xp1, xp1
+ vpxorq xp2, xp2, xp2
+ vpxorq xp3, xp3, xp3
+ vpxorq xp4, xp4, xp4
+ mov tmp, mul_array
+ xor vec_i, vec_i
+
+%%next_vect:
+ mov ptr, [src + vec_i]
+%if %0 == 1
+ vmovdqu8 x0{%%KMASK}, [ptr + pos] ;Get next source vector (less than 64 bytes)
+%else
+ XLDR x0, [ptr + pos] ;Get next source vector (64 bytes)
+%endif
+ add vec_i, 8
+
+ vbroadcastf32x2 xgft1, [tmp]
+ vbroadcastf32x2 xgft2, [tmp + vec]
+ vbroadcastf32x2 xgft3, [tmp + vec*2]
+ vbroadcastf32x2 xgft4, [tmp + vskip3]
+ add tmp, 8
+
+ GF_MUL_XOR EVEX, x0, xgft1, xgft1, xp1, xgft2, xgft2, xp2, xgft3, xgft3, xp3, \
+ xgft4, xgft4, xp4
+
+ cmp vec_i, vec
+ jl %%next_vect
+
+%if %0 == 1
+ vmovdqu8 [dest1 + pos]{%%KMASK}, xp1
+ vmovdqu8 [dest2 + pos]{%%KMASK}, xp2
+ vmovdqu8 [dest3 + pos]{%%KMASK}, xp3
+ vmovdqu8 [dest4 + pos]{%%KMASK}, xp4
+%else
+ XSTR [dest1 + pos], xp1
+ XSTR [dest2 + pos], xp2
+ XSTR [dest3 + pos], xp3
+ XSTR [dest4 + pos], xp4
+%endif
+%endmacro
+
+align 16
+global gf_4vect_dot_prod_avx512_gfni, function
+func(gf_4vect_dot_prod_avx512_gfni)
+ FUNC_SAVE
+
+ xor pos, pos
+ mov vskip3, vec
+ imul vskip3, 8*3
+ shl vec, 3 ;vec *= 8. Make vec_i count by 8
+ mov dest2, [dest1 + 8]
+ mov dest3, [dest1 + 2*8]
+ mov dest4, [dest1 + 3*8]
+ mov dest1, [dest1]
+
+ cmp len, 64
+ jl .len_lt_64
+
+.loop64:
+
+ ENCODE_64B_4
+
+ add pos, 64 ;Loop on 64 bytes at a time
+ sub len, 64
+ cmp len, 64
+ jge .loop64
+
+.len_lt_64:
+ cmp len, 0
+ jle .exit
+
+ xor tmp, tmp
+ bts tmp, len
+ dec tmp
+ kmovq k1, tmp
+
+ ENCODE_64B_4 k1
+
+.exit:
+ vzeroupper
+
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+%endif ; if AS_FEATURE_LEVEL >= 10
diff --git a/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_sse.asm b/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_sse.asm
index 8a486bf7b5..aadab4b1e4 100644
--- a/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_sse.asm
+++ b/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_sse.asm
@@ -54,7 +54,7 @@
%define PS 8
%define LOG_PS 3
- %define func(x) x:
+ %define func(x) x: endbranch
%macro FUNC_SAVE 0
push r12
push r13
@@ -159,7 +159,7 @@
%define PS 4
%define LOG_PS 2
- %define func(x) x:
+ %define func(x) x: endbranch
%define arg(x) [ebp + PS*2 + PS*x]
%define var(x) [ebp - PS - PS*x]
@@ -294,13 +294,8 @@ section .text
%define xp4 xmm5
%endif
align 16
-global gf_4vect_dot_prod_sse:ISAL_SYM_TYPE_FUNCTION
+global gf_4vect_dot_prod_sse, function
func(gf_4vect_dot_prod_sse)
-%ifidn __OUTPUT_FORMAT__, macho64
-global _gf_4vect_dot_prod_sse:ISAL_SYM_TYPE_FUNCTION
-func(_gf_4vect_dot_prod_sse)
-%endif
-
FUNC_SAVE
SLDR len, len_m
sub len, 16
@@ -443,6 +438,3 @@ section .data
align 16
mask0f: dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
-
-;;; func core, ver, snum
-slversion gf_4vect_dot_prod_sse, 00, 06, 0064
diff --git a/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_sse.patch b/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_sse.patch
deleted file mode 100644
index 78b6abbe4b..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_sse.patch
+++ /dev/null
@@ -1,6 +0,0 @@
-299,303d298
-< %ifidn __OUTPUT_FORMAT__, macho64
-< global _gf_4vect_dot_prod_sse:ISAL_SYM_TYPE_FUNCTION
-< func(_gf_4vect_dot_prod_sse)
-< %endif
-<
diff --git a/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_sse_test.c b/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_sse_test.c
deleted file mode 100644
index eb6bc986ab..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_sse_test.c
+++ /dev/null
@@ -1,695 +0,0 @@
-/**********************************************************************
- Copyright(c) 2011-2015 Intel Corporation All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions
- are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in
- the documentation and/or other materials provided with the
- distribution.
- * Neither the name of Intel Corporation nor the names of its
- contributors may be used to endorse or promote products derived
- from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-**********************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h> // for memset, memcmp
-#include "erasure_code.h"
-// #include "types.h"
-
-#ifndef FUNCTION_UNDER_TEST
-# define FUNCTION_UNDER_TEST gf_4vect_dot_prod_sse
-#endif
-#ifndef TEST_MIN_SIZE
-# define TEST_MIN_SIZE 16
-#endif
-
-#define str(s) #s
-#define xstr(s) str(s)
-
-#define TEST_LEN 2048
-#define TEST_SIZE (TEST_LEN/2)
-#define TEST_MEM TEST_SIZE
-#define TEST_LOOPS 1000
-#define TEST_TYPE_STR ""
-
-#ifndef TEST_SOURCES
-# define TEST_SOURCES 16
-#endif
-#ifndef RANDOMS
-# define RANDOMS 20
-#endif
-
-#ifdef EC_ALIGNED_ADDR
-// Define power of 2 range to check ptr, len alignment
-# define PTR_ALIGN_CHK_B 0
-# define LEN_ALIGN_CHK_B 0 // 0 for aligned only
-#else
-// Define power of 2 range to check ptr, len alignment
-# define PTR_ALIGN_CHK_B 32
-# define LEN_ALIGN_CHK_B 32 // 0 for aligned only
-#endif
-
-typedef unsigned char u8;
-
-extern void FUNCTION_UNDER_TEST(int len, int vlen, unsigned char *gftbls,
- unsigned char **src, unsigned char **dest);
-
-void dump(unsigned char *buf, int len)
-{
- int i;
- for (i = 0; i < len;) {
- printf(" %2x", 0xff & buf[i++]);
- if (i % 32 == 0)
- printf("\n");
- }
- printf("\n");
-}
-
-void dump_matrix(unsigned char **s, int k, int m)
-{
- int i, j;
- for (i = 0; i < k; i++) {
- for (j = 0; j < m; j++) {
- printf(" %2x", s[i][j]);
- }
- printf("\n");
- }
- printf("\n");
-}
-
-void dump_u8xu8(unsigned char *s, int k, int m)
-{
- int i, j;
- for (i = 0; i < k; i++) {
- for (j = 0; j < m; j++) {
- printf(" %2x", 0xff & s[j + (i * m)]);
- }
- printf("\n");
- }
- printf("\n");
-}
-
-int main(int argc, char *argv[])
-{
- int i, j, rtest, srcs;
- void *buf;
- u8 g1[TEST_SOURCES], g2[TEST_SOURCES], g3[TEST_SOURCES];
- u8 g4[TEST_SOURCES], g_tbls[4 * TEST_SOURCES * 32], *buffs[TEST_SOURCES];
- u8 *dest1, *dest2, *dest3, *dest4, *dest_ref1, *dest_ref2, *dest_ref3;
- u8 *dest_ref4, *dest_ptrs[4];
-
- int align, size;
- unsigned char *efence_buffs[TEST_SOURCES];
- unsigned int offset;
- u8 *ubuffs[TEST_SOURCES];
- u8 *udest_ptrs[4];
- printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d ", TEST_SOURCES, TEST_LEN);
-
- // Allocate the arrays
- for (i = 0; i < TEST_SOURCES; i++) {
- if (posix_memalign(&buf, 64, TEST_LEN)) {
- printf("alloc error: Fail");
- return -1;
- }
- buffs[i] = buf;
- }
-
- if (posix_memalign(&buf, 64, TEST_LEN)) {
- printf("alloc error: Fail");
- return -1;
- }
- dest1 = buf;
-
- if (posix_memalign(&buf, 64, TEST_LEN)) {
- printf("alloc error: Fail");
- return -1;
- }
- dest2 = buf;
-
- if (posix_memalign(&buf, 64, TEST_LEN)) {
- printf("alloc error: Fail");
- return -1;
- }
- dest3 = buf;
-
- if (posix_memalign(&buf, 64, TEST_LEN)) {
- printf("alloc error: Fail");
- return -1;
- }
- dest4 = buf;
-
- if (posix_memalign(&buf, 64, TEST_LEN)) {
- printf("alloc error: Fail");
- return -1;
- }
- dest_ref1 = buf;
-
- if (posix_memalign(&buf, 64, TEST_LEN)) {
- printf("alloc error: Fail");
- return -1;
- }
- dest_ref2 = buf;
-
- if (posix_memalign(&buf, 64, TEST_LEN)) {
- printf("alloc error: Fail");
- return -1;
- }
- dest_ref3 = buf;
-
- if (posix_memalign(&buf, 64, TEST_LEN)) {
- printf("alloc error: Fail");
- return -1;
- }
- dest_ref4 = buf;
-
- dest_ptrs[0] = dest1;
- dest_ptrs[1] = dest2;
- dest_ptrs[2] = dest3;
- dest_ptrs[3] = dest4;
-
- // Test of all zeros
- for (i = 0; i < TEST_SOURCES; i++)
- memset(buffs[i], 0, TEST_LEN);
-
- memset(dest1, 0, TEST_LEN);
- memset(dest2, 0, TEST_LEN);
- memset(dest3, 0, TEST_LEN);
- memset(dest4, 0, TEST_LEN);
- memset(dest_ref1, 0, TEST_LEN);
- memset(dest_ref2, 0, TEST_LEN);
- memset(dest_ref3, 0, TEST_LEN);
- memset(dest_ref4, 0, TEST_LEN);
- memset(g1, 2, TEST_SOURCES);
- memset(g2, 1, TEST_SOURCES);
- memset(g3, 7, TEST_SOURCES);
- memset(g4, 3, TEST_SOURCES);
-
- for (i = 0; i < TEST_SOURCES; i++) {
- gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
- gf_vect_mul_init(g2[i], &g_tbls[32 * TEST_SOURCES + i * 32]);
- gf_vect_mul_init(g3[i], &g_tbls[64 * TEST_SOURCES + i * 32]);
- gf_vect_mul_init(g4[i], &g_tbls[96 * TEST_SOURCES + i * 32]);
- }
-
- gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
- gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES], buffs,
- dest_ref2);
- gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES], buffs,
- dest_ref3);
- gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES], buffs,
- dest_ref4);
-
- FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
-
- if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
- printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test1\n");
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref1, 25);
- printf("dprod_dut:");
- dump(dest1, 25);
- return -1;
- }
- if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
- printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test2\n");
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref2, 25);
- printf("dprod_dut:");
- dump(dest2, 25);
- return -1;
- }
- if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
- printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test3\n");
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref3, 25);
- printf("dprod_dut:");
- dump(dest3, 25);
- return -1;
- }
- if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) {
- printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test4\n");
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref4, 25);
- printf("dprod_dut:");
- dump(dest4, 25);
- return -1;
- }
-
- putchar('.');
-
- // Rand data test
-
- for (rtest = 0; rtest < RANDOMS; rtest++) {
- for (i = 0; i < TEST_SOURCES; i++)
- for (j = 0; j < TEST_LEN; j++)
- buffs[i][j] = rand();
-
- for (i = 0; i < TEST_SOURCES; i++) {
- g1[i] = rand();
- g2[i] = rand();
- g3[i] = rand();
- g4[i] = rand();
- }
-
- for (i = 0; i < TEST_SOURCES; i++) {
- gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
- gf_vect_mul_init(g2[i], &g_tbls[(32 * TEST_SOURCES) + (i * 32)]);
- gf_vect_mul_init(g3[i], &g_tbls[(64 * TEST_SOURCES) + (i * 32)]);
- gf_vect_mul_init(g4[i], &g_tbls[(96 * TEST_SOURCES) + (i * 32)]);
- }
-
- gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
- gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
- buffs, dest_ref2);
- gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES],
- buffs, dest_ref3);
- gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES],
- buffs, dest_ref4);
-
- FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
-
- if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test1 %d\n", rtest);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref1, 25);
- printf("dprod_dut:");
- dump(dest1, 25);
- return -1;
- }
- if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test2 %d\n", rtest);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref2, 25);
- printf("dprod_dut:");
- dump(dest2, 25);
- return -1;
- }
- if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test3 %d\n", rtest);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref3, 25);
- printf("dprod_dut:");
- dump(dest3, 25);
- return -1;
- }
- if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test4 %d\n", rtest);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref4, 25);
- printf("dprod_dut:");
- dump(dest4, 25);
- return -1;
- }
-
- putchar('.');
- }
-
- // Rand data test with varied parameters
- for (rtest = 0; rtest < RANDOMS; rtest++) {
- for (srcs = TEST_SOURCES; srcs > 0; srcs--) {
- for (i = 0; i < srcs; i++)
- for (j = 0; j < TEST_LEN; j++)
- buffs[i][j] = rand();
-
- for (i = 0; i < srcs; i++) {
- g1[i] = rand();
- g2[i] = rand();
- g3[i] = rand();
- g4[i] = rand();
- }
-
- for (i = 0; i < srcs; i++) {
- gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
- gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
- gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]);
- gf_vect_mul_init(g4[i], &g_tbls[(96 * srcs) + (i * 32)]);
- }
-
- gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[0], buffs, dest_ref1);
- gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[32 * srcs], buffs,
- dest_ref2);
- gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[64 * srcs], buffs,
- dest_ref3);
- gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[96 * srcs], buffs,
- dest_ref4);
-
- FUNCTION_UNDER_TEST(TEST_LEN, srcs, g_tbls, buffs, dest_ptrs);
-
- if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
- " test1 srcs=%d\n", srcs);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref1, 25);
- printf("dprod_dut:");
- dump(dest1, 25);
- return -1;
- }
- if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
- " test2 srcs=%d\n", srcs);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref2, 25);
- printf("dprod_dut:");
- dump(dest2, 25);
- return -1;
- }
- if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
- " test3 srcs=%d\n", srcs);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref3, 25);
- printf("dprod_dut:");
- dump(dest3, 25);
- return -1;
- }
- if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
- " test4 srcs=%d\n", srcs);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref4, 25);
- printf("dprod_dut:");
- dump(dest4, 25);
- return -1;
- }
-
- putchar('.');
- }
- }
-
- // Run tests at end of buffer for Electric Fence
- align = (LEN_ALIGN_CHK_B != 0) ? 1 : 32;
- for (size = TEST_MIN_SIZE; size <= TEST_SIZE; size += align) {
- for (i = 0; i < TEST_SOURCES; i++)
- for (j = 0; j < TEST_LEN; j++)
- buffs[i][j] = rand();
-
- for (i = 0; i < TEST_SOURCES; i++) // Line up TEST_SIZE from end
- efence_buffs[i] = buffs[i] + TEST_LEN - size;
-
- for (i = 0; i < TEST_SOURCES; i++) {
- g1[i] = rand();
- g2[i] = rand();
- g3[i] = rand();
- g4[i] = rand();
- }
-
- for (i = 0; i < TEST_SOURCES; i++) {
- gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
- gf_vect_mul_init(g2[i], &g_tbls[(32 * TEST_SOURCES) + (i * 32)]);
- gf_vect_mul_init(g3[i], &g_tbls[(64 * TEST_SOURCES) + (i * 32)]);
- gf_vect_mul_init(g4[i], &g_tbls[(96 * TEST_SOURCES) + (i * 32)]);
- }
-
- gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[0], efence_buffs, dest_ref1);
- gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
- efence_buffs, dest_ref2);
- gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES],
- efence_buffs, dest_ref3);
- gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES],
- efence_buffs, dest_ref4);
-
- FUNCTION_UNDER_TEST(size, TEST_SOURCES, g_tbls, efence_buffs, dest_ptrs);
-
- if (0 != memcmp(dest_ref1, dest1, size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test1 %d\n", rtest);
- dump_matrix(efence_buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref1, align);
- printf("dprod_dut:");
- dump(dest1, align);
- return -1;
- }
-
- if (0 != memcmp(dest_ref2, dest2, size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test2 %d\n", rtest);
- dump_matrix(efence_buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref2, align);
- printf("dprod_dut:");
- dump(dest2, align);
- return -1;
- }
-
- if (0 != memcmp(dest_ref3, dest3, size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test3 %d\n", rtest);
- dump_matrix(efence_buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref3, align);
- printf("dprod_dut:");
- dump(dest3, align);
- return -1;
- }
-
- if (0 != memcmp(dest_ref4, dest4, size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test4 %d\n", rtest);
- dump_matrix(efence_buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref4, align);
- printf("dprod_dut:");
- dump(dest4, align);
- return -1;
- }
-
- putchar('.');
- }
-
- // Test rand ptr alignment if available
-
- for (rtest = 0; rtest < RANDOMS; rtest++) {
- size = (TEST_LEN - PTR_ALIGN_CHK_B) & ~(TEST_MIN_SIZE - 1);
- srcs = rand() % TEST_SOURCES;
- if (srcs == 0)
- continue;
-
- offset = (PTR_ALIGN_CHK_B != 0) ? 1 : PTR_ALIGN_CHK_B;
- // Add random offsets
- for (i = 0; i < srcs; i++)
- ubuffs[i] = buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
-
- udest_ptrs[0] = dest1 + (rand() & (PTR_ALIGN_CHK_B - offset));
- udest_ptrs[1] = dest2 + (rand() & (PTR_ALIGN_CHK_B - offset));
- udest_ptrs[2] = dest3 + (rand() & (PTR_ALIGN_CHK_B - offset));
- udest_ptrs[3] = dest4 + (rand() & (PTR_ALIGN_CHK_B - offset));
-
- memset(dest1, 0, TEST_LEN); // zero pad to check write-over
- memset(dest2, 0, TEST_LEN);
- memset(dest3, 0, TEST_LEN);
- memset(dest4, 0, TEST_LEN);
-
- for (i = 0; i < srcs; i++)
- for (j = 0; j < size; j++)
- ubuffs[i][j] = rand();
-
- for (i = 0; i < srcs; i++) {
- g1[i] = rand();
- g2[i] = rand();
- g3[i] = rand();
- g4[i] = rand();
- }
-
- for (i = 0; i < srcs; i++) {
- gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
- gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
- gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]);
- gf_vect_mul_init(g4[i], &g_tbls[(96 * srcs) + (i * 32)]);
- }
-
- gf_vect_dot_prod_base(size, srcs, &g_tbls[0], ubuffs, dest_ref1);
- gf_vect_dot_prod_base(size, srcs, &g_tbls[32 * srcs], ubuffs, dest_ref2);
- gf_vect_dot_prod_base(size, srcs, &g_tbls[64 * srcs], ubuffs, dest_ref3);
- gf_vect_dot_prod_base(size, srcs, &g_tbls[96 * srcs], ubuffs, dest_ref4);
-
- FUNCTION_UNDER_TEST(size, srcs, g_tbls, ubuffs, udest_ptrs);
-
- if (memcmp(dest_ref1, udest_ptrs[0], size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
- srcs);
- dump_matrix(ubuffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref1, 25);
- printf("dprod_dut:");
- dump(udest_ptrs[0], 25);
- return -1;
- }
- if (memcmp(dest_ref2, udest_ptrs[1], size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
- srcs);
- dump_matrix(ubuffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref2, 25);
- printf("dprod_dut:");
- dump(udest_ptrs[1], 25);
- return -1;
- }
- if (memcmp(dest_ref3, udest_ptrs[2], size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
- srcs);
- dump_matrix(ubuffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref3, 25);
- printf("dprod_dut:");
- dump(udest_ptrs[2], 25);
- return -1;
- }
- if (memcmp(dest_ref4, udest_ptrs[3], size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
- srcs);
- dump_matrix(ubuffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref4, 25);
- printf("dprod_dut:");
- dump(udest_ptrs[3], 25);
- return -1;
- }
- // Confirm that padding around dests is unchanged
- memset(dest_ref1, 0, PTR_ALIGN_CHK_B); // Make reference zero buff
- offset = udest_ptrs[0] - dest1;
-
- if (memcmp(dest1, dest_ref1, offset)) {
- printf("Fail rand ualign pad1 start\n");
- return -1;
- }
- if (memcmp(dest1 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
- printf("Fail rand ualign pad1 end\n");
- printf("size=%d offset=%d srcs=%d\n", size, offset, srcs);
- return -1;
- }
-
- offset = udest_ptrs[1] - dest2;
- if (memcmp(dest2, dest_ref1, offset)) {
- printf("Fail rand ualign pad2 start\n");
- return -1;
- }
- if (memcmp(dest2 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
- printf("Fail rand ualign pad2 end\n");
- return -1;
- }
-
- offset = udest_ptrs[2] - dest3;
- if (memcmp(dest3, dest_ref1, offset)) {
- printf("Fail rand ualign pad3 start\n");
- return -1;
- }
- if (memcmp(dest3 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
- printf("Fail rand ualign pad3 end\n");
- return -1;
- }
-
- offset = udest_ptrs[3] - dest4;
- if (memcmp(dest4, dest_ref1, offset)) {
- printf("Fail rand ualign pad4 start\n");
- return -1;
- }
- if (memcmp(dest4 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
- printf("Fail rand ualign pad4 end\n");
- return -1;
- }
-
- putchar('.');
- }
-
- // Test all size alignment
- align = (LEN_ALIGN_CHK_B != 0) ? 1 : 32;
-
- for (size = TEST_LEN; size >= TEST_MIN_SIZE; size -= align) {
- srcs = TEST_SOURCES;
-
- for (i = 0; i < srcs; i++)
- for (j = 0; j < size; j++)
- buffs[i][j] = rand();
-
- for (i = 0; i < srcs; i++) {
- g1[i] = rand();
- g2[i] = rand();
- g3[i] = rand();
- g4[i] = rand();
- }
-
- for (i = 0; i < srcs; i++) {
- gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
- gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
- gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]);
- gf_vect_mul_init(g4[i], &g_tbls[(96 * srcs) + (i * 32)]);
- }
-
- gf_vect_dot_prod_base(size, srcs, &g_tbls[0], buffs, dest_ref1);
- gf_vect_dot_prod_base(size, srcs, &g_tbls[32 * srcs], buffs, dest_ref2);
- gf_vect_dot_prod_base(size, srcs, &g_tbls[64 * srcs], buffs, dest_ref3);
- gf_vect_dot_prod_base(size, srcs, &g_tbls[96 * srcs], buffs, dest_ref4);
-
- FUNCTION_UNDER_TEST(size, srcs, g_tbls, buffs, dest_ptrs);
-
- if (memcmp(dest_ref1, dest_ptrs[0], size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
- size);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref1, 25);
- printf("dprod_dut:");
- dump(dest_ptrs[0], 25);
- return -1;
- }
- if (memcmp(dest_ref2, dest_ptrs[1], size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
- size);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref2, 25);
- printf("dprod_dut:");
- dump(dest_ptrs[1], 25);
- return -1;
- }
- if (memcmp(dest_ref3, dest_ptrs[2], size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
- size);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref3, 25);
- printf("dprod_dut:");
- dump(dest_ptrs[2], 25);
- return -1;
- }
- if (memcmp(dest_ref4, dest_ptrs[3], size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
- size);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref4, 25);
- printf("dprod_dut:");
- dump(dest_ptrs[3], 25);
- return -1;
- }
- }
-
- printf("Pass\n");
- return 0;
-
-}
diff --git a/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_sse_test.patch b/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_sse_test.patch
deleted file mode 100644
index 21bbfaa667..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_4vect_dot_prod_sse_test.patch
+++ /dev/null
@@ -1,4 +0,0 @@
-34c34
-< // #include "types.h"
----
-> #include "types.h"
diff --git a/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx.asm b/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx.asm
index 2d351663c3..870bc1cdaf 100644
--- a/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx.asm
+++ b/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx.asm
@@ -103,7 +103,7 @@
%define return rax
%define return.w eax
- %define func(x) x:
+ %define func(x) x: endbranch
%macro FUNC_SAVE 0
push r12
%endmacro
@@ -169,13 +169,8 @@ section .text
%define xd4 xtmpl1
align 16
-global gf_4vect_mad_avx:ISAL_SYM_TYPE_FUNCTION
+global gf_4vect_mad_avx, function
func(gf_4vect_mad_avx)
-%ifidn __OUTPUT_FORMAT__, macho64
-global _gf_4vect_mad_avx:ISAL_SYM_TYPE_FUNCTION
-func(_gf_4vect_mad_avx)
-%endif
-
FUNC_SAVE
sub len, 16
jl .return_fail
@@ -336,6 +331,3 @@ align 16
mask0f: dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
constip16:
dq 0xf8f9fafbfcfdfeff, 0xf0f1f2f3f4f5f6f7
-
-;;; func core, ver, snum
-slversion gf_4vect_mad_avx, 02, 01, 020a
diff --git a/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx.patch b/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx.patch
deleted file mode 100644
index 5b3ad1f1a9..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx.patch
+++ /dev/null
@@ -1,6 +0,0 @@
-174,178d173
-< %ifidn __OUTPUT_FORMAT__, macho64
-< global _gf_4vect_mad_avx:ISAL_SYM_TYPE_FUNCTION
-< func(_gf_4vect_mad_avx)
-< %endif
-<
diff --git a/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx2.asm b/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx2.asm
index 9ec431ff27..4ec710ddac 100644
--- a/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx2.asm
+++ b/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx2.asm
@@ -101,7 +101,7 @@
%define return rax
%define return.w eax
- %define func(x) x:
+ %define func(x) x: endbranch
%define FUNC_SAVE
%define FUNC_RESTORE
%endif
@@ -165,13 +165,8 @@ section .text
%define xd4 ymm10
align 16
-global gf_4vect_mad_avx2:ISAL_SYM_TYPE_FUNCTION
+global gf_4vect_mad_avx2, function
func(gf_4vect_mad_avx2)
-%ifidn __OUTPUT_FORMAT__, macho64
-global _gf_4vect_mad_avx2:ISAL_SYM_TYPE_FUNCTION
-func(_gf_4vect_mad_avx2)
-%endif
-
FUNC_SAVE
sub len, 32
jl .return_fail
@@ -342,6 +337,3 @@ align 32
constip32:
dq 0xf8f9fafbfcfdfeff, 0xf0f1f2f3f4f5f6f7
dq 0xe8e9eaebecedeeef, 0xe0e1e2e3e4e5e6e7
-
-;;; func core, ver, snum
-slversion gf_4vect_mad_avx2, 04, 01, 020b
diff --git a/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx2.patch b/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx2.patch
deleted file mode 100644
index e0518326ce..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx2.patch
+++ /dev/null
@@ -1,6 +0,0 @@
-170,174d169
-< %ifidn __OUTPUT_FORMAT__, macho64
-< global _gf_4vect_mad_avx2:ISAL_SYM_TYPE_FUNCTION
-< func(_gf_4vect_mad_avx2)
-< %endif
-<
diff --git a/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx2_gfni.asm b/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx2_gfni.asm
new file mode 100644
index 0000000000..63efd4decc
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx2_gfni.asm
@@ -0,0 +1,239 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2023 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_4vect_mad_avx2_gfni(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%include "reg_sizes.asm"
+%include "gf_vect_gfni.inc"
+%include "memcpy.asm"
+
+%if AS_FEATURE_LEVEL >= 10
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+ %define arg4 r8
+ %define arg5 r9
+ %define tmp r11
+ %define tmp2 r10
+ %define tmp3 r12
+ %define func(x) x: endbranch
+ %macro FUNC_SAVE 0
+ push r12
+ %endmacro
+ %macro FUNC_RESTORE 0
+ pop r12
+ %endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+ %define arg4 r12 ; must be saved, loaded and restored
+ %define arg5 r13 ; must be saved and restored
+ %define tmp r11
+ %define tmp2 r10
+ %define tmp3 r14
+ %define stack_size 16*7 + 3*8
+ %define arg(x) [rsp + stack_size + 8 + 8*x]
+ %define func(x) proc_frame x
+
+ %macro FUNC_SAVE 0
+ sub rsp, stack_size
+ vmovdqa [rsp + 0*16], xmm6
+ vmovdqa [rsp + 1*16], xmm7
+ vmovdqa [rsp + 2*16], xmm8
+ vmovdqa [rsp + 3*16], xmm9
+ vmovdqa [rsp + 4*16], xmm10
+ vmovdqa [rsp + 5*16], xmm11
+ vmovdqa [rsp + 6*16], xmm12
+ mov [rsp + 7*16 + 0*8], r12
+ mov [rsp + 7*16 + 1*8], r13
+ mov [rsp + 7*16 + 2*8], r14
+ end_prolog
+ mov arg4, arg(4)
+ mov arg5, arg(5)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ vmovdqa xmm6, [rsp + 0*16]
+ vmovdqa xmm7, [rsp + 1*16]
+ vmovdqa xmm8, [rsp + 2*16]
+ vmovdqa xmm9, [rsp + 3*16]
+ vmovdqa xmm10, [rsp + 4*16]
+ vmovdqa xmm11, [rsp + 5*16]
+ vmovdqa xmm12, [rsp + 6*16]
+ mov r12, [rsp + 7*16 + 0*8]
+ mov r13, [rsp + 7*16 + 1*8]
+ mov r14, [rsp + 7*16 + 2*8]
+ add rsp, stack_size
+ %endmacro
+%endif
+
+%define len arg0
+%define vec arg1
+%define vec_i arg2
+%define mul_array arg3
+%define src arg4
+%define dest1 arg5
+%define pos rax
+%define dest2 mul_array
+%define dest3 vec_i
+%define dest4 tmp3
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+ %define XLDR vmovdqa
+ %define XSTR vmovdqa
+ %else
+ %define XLDR vmovntdqa
+ %define XSTR vmovntdq
+ %endif
+%endif
+
+default rel
+[bits 64]
+section .text
+
+%define x0 ymm0
+%define xd1 ymm1
+%define xd2 ymm2
+%define xd3 ymm3
+%define xd4 ymm4
+%define xgft1 ymm5
+%define xgft2 ymm6
+%define xgft3 ymm7
+%define xgft4 ymm8
+%define xret1 ymm9
+%define xret2 ymm10
+%define xret3 ymm11
+%define xret4 ymm12
+
+;;
+;; Encodes 32 bytes of a single source into 4x 32 bytes (parity disks)
+;;
+%macro ENCODE_32B_4 0
+ ;; get next source vector
+ XLDR x0, [src + pos]
+ ;; get next dest vectors
+ XLDR xd1, [dest1 + pos]
+ XLDR xd2, [dest2 + pos]
+ XLDR xd3, [dest3 + pos]
+ XLDR xd4, [dest4 + pos]
+
+ GF_MUL_XOR VEX, x0, xgft1, xret1, xd1, xgft2, xret2, xd2, \
+ xgft3, xret3, xd3, xgft4, xret4, xd4
+
+ XSTR [dest1 + pos], xd1
+ XSTR [dest2 + pos], xd2
+ XSTR [dest3 + pos], xd3
+ XSTR [dest4 + pos], xd4
+%endmacro
+
+;;
+;; Encodes less than 32 bytes of a single source into 4x parity disks
+;;
+%macro ENCODE_LT_32B_4 1
+%define %%LEN %1
+ ;; get next source vector
+ simd_load_avx2 x0, src + pos, %%LEN, tmp, tmp2
+ ;; get next dest vectors
+ simd_load_avx2 xd1, dest1 + pos, %%LEN, tmp, tmp2
+ simd_load_avx2 xd2, dest2 + pos, %%LEN, tmp, tmp2
+ simd_load_avx2 xd3, dest3 + pos, %%LEN, tmp, tmp2
+ simd_load_avx2 xd4, dest4 + pos, %%LEN, tmp, tmp2
+
+ GF_MUL_XOR VEX, x0, xgft1, xret1, xd1, xgft2, xret2, xd2, \
+ xgft3, xret3, xd3, xgft4, xret4, xd4
+
+ lea dest1, [dest1 + pos]
+ simd_store_avx2 dest1, xd1, %%LEN, tmp, tmp2
+ lea dest2, [dest2 + pos]
+ simd_store_avx2 dest2, xd2, %%LEN, tmp, tmp2
+ lea dest3, [dest3 + pos]
+ simd_store_avx2 dest3, xd3, %%LEN, tmp, tmp2
+ lea dest4, [dest4 + pos]
+ simd_store_avx2 dest4, xd4, %%LEN, tmp, tmp2
+%endmacro
+
+align 16
+global gf_4vect_mad_avx2_gfni, function
+func(gf_4vect_mad_avx2_gfni)
+ FUNC_SAVE
+
+ xor pos, pos
+ shl vec_i, 3 ;Multiply by 8
+ shl vec, 3 ;Multiply by 8
+ lea tmp, [mul_array + vec_i]
+ lea tmp2,[vec*3]
+ vbroadcastsd xgft1, [tmp]
+ vbroadcastsd xgft2, [tmp + vec]
+ vbroadcastsd xgft3, [tmp + vec*2]
+ vbroadcastsd xgft4, [tmp + tmp2]
+ mov dest2, [dest1 + 8] ; reuse mul_array
+ mov dest3, [dest1 + 2*8] ; reuse vec_i
+ mov dest4, [dest1 + 3*8]
+ mov dest1, [dest1]
+
+ cmp len, 32
+ jl .len_lt_32
+
+.loop32:
+ ENCODE_32B_4 ;; loop on 32 bytes at a time
+
+ add pos, 32
+ sub len, 32
+ cmp len, 32
+ jge .loop32
+
+.len_lt_32:
+ cmp len, 0
+ jle .exit
+
+ ENCODE_LT_32B_4 len ;; encode final bytes
+
+.exit:
+ vzeroupper
+
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+%endif ; if AS_FEATURE_LEVEL >= 10
diff --git a/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx512.asm b/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx512.asm
index 7a5866fdf0..c09ec387e7 100644
--- a/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx512.asm
+++ b/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx512.asm
@@ -44,7 +44,7 @@
%define arg5 r9
%define tmp r11
%define return rax
- %define func(x) x:
+ %define func(x) x: endbranch
%define FUNC_SAVE
%define FUNC_RESTORE
%endif
@@ -64,16 +64,16 @@
%macro FUNC_SAVE 0
sub rsp, stack_size
- movdqa [rsp+16*0],xmm6
- movdqa [rsp+16*1],xmm7
- movdqa [rsp+16*2],xmm8
- movdqa [rsp+16*3],xmm9
- movdqa [rsp+16*4],xmm10
- movdqa [rsp+16*5],xmm11
- movdqa [rsp+16*6],xmm12
- movdqa [rsp+16*7],xmm13
- movdqa [rsp+16*8],xmm14
- movdqa [rsp+16*9],xmm15
+ vmovdqa [rsp+16*0],xmm6
+ vmovdqa [rsp+16*1],xmm7
+ vmovdqa [rsp+16*2],xmm8
+ vmovdqa [rsp+16*3],xmm9
+ vmovdqa [rsp+16*4],xmm10
+ vmovdqa [rsp+16*5],xmm11
+ vmovdqa [rsp+16*6],xmm12
+ vmovdqa [rsp+16*7],xmm13
+ vmovdqa [rsp+16*8],xmm14
+ vmovdqa [rsp+16*9],xmm15
save_reg r12, 10*16 + 0*8
save_reg r15, 10*16 + 1*8
end_prolog
@@ -82,16 +82,16 @@
%endmacro
%macro FUNC_RESTORE 0
- movdqa xmm6, [rsp+16*0]
- movdqa xmm7, [rsp+16*1]
- movdqa xmm8, [rsp+16*2]
- movdqa xmm9, [rsp+16*3]
- movdqa xmm10, [rsp+16*4]
- movdqa xmm11, [rsp+16*5]
- movdqa xmm12, [rsp+16*6]
- movdqa xmm13, [rsp+16*7]
- movdqa xmm14, [rsp+16*8]
- movdqa xmm15, [rsp+16*9]
+ vmovdqa xmm6, [rsp+16*0]
+ vmovdqa xmm7, [rsp+16*1]
+ vmovdqa xmm8, [rsp+16*2]
+ vmovdqa xmm9, [rsp+16*3]
+ vmovdqa xmm10, [rsp+16*4]
+ vmovdqa xmm11, [rsp+16*5]
+ vmovdqa xmm12, [rsp+16*6]
+ vmovdqa xmm13, [rsp+16*7]
+ vmovdqa xmm14, [rsp+16*8]
+ vmovdqa xmm15, [rsp+16*9]
mov r12, [rsp + 10*16 + 0*8]
mov r15, [rsp + 10*16 + 1*8]
add rsp, stack_size
@@ -117,8 +117,8 @@
%else
;;; Use Non-temporal load/stor
%ifdef NO_NT_LDST
- %define XLDR vmovdqa
- %define XSTR vmovdqa
+ %define XLDR vmovdqa64
+ %define XSTR vmovdqa64
%else
%define XLDR vmovntdqa
%define XSTR vmovntdq
@@ -159,13 +159,8 @@ section .text
%define xtmpl5 zmm23
align 16
-global gf_4vect_mad_avx512:ISAL_SYM_TYPE_FUNCTION
+global gf_4vect_mad_avx512, function
func(gf_4vect_mad_avx512)
-%ifidn __OUTPUT_FORMAT__, macho64
-global _gf_4vect_mad_avx512:ISAL_SYM_TYPE_FUNCTION
-func(_gf_4vect_mad_avx512)
-%endif
-
FUNC_SAVE
sub len, 64
jl .return_fail
diff --git a/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx512.patch b/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx512.patch
deleted file mode 100644
index 4c2a0d08c0..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx512.patch
+++ /dev/null
@@ -1,6 +0,0 @@
-164,168d163
-< %ifidn __OUTPUT_FORMAT__, macho64
-< global _gf_4vect_mad_avx512:ISAL_SYM_TYPE_FUNCTION
-< func(_gf_4vect_mad_avx512)
-< %endif
-<
diff --git a/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx512_gfni.asm b/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx512_gfni.asm
new file mode 100644
index 0000000000..1a5c4d9804
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/gf_4vect_mad_avx512_gfni.asm
@@ -0,0 +1,223 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2023 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_4vect_mad_avx512_gfni(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%include "reg_sizes.asm"
+%include "gf_vect_gfni.inc"
+
+%if AS_FEATURE_LEVEL >= 10
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+ %define arg4 r8
+ %define arg5 r9
+ %define tmp r11
+ %define func(x) x: endbranch
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+ %define arg4 r12
+ %define arg5 r13
+ %define tmp r11
+ %define stack_size 7*16 + 3*8
+ %define arg(x) [rsp + stack_size + 8 + 8*x]
+ %define func(x) proc_frame x
+
+%macro FUNC_SAVE 0
+ sub rsp, stack_size
+ vmovdqa [rsp + 16*0], xmm6
+ vmovdqa [rsp + 16*1], xmm7
+ vmovdqa [rsp + 16*2], xmm8
+ vmovdqa [rsp + 16*3], xmm9
+ vmovdqa [rsp + 16*4], xmm10
+ vmovdqa [rsp + 16*5], xmm11
+ vmovdqa [rsp + 16*6], xmm12
+ mov [rsp + 7*16 + 0*8], r12
+ mov [rsp + 7*16 + 1*8], r13
+ end_prolog
+ mov arg4, arg(4)
+ mov arg5, arg(5)
+%endmacro
+
+%macro FUNC_RESTORE 0
+ vmovdqa xmm6, [rsp + 16*0]
+ vmovdqa xmm7, [rsp + 16*1]
+ vmovdqa xmm8, [rsp + 16*2]
+ vmovdqa xmm9, [rsp + 16*3]
+ vmovdqa xmm10, [rsp + 16*4]
+ vmovdqa xmm11, [rsp + 16*5]
+ vmovdqa xmm12, [rsp + 16*6]
+ mov r12, [rsp + 7*16 + 0*8]
+ mov r13, [rsp + 7*16 + 1*8]
+ add rsp, stack_size
+%endmacro
+%endif
+
+%define len arg0
+%define vec arg1
+%define vec_i arg2
+%define mul_array arg3
+%define src arg4
+%define dest1 arg5
+%define pos rax
+%define dest2 mul_array
+%define dest3 vec
+%define dest4 vec_i
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu8
+ %define XSTR vmovdqu8
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+ %define XLDR vmovdqa64
+ %define XSTR vmovdqa64
+ %else
+ %define XLDR vmovntdqa
+ %define XSTR vmovntdq
+ %endif
+%endif
+
+default rel
+[bits 64]
+section .text
+
+%define x0 zmm0
+%define xd1 zmm1
+%define xd2 zmm2
+%define xd3 zmm3
+%define xd4 zmm4
+
+%define xgft1 zmm5
+%define xgft2 zmm6
+%define xgft3 zmm7
+%define xgft4 zmm8
+
+%define xret1 zmm9
+%define xret2 zmm10
+%define xret3 zmm11
+%define xret4 zmm12
+
+;;
+;; Encodes 64 bytes of a single source into 4x 64 bytes (parity disks)
+;;
+%macro ENCODE_64B_4 0-1
+%define %%KMASK %1
+
+%if %0 == 1
+ vmovdqu8 x0{%%KMASK}, [src + pos] ;Get next source vector
+ vmovdqu8 xd1{%%KMASK}, [dest1 + pos] ;Get next dest vector
+ vmovdqu8 xd2{%%KMASK}, [dest2 + pos] ;Get next dest vector
+ vmovdqu8 xd3{%%KMASK}, [dest3 + pos] ;Get next dest vector
+ vmovdqu8 xd4{%%KMASK}, [dest4 + pos] ;Get next dest vector
+%else
+ XLDR x0, [src + pos] ;Get next source vector
+ XLDR xd1, [dest1 + pos] ;Get next dest vector
+ XLDR xd2, [dest2 + pos] ;Get next dest vector
+ XLDR xd3, [dest3 + pos] ;Get next dest vector
+ XLDR xd4, [dest4 + pos] ;Get next dest vector
+%endif
+
+ GF_MUL_XOR EVEX, x0, xgft1, xret1, xd1, xgft2, xret2, xd2, xgft3, xret3, xd3, \
+ xgft4, xret4, xd4
+
+%if %0 == 1
+ vmovdqu8 [dest1 + pos]{%%KMASK}, xd1
+ vmovdqu8 [dest2 + pos]{%%KMASK}, xd2
+ vmovdqu8 [dest3 + pos]{%%KMASK}, xd3
+ vmovdqu8 [dest4 + pos]{%%KMASK}, xd4
+%else
+ XSTR [dest1 + pos], xd1
+ XSTR [dest2 + pos], xd2
+ XSTR [dest3 + pos], xd3
+ XSTR [dest4 + pos], xd4
+%endif
+%endmacro
+
+align 16
+global gf_4vect_mad_avx512_gfni, function
+func(gf_4vect_mad_avx512_gfni)
+ FUNC_SAVE
+
+ xor pos, pos
+ shl vec_i, 3 ;Multiply by 8
+ shl vec, 3 ;Multiply by 8
+ lea tmp, [mul_array + vec_i]
+ vbroadcastf32x2 xgft1, [tmp]
+ vbroadcastf32x2 xgft2, [tmp + vec]
+ vbroadcastf32x2 xgft3, [tmp + vec*2]
+ add tmp, vec
+ vbroadcastf32x2 xgft4, [tmp + vec*2]
+ mov dest2, [dest1 + 8] ; reuse mul_array
+ mov dest3, [dest1 + 2*8] ; reuse vec
+ mov dest4, [dest1 + 3*8] ; reuse vec_i
+ mov dest1, [dest1]
+
+ cmp len, 64
+ jl .len_lt_64
+.loop64:
+ ENCODE_64B_4
+
+ add pos, 64 ;Loop on 64 bytes at a time
+ sub len, 64
+ cmp len, 64
+ jge .loop64
+
+.len_lt_64:
+ cmp len, 0
+ jle .exit
+
+ xor tmp, tmp
+ bts tmp, len
+ dec tmp
+ kmovq k1, tmp
+
+ ENCODE_64B_4 k1
+
+.exit:
+ vzeroupper
+
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+%endif ; if AS_FEATURE_LEVEL >= 10
diff --git a/contrib/libs/isa-l/erasure_code/gf_4vect_mad_sse.asm b/contrib/libs/isa-l/erasure_code/gf_4vect_mad_sse.asm
index 32b6cda183..efbe3836a8 100644
--- a/contrib/libs/isa-l/erasure_code/gf_4vect_mad_sse.asm
+++ b/contrib/libs/isa-l/erasure_code/gf_4vect_mad_sse.asm
@@ -103,7 +103,7 @@
%define return rax
%define return.w eax
- %define func(x) x:
+ %define func(x) x: endbranch
%macro FUNC_SAVE 0
push r12
%endmacro
@@ -168,13 +168,8 @@ section .text
%define xd4 xtmpl1
align 16
-global gf_4vect_mad_sse:ISAL_SYM_TYPE_FUNCTION
+global gf_4vect_mad_sse, function
func(gf_4vect_mad_sse)
-%ifidn __OUTPUT_FORMAT__, macho64
-global _gf_4vect_mad_sse:ISAL_SYM_TYPE_FUNCTION
-func(_gf_4vect_mad_sse)
-%endif
-
FUNC_SAVE
sub len, 16
jl .return_fail
@@ -342,6 +337,3 @@ mask0f:
dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
constip16:
dq 0xf8f9fafbfcfdfeff, 0xf0f1f2f3f4f5f6f7
-
-;;; func core, ver, snum
-slversion gf_4vect_mad_sse, 00, 01, 0209
diff --git a/contrib/libs/isa-l/erasure_code/gf_4vect_mad_sse.patch b/contrib/libs/isa-l/erasure_code/gf_4vect_mad_sse.patch
deleted file mode 100644
index d8610712d2..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_4vect_mad_sse.patch
+++ /dev/null
@@ -1,6 +0,0 @@
-173,177d172
-< %ifidn __OUTPUT_FORMAT__, macho64
-< global _gf_4vect_mad_sse:ISAL_SYM_TYPE_FUNCTION
-< func(_gf_4vect_mad_sse)
-< %endif
-<
diff --git a/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_avx.asm b/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_avx.asm
index 1d8cccf70b..978b4d2720 100644
--- a/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_avx.asm
+++ b/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_avx.asm
@@ -51,7 +51,7 @@
%define PS 8
%define LOG_PS 3
- %define func(x) x:
+ %define func(x) x: endbranch
%macro FUNC_SAVE 0
push r12
push r13
@@ -89,16 +89,16 @@
%define func(x) proc_frame x
%macro FUNC_SAVE 0
alloc_stack stack_size
- save_xmm128 xmm6, 0*16
- save_xmm128 xmm7, 1*16
- save_xmm128 xmm8, 2*16
- save_xmm128 xmm9, 3*16
- save_xmm128 xmm10, 4*16
- save_xmm128 xmm11, 5*16
- save_xmm128 xmm12, 6*16
- save_xmm128 xmm13, 7*16
- save_xmm128 xmm14, 8*16
- save_xmm128 xmm15, 9*16
+ vmovdqa [rsp + 0*16], xmm6
+ vmovdqa [rsp + 1*16], xmm7
+ vmovdqa [rsp + 2*16], xmm8
+ vmovdqa [rsp + 3*16], xmm9
+ vmovdqa [rsp + 4*16], xmm10
+ vmovdqa [rsp + 5*16], xmm11
+ vmovdqa [rsp + 6*16], xmm12
+ vmovdqa [rsp + 7*16], xmm13
+ vmovdqa [rsp + 8*16], xmm14
+ vmovdqa [rsp + 9*16], xmm15
save_reg r12, 10*16 + 0*8
save_reg r13, 10*16 + 1*8
save_reg r14, 10*16 + 2*8
@@ -184,13 +184,8 @@ section .text
%define xp5 xmm6
align 16
-global gf_5vect_dot_prod_avx:ISAL_SYM_TYPE_FUNCTION
+global gf_5vect_dot_prod_avx, function
func(gf_5vect_dot_prod_avx)
-%ifidn __OUTPUT_FORMAT__, macho64
-global _gf_5vect_dot_prod_avx:ISAL_SYM_TYPE_FUNCTION
-func(_gf_5vect_dot_prod_avx)
-%endif
-
FUNC_SAVE
sub len, 16
jl .return_fail
@@ -303,6 +298,3 @@ section .data
align 16
mask0f: dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
-
-;;; func core, ver, snum
-slversion gf_5vect_dot_prod_avx, 02, 04, 0194
diff --git a/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_avx.patch b/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_avx.patch
deleted file mode 100644
index e72a2b4857..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_avx.patch
+++ /dev/null
@@ -1,6 +0,0 @@
-189,193d188
-< %ifidn __OUTPUT_FORMAT__, macho64
-< global _gf_5vect_dot_prod_avx:ISAL_SYM_TYPE_FUNCTION
-< func(_gf_5vect_dot_prod_avx)
-< %endif
-<
diff --git a/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_avx2.asm b/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_avx2.asm
index 0cdfee906e..11fb36b687 100644
--- a/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_avx2.asm
+++ b/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_avx2.asm
@@ -53,7 +53,7 @@
%define PS 8
%define LOG_PS 3
- %define func(x) x:
+ %define func(x) x: endbranch
%macro FUNC_SAVE 0
push r12
push r13
@@ -189,13 +189,8 @@ section .text
%define xp5 ymm6
align 16
-global gf_5vect_dot_prod_avx2:ISAL_SYM_TYPE_FUNCTION
+global gf_5vect_dot_prod_avx2, function
func(gf_5vect_dot_prod_avx2)
-%ifidn __OUTPUT_FORMAT__, macho64
-global _gf_5vect_dot_prod_avx2:ISAL_SYM_TYPE_FUNCTION
-func(_gf_5vect_dot_prod_avx2)
-%endif
-
FUNC_SAVE
sub len, 32
jl .return_fail
@@ -313,8 +308,3 @@ func(_gf_5vect_dot_prod_avx2)
ret
endproc_frame
-
-section .data
-
-;;; func core, ver, snum
-slversion gf_5vect_dot_prod_avx2, 04, 04, 0199
diff --git a/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_avx2.patch b/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_avx2.patch
deleted file mode 100644
index a898e05522..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_avx2.patch
+++ /dev/null
@@ -1,6 +0,0 @@
-194,198d193
-< %ifidn __OUTPUT_FORMAT__, macho64
-< global _gf_5vect_dot_prod_avx2:ISAL_SYM_TYPE_FUNCTION
-< func(_gf_5vect_dot_prod_avx2)
-< %endif
-<
diff --git a/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_avx512.asm b/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_avx512.asm
new file mode 100644
index 0000000000..e790cb69eb
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_avx512.asm
@@ -0,0 +1,334 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2019 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_5vect_dot_prod_avx512(len, vec, *g_tbls, **buffs, **dests);
+;;;
+
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+ %define arg4 r8
+ %define arg5 r9
+
+ %define tmp r11
+ %define tmp2 r10
+ %define tmp3 r13 ; must be saved and restored
+ %define tmp4 r12 ; must be saved and restored
+ %define tmp5 r14 ; must be saved and restored
+ %define tmp6 r15 ; must be saved and restored
+ %define tmp7 rbp ; must be saved and restored
+ %define tmp8 rbx ; must be saved and restored
+ %define return rax
+ %define PS 8
+ %define LOG_PS 3
+ %define stack_size 6*8
+
+ %define func(x) x: endbranch
+ %macro FUNC_SAVE 0
+ sub rsp, stack_size
+ mov [rsp + 0*8], r12
+ mov [rsp + 1*8], r13
+ mov [rsp + 2*8], r14
+ mov [rsp + 3*8], r15
+ mov [rsp + 4*8], rbp
+ mov [rsp + 5*8], rbx
+ %endmacro
+ %macro FUNC_RESTORE 0
+ mov r12, [rsp + 0*8]
+ mov r13, [rsp + 1*8]
+ mov r14, [rsp + 2*8]
+ mov r15, [rsp + 3*8]
+ mov rbp, [rsp + 4*8]
+ mov rbx, [rsp + 5*8]
+ add rsp, stack_size
+ %endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+
+ %define arg4 r12 ; must be saved, loaded and restored
+ %define arg5 r15 ; must be saved and restored
+ %define tmp r11
+ %define tmp2 r10
+ %define tmp3 r13 ; must be saved and restored
+ %define tmp4 r14 ; must be saved and restored
+ %define tmp5 rdi ; must be saved and restored
+ %define tmp6 rsi ; must be saved and restored
+ %define tmp7 rbp ; must be saved and restored
+ %define tmp8 rbx ; must be saved and restored
+ %define return rax
+ %define PS 8
+ %define LOG_PS 3
+ %define stack_size 10*16 + 9*8 ; must be an odd multiple of 8
+ %define arg(x) [rsp + stack_size + PS + PS*x]
+
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ vmovdqa [rsp + 0*16], xmm6
+ vmovdqa [rsp + 1*16], xmm7
+ vmovdqa [rsp + 2*16], xmm8
+ vmovdqa [rsp + 3*16], xmm9
+ vmovdqa [rsp + 4*16], xmm10
+ vmovdqa [rsp + 5*16], xmm11
+ vmovdqa [rsp + 6*16], xmm12
+ vmovdqa [rsp + 7*16], xmm13
+ vmovdqa [rsp + 8*16], xmm14
+ vmovdqa [rsp + 9*16], xmm15
+ save_reg r12, 10*16 + 0*8
+ save_reg r13, 10*16 + 1*8
+ save_reg r14, 10*16 + 2*8
+ save_reg r15, 10*16 + 3*8
+ save_reg rdi, 10*16 + 4*8
+ save_reg rsi, 10*16 + 5*8
+ save_reg rbp, 10*16 + 6*8
+ save_reg rbx, 10*16 + 7*8
+ end_prolog
+ mov arg4, arg(4)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ vmovdqa xmm6, [rsp + 0*16]
+ vmovdqa xmm7, [rsp + 1*16]
+ vmovdqa xmm8, [rsp + 2*16]
+ vmovdqa xmm9, [rsp + 3*16]
+ vmovdqa xmm10, [rsp + 4*16]
+ vmovdqa xmm11, [rsp + 5*16]
+ vmovdqa xmm12, [rsp + 6*16]
+ vmovdqa xmm13, [rsp + 7*16]
+ vmovdqa xmm14, [rsp + 8*16]
+ vmovdqa xmm15, [rsp + 9*16]
+ mov r12, [rsp + 10*16 + 0*8]
+ mov r13, [rsp + 10*16 + 1*8]
+ mov r14, [rsp + 10*16 + 2*8]
+ mov r15, [rsp + 10*16 + 3*8]
+ mov rdi, [rsp + 10*16 + 4*8]
+ mov rsi, [rsp + 10*16 + 5*8]
+ mov rbp, [rsp + 10*16 + 6*8]
+ mov rbx, [rsp + 10*16 + 7*8]
+ add rsp, stack_size
+ %endmacro
+%endif
+
+
+%define len arg0
+%define vec arg1
+%define mul_array arg2
+%define src arg3
+%define dest1 arg4
+%define ptr arg5
+%define vec_i tmp2
+%define dest2 tmp3
+%define dest3 tmp4
+%define dest4 tmp5
+%define vskip3 tmp6
+%define dest5 tmp7
+%define vskip1 tmp8
+%define pos return
+
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu8
+ %define XSTR vmovdqu8
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+ %define XLDR vmovdqa64
+ %define XSTR vmovdqa64
+ %else
+ %define XLDR vmovntdqa
+ %define XSTR vmovntdq
+ %endif
+%endif
+
+%define xmask0f zmm17
+%define xgft1_lo zmm16
+%define xgft1_loy ymm16
+%define xgft1_hi zmm15
+%define xgft2_lo zmm14
+%define xgft2_loy ymm14
+%define xgft2_hi zmm13
+%define xgft3_lo zmm12
+%define xgft3_loy ymm12
+%define xgft3_hi zmm11
+%define xgft4_lo zmm10
+%define xgft4_loy ymm10
+%define xgft4_hi zmm9
+%define xgft5_lo zmm8
+%define xgft5_loy ymm8
+%define xgft5_hi zmm7
+
+%define x0 zmm0
+%define xtmpa zmm1
+%define xp1 zmm2
+%define xp2 zmm3
+%define xp3 zmm4
+%define xp4 zmm5
+%define xp5 zmm6
+
+default rel
+[bits 64]
+
+section .text
+
+align 16
+global gf_5vect_dot_prod_avx512, function
+func(gf_5vect_dot_prod_avx512)
+ FUNC_SAVE
+ sub len, 64
+ jl .return_fail
+
+ xor pos, pos
+ mov tmp, 0x0f
+ vpbroadcastb xmask0f, tmp ;Construct mask 0x0f0f0f...
+ mov vskip1, vec
+ imul vskip1, 32
+ mov vskip3, vec
+ imul vskip3, 96
+ sal vec, LOG_PS ;vec *= PS. Make vec_i count by PS
+ mov dest2, [dest1+PS]
+ mov dest3, [dest1+2*PS]
+ mov dest4, [dest1+3*PS]
+ mov dest5, [dest1+4*PS]
+ mov dest1, [dest1]
+
+.loop64:
+ vpxorq xp1, xp1, xp1
+ vpxorq xp2, xp2, xp2
+ vpxorq xp3, xp3, xp3
+ vpxorq xp4, xp4, xp4
+ vpxorq xp5, xp5, xp5
+ mov tmp, mul_array
+ xor vec_i, vec_i
+
+.next_vect:
+ mov ptr, [src+vec_i]
+ XLDR x0, [ptr+pos] ;Get next source vector
+ add vec_i, PS
+
+ vpandq xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
+ vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
+ vpandq x0, x0, xmask0f ;Mask high src nibble in bits 4-0
+
+ vmovdqu8 xgft1_loy, [tmp] ;Load array Ax{00}..{0f}, Ax{00}..{f0}
+ vmovdqu8 xgft2_loy, [tmp+vec*(32/PS)] ;Load array Bx{00}..{0f}, Bx{00}..{f0}
+ vmovdqu8 xgft3_loy, [tmp+vec*(64/PS)] ;Load array Cx{00}..{0f}, Cx{00}..{f0}
+ vmovdqu8 xgft4_loy, [tmp+vskip3] ;Load array Dx{00}..{0f}, Dx{00}..{f0}
+ vmovdqu8 xgft5_loy, [tmp+vskip1*4] ;Load array Ex{00}..{0f}, Ex{00}..{f0}
+ add tmp, 32
+
+ vshufi64x2 xgft1_hi, xgft1_lo, xgft1_lo, 0x55
+ vshufi64x2 xgft1_lo, xgft1_lo, xgft1_lo, 0x00
+ vshufi64x2 xgft2_hi, xgft2_lo, xgft2_lo, 0x55
+ vshufi64x2 xgft2_lo, xgft2_lo, xgft2_lo, 0x00
+
+ vpshufb xgft1_hi, xgft1_hi, x0 ;Lookup mul table of high nibble
+ vpshufb xgft1_lo, xgft1_lo, xtmpa ;Lookup mul table of low nibble
+ vpxorq xgft1_hi, xgft1_hi, xgft1_lo ;GF add high and low partials
+ vpxorq xp1, xp1, xgft1_hi ;xp1 += partial
+
+ vpshufb xgft2_hi, xgft2_hi, x0 ;Lookup mul table of high nibble
+ vpshufb xgft2_lo, xgft2_lo, xtmpa ;Lookup mul table of low nibble
+ vpxorq xgft2_hi, xgft2_hi, xgft2_lo ;GF add high and low partials
+ vpxorq xp2, xp2, xgft2_hi ;xp2 += partial
+
+ vshufi64x2 xgft3_hi, xgft3_lo, xgft3_lo, 0x55
+ vshufi64x2 xgft3_lo, xgft3_lo, xgft3_lo, 0x00
+ vshufi64x2 xgft4_hi, xgft4_lo, xgft4_lo, 0x55
+ vshufi64x2 xgft4_lo, xgft4_lo, xgft4_lo, 0x00
+
+ vpshufb xgft3_hi, xgft3_hi, x0 ;Lookup mul table of high nibble
+ vpshufb xgft3_lo, xgft3_lo, xtmpa ;Lookup mul table of low nibble
+ vpxorq xgft3_hi, xgft3_hi, xgft3_lo ;GF add high and low partials
+ vpxorq xp3, xp3, xgft3_hi ;xp3 += partial
+
+ vpshufb xgft4_hi, xgft4_hi, x0 ;Lookup mul table of high nibble
+ vpshufb xgft4_lo, xgft4_lo, xtmpa ;Lookup mul table of low nibble
+ vpxorq xgft4_hi, xgft4_hi, xgft4_lo ;GF add high and low partials
+ vpxorq xp4, xp4, xgft4_hi ;xp4 += partial
+
+ vshufi64x2 xgft5_hi, xgft5_lo, xgft5_lo, 0x55
+ vshufi64x2 xgft5_lo, xgft5_lo, xgft5_lo, 0x00
+
+ vpshufb xgft5_hi, xgft5_hi, x0 ;Lookup mul table of high nibble
+ vpshufb xgft5_lo, xgft5_lo, xtmpa ;Lookup mul table of low nibble
+ vpxorq xgft5_hi, xgft5_hi, xgft5_lo ;GF add high and low partials
+ vpxorq xp5, xp5, xgft5_hi ;xp5 += partial
+
+ cmp vec_i, vec
+ jl .next_vect
+
+ XSTR [dest1+pos], xp1
+ XSTR [dest2+pos], xp2
+ XSTR [dest3+pos], xp3
+ XSTR [dest4+pos], xp4
+ XSTR [dest5+pos], xp5
+
+ add pos, 64 ;Loop on 64 bytes at a time
+ cmp pos, len
+ jle .loop64
+
+ lea tmp, [len + 64]
+ cmp pos, tmp
+ je .return_pass
+
+ ;; Tail len
+ mov pos, len ;Overlapped offset length-64
+ jmp .loop64 ;Do one more overlap pass
+
+.return_pass:
+ mov return, 0
+ FUNC_RESTORE
+ ret
+
+.return_fail:
+ mov return, 1
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_gf_5vect_dot_prod_avx512
+no_gf_5vect_dot_prod_avx512:
+%endif
+%endif ; ifdef HAVE_AS_KNOWS_AVX512
diff --git a/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_avx512_gfni.asm b/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_avx512_gfni.asm
new file mode 100644
index 0000000000..ebb9052368
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_avx512_gfni.asm
@@ -0,0 +1,275 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2023 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_5vect_dot_prod_avx512_gfni(len, vec, *g_tbls, **buffs, **dests);
+;;;
+
+%include "reg_sizes.asm"
+%include "gf_vect_gfni.inc"
+
+%if AS_FEATURE_LEVEL >= 10
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+ %define arg4 r8
+ %define arg5 r9
+
+ %define tmp r11
+ %define tmp2 r10
+ %define tmp3 r13 ; must be saved and restored
+ %define tmp4 r12 ; must be saved and restored
+ %define tmp5 r14 ; must be saved and restored
+ %define tmp6 r15 ; must be saved and restored
+ %define tmp7 rbp ; must be saved and restored
+
+ %define func(x) x: endbranch
+ %macro FUNC_SAVE 0
+ push r12
+ push r13
+ push r14
+ push r15
+ push rbp
+ %endmacro
+ %macro FUNC_RESTORE 0
+ pop rbp
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ %endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+
+ %define arg4 r12 ; must be saved, loaded and restored
+ %define arg5 r15 ; must be saved and restored
+ %define tmp r11
+ %define tmp2 r10
+ %define tmp3 r13 ; must be saved and restored
+ %define tmp4 r14 ; must be saved and restored
+ %define tmp5 rdi ; must be saved and restored
+ %define tmp6 rsi ; must be saved and restored
+ %define tmp7 rbp ; must be saved and restored
+ %define stack_size 5*16 + 9*8 ; must be an odd multiple of 8
+ %define arg(x) [rsp + stack_size + 8 + 8*x]
+
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ vmovdqa [rsp + 0*16], xmm6
+ vmovdqa [rsp + 1*16], xmm7
+ vmovdqa [rsp + 2*16], xmm8
+ vmovdqa [rsp + 3*16], xmm9
+ vmovdqa [rsp + 4*16], xmm10
+ mov [rsp + 5*16 + 0*8], r12
+ mov [rsp + 5*16 + 1*8], r13
+ mov [rsp + 5*16 + 2*8], r14
+ mov [rsp + 5*16 + 3*8], r15
+ mov [rsp + 5*16 + 4*8], rdi
+ mov [rsp + 5*16 + 5*8], rsi
+ mov [rsp + 5*16 + 6*8], rbp
+ end_prolog
+ mov arg4, arg(4)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ vmovdqa xmm6, [rsp + 0*16]
+ vmovdqa xmm7, [rsp + 1*16]
+ vmovdqa xmm8, [rsp + 2*16]
+ vmovdqa xmm9, [rsp + 3*16]
+ vmovdqa xmm10, [rsp + 4*16]
+ mov r12, [rsp + 5*16 + 0*8]
+ mov r13, [rsp + 5*16 + 1*8]
+ mov r14, [rsp + 5*16 + 2*8]
+ mov r15, [rsp + 5*16 + 3*8]
+ mov rdi, [rsp + 5*16 + 4*8]
+ mov rsi, [rsp + 5*16 + 5*8]
+ mov rbp, [rsp + 5*16 + 6*8]
+ add rsp, stack_size
+ %endmacro
+%endif
+
+
+%define len arg0
+%define vec arg1
+%define mul_array arg2
+%define src arg3
+%define dest1 arg4
+%define ptr arg5
+%define vec_i tmp2
+%define dest2 tmp3
+%define dest3 tmp4
+%define dest4 tmp5
+%define vskip3 tmp6
+%define dest5 tmp7
+%define pos rax
+
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu8
+ %define XSTR vmovdqu8
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+ %define XLDR vmovdqa64
+ %define XSTR vmovdqa64
+ %else
+ %define XLDR vmovntdqa
+ %define XSTR vmovntdq
+ %endif
+%endif
+
+%define xgft1 zmm6
+%define xgft2 zmm7
+%define xgft3 zmm8
+%define xgft4 zmm9
+%define xgft5 zmm10
+
+%define x0 zmm0
+%define xp1 zmm1
+%define xp2 zmm2
+%define xp3 zmm3
+%define xp4 zmm4
+%define xp5 zmm5
+
+default rel
+[bits 64]
+
+section .text
+
+;;
+;; Encodes 64 bytes of all "k" sources into 5x 64 bytes (parity disks)
+;;
+%macro ENCODE_64B_5 0-1
+%define %%KMASK %1
+
+ vpxorq xp1, xp1, xp1
+ vpxorq xp2, xp2, xp2
+ vpxorq xp3, xp3, xp3
+ vpxorq xp4, xp4, xp4
+ vpxorq xp5, xp5, xp5
+ mov tmp, mul_array
+ xor vec_i, vec_i
+
+%%next_vect:
+ mov ptr, [src + vec_i]
+%if %0 == 1
+ vmovdqu8 x0{%%KMASK}, [ptr + pos] ;Get next source vector (less than 64 bytes)
+%else
+ XLDR x0, [ptr + pos] ;Get next source vector (64 bytes)
+%endif
+ add vec_i, 8
+
+ vbroadcastf32x2 xgft1, [tmp]
+ vbroadcastf32x2 xgft2, [tmp + vec]
+ vbroadcastf32x2 xgft3, [tmp + vec*2]
+ vbroadcastf32x2 xgft4, [tmp + vskip3]
+ vbroadcastf32x2 xgft5, [tmp + vec*4]
+ add tmp, 8
+
+ GF_MUL_XOR EVEX, x0, xgft1, xgft1, xp1, xgft2, xgft2, xp2, xgft3, xgft3, xp3, \
+ xgft4, xgft4, xp4, xgft5, xgft5, xp5
+
+ cmp vec_i, vec
+ jl %%next_vect
+
+ mov ptr, [dest1] ;reuse ptr
+ mov tmp, [dest1 + 5*8] ;reuse tmp
+
+%if %0 == 1
+ vmovdqu8 [dest1 + pos]{%%KMASK}, xp1
+ vmovdqu8 [dest2 + pos]{%%KMASK}, xp2
+ vmovdqu8 [dest3 + pos]{%%KMASK}, xp3
+ vmovdqu8 [dest4 + pos]{%%KMASK}, xp4
+ vmovdqu8 [dest5 + pos]{%%KMASK}, xp5
+%else
+ XSTR [dest1 + pos], xp1
+ XSTR [dest2 + pos], xp2
+ XSTR [dest3 + pos], xp3
+ XSTR [dest4 + pos], xp4
+ XSTR [dest5 + pos], xp5
+%endif
+%endmacro
+
+align 16
+global gf_5vect_dot_prod_avx512_gfni, function
+func(gf_5vect_dot_prod_avx512_gfni)
+ FUNC_SAVE
+
+ xor pos, pos
+ mov vskip3, vec
+ imul vskip3, 8*3
+ shl vec, 3 ;vec *= 8. Make vec_i count by 8
+ mov dest2, [dest1 + 8]
+ mov dest3, [dest1 + 2*8]
+ mov dest4, [dest1 + 3*8]
+ mov dest5, [dest1 + 4*8]
+ mov dest1, [dest1]
+
+ cmp len, 64
+ jl .len_lt_64
+
+.loop64:
+
+ ENCODE_64B_5
+
+ add pos, 64 ;Loop on 64 bytes at a time
+ sub len, 64
+ cmp len, 64
+ jge .loop64
+
+
+.len_lt_64:
+ cmp len, 0
+ jle .exit
+
+ xor tmp, tmp
+ bts tmp, len
+ dec tmp
+ kmovq k1, tmp
+
+ ENCODE_64B_5 k1
+
+.exit:
+ vzeroupper
+
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+%endif ; if AS_FEATURE_LEVEL >= 10
diff --git a/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_sse.asm b/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_sse.asm
index 577875dbb4..b669ac6464 100644
--- a/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_sse.asm
+++ b/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_sse.asm
@@ -51,7 +51,7 @@
%define PS 8
%define LOG_PS 3
- %define func(x) x:
+ %define func(x) x: endbranch
%macro FUNC_SAVE 0
push r12
push r13
@@ -184,13 +184,8 @@ section .text
%define xp5 xmm14
align 16
-global gf_5vect_dot_prod_sse:ISAL_SYM_TYPE_FUNCTION
+global gf_5vect_dot_prod_sse, function
func(gf_5vect_dot_prod_sse)
-%ifidn __OUTPUT_FORMAT__, macho64
-global _gf_5vect_dot_prod_sse:ISAL_SYM_TYPE_FUNCTION
-func(_gf_5vect_dot_prod_sse)
-%endif
-
FUNC_SAVE
sub len, 16
jl .return_fail
@@ -304,6 +299,3 @@ section .data
align 16
mask0f: dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
-
-;;; func core, ver, snum
-slversion gf_5vect_dot_prod_sse, 00, 05, 0065
diff --git a/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_sse.patch b/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_sse.patch
deleted file mode 100644
index eaa82dcc5d..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_sse.patch
+++ /dev/null
@@ -1,6 +0,0 @@
-189,193d188
-< %ifidn __OUTPUT_FORMAT__, macho64
-< global _gf_5vect_dot_prod_sse:ISAL_SYM_TYPE_FUNCTION
-< func(_gf_5vect_dot_prod_sse)
-< %endif
-<
diff --git a/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_sse_test.c b/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_sse_test.c
deleted file mode 100644
index b1eea664b1..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_sse_test.c
+++ /dev/null
@@ -1,805 +0,0 @@
-/**********************************************************************
- Copyright(c) 2011-2015 Intel Corporation All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions
- are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in
- the documentation and/or other materials provided with the
- distribution.
- * Neither the name of Intel Corporation nor the names of its
- contributors may be used to endorse or promote products derived
- from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-**********************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h> // for memset, memcmp
-#include "erasure_code.h"
-// #include "types.h"
-
-#ifndef FUNCTION_UNDER_TEST
-# define FUNCTION_UNDER_TEST gf_5vect_dot_prod_sse
-#endif
-#ifndef TEST_MIN_SIZE
-# define TEST_MIN_SIZE 16
-#endif
-
-#define str(s) #s
-#define xstr(s) str(s)
-
-#define TEST_LEN 2048
-#define TEST_SIZE (TEST_LEN/2)
-#define TEST_MEM TEST_SIZE
-#define TEST_LOOPS 1000
-#define TEST_TYPE_STR ""
-
-#ifndef TEST_SOURCES
-# define TEST_SOURCES 16
-#endif
-#ifndef RANDOMS
-# define RANDOMS 20
-#endif
-
-#ifdef EC_ALIGNED_ADDR
-// Define power of 2 range to check ptr, len alignment
-# define PTR_ALIGN_CHK_B 0
-# define LEN_ALIGN_CHK_B 0 // 0 for aligned only
-#else
-// Define power of 2 range to check ptr, len alignment
-# define PTR_ALIGN_CHK_B 32
-# define LEN_ALIGN_CHK_B 32 // 0 for aligned only
-#endif
-
-typedef unsigned char u8;
-
-void dump(unsigned char *buf, int len)
-{
- int i;
- for (i = 0; i < len;) {
- printf(" %2x", 0xff & buf[i++]);
- if (i % 32 == 0)
- printf("\n");
- }
- printf("\n");
-}
-
-void dump_matrix(unsigned char **s, int k, int m)
-{
- int i, j;
- for (i = 0; i < k; i++) {
- for (j = 0; j < m; j++) {
- printf(" %2x", s[i][j]);
- }
- printf("\n");
- }
- printf("\n");
-}
-
-void dump_u8xu8(unsigned char *s, int k, int m)
-{
- int i, j;
- for (i = 0; i < k; i++) {
- for (j = 0; j < m; j++) {
- printf(" %2x", 0xff & s[j + (i * m)]);
- }
- printf("\n");
- }
- printf("\n");
-}
-
-int main(int argc, char *argv[])
-{
- int i, j, rtest, srcs;
- void *buf;
- u8 g1[TEST_SOURCES], g2[TEST_SOURCES], g3[TEST_SOURCES];
- u8 g4[TEST_SOURCES], g5[TEST_SOURCES], *g_tbls;
- u8 *dest1, *dest2, *dest3, *dest4, *dest5, *buffs[TEST_SOURCES];
- u8 *dest_ref1, *dest_ref2, *dest_ref3, *dest_ref4, *dest_ref5;
- u8 *dest_ptrs[5];
-
- int align, size;
- unsigned char *efence_buffs[TEST_SOURCES];
- unsigned int offset;
- u8 *ubuffs[TEST_SOURCES];
- u8 *udest_ptrs[5];
- printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d ", TEST_SOURCES, TEST_LEN);
-
- // Allocate the arrays
- for (i = 0; i < TEST_SOURCES; i++) {
- if (posix_memalign(&buf, 64, TEST_LEN)) {
- printf("alloc error: Fail");
- return -1;
- }
- buffs[i] = buf;
- }
-
- if (posix_memalign(&buf, 16, 2 * (6 * TEST_SOURCES * 32))) {
- printf("alloc error: Fail");
- return -1;
- }
- g_tbls = buf;
-
- if (posix_memalign(&buf, 64, TEST_LEN)) {
- printf("alloc error: Fail");
- return -1;
- }
- dest1 = buf;
-
- if (posix_memalign(&buf, 64, TEST_LEN)) {
- printf("alloc error: Fail");
- return -1;
- }
- dest2 = buf;
-
- if (posix_memalign(&buf, 64, TEST_LEN)) {
- printf("alloc error: Fail");
- return -1;
- }
- dest3 = buf;
-
- if (posix_memalign(&buf, 64, TEST_LEN)) {
- printf("alloc error: Fail");
- return -1;
- }
- dest4 = buf;
-
- if (posix_memalign(&buf, 64, TEST_LEN)) {
- printf("alloc error: Fail");
- return -1;
- }
- dest5 = buf;
-
- if (posix_memalign(&buf, 64, TEST_LEN)) {
- printf("alloc error: Fail");
- return -1;
- }
- dest_ref1 = buf;
-
- if (posix_memalign(&buf, 64, TEST_LEN)) {
- printf("alloc error: Fail");
- return -1;
- }
- dest_ref2 = buf;
-
- if (posix_memalign(&buf, 64, TEST_LEN)) {
- printf("alloc error: Fail");
- return -1;
- }
- dest_ref3 = buf;
-
- if (posix_memalign(&buf, 64, TEST_LEN)) {
- printf("alloc error: Fail");
- return -1;
- }
- dest_ref4 = buf;
-
- if (posix_memalign(&buf, 64, TEST_LEN)) {
- printf("alloc error: Fail");
- return -1;
- }
- dest_ref5 = buf;
-
- dest_ptrs[0] = dest1;
- dest_ptrs[1] = dest2;
- dest_ptrs[2] = dest3;
- dest_ptrs[3] = dest4;
- dest_ptrs[4] = dest5;
-
- // Test of all zeros
- for (i = 0; i < TEST_SOURCES; i++)
- memset(buffs[i], 0, TEST_LEN);
-
- memset(dest1, 0, TEST_LEN);
- memset(dest2, 0, TEST_LEN);
- memset(dest3, 0, TEST_LEN);
- memset(dest4, 0, TEST_LEN);
- memset(dest5, 0, TEST_LEN);
- memset(dest_ref1, 0, TEST_LEN);
- memset(dest_ref2, 0, TEST_LEN);
- memset(dest_ref3, 0, TEST_LEN);
- memset(dest_ref4, 0, TEST_LEN);
- memset(dest_ref5, 0, TEST_LEN);
- memset(g1, 2, TEST_SOURCES);
- memset(g2, 1, TEST_SOURCES);
- memset(g3, 7, TEST_SOURCES);
- memset(g4, 9, TEST_SOURCES);
- memset(g5, 4, TEST_SOURCES);
-
- for (i = 0; i < TEST_SOURCES; i++) {
- gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
- gf_vect_mul_init(g2[i], &g_tbls[32 * TEST_SOURCES + i * 32]);
- gf_vect_mul_init(g3[i], &g_tbls[64 * TEST_SOURCES + i * 32]);
- gf_vect_mul_init(g4[i], &g_tbls[96 * TEST_SOURCES + i * 32]);
- gf_vect_mul_init(g5[i], &g_tbls[128 * TEST_SOURCES + i * 32]);
- }
-
- gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
- gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES], buffs,
- dest_ref2);
- gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES], buffs,
- dest_ref3);
- gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES], buffs,
- dest_ref4);
- gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[128 * TEST_SOURCES], buffs,
- dest_ref5);
-
- FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
-
- if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
- printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test1\n");
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref1, 25);
- printf("dprod_dut:");
- dump(dest1, 25);
- return -1;
- }
- if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
- printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test2\n");
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref2, 25);
- printf("dprod_dut:");
- dump(dest2, 25);
- return -1;
- }
- if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
- printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test3\n");
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref3, 25);
- printf("dprod_dut:");
- dump(dest3, 25);
- return -1;
- }
- if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) {
- printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test4\n");
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref4, 25);
- printf("dprod_dut:");
- dump(dest4, 25);
- return -1;
- }
- if (0 != memcmp(dest_ref5, dest5, TEST_LEN)) {
- printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test5\n");
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref5, 25);
- printf("dprod_dut:");
- dump(dest5, 25);
- return -1;
- }
- putchar('.');
-
- // Rand data test
-
- for (rtest = 0; rtest < RANDOMS; rtest++) {
- for (i = 0; i < TEST_SOURCES; i++)
- for (j = 0; j < TEST_LEN; j++)
- buffs[i][j] = rand();
-
- for (i = 0; i < TEST_SOURCES; i++) {
- g1[i] = rand();
- g2[i] = rand();
- g3[i] = rand();
- g4[i] = rand();
- g5[i] = rand();
- }
-
- for (i = 0; i < TEST_SOURCES; i++) {
- gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
- gf_vect_mul_init(g2[i], &g_tbls[(32 * TEST_SOURCES) + (i * 32)]);
- gf_vect_mul_init(g3[i], &g_tbls[(64 * TEST_SOURCES) + (i * 32)]);
- gf_vect_mul_init(g4[i], &g_tbls[(96 * TEST_SOURCES) + (i * 32)]);
- gf_vect_mul_init(g5[i], &g_tbls[(128 * TEST_SOURCES) + (i * 32)]);
- }
-
- gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
- gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
- buffs, dest_ref2);
- gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES],
- buffs, dest_ref3);
- gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES],
- buffs, dest_ref4);
- gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[128 * TEST_SOURCES],
- buffs, dest_ref5);
-
- FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
-
- if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test1 %d\n", rtest);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref1, 25);
- printf("dprod_dut:");
- dump(dest1, 25);
- return -1;
- }
- if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test2 %d\n", rtest);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref2, 25);
- printf("dprod_dut:");
- dump(dest2, 25);
- return -1;
- }
- if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test3 %d\n", rtest);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref3, 25);
- printf("dprod_dut:");
- dump(dest3, 25);
- return -1;
- }
- if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test4 %d\n", rtest);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref4, 25);
- printf("dprod_dut:");
- dump(dest4, 25);
- return -1;
- }
- if (0 != memcmp(dest_ref5, dest5, TEST_LEN)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test5 %d\n", rtest);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref5, 25);
- printf("dprod_dut:");
- dump(dest5, 25);
- return -1;
- }
-
- putchar('.');
- }
-
- // Rand data test with varied parameters
- for (rtest = 0; rtest < RANDOMS; rtest++) {
- for (srcs = TEST_SOURCES; srcs > 0; srcs--) {
- for (i = 0; i < srcs; i++)
- for (j = 0; j < TEST_LEN; j++)
- buffs[i][j] = rand();
-
- for (i = 0; i < srcs; i++) {
- g1[i] = rand();
- g2[i] = rand();
- g3[i] = rand();
- g4[i] = rand();
- g5[i] = rand();
- }
-
- for (i = 0; i < srcs; i++) {
- gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
- gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
- gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]);
- gf_vect_mul_init(g4[i], &g_tbls[(96 * srcs) + (i * 32)]);
- gf_vect_mul_init(g5[i], &g_tbls[(128 * srcs) + (i * 32)]);
- }
-
- gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[0], buffs, dest_ref1);
- gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[32 * srcs], buffs,
- dest_ref2);
- gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[64 * srcs], buffs,
- dest_ref3);
- gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[96 * srcs], buffs,
- dest_ref4);
- gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[128 * srcs], buffs,
- dest_ref5);
-
- FUNCTION_UNDER_TEST(TEST_LEN, srcs, g_tbls, buffs, dest_ptrs);
-
- if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
- " test1 srcs=%d\n", srcs);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref1, 25);
- printf("dprod_dut:");
- dump(dest1, 25);
- return -1;
- }
- if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
- " test2 srcs=%d\n", srcs);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref2, 25);
- printf("dprod_dut:");
- dump(dest2, 25);
- return -1;
- }
- if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
- " test3 srcs=%d\n", srcs);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref3, 25);
- printf("dprod_dut:");
- dump(dest3, 25);
- return -1;
- }
- if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
- " test4 srcs=%d\n", srcs);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref4, 25);
- printf("dprod_dut:");
- dump(dest4, 25);
- return -1;
- }
- if (0 != memcmp(dest_ref5, dest5, TEST_LEN)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
- " test5 srcs=%d\n", srcs);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref5, 25);
- printf("dprod_dut:");
- dump(dest5, 25);
- return -1;
- }
-
- putchar('.');
- }
- }
-
- // Run tests at end of buffer for Electric Fence
- align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
- for (size = TEST_MIN_SIZE; size <= TEST_SIZE; size += align) {
- for (i = 0; i < TEST_SOURCES; i++)
- for (j = 0; j < TEST_LEN; j++)
- buffs[i][j] = rand();
-
- for (i = 0; i < TEST_SOURCES; i++) // Line up TEST_SIZE from end
- efence_buffs[i] = buffs[i] + TEST_LEN - size;
-
- for (i = 0; i < TEST_SOURCES; i++) {
- g1[i] = rand();
- g2[i] = rand();
- g3[i] = rand();
- g4[i] = rand();
- g5[i] = rand();
- }
-
- for (i = 0; i < TEST_SOURCES; i++) {
- gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
- gf_vect_mul_init(g2[i], &g_tbls[(32 * TEST_SOURCES) + (i * 32)]);
- gf_vect_mul_init(g3[i], &g_tbls[(64 * TEST_SOURCES) + (i * 32)]);
- gf_vect_mul_init(g4[i], &g_tbls[(96 * TEST_SOURCES) + (i * 32)]);
- gf_vect_mul_init(g5[i], &g_tbls[(128 * TEST_SOURCES) + (i * 32)]);
- }
-
- gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[0], efence_buffs, dest_ref1);
- gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
- efence_buffs, dest_ref2);
- gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES],
- efence_buffs, dest_ref3);
- gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES],
- efence_buffs, dest_ref4);
- gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[128 * TEST_SOURCES],
- efence_buffs, dest_ref5);
-
- FUNCTION_UNDER_TEST(size, TEST_SOURCES, g_tbls, efence_buffs, dest_ptrs);
-
- if (0 != memcmp(dest_ref1, dest1, size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test1 %d\n", rtest);
- dump_matrix(efence_buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref1, align);
- printf("dprod_dut:");
- dump(dest1, align);
- return -1;
- }
-
- if (0 != memcmp(dest_ref2, dest2, size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test2 %d\n", rtest);
- dump_matrix(efence_buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref2, align);
- printf("dprod_dut:");
- dump(dest2, align);
- return -1;
- }
-
- if (0 != memcmp(dest_ref3, dest3, size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test3 %d\n", rtest);
- dump_matrix(efence_buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref3, align);
- printf("dprod_dut:");
- dump(dest3, align);
- return -1;
- }
-
- if (0 != memcmp(dest_ref4, dest4, size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test4 %d\n", rtest);
- dump_matrix(efence_buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref4, align);
- printf("dprod_dut:");
- dump(dest4, align);
- return -1;
- }
-
- if (0 != memcmp(dest_ref5, dest5, size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test5 %d\n", rtest);
- dump_matrix(efence_buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref5, align);
- printf("dprod_dut:");
- dump(dest5, align);
- return -1;
- }
-
- putchar('.');
- }
-
- // Test rand ptr alignment if available
-
- for (rtest = 0; rtest < RANDOMS; rtest++) {
- size = (TEST_LEN - PTR_ALIGN_CHK_B) & ~(TEST_MIN_SIZE - 1);
- srcs = rand() % TEST_SOURCES;
- if (srcs == 0)
- continue;
-
- offset = (PTR_ALIGN_CHK_B != 0) ? 1 : PTR_ALIGN_CHK_B;
- // Add random offsets
- for (i = 0; i < srcs; i++)
- ubuffs[i] = buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
-
- udest_ptrs[0] = dest1 + (rand() & (PTR_ALIGN_CHK_B - offset));
- udest_ptrs[1] = dest2 + (rand() & (PTR_ALIGN_CHK_B - offset));
- udest_ptrs[2] = dest3 + (rand() & (PTR_ALIGN_CHK_B - offset));
- udest_ptrs[3] = dest4 + (rand() & (PTR_ALIGN_CHK_B - offset));
- udest_ptrs[4] = dest5 + (rand() & (PTR_ALIGN_CHK_B - offset));
-
- memset(dest1, 0, TEST_LEN); // zero pad to check write-over
- memset(dest2, 0, TEST_LEN);
- memset(dest3, 0, TEST_LEN);
- memset(dest4, 0, TEST_LEN);
- memset(dest5, 0, TEST_LEN);
-
- for (i = 0; i < srcs; i++)
- for (j = 0; j < size; j++)
- ubuffs[i][j] = rand();
-
- for (i = 0; i < srcs; i++) {
- g1[i] = rand();
- g2[i] = rand();
- g3[i] = rand();
- g4[i] = rand();
- g5[i] = rand();
- }
-
- for (i = 0; i < srcs; i++) {
- gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
- gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
- gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]);
- gf_vect_mul_init(g4[i], &g_tbls[(96 * srcs) + (i * 32)]);
- gf_vect_mul_init(g5[i], &g_tbls[(128 * srcs) + (i * 32)]);
- }
-
- gf_vect_dot_prod_base(size, srcs, &g_tbls[0], ubuffs, dest_ref1);
- gf_vect_dot_prod_base(size, srcs, &g_tbls[32 * srcs], ubuffs, dest_ref2);
- gf_vect_dot_prod_base(size, srcs, &g_tbls[64 * srcs], ubuffs, dest_ref3);
- gf_vect_dot_prod_base(size, srcs, &g_tbls[96 * srcs], ubuffs, dest_ref4);
- gf_vect_dot_prod_base(size, srcs, &g_tbls[128 * srcs], ubuffs, dest_ref5);
-
- FUNCTION_UNDER_TEST(size, srcs, g_tbls, ubuffs, udest_ptrs);
-
- if (memcmp(dest_ref1, udest_ptrs[0], size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
- srcs);
- dump_matrix(ubuffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref1, 25);
- printf("dprod_dut:");
- dump(udest_ptrs[0], 25);
- return -1;
- }
- if (memcmp(dest_ref2, udest_ptrs[1], size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
- srcs);
- dump_matrix(ubuffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref2, 25);
- printf("dprod_dut:");
- dump(udest_ptrs[1], 25);
- return -1;
- }
- if (memcmp(dest_ref3, udest_ptrs[2], size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
- srcs);
- dump_matrix(ubuffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref3, 25);
- printf("dprod_dut:");
- dump(udest_ptrs[2], 25);
- return -1;
- }
- if (memcmp(dest_ref4, udest_ptrs[3], size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
- srcs);
- dump_matrix(ubuffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref4, 25);
- printf("dprod_dut:");
- dump(udest_ptrs[3], 25);
- return -1;
- }
- if (memcmp(dest_ref5, udest_ptrs[4], size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
- srcs);
- dump_matrix(ubuffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref5, 25);
- printf("dprod_dut:");
- dump(udest_ptrs[4], 25);
- return -1;
- }
- // Confirm that padding around dests is unchanged
- memset(dest_ref1, 0, PTR_ALIGN_CHK_B); // Make reference zero buff
- offset = udest_ptrs[0] - dest1;
-
- if (memcmp(dest1, dest_ref1, offset)) {
- printf("Fail rand ualign pad1 start\n");
- return -1;
- }
- if (memcmp(dest1 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
- printf("Fail rand ualign pad1 end\n");
- return -1;
- }
-
- offset = udest_ptrs[1] - dest2;
- if (memcmp(dest2, dest_ref1, offset)) {
- printf("Fail rand ualign pad2 start\n");
- return -1;
- }
- if (memcmp(dest2 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
- printf("Fail rand ualign pad2 end\n");
- return -1;
- }
-
- offset = udest_ptrs[2] - dest3;
- if (memcmp(dest3, dest_ref1, offset)) {
- printf("Fail rand ualign pad3 start\n");
- return -1;
- }
- if (memcmp(dest3 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
- printf("Fail rand ualign pad3 end\n");
- return -1;
- }
-
- offset = udest_ptrs[3] - dest4;
- if (memcmp(dest4, dest_ref1, offset)) {
- printf("Fail rand ualign pad4 start\n");
- return -1;
- }
- if (memcmp(dest4 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
- printf("Fail rand ualign pad4 end\n");
- return -1;
- }
-
- offset = udest_ptrs[4] - dest5;
- if (memcmp(dest5, dest_ref1, offset)) {
- printf("Fail rand ualign pad5 start\n");
- return -1;
- }
- if (memcmp(dest5 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
- printf("Fail rand ualign pad5 end\n");
- return -1;
- }
-
- putchar('.');
- }
-
- // Test all size alignment
- align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
-
- for (size = TEST_LEN; size >= TEST_MIN_SIZE; size -= align) {
- srcs = TEST_SOURCES;
-
- for (i = 0; i < srcs; i++)
- for (j = 0; j < size; j++)
- buffs[i][j] = rand();
-
- for (i = 0; i < srcs; i++) {
- g1[i] = rand();
- g2[i] = rand();
- g3[i] = rand();
- g4[i] = rand();
- g5[i] = rand();
- }
-
- for (i = 0; i < srcs; i++) {
- gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
- gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
- gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]);
- gf_vect_mul_init(g4[i], &g_tbls[(96 * srcs) + (i * 32)]);
- gf_vect_mul_init(g5[i], &g_tbls[(128 * srcs) + (i * 32)]);
- }
-
- gf_vect_dot_prod_base(size, srcs, &g_tbls[0], buffs, dest_ref1);
- gf_vect_dot_prod_base(size, srcs, &g_tbls[32 * srcs], buffs, dest_ref2);
- gf_vect_dot_prod_base(size, srcs, &g_tbls[64 * srcs], buffs, dest_ref3);
- gf_vect_dot_prod_base(size, srcs, &g_tbls[96 * srcs], buffs, dest_ref4);
- gf_vect_dot_prod_base(size, srcs, &g_tbls[128 * srcs], buffs, dest_ref5);
-
- FUNCTION_UNDER_TEST(size, srcs, g_tbls, buffs, dest_ptrs);
-
- if (memcmp(dest_ref1, dest_ptrs[0], size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
- size);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref1, 25);
- printf("dprod_dut:");
- dump(dest_ptrs[0], 25);
-
- return -1;
- }
- if (memcmp(dest_ref2, dest_ptrs[1], size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
- size);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref2, 25);
- printf("dprod_dut:");
- dump(dest_ptrs[1], 25);
- return -1;
- }
- if (memcmp(dest_ref3, dest_ptrs[2], size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
- size);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref3, 25);
- printf("dprod_dut:");
- dump(dest_ptrs[2], 25);
- return -1;
- }
- if (memcmp(dest_ref4, dest_ptrs[3], size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
- size);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref4, 25);
- printf("dprod_dut:");
- dump(dest_ptrs[3], 25);
- return -1;
- }
- if (memcmp(dest_ref5, dest_ptrs[4], size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
- size);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref5, 25);
- printf("dprod_dut:");
- dump(dest_ptrs[4], 25);
- return -1;
- }
- }
-
- printf("Pass\n");
- return 0;
-
-}
diff --git a/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_sse_test.patch b/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_sse_test.patch
deleted file mode 100644
index 21bbfaa667..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_5vect_dot_prod_sse_test.patch
+++ /dev/null
@@ -1,4 +0,0 @@
-34c34
-< // #include "types.h"
----
-> #include "types.h"
diff --git a/contrib/libs/isa-l/erasure_code/gf_5vect_mad_avx.asm b/contrib/libs/isa-l/erasure_code/gf_5vect_mad_avx.asm
index 8f38a415a1..e72717328a 100644
--- a/contrib/libs/isa-l/erasure_code/gf_5vect_mad_avx.asm
+++ b/contrib/libs/isa-l/erasure_code/gf_5vect_mad_avx.asm
@@ -107,7 +107,7 @@
%define return rax
%define return.w eax
- %define func(x) x:
+ %define func(x) x: endbranch
%macro FUNC_SAVE 0
push r12
push r13
@@ -178,13 +178,8 @@ section .text
align 16
-global gf_5vect_mad_avx:ISAL_SYM_TYPE_FUNCTION
+global gf_5vect_mad_avx, function
func(gf_5vect_mad_avx)
-%ifidn __OUTPUT_FORMAT__, macho64
-global _gf_5vect_mad_avx:ISAL_SYM_TYPE_FUNCTION
-func(_gf_5vect_mad_avx)
-%endif
-
FUNC_SAVE
sub len, 16
jl .return_fail
@@ -365,6 +360,3 @@ align 16
mask0f: dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
constip16:
dq 0xf8f9fafbfcfdfeff, 0xf0f1f2f3f4f5f6f7
-
-;;; func core, ver, snum
-slversion gf_5vect_mad_avx, 02, 01, 020d
diff --git a/contrib/libs/isa-l/erasure_code/gf_5vect_mad_avx.patch b/contrib/libs/isa-l/erasure_code/gf_5vect_mad_avx.patch
deleted file mode 100644
index d1a3e09445..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_5vect_mad_avx.patch
+++ /dev/null
@@ -1,6 +0,0 @@
-183,187d182
-< %ifidn __OUTPUT_FORMAT__, macho64
-< global _gf_5vect_mad_avx:ISAL_SYM_TYPE_FUNCTION
-< func(_gf_5vect_mad_avx)
-< %endif
-<
diff --git a/contrib/libs/isa-l/erasure_code/gf_5vect_mad_avx2.asm b/contrib/libs/isa-l/erasure_code/gf_5vect_mad_avx2.asm
index 9029f9287e..927cbcdd1a 100644
--- a/contrib/libs/isa-l/erasure_code/gf_5vect_mad_avx2.asm
+++ b/contrib/libs/isa-l/erasure_code/gf_5vect_mad_avx2.asm
@@ -103,7 +103,7 @@
%define return rax
%define return.w eax
- %define func(x) x:
+ %define func(x) x: endbranch
%define FUNC_SAVE
%define FUNC_RESTORE
%endif
@@ -166,13 +166,8 @@ section .text
%define xd5 ymm9
align 16
-global gf_5vect_mad_avx2:ISAL_SYM_TYPE_FUNCTION
+global gf_5vect_mad_avx2, function
func(gf_5vect_mad_avx2)
-%ifidn __OUTPUT_FORMAT__, macho64
-global _gf_5vect_mad_avx2:ISAL_SYM_TYPE_FUNCTION
-func(_gf_5vect_mad_avx2)
-%endif
-
FUNC_SAVE
sub len, 32
jl .return_fail
@@ -363,6 +358,3 @@ align 32
constip32:
dq 0xf8f9fafbfcfdfeff, 0xf0f1f2f3f4f5f6f7
dq 0xe8e9eaebecedeeef, 0xe0e1e2e3e4e5e6e7
-
-;;; func core, ver, snum
-slversion gf_5vect_mad_avx2, 04, 01, 020e
diff --git a/contrib/libs/isa-l/erasure_code/gf_5vect_mad_avx2.patch b/contrib/libs/isa-l/erasure_code/gf_5vect_mad_avx2.patch
deleted file mode 100644
index 1960386b1c..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_5vect_mad_avx2.patch
+++ /dev/null
@@ -1,6 +0,0 @@
-171,175d170
-< %ifidn __OUTPUT_FORMAT__, macho64
-< global _gf_5vect_mad_avx2:ISAL_SYM_TYPE_FUNCTION
-< func(_gf_5vect_mad_avx2)
-< %endif
-<
diff --git a/contrib/libs/isa-l/erasure_code/gf_5vect_mad_avx2_gfni.asm b/contrib/libs/isa-l/erasure_code/gf_5vect_mad_avx2_gfni.asm
new file mode 100644
index 0000000000..7ff768528e
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/gf_5vect_mad_avx2_gfni.asm
@@ -0,0 +1,265 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2023 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_5vect_mad_avx2_gfni(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%include "reg_sizes.asm"
+%include "gf_vect_gfni.inc"
+%include "memcpy.asm"
+
+%if AS_FEATURE_LEVEL >= 10
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+ %define arg4 r8
+ %define arg5 r9
+ %define tmp r11
+ %define tmp2 r10
+ %define tmp3 r12
+ %define tmp4 r13
+ %define func(x) x: endbranch
+ %define stack_size 2*8
+ %macro FUNC_SAVE 0
+ sub rsp, stack_size
+ mov [rsp + 0*8], r12
+ mov [rsp + 1*8], r13
+ %endmacro
+ %macro FUNC_RESTORE 0
+ mov r12, [rsp + 0*8]
+ mov r13, [rsp + 1*8]
+ add rsp, stack_size
+ %endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+ %define arg4 r12 ; must be saved, loaded and restored
+ %define arg5 r13 ; must be saved and restored
+ %define tmp r11
+ %define tmp2 r10
+ %define tmp3 r14
+ %define tmp4 r15
+ %define stack_size 16*10 + 5*8
+ %define arg(x) [rsp + stack_size + 8 + 8*x]
+ %define func(x) proc_frame x
+
+ %macro FUNC_SAVE 0
+ sub rsp, stack_size
+ vmovdqa [rsp + 0*16], xmm6
+ vmovdqa [rsp + 1*16], xmm7
+ vmovdqa [rsp + 2*16], xmm8
+ vmovdqa [rsp + 3*16], xmm9
+ vmovdqa [rsp + 4*16], xmm10
+ vmovdqa [rsp + 5*16], xmm11
+ vmovdqa [rsp + 6*16], xmm12
+ vmovdqa [rsp + 7*16], xmm13
+ vmovdqa [rsp + 8*16], xmm14
+ vmovdqa [rsp + 9*16], xmm15
+ mov [rsp + 10*16 + 0*8], r12
+ mov [rsp + 10*16 + 1*8], r13
+ mov [rsp + 10*16 + 2*8], r14
+ mov [rsp + 10*16 + 3*8], r15
+ end_prolog
+ mov arg4, arg(4)
+ mov arg5, arg(5)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ vmovdqa xmm6, [rsp + 0*16]
+ vmovdqa xmm7, [rsp + 1*16]
+ vmovdqa xmm8, [rsp + 2*16]
+ vmovdqa xmm9, [rsp + 3*16]
+ vmovdqa xmm10, [rsp + 4*16]
+ vmovdqa xmm11, [rsp + 5*16]
+ vmovdqa xmm12, [rsp + 6*16]
+ vmovdqa xmm13, [rsp + 7*16]
+ vmovdqa xmm14, [rsp + 8*16]
+ vmovdqa xmm15, [rsp + 9*16]
+ mov r12, [rsp + 10*16 + 0*8]
+ mov r13, [rsp + 10*16 + 1*8]
+ mov r14, [rsp + 10*16 + 2*8]
+ mov r15, [rsp + 10*16 + 3*8]
+ add rsp, stack_size
+ %endmacro
+%endif
+
+%define len arg0
+%define vec arg1
+%define vec_i arg2
+%define mul_array arg3
+%define src arg4
+%define dest1 arg5
+%define pos rax
+%define dest2 mul_array
+%define dest3 vec_i
+%define dest4 tmp3
+%define dest5 tmp4
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+ %define XLDR vmovdqa
+ %define XSTR vmovdqa
+ %else
+ %define XLDR vmovntdqa
+ %define XSTR vmovntdq
+ %endif
+%endif
+
+default rel
+[bits 64]
+section .text
+
+%define x0 ymm0
+%define xd1 ymm1
+%define xd2 ymm2
+%define xd3 ymm3
+%define xd4 ymm4
+%define xd5 ymm5
+%define xgft1 ymm6
+%define xgft2 ymm7
+%define xgft3 ymm8
+%define xgft4 ymm9
+%define xgft5 ymm10
+%define xret1 ymm11
+%define xret2 ymm12
+%define xret3 ymm13
+%define xret4 ymm14
+%define xret5 ymm15
+
+;;
+;; Encodes 32 bytes of a single source into 5x 32 bytes (parity disks)
+;;
+%macro ENCODE_32B_5 0
+ ;; get next source vector
+ XLDR x0, [src + pos]
+ ;; get next dest vectors
+ XLDR xd1, [dest1 + pos]
+ XLDR xd2, [dest2 + pos]
+ XLDR xd3, [dest3 + pos]
+ XLDR xd4, [dest4 + pos]
+ XLDR xd5, [dest5 + pos]
+
+ GF_MUL_XOR VEX, x0, xgft1, xret1, xd1, xgft2, xret2, xd2, \
+ xgft3, xret3, xd3, xgft4, xret4, xd4, xgft5, xret5, xd5
+
+ XSTR [dest1 + pos], xd1
+ XSTR [dest2 + pos], xd2
+ XSTR [dest3 + pos], xd3
+ XSTR [dest4 + pos], xd4
+ XSTR [dest5 + pos], xd5
+%endmacro
+
+;;
+;; Encodes less than 32 bytes of a single source into 5x parity disks
+;;
+%macro ENCODE_LT_32B_5 1
+%define %%LEN %1
+ ;; get next source vector
+ simd_load_avx2 x0, src + pos, %%LEN, tmp, tmp2
+ ;; get next dest vectors
+ simd_load_avx2 xd1, dest1 + pos, %%LEN, tmp, tmp2
+ simd_load_avx2 xd2, dest2 + pos, %%LEN, tmp, tmp2
+ simd_load_avx2 xd3, dest3 + pos, %%LEN, tmp, tmp2
+ simd_load_avx2 xd4, dest4 + pos, %%LEN, tmp, tmp2
+ simd_load_avx2 xd5, dest5 + pos, %%LEN, tmp, tmp2
+
+ GF_MUL_XOR VEX, x0, xgft1, xret1, xd1, xgft2, xret2, xd2, \
+ xgft3, xret3, xd3, xgft4, xret4, xd4, xgft5, xret5, xd5
+
+ lea dest1, [dest1 + pos]
+ simd_store_avx2 dest1, xd1, %%LEN, tmp, tmp2
+ lea dest2, [dest2 + pos]
+ simd_store_avx2 dest2, xd2, %%LEN, tmp, tmp2
+ lea dest3, [dest3 + pos]
+ simd_store_avx2 dest3, xd3, %%LEN, tmp, tmp2
+ lea dest4, [dest4 + pos]
+ simd_store_avx2 dest4, xd4, %%LEN, tmp, tmp2
+ lea dest5, [dest5 + pos]
+ simd_store_avx2 dest5, xd5, %%LEN, tmp, tmp2
+%endmacro
+
+align 16
+global gf_5vect_mad_avx2_gfni, function
+func(gf_5vect_mad_avx2_gfni)
+ FUNC_SAVE
+
+ xor pos, pos
+ shl vec_i, 3 ;Multiply by 8
+ shl vec, 3 ;Multiply by 8
+ lea tmp, [mul_array + vec_i]
+ lea tmp2, [vec*3]
+ vbroadcastsd xgft1, [tmp]
+ vbroadcastsd xgft2, [tmp + vec]
+ vbroadcastsd xgft3, [tmp + vec*2]
+ vbroadcastsd xgft4, [tmp + tmp2]
+ vbroadcastsd xgft5, [tmp + vec*4]
+ mov dest2, [dest1 + 1*8] ; reuse mul_array
+ mov dest3, [dest1 + 2*8] ; reuse vec_i
+ mov dest4, [dest1 + 3*8]
+ mov dest5, [dest1 + 4*8]
+ mov dest1, [dest1]
+
+ cmp len, 32
+ jl .len_lt_32
+
+.loop32:
+ ENCODE_32B_5 ;; loop on 32 bytes at a time
+
+ add pos, 32
+ sub len, 32
+ cmp len, 32
+ jge .loop32
+
+.len_lt_32:
+ cmp len, 0
+ jle .exit
+
+ ENCODE_LT_32B_5 len ;; encode final bytes
+
+.exit:
+ vzeroupper
+
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+%endif ; if AS_FEATURE_LEVEL >= 10
diff --git a/contrib/libs/isa-l/erasure_code/gf_5vect_mad_avx512.asm b/contrib/libs/isa-l/erasure_code/gf_5vect_mad_avx512.asm
new file mode 100644
index 0000000000..26f0964b94
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/gf_5vect_mad_avx512.asm
@@ -0,0 +1,287 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2019 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_5vect_mad_avx512(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+ %define arg4 r8
+ %define arg5 r9
+ %define tmp r11
+ %define tmp2 r10
+ %define return rax
+ %define func(x) x: endbranch
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+ %define arg4 r12
+ %define arg5 r15
+ %define tmp r11
+ %define tmp2 r10
+ %define return rax
+ %define stack_size 16*10 + 3*8
+ %define arg(x) [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+%macro FUNC_SAVE 0
+ sub rsp, stack_size
+ vmovdqa [rsp+16*0],xmm6
+ vmovdqa [rsp+16*1],xmm7
+ vmovdqa [rsp+16*2],xmm8
+ vmovdqa [rsp+16*3],xmm9
+ vmovdqa [rsp+16*4],xmm10
+ vmovdqa [rsp+16*5],xmm11
+ vmovdqa [rsp+16*6],xmm12
+ vmovdqa [rsp+16*7],xmm13
+ vmovdqa [rsp+16*8],xmm14
+ vmovdqa [rsp+16*9],xmm15
+ save_reg r12, 10*16 + 0*8
+ save_reg r15, 10*16 + 1*8
+ end_prolog
+ mov arg4, arg(4)
+ mov arg5, arg(5)
+%endmacro
+
+%macro FUNC_RESTORE 0
+ vmovdqa xmm6, [rsp+16*0]
+ vmovdqa xmm7, [rsp+16*1]
+ vmovdqa xmm8, [rsp+16*2]
+ vmovdqa xmm9, [rsp+16*3]
+ vmovdqa xmm10, [rsp+16*4]
+ vmovdqa xmm11, [rsp+16*5]
+ vmovdqa xmm12, [rsp+16*6]
+ vmovdqa xmm13, [rsp+16*7]
+ vmovdqa xmm14, [rsp+16*8]
+ vmovdqa xmm15, [rsp+16*9]
+ mov r12, [rsp + 10*16 + 0*8]
+ mov r15, [rsp + 10*16 + 1*8]
+ add rsp, stack_size
+%endmacro
+%endif
+
+%define PS 8
+%define len arg0
+%define vec arg1
+%define vec_i arg2
+%define mul_array arg3
+%define src arg4
+%define dest1 arg5
+%define pos return
+%define dest2 tmp2
+%define dest3 mul_array
+%define dest4 vec
+%define dest5 vec_i
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu8
+ %define XSTR vmovdqu8
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+ %define XLDR vmovdqa64
+ %define XSTR vmovdqa64
+ %else
+ %define XLDR vmovntdqa
+ %define XSTR vmovntdq
+ %endif
+%endif
+
+default rel
+[bits 64]
+section .text
+
+%define x0 zmm0
+%define xtmpa zmm1
+%define xtmpl1 zmm2
+%define xtmph1 zmm3
+%define xtmph2 zmm4
+%define xtmph3 zmm5
+%define xgft1_hi zmm6
+%define xgft1_lo zmm7
+%define xgft1_loy ymm7
+%define xgft2_hi zmm8
+%define xgft2_lo zmm9
+%define xgft2_loy ymm9
+%define xgft3_hi zmm10
+%define xgft3_lo zmm11
+%define xgft3_loy ymm11
+%define xgft4_hi zmm12
+%define xgft4_lo zmm13
+%define xgft4_loy ymm13
+%define xgft5_hi zmm14
+%define xgft5_lo zmm15
+%define xgft5_loy ymm15
+%define xd1 zmm16
+%define xd2 zmm17
+%define xd3 zmm18
+%define xd4 zmm19
+%define xd5 zmm20
+%define xmask0f zmm21
+%define xtmpl2 zmm22
+%define xtmpl3 zmm23
+%define xtmpl4 zmm24
+%define xtmpl5 zmm25
+%define xtmph4 zmm26
+%define xtmph5 zmm27
+
+align 16
+global gf_5vect_mad_avx512, function
+func(gf_5vect_mad_avx512)
+ FUNC_SAVE
+ sub len, 64
+ jl .return_fail
+ xor pos, pos
+ mov tmp, 0x0f
+ vpbroadcastb xmask0f, tmp ;Construct mask 0x0f0f0f...
+ sal vec_i, 5 ;Multiply by 32
+ sal vec, 5 ;Multiply by 32
+ lea tmp, [mul_array + vec_i]
+ vmovdqu xgft1_loy, [tmp] ;Load array Ax{00}..{0f}, Ax{00}..{f0}
+ vmovdqu xgft2_loy, [tmp+vec] ;Load array Bx{00}..{0f}, Bx{00}..{f0}
+ vmovdqu xgft3_loy, [tmp+2*vec] ;Load array Cx{00}..{0f}, Cx{00}..{f0}
+ vmovdqu xgft5_loy, [tmp+4*vec] ;Load array Ex{00}..{0f}, Ex{00}..{f0}
+ add tmp, vec
+ vmovdqu xgft4_loy, [tmp+2*vec] ;Load array Dx{00}..{0f}, Dx{00}..{f0}
+ vshufi64x2 xgft1_hi, xgft1_lo, xgft1_lo, 0x55
+ vshufi64x2 xgft1_lo, xgft1_lo, xgft1_lo, 0x00
+ vshufi64x2 xgft2_hi, xgft2_lo, xgft2_lo, 0x55
+ vshufi64x2 xgft2_lo, xgft2_lo, xgft2_lo, 0x00
+ vshufi64x2 xgft3_hi, xgft3_lo, xgft3_lo, 0x55
+ vshufi64x2 xgft3_lo, xgft3_lo, xgft3_lo, 0x00
+ vshufi64x2 xgft4_hi, xgft4_lo, xgft4_lo, 0x55
+ vshufi64x2 xgft4_lo, xgft4_lo, xgft4_lo, 0x00
+ vshufi64x2 xgft5_hi, xgft5_lo, xgft5_lo, 0x55
+ vshufi64x2 xgft5_lo, xgft5_lo, xgft5_lo, 0x00
+ mov dest2, [dest1+PS]
+ mov dest3, [dest1+2*PS] ; reuse mul_array
+ mov dest4, [dest1+3*PS] ; reuse vec
+ mov dest5, [dest1+4*PS] ; reuse vec_i
+ mov dest1, [dest1]
+ mov tmp, -1
+ kmovq k1, tmp
+
+.loop64:
+ XLDR x0, [src+pos] ;Get next source vector
+ XLDR xd1, [dest1+pos] ;Get next dest vector
+ XLDR xd2, [dest2+pos] ;Get next dest vector
+ XLDR xd3, [dest3+pos] ;Get next dest vector
+ XLDR xd4, [dest4+pos] ;reuse xtmpl1. Get next dest vector
+ XLDR xd5, [dest5+pos] ;Get next dest vector
+
+ vpandq xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
+ vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
+ vpandq x0, x0, xmask0f ;Mask high src nibble in bits 4-0
+
+ ; dest1
+ vpshufb xtmph1 {k1}{z}, xgft1_hi, x0 ;Lookup mul table of high nibble
+ vpshufb xtmpl1 {k1}{z}, xgft1_lo, xtmpa ;Lookup mul table of low nibble
+ vpxorq xtmph1, xtmph1, xtmpl1 ;GF add high and low partials
+ vpxorq xd1, xd1, xtmph1 ;xd1 += partial
+
+ ; dest2
+ vpshufb xtmph2 {k1}{z}, xgft2_hi, x0 ;Lookup mul table of high nibble
+ vpshufb xtmpl2 {k1}{z}, xgft2_lo, xtmpa ;Lookup mul table of low nibble
+ vpxorq xtmph2, xtmph2, xtmpl2 ;GF add high and low partials
+ vpxorq xd2, xd2, xtmph2 ;xd2 += partial
+
+ ; dest3
+ vpshufb xtmph3 {k1}{z}, xgft3_hi, x0 ;Lookup mul table of high nibble
+ vpshufb xtmpl3 {k1}{z}, xgft3_lo, xtmpa ;Lookup mul table of low nibble
+ vpxorq xtmph3, xtmph3, xtmpl3 ;GF add high and low partials
+ vpxorq xd3, xd3, xtmph3 ;xd2 += partial
+
+ ; dest4
+ vpshufb xtmph4 {k1}{z}, xgft4_hi, x0 ;Lookup mul table of high nibble
+ vpshufb xtmpl4 {k1}{z}, xgft4_lo, xtmpa ;Lookup mul table of low nibble
+ vpxorq xtmph4, xtmph4, xtmpl4 ;GF add high and low partials
+ vpxorq xd4, xd4, xtmph4 ;xd2 += partial
+
+ ; dest5
+ vpshufb xtmph5 {k1}{z}, xgft5_hi, x0 ;Lookup mul table of high nibble
+ vpshufb xtmpl5 {k1}{z}, xgft5_lo, xtmpa ;Lookup mul table of low nibble
+ vpxorq xtmph5, xtmph5, xtmpl5 ;GF add high and low partials
+ vpxorq xd5, xd5, xtmph5 ;xd2 += partial
+
+ XSTR [dest1+pos], xd1
+ XSTR [dest2+pos], xd2
+ XSTR [dest3+pos], xd3
+ XSTR [dest4+pos], xd4
+ XSTR [dest5+pos], xd5
+
+ add pos, 64 ;Loop on 64 bytes at a time
+ cmp pos, len
+ jle .loop64
+
+ lea tmp, [len + 64]
+ cmp pos, tmp
+ je .return_pass
+
+ ;; Tail len
+ mov pos, (1 << 63)
+ lea tmp, [len + 64 - 1]
+ and tmp, 63
+ sarx pos, pos, tmp
+ kmovq k1, pos
+ mov pos, len ;Overlapped offset length-64
+ jmp .loop64 ;Do one more overlap pass
+
+.return_pass:
+ mov return, 0
+ FUNC_RESTORE
+ ret
+
+.return_fail:
+ mov return, 1
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_gf_5vect_mad_avx512
+no_gf_5vect_mad_avx512:
+%endif
+%endif ; ifdef HAVE_AS_KNOWS_AVX512
diff --git a/contrib/libs/isa-l/erasure_code/gf_5vect_mad_avx512_gfni.asm b/contrib/libs/isa-l/erasure_code/gf_5vect_mad_avx512_gfni.asm
new file mode 100644
index 0000000000..d89ecca970
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/gf_5vect_mad_avx512_gfni.asm
@@ -0,0 +1,240 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2023 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_5vect_mad_avx512_gfni(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%include "reg_sizes.asm"
+%include "gf_vect_gfni.inc"
+
+%if AS_FEATURE_LEVEL >= 10
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+ %define arg4 r8
+ %define arg5 r9
+ %define tmp r11
+ %define tmp2 r10
+ %define func(x) x: endbranch
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+ %define arg4 r12
+ %define arg5 r13
+ %define tmp r11
+ %define tmp2 r10
+ %define stack_size 16*10 + 3*8
+ %define arg(x) [rsp + stack_size + 8 + 8*x]
+ %define func(x) proc_frame x
+
+%macro FUNC_SAVE 0
+ sub rsp, stack_size
+ vmovdqa [rsp + 16*0], xmm6
+ vmovdqa [rsp + 16*1], xmm7
+ vmovdqa [rsp + 16*2], xmm8
+ vmovdqa [rsp + 16*3], xmm9
+ vmovdqa [rsp + 16*4], xmm10
+ vmovdqa [rsp + 16*5], xmm11
+ vmovdqa [rsp + 16*6], xmm12
+ vmovdqa [rsp + 16*7], xmm13
+ vmovdqa [rsp + 16*8], xmm14
+ vmovdqa [rsp + 16*9], xmm15
+ mov [rsp + 10*16 + 0*8], r12
+ mov [rsp + 10*16 + 1*8], r13
+ end_prolog
+ mov arg4, arg(4)
+ mov arg5, arg(5)
+%endmacro
+
+%macro FUNC_RESTORE 0
+ vmovdqa xmm6, [rsp + 16*0]
+ vmovdqa xmm7, [rsp + 16*1]
+ vmovdqa xmm8, [rsp + 16*2]
+ vmovdqa xmm9, [rsp + 16*3]
+ vmovdqa xmm10, [rsp + 16*4]
+ vmovdqa xmm11, [rsp + 16*5]
+ vmovdqa xmm12, [rsp + 16*6]
+ vmovdqa xmm13, [rsp + 16*7]
+ vmovdqa xmm14, [rsp + 16*8]
+ vmovdqa xmm15, [rsp + 16*9]
+ mov r12, [rsp + 10*16 + 0*8]
+ mov r13, [rsp + 10*16 + 1*8]
+ add rsp, stack_size
+%endmacro
+%endif
+
+%define len arg0
+%define vec arg1
+%define vec_i arg2
+%define mul_array arg3
+%define src arg4
+%define dest1 arg5
+%define pos rax
+%define dest2 tmp2
+%define dest3 mul_array
+%define dest4 vec
+%define dest5 vec_i
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu8
+ %define XSTR vmovdqu8
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+ %define XLDR vmovdqa64
+ %define XSTR vmovdqa64
+ %else
+ %define XLDR vmovntdqa
+ %define XSTR vmovntdq
+ %endif
+%endif
+
+default rel
+[bits 64]
+section .text
+
+%define x0 zmm0
+%define xd1 zmm1
+%define xd2 zmm2
+%define xd3 zmm3
+%define xd4 zmm4
+%define xd5 zmm5
+
+%define xgft1 zmm6
+%define xgft2 zmm7
+%define xgft3 zmm8
+%define xgft4 zmm9
+%define xgft5 zmm10
+
+%define xret1 zmm11
+%define xret2 zmm12
+%define xret3 zmm13
+%define xret4 zmm14
+%define xret5 zmm15
+
+;;
+;; Encodes 64 bytes of a single source into 5x 64 bytes (parity disks)
+;;
+%macro ENCODE_64B_5 0-1
+%define %%KMASK %1
+
+%if %0 == 1
+ vmovdqu8 x0{%%KMASK}, [src + pos] ;Get next source vector
+ vmovdqu8 xd1{%%KMASK}, [dest1 + pos] ;Get next dest vector
+ vmovdqu8 xd2{%%KMASK}, [dest2 + pos] ;Get next dest vector
+ vmovdqu8 xd3{%%KMASK}, [dest3 + pos] ;Get next dest vector
+ vmovdqu8 xd4{%%KMASK}, [dest4 + pos] ;Get next dest vector
+ vmovdqu8 xd5{%%KMASK}, [dest5 + pos] ;Get next dest vector
+%else
+ XLDR x0, [src + pos] ;Get next source vector
+ XLDR xd1, [dest1 + pos] ;Get next dest vector
+ XLDR xd2, [dest2 + pos] ;Get next dest vector
+ XLDR xd3, [dest3 + pos] ;Get next dest vector
+ XLDR xd4, [dest4 + pos] ;Get next dest vector
+ XLDR xd5, [dest5 + pos] ;Get next dest vector
+%endif
+
+ GF_MUL_XOR EVEX, x0, xgft1, xret1, xd1, xgft2, xret2, xd2, xgft3, xret3, xd3, \
+ xgft4, xret4, xd4, xgft5, xret5, xd5
+
+%if %0 == 1
+ vmovdqu8 [dest1 + pos]{%%KMASK}, xd1
+ vmovdqu8 [dest2 + pos]{%%KMASK}, xd2
+ vmovdqu8 [dest3 + pos]{%%KMASK}, xd3
+ vmovdqu8 [dest4 + pos]{%%KMASK}, xd4
+ vmovdqu8 [dest5 + pos]{%%KMASK}, xd5
+%else
+ XSTR [dest1 + pos], xd1
+ XSTR [dest2 + pos], xd2
+ XSTR [dest3 + pos], xd3
+ XSTR [dest4 + pos], xd4
+ XSTR [dest5 + pos], xd5
+%endif
+%endmacro
+align 16
+global gf_5vect_mad_avx512_gfni, function
+func(gf_5vect_mad_avx512_gfni)
+ FUNC_SAVE
+
+ xor pos, pos
+ shl vec_i, 3 ;Multiply by 8
+ shl vec, 3 ;Multiply by 8
+ lea tmp, [mul_array + vec_i]
+ vbroadcastf32x2 xgft1, [tmp]
+ vbroadcastf32x2 xgft2, [tmp + vec]
+ vbroadcastf32x2 xgft3, [tmp + vec*2]
+ vbroadcastf32x2 xgft5, [tmp + vec*4]
+ add tmp, vec
+ vbroadcastf32x2 xgft4, [tmp + vec*2]
+ mov dest2, [dest1 + 8]
+ mov dest3, [dest1 + 2*8] ; reuse mul_array
+ mov dest4, [dest1 + 3*8] ; reuse vec
+ mov dest5, [dest1 + 4*8] ; reuse vec_i
+ mov dest1, [dest1]
+
+ cmp len, 64
+ jl .len_lt_64
+.loop64:
+ ENCODE_64B_5
+
+ add pos, 64 ;Loop on 64 bytes at a time
+ sub len, 64
+ cmp len, 64
+ jge .loop64
+
+.len_lt_64:
+ cmp len, 0
+ jle .exit
+
+ xor tmp, tmp
+ bts tmp, len
+ dec tmp
+ kmovq k1, tmp
+
+ ENCODE_64B_5 k1
+
+.exit:
+ vzeroupper
+
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+%endif ; if AS_FEATURE_LEVEL >= 10
diff --git a/contrib/libs/isa-l/erasure_code/gf_5vect_mad_sse.asm b/contrib/libs/isa-l/erasure_code/gf_5vect_mad_sse.asm
index 15c96bf4dc..072c2b5632 100644
--- a/contrib/libs/isa-l/erasure_code/gf_5vect_mad_sse.asm
+++ b/contrib/libs/isa-l/erasure_code/gf_5vect_mad_sse.asm
@@ -107,7 +107,7 @@
%define return rax
%define return.w eax
- %define func(x) x:
+ %define func(x) x: endbranch
%macro FUNC_SAVE 0
push r12
push r13
@@ -177,13 +177,8 @@ section .text
align 16
-global gf_5vect_mad_sse:ISAL_SYM_TYPE_FUNCTION
+global gf_5vect_mad_sse, function
func(gf_5vect_mad_sse)
-%ifidn __OUTPUT_FORMAT__, macho64
-global _gf_5vect_mad_sse:ISAL_SYM_TYPE_FUNCTION
-func(_gf_5vect_mad_sse)
-%endif
-
FUNC_SAVE
sub len, 16
jl .return_fail
@@ -373,6 +368,3 @@ mask0f:
dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
constip16:
dq 0xf8f9fafbfcfdfeff, 0xf0f1f2f3f4f5f6f7
-
-;;; func core, ver, snum
-slversion gf_5vect_mad_sse, 00, 01, 020c
diff --git a/contrib/libs/isa-l/erasure_code/gf_5vect_mad_sse.patch b/contrib/libs/isa-l/erasure_code/gf_5vect_mad_sse.patch
deleted file mode 100644
index 35d5094ffe..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_5vect_mad_sse.patch
+++ /dev/null
@@ -1,6 +0,0 @@
-182,186d181
-< %ifidn __OUTPUT_FORMAT__, macho64
-< global _gf_5vect_mad_sse:ISAL_SYM_TYPE_FUNCTION
-< func(_gf_5vect_mad_sse)
-< %endif
-<
diff --git a/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_avx.asm b/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_avx.asm
index f12798edec..86082e75af 100644
--- a/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_avx.asm
+++ b/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_avx.asm
@@ -51,7 +51,7 @@
%define PS 8
%define LOG_PS 3
- %define func(x) x:
+ %define func(x) x: endbranch
%macro FUNC_SAVE 0
push r12
push r13
@@ -89,16 +89,16 @@
%define func(x) proc_frame x
%macro FUNC_SAVE 0
alloc_stack stack_size
- save_xmm128 xmm6, 0*16
- save_xmm128 xmm7, 1*16
- save_xmm128 xmm8, 2*16
- save_xmm128 xmm9, 3*16
- save_xmm128 xmm10, 4*16
- save_xmm128 xmm11, 5*16
- save_xmm128 xmm12, 6*16
- save_xmm128 xmm13, 7*16
- save_xmm128 xmm14, 8*16
- save_xmm128 xmm15, 9*16
+ vmovdqa [rsp + 0*16], xmm6
+ vmovdqa [rsp + 1*16], xmm7
+ vmovdqa [rsp + 2*16], xmm8
+ vmovdqa [rsp + 3*16], xmm9
+ vmovdqa [rsp + 4*16], xmm10
+ vmovdqa [rsp + 5*16], xmm11
+ vmovdqa [rsp + 6*16], xmm12
+ vmovdqa [rsp + 7*16], xmm13
+ vmovdqa [rsp + 8*16], xmm14
+ vmovdqa [rsp + 9*16], xmm15
save_reg r12, 10*16 + 0*8
save_reg r13, 10*16 + 1*8
save_reg r14, 10*16 + 2*8
@@ -182,13 +182,8 @@ section .text
%define xp6 xmm7
align 16
-global gf_6vect_dot_prod_avx:ISAL_SYM_TYPE_FUNCTION
+global gf_6vect_dot_prod_avx, function
func(gf_6vect_dot_prod_avx)
-%ifidn __OUTPUT_FORMAT__, macho64
-global _gf_6vect_dot_prod_avx:ISAL_SYM_TYPE_FUNCTION
-func(_gf_6vect_dot_prod_avx)
-%endif
-
FUNC_SAVE
sub len, 16
jl .return_fail
@@ -315,6 +310,3 @@ section .data
align 16
mask0f: dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
-
-;;; func core, ver, snum
-slversion gf_6vect_dot_prod_avx, 02, 04, 0195
diff --git a/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_avx.patch b/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_avx.patch
deleted file mode 100644
index 42b790fcab..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_avx.patch
+++ /dev/null
@@ -1,6 +0,0 @@
-187,191d186
-< %ifidn __OUTPUT_FORMAT__, macho64
-< global _gf_6vect_dot_prod_avx:ISAL_SYM_TYPE_FUNCTION
-< func(_gf_6vect_dot_prod_avx)
-< %endif
-<
diff --git a/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_avx2.asm b/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_avx2.asm
index d5b2543225..ee2d92665e 100644
--- a/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_avx2.asm
+++ b/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_avx2.asm
@@ -53,7 +53,7 @@
%define PS 8
%define LOG_PS 3
- %define func(x) x:
+ %define func(x) x: endbranch
%macro FUNC_SAVE 0
push r12
push r13
@@ -187,13 +187,8 @@ section .text
%define xp6 ymm7
align 16
-global gf_6vect_dot_prod_avx2:ISAL_SYM_TYPE_FUNCTION
+global gf_6vect_dot_prod_avx2, function
func(gf_6vect_dot_prod_avx2)
-%ifidn __OUTPUT_FORMAT__, macho64
-global _gf_6vect_dot_prod_avx2:ISAL_SYM_TYPE_FUNCTION
-func(_gf_6vect_dot_prod_avx2)
-%endif
-
FUNC_SAVE
sub len, 32
jl .return_fail
@@ -324,8 +319,3 @@ func(_gf_6vect_dot_prod_avx2)
ret
endproc_frame
-
-section .data
-
-;;; func core, ver, snum
-slversion gf_6vect_dot_prod_avx2, 04, 04, 019a
diff --git a/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_avx2.patch b/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_avx2.patch
deleted file mode 100644
index 531cd8cdda..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_avx2.patch
+++ /dev/null
@@ -1,6 +0,0 @@
-192,196d191
-< %ifidn __OUTPUT_FORMAT__, macho64
-< global _gf_6vect_dot_prod_avx2:ISAL_SYM_TYPE_FUNCTION
-< func(_gf_6vect_dot_prod_avx2)
-< %endif
-<
diff --git a/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_avx512.asm b/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_avx512.asm
new file mode 100644
index 0000000000..531dce90d1
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_avx512.asm
@@ -0,0 +1,353 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2019 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_6vect_dot_prod_avx512(len, vec, *g_tbls, **buffs, **dests);
+;;;
+
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+ %define arg4 r8
+ %define arg5 r9
+
+ %define tmp r11
+ %define tmp2 r10
+ %define tmp3 r13 ; must be saved and restored
+ %define tmp4 r12 ; must be saved and restored
+ %define tmp5 r14 ; must be saved and restored
+ %define tmp6 r15 ; must be saved and restored
+ %define tmp7 rbp ; must be saved and restored
+ %define tmp8 rbx ; must be saved and restored
+ %define return rax
+ %define PS 8
+ %define LOG_PS 3
+ %define stack_size 6*8
+
+ %define func(x) x: endbranch
+ %macro FUNC_SAVE 0
+ sub rsp, stack_size
+ mov [rsp + 0*8], r12
+ mov [rsp + 1*8], r13
+ mov [rsp + 2*8], r14
+ mov [rsp + 3*8], r15
+ mov [rsp + 4*8], rbp
+ mov [rsp + 5*8], rbx
+ %endmacro
+ %macro FUNC_RESTORE 0
+ mov r12, [rsp + 0*8]
+ mov r13, [rsp + 1*8]
+ mov r14, [rsp + 2*8]
+ mov r15, [rsp + 3*8]
+ mov rbp, [rsp + 4*8]
+ mov rbx, [rsp + 5*8]
+ add rsp, stack_size
+ %endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+
+ %define arg4 r12 ; must be saved, loaded and restored
+ %define arg5 r15 ; must be saved and restored
+ %define tmp r11
+ %define tmp2 r10
+ %define tmp3 r13 ; must be saved and restored
+ %define tmp4 r14 ; must be saved and restored
+ %define tmp5 rdi ; must be saved and restored
+ %define tmp6 rsi ; must be saved and restored
+ %define tmp7 rbp ; must be saved and restored
+ %define tmp8 rbx ; must be saved and restored
+ %define return rax
+ %define PS 8
+ %define LOG_PS 3
+ %define stack_size 10*16 + 9*8 ; must be an odd multiple of 8
+ %define arg(x) [rsp + stack_size + PS + PS*x]
+
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ vmovdqa [rsp + 0*16], xmm6
+ vmovdqa [rsp + 1*16], xmm7
+ vmovdqa [rsp + 2*16], xmm8
+ vmovdqa [rsp + 3*16], xmm9
+ vmovdqa [rsp + 4*16], xmm10
+ vmovdqa [rsp + 5*16], xmm11
+ vmovdqa [rsp + 6*16], xmm12
+ vmovdqa [rsp + 7*16], xmm13
+ vmovdqa [rsp + 8*16], xmm14
+ vmovdqa [rsp + 9*16], xmm15
+ save_reg r12, 10*16 + 0*8
+ save_reg r13, 10*16 + 1*8
+ save_reg r14, 10*16 + 2*8
+ save_reg r15, 10*16 + 3*8
+ save_reg rdi, 10*16 + 4*8
+ save_reg rsi, 10*16 + 5*8
+ save_reg rbp, 10*16 + 6*8
+ save_reg rbx, 10*16 + 7*8
+ end_prolog
+ mov arg4, arg(4)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ vmovdqa xmm6, [rsp + 0*16]
+ vmovdqa xmm7, [rsp + 1*16]
+ vmovdqa xmm8, [rsp + 2*16]
+ vmovdqa xmm9, [rsp + 3*16]
+ vmovdqa xmm10, [rsp + 4*16]
+ vmovdqa xmm11, [rsp + 5*16]
+ vmovdqa xmm12, [rsp + 6*16]
+ vmovdqa xmm13, [rsp + 7*16]
+ vmovdqa xmm14, [rsp + 8*16]
+ vmovdqa xmm15, [rsp + 9*16]
+ mov r12, [rsp + 10*16 + 0*8]
+ mov r13, [rsp + 10*16 + 1*8]
+ mov r14, [rsp + 10*16 + 2*8]
+ mov r15, [rsp + 10*16 + 3*8]
+ mov rdi, [rsp + 10*16 + 4*8]
+ mov rsi, [rsp + 10*16 + 5*8]
+ mov rbp, [rsp + 10*16 + 6*8]
+ mov rbx, [rsp + 10*16 + 7*8]
+ add rsp, stack_size
+ %endmacro
+%endif
+
+
+%define len arg0
+%define vec arg1
+%define mul_array arg2
+%define src arg3
+%define dest1 arg4
+%define ptr arg5
+%define vec_i tmp2
+%define dest2 tmp3
+%define dest3 tmp4
+%define dest4 tmp5
+%define vskip3 tmp6
+%define dest5 tmp7
+%define vskip1 tmp8
+%define pos return
+
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu8
+ %define XSTR vmovdqu8
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+ %define XLDR vmovdqa64
+ %define XSTR vmovdqa64
+ %else
+ %define XLDR vmovntdqa
+ %define XSTR vmovntdq
+ %endif
+%endif
+
+%define xmask0f zmm20
+%define xgft1_lo zmm19
+%define xgft1_loy ymm19
+%define xgft1_hi zmm18
+%define xgft2_lo zmm17
+%define xgft2_loy ymm17
+%define xgft2_hi zmm16
+%define xgft3_lo zmm15
+%define xgft3_loy ymm15
+%define xgft3_hi zmm14
+%define xgft4_lo zmm13
+%define xgft4_loy ymm13
+%define xgft4_hi zmm12
+%define xgft5_lo zmm11
+%define xgft5_loy ymm11
+%define xgft5_hi zmm10
+%define xgft6_lo zmm9
+%define xgft6_loy ymm9
+%define xgft6_hi zmm8
+
+%define x0 zmm0
+%define xtmpa zmm1
+%define xp1 zmm2
+%define xp2 zmm3
+%define xp3 zmm4
+%define xp4 zmm5
+%define xp5 zmm6
+%define xp6 zmm7
+
+default rel
+[bits 64]
+
+section .text
+
+align 16
+global gf_6vect_dot_prod_avx512, function
+func(gf_6vect_dot_prod_avx512)
+ FUNC_SAVE
+ sub len, 64
+ jl .return_fail
+
+ xor pos, pos
+ mov tmp, 0x0f
+ vpbroadcastb xmask0f, tmp ;Construct mask 0x0f0f0f...
+ mov vskip1, vec
+ imul vskip1, 32
+ mov vskip3, vec
+ imul vskip3, 96
+ sal vec, LOG_PS ;vec *= PS. Make vec_i count by PS
+ mov dest2, [dest1+PS]
+ mov dest3, [dest1+2*PS]
+ mov dest4, [dest1+3*PS]
+ mov dest5, [dest1+4*PS]
+
+.loop64:
+ vpxorq xp1, xp1, xp1
+ vpxorq xp2, xp2, xp2
+ vpxorq xp3, xp3, xp3
+ vpxorq xp4, xp4, xp4
+ vpxorq xp5, xp5, xp5
+ vpxorq xp6, xp6, xp6
+ mov tmp, mul_array
+ xor vec_i, vec_i
+
+.next_vect:
+ mov ptr, [src+vec_i]
+ XLDR x0, [ptr+pos] ;Get next source vector
+ add vec_i, PS
+
+ vpandq xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
+ vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
+ vpandq x0, x0, xmask0f ;Mask high src nibble in bits 4-0
+
+ vmovdqu8 xgft1_loy, [tmp] ;Load array Ax{00}..{0f}, Ax{00}..{f0}
+ vmovdqu8 xgft2_loy, [tmp+vec*(32/PS)] ;Load array Bx{00}..{0f}, Bx{00}..{f0}
+ vmovdqu8 xgft3_loy, [tmp+vec*(64/PS)] ;Load array Cx{00}..{0f}, Cx{00}..{f0}
+ vmovdqu8 xgft4_loy, [tmp+vskip3] ;Load array Dx{00}..{0f}, Dx{00}..{f0}
+ vmovdqu8 xgft5_loy, [tmp+vskip1*4] ;Load array Ex{00}..{0f}, Ex{00}..{f0}
+ lea ptr, [vskip1 + vskip1*4] ;ptr = vskip5
+ vmovdqu8 xgft6_loy, [tmp+ptr] ;Load array Fx{00}..{0f}, Fx{00}..{f0}
+ add tmp, 32
+
+ vshufi64x2 xgft1_hi, xgft1_lo, xgft1_lo, 0x55
+ vshufi64x2 xgft1_lo, xgft1_lo, xgft1_lo, 0x00
+ vshufi64x2 xgft2_hi, xgft2_lo, xgft2_lo, 0x55
+ vshufi64x2 xgft2_lo, xgft2_lo, xgft2_lo, 0x00
+
+ vpshufb xgft1_hi, xgft1_hi, x0 ;Lookup mul table of high nibble
+ vpshufb xgft1_lo, xgft1_lo, xtmpa ;Lookup mul table of low nibble
+ vpxorq xgft1_hi, xgft1_hi, xgft1_lo ;GF add high and low partials
+ vpxorq xp1, xp1, xgft1_hi ;xp1 += partial
+
+ vpshufb xgft2_hi, xgft2_hi, x0 ;Lookup mul table of high nibble
+ vpshufb xgft2_lo, xgft2_lo, xtmpa ;Lookup mul table of low nibble
+ vpxorq xgft2_hi, xgft2_hi, xgft2_lo ;GF add high and low partials
+ vpxorq xp2, xp2, xgft2_hi ;xp2 += partial
+
+ vshufi64x2 xgft3_hi, xgft3_lo, xgft3_lo, 0x55
+ vshufi64x2 xgft3_lo, xgft3_lo, xgft3_lo, 0x00
+ vshufi64x2 xgft4_hi, xgft4_lo, xgft4_lo, 0x55
+ vshufi64x2 xgft4_lo, xgft4_lo, xgft4_lo, 0x00
+
+ vpshufb xgft3_hi, xgft3_hi, x0 ;Lookup mul table of high nibble
+ vpshufb xgft3_lo, xgft3_lo, xtmpa ;Lookup mul table of low nibble
+ vpxorq xgft3_hi, xgft3_hi, xgft3_lo ;GF add high and low partials
+ vpxorq xp3, xp3, xgft3_hi ;xp3 += partial
+
+ vpshufb xgft4_hi, xgft4_hi, x0 ;Lookup mul table of high nibble
+ vpshufb xgft4_lo, xgft4_lo, xtmpa ;Lookup mul table of low nibble
+ vpxorq xgft4_hi, xgft4_hi, xgft4_lo ;GF add high and low partials
+ vpxorq xp4, xp4, xgft4_hi ;xp4 += partial
+
+ vshufi64x2 xgft5_hi, xgft5_lo, xgft5_lo, 0x55
+ vshufi64x2 xgft5_lo, xgft5_lo, xgft5_lo, 0x00
+
+ vpshufb xgft5_hi, xgft5_hi, x0 ;Lookup mul table of high nibble
+ vpshufb xgft5_lo, xgft5_lo, xtmpa ;Lookup mul table of low nibble
+ vpxorq xgft5_hi, xgft5_hi, xgft5_lo ;GF add high and low partials
+ vpxorq xp5, xp5, xgft5_hi ;xp5 += partial
+
+ vshufi64x2 xgft6_hi, xgft6_lo, xgft6_lo, 0x55
+ vshufi64x2 xgft6_lo, xgft6_lo, xgft6_lo, 0x00
+
+ vpshufb xgft6_hi, xgft6_hi, x0 ;Lookup mul table of high nibble
+ vpshufb xgft6_lo, xgft6_lo, xtmpa ;Lookup mul table of low nibble
+ vpxorq xgft6_hi, xgft6_hi, xgft6_lo ;GF add high and low partials
+ vpxorq xp6, xp6, xgft6_hi ;x6 += partial
+
+ cmp vec_i, vec
+ jl .next_vect
+
+ mov ptr, [dest1] ;reuse ptr
+ mov tmp, [dest1+5*PS] ;reuse tmp
+
+ XSTR [dest2+pos], xp2
+ XSTR [dest3+pos], xp3
+ XSTR [dest4+pos], xp4
+ XSTR [dest5+pos], xp5
+
+ XSTR [ptr+pos], xp1
+ XSTR [tmp+pos], xp6
+
+ add pos, 64 ;Loop on 64 bytes at a time
+ cmp pos, len
+ jle .loop64
+
+ lea tmp, [len + 64]
+ cmp pos, tmp
+ je .return_pass
+
+ ;; Tail len
+ mov pos, len ;Overlapped offset length-64
+ jmp .loop64 ;Do one more overlap pass
+
+.return_pass:
+ mov return, 0
+ FUNC_RESTORE
+ ret
+
+.return_fail:
+ mov return, 1
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_gf_6vect_dot_prod_avx512
+no_gf_6vect_dot_prod_avx512:
+%endif
+%endif ; ifdef HAVE_AS_KNOWS_AVX512
diff --git a/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_avx512_gfni.asm b/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_avx512_gfni.asm
new file mode 100644
index 0000000000..b4b8c654bb
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_avx512_gfni.asm
@@ -0,0 +1,292 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2023 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_6vect_dot_prod_avx512_gfni(len, vec, *g_tbls, **buffs, **dests);
+;;;
+
+%include "reg_sizes.asm"
+%include "gf_vect_gfni.inc"
+
+%if AS_FEATURE_LEVEL >= 10
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+ %define arg4 r8
+ %define arg5 r9
+
+ %define tmp r11
+ %define tmp2 r10
+ %define tmp3 r13 ; must be saved and restored
+ %define tmp4 r12 ; must be saved and restored
+ %define tmp5 r14 ; must be saved and restored
+ %define tmp6 r15 ; must be saved and restored
+ %define tmp7 rbp ; must be saved and restored
+ %define tmp8 rbx ; must be saved and restored
+
+ %define func(x) x: endbranch
+ %macro FUNC_SAVE 0
+ push r12
+ push r13
+ push r14
+ push r15
+ push rbp
+ push rbx
+ %endmacro
+ %macro FUNC_RESTORE 0
+ pop rbx
+ pop rbp
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ %endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+
+ %define arg4 r12 ; must be saved, loaded and restored
+ %define arg5 r15 ; must be saved and restored
+ %define tmp r11
+ %define tmp2 r10
+ %define tmp3 r13 ; must be saved and restored
+ %define tmp4 r14 ; must be saved and restored
+ %define tmp5 rdi ; must be saved and restored
+ %define tmp6 rsi ; must be saved and restored
+ %define tmp7 rbp ; must be saved and restored
+ %define tmp8 rbx ; must be saved and restored
+ %define stack_size 7*16 + 9*8 ; must be an odd multiple of 8
+ %define arg(x) [rsp + stack_size + 8 + 8*x]
+
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ vmovdqa [rsp + 0*16], xmm6
+ vmovdqa [rsp + 1*16], xmm7
+ vmovdqa [rsp + 2*16], xmm8
+ vmovdqa [rsp + 3*16], xmm9
+ vmovdqa [rsp + 4*16], xmm10
+ vmovdqa [rsp + 5*16], xmm11
+ vmovdqa [rsp + 6*16], xmm12
+ mov [rsp + 7*16 + 0*8], r12
+ mov [rsp + 7*16 + 1*8], r13
+ mov [rsp + 7*16 + 2*8], r14
+ mov [rsp + 7*16 + 3*8], r15
+ mov [rsp + 7*16 + 4*8], rdi
+ mov [rsp + 7*16 + 5*8], rsi
+ mov [rsp + 7*16 + 6*8], rbp
+ mov [rsp + 7*16 + 7*8], rbx
+ end_prolog
+ mov arg4, arg(4)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ vmovdqa xmm6, [rsp + 0*16]
+ vmovdqa xmm7, [rsp + 1*16]
+ vmovdqa xmm8, [rsp + 2*16]
+ vmovdqa xmm9, [rsp + 3*16]
+ vmovdqa xmm10, [rsp + 4*16]
+ vmovdqa xmm11, [rsp + 5*16]
+ vmovdqa xmm12, [rsp + 6*16]
+ mov r12, [rsp + 7*16 + 0*8]
+ mov r13, [rsp + 7*16 + 1*8]
+ mov r14, [rsp + 7*16 + 2*8]
+ mov r15, [rsp + 7*16 + 3*8]
+ mov rdi, [rsp + 7*16 + 4*8]
+ mov rsi, [rsp + 7*16 + 5*8]
+ mov rbp, [rsp + 7*16 + 6*8]
+ mov rbx, [rsp + 7*16 + 7*8]
+ add rsp, stack_size
+ %endmacro
+%endif
+
+
+%define len arg0
+%define vec arg1
+%define mul_array arg2
+%define src arg3
+%define dest1 arg4
+%define ptr arg5
+%define vec_i tmp2
+%define dest2 tmp3
+%define dest3 tmp4
+%define dest4 tmp5
+%define vskip3 tmp6
+%define dest5 tmp7
+%define vskip5 tmp8
+%define pos rax
+
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu8
+ %define XSTR vmovdqu8
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+ %define XLDR vmovdqa64
+ %define XSTR vmovdqa64
+ %else
+ %define XLDR vmovntdqa
+ %define XSTR vmovntdq
+ %endif
+%endif
+
+%define xgft1 zmm7
+%define xgft2 zmm8
+%define xgft3 zmm9
+%define xgft4 zmm10
+%define xgft5 zmm11
+%define xgft6 zmm12
+
+%define x0 zmm0
+%define xp1 zmm1
+%define xp2 zmm2
+%define xp3 zmm3
+%define xp4 zmm4
+%define xp5 zmm5
+%define xp6 zmm6
+
+default rel
+[bits 64]
+
+section .text
+
+;;
+;; Encodes 64 bytes of all "k" sources into 6x 64 bytes (parity disks)
+;;
+%macro ENCODE_64B_6 0-1
+%define %%KMASK %1
+
+ vpxorq xp1, xp1, xp1
+ vpxorq xp2, xp2, xp2
+ vpxorq xp3, xp3, xp3
+ vpxorq xp4, xp4, xp4
+ vpxorq xp5, xp5, xp5
+ vpxorq xp6, xp6, xp6
+ mov tmp, mul_array
+ xor vec_i, vec_i
+
+%%next_vect:
+ mov ptr, [src + vec_i]
+%if %0 == 1
+ vmovdqu8 x0{%%KMASK}, [ptr + pos] ;Get next source vector (less than 64 bytes)
+%else
+ XLDR x0, [ptr + pos] ;Get next source vector (64 bytes)
+%endif
+ add vec_i, 8
+
+ vbroadcastf32x2 xgft1, [tmp]
+ vbroadcastf32x2 xgft2, [tmp + vec]
+ vbroadcastf32x2 xgft3, [tmp + vec*2]
+ vbroadcastf32x2 xgft4, [tmp + vskip3]
+ vbroadcastf32x2 xgft5, [tmp + vec*4]
+ vbroadcastf32x2 xgft6, [tmp + vskip5]
+ add tmp, 8
+
+ GF_MUL_XOR EVEX, x0, xgft1, xgft1, xp1, xgft2, xgft2, xp2, xgft3, xgft3, xp3, \
+ xgft4, xgft4, xp4, xgft5, xgft5, xp5, xgft6, xgft6, xp6
+
+ cmp vec_i, vec
+ jl %%next_vect
+
+ mov ptr, [dest1] ;reuse ptr
+ mov tmp, [dest1 + 5*8] ;reuse tmp
+
+%if %0 == 1
+ vmovdqu8 [dest2 + pos]{%%KMASK}, xp2
+ vmovdqu8 [dest3 + pos]{%%KMASK}, xp3
+ vmovdqu8 [dest4 + pos]{%%KMASK}, xp4
+ vmovdqu8 [dest5 + pos]{%%KMASK}, xp5
+ vmovdqu8 [ptr + pos]{%%KMASK}, xp1 ; dest 1
+ vmovdqu8 [tmp + pos]{%%KMASK}, xp6 ; dest 6
+%else
+ XSTR [dest2 + pos], xp2
+ XSTR [dest3 + pos], xp3
+ XSTR [dest4 + pos], xp4
+ XSTR [dest5 + pos], xp5
+ XSTR [ptr + pos], xp1 ; dest 1
+ XSTR [tmp + pos], xp6 ; dest 6
+%endif
+%endmacro
+
+align 16
+global gf_6vect_dot_prod_avx512_gfni, function
+func(gf_6vect_dot_prod_avx512_gfni)
+ FUNC_SAVE
+
+ xor pos, pos
+ mov vskip3, vec
+ imul vskip3, 3*8
+ mov vskip5, vec
+ imul vskip5, 5*8
+ shl vec, 3 ;vec *= 8. Make vec_i count by 8
+ mov dest2, [dest1 + 8]
+ mov dest3, [dest1 + 2*8]
+ mov dest4, [dest1 + 3*8]
+ mov dest5, [dest1 + 4*8] ;dest1 and dest6 are calculated later
+
+ cmp len, 64
+ jl .len_lt_64
+
+.loop64:
+
+ ENCODE_64B_6
+
+ add pos, 64 ;Loop on 64 bytes at a time
+ sub len, 64
+ cmp len, 64
+ jge .loop64
+
+.len_lt_64:
+ cmp len, 0
+ jle .exit
+
+ xor tmp, tmp
+ bts tmp, len
+ dec tmp
+ kmovq k1, tmp
+
+ ENCODE_64B_6 k1
+
+.exit:
+ vzeroupper
+
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+%endif ; if AS_FEATURE_LEVEL >= 10
diff --git a/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_sse.asm b/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_sse.asm
index 5dea0be18e..b877411da8 100644
--- a/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_sse.asm
+++ b/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_sse.asm
@@ -51,7 +51,7 @@
%define PS 8
%define LOG_PS 3
- %define func(x) x:
+ %define func(x) x: endbranch
%macro FUNC_SAVE 0
push r12
push r13
@@ -182,13 +182,8 @@ section .text
%define xp6 xmm13
align 16
-global gf_6vect_dot_prod_sse:ISAL_SYM_TYPE_FUNCTION
+global gf_6vect_dot_prod_sse, function
func(gf_6vect_dot_prod_sse)
-%ifidn __OUTPUT_FORMAT__, macho64
-global _gf_6vect_dot_prod_sse:ISAL_SYM_TYPE_FUNCTION
-func(_gf_6vect_dot_prod_sse)
-%endif
-
FUNC_SAVE
sub len, 16
jl .return_fail
@@ -315,6 +310,3 @@ section .data
align 16
mask0f: dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
-
-;;; func core, ver, snum
-slversion gf_6vect_dot_prod_sse, 00, 05, 0066
diff --git a/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_sse.patch b/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_sse.patch
deleted file mode 100644
index 1255245edf..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_sse.patch
+++ /dev/null
@@ -1,6 +0,0 @@
-187,191d186
-< %ifidn __OUTPUT_FORMAT__, macho64
-< global _gf_6vect_dot_prod_sse:ISAL_SYM_TYPE_FUNCTION
-< func(_gf_6vect_dot_prod_sse)
-< %endif
-<
diff --git a/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_sse_test.c b/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_sse_test.c
deleted file mode 100644
index f0075a00e8..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_sse_test.c
+++ /dev/null
@@ -1,911 +0,0 @@
-/**********************************************************************
- Copyright(c) 2011-2015 Intel Corporation All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions
- are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in
- the documentation and/or other materials provided with the
- distribution.
- * Neither the name of Intel Corporation nor the names of its
- contributors may be used to endorse or promote products derived
- from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-**********************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h> // for memset, memcmp
-#include "erasure_code.h"
-// #include "types.h"
-
-#ifndef FUNCTION_UNDER_TEST
-# define FUNCTION_UNDER_TEST gf_6vect_dot_prod_sse
-#endif
-#ifndef TEST_MIN_SIZE
-# define TEST_MIN_SIZE 16
-#endif
-
-#define str(s) #s
-#define xstr(s) str(s)
-
-#define TEST_LEN 2048
-#define TEST_SIZE (TEST_LEN/2)
-#define TEST_MEM TEST_SIZE
-#define TEST_LOOPS 1000
-#define TEST_TYPE_STR ""
-
-#ifndef TEST_SOURCES
-# define TEST_SOURCES 16
-#endif
-#ifndef RANDOMS
-# define RANDOMS 20
-#endif
-
-#ifdef EC_ALIGNED_ADDR
-// Define power of 2 range to check ptr, len alignment
-# define PTR_ALIGN_CHK_B 0
-# define LEN_ALIGN_CHK_B 0 // 0 for aligned only
-#else
-// Define power of 2 range to check ptr, len alignment
-# define PTR_ALIGN_CHK_B 32
-# define LEN_ALIGN_CHK_B 32 // 0 for aligned only
-#endif
-
-typedef unsigned char u8;
-
-void dump(unsigned char *buf, int len)
-{
- int i;
- for (i = 0; i < len;) {
- printf(" %2x", 0xff & buf[i++]);
- if (i % 32 == 0)
- printf("\n");
- }
- printf("\n");
-}
-
-void dump_matrix(unsigned char **s, int k, int m)
-{
- int i, j;
- for (i = 0; i < k; i++) {
- for (j = 0; j < m; j++) {
- printf(" %2x", s[i][j]);
- }
- printf("\n");
- }
- printf("\n");
-}
-
-void dump_u8xu8(unsigned char *s, int k, int m)
-{
- int i, j;
- for (i = 0; i < k; i++) {
- for (j = 0; j < m; j++) {
- printf(" %2x", 0xff & s[j + (i * m)]);
- }
- printf("\n");
- }
- printf("\n");
-}
-
-int main(int argc, char *argv[])
-{
- int i, j, rtest, srcs;
- void *buf;
- u8 g1[TEST_SOURCES], g2[TEST_SOURCES], g3[TEST_SOURCES];
- u8 g4[TEST_SOURCES], g5[TEST_SOURCES], g6[TEST_SOURCES], *g_tbls;
- u8 *dest1, *dest2, *dest3, *dest4, *dest5, *dest6, *dest_ref1;
- u8 *dest_ref2, *dest_ref3, *dest_ref4, *dest_ref5, *dest_ref6;
- u8 *dest_ptrs[6], *buffs[TEST_SOURCES];
-
- int align, size;
- unsigned char *efence_buffs[TEST_SOURCES];
- unsigned int offset;
- u8 *ubuffs[TEST_SOURCES];
- u8 *udest_ptrs[6];
- printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d ", TEST_SOURCES, TEST_LEN);
-
- // Allocate the arrays
- for (i = 0; i < TEST_SOURCES; i++) {
- if (posix_memalign(&buf, 64, TEST_LEN)) {
- printf("alloc error: Fail");
- return -1;
- }
- buffs[i] = buf;
- }
-
- if (posix_memalign(&buf, 16, 2 * (6 * TEST_SOURCES * 32))) {
- printf("alloc error: Fail");
- return -1;
- }
- g_tbls = buf;
-
- if (posix_memalign(&buf, 64, TEST_LEN)) {
- printf("alloc error: Fail");
- return -1;
- }
- dest1 = buf;
-
- if (posix_memalign(&buf, 64, TEST_LEN)) {
- printf("alloc error: Fail");
- return -1;
- }
- dest2 = buf;
-
- if (posix_memalign(&buf, 64, TEST_LEN)) {
- printf("alloc error: Fail");
- return -1;
- }
- dest3 = buf;
-
- if (posix_memalign(&buf, 64, TEST_LEN)) {
- printf("alloc error: Fail");
- return -1;
- }
- dest4 = buf;
-
- if (posix_memalign(&buf, 64, TEST_LEN)) {
- printf("alloc error: Fail");
- return -1;
- }
- dest5 = buf;
-
- if (posix_memalign(&buf, 64, TEST_LEN)) {
- printf("alloc error: Fail");
- return -1;
- }
- dest6 = buf;
-
- if (posix_memalign(&buf, 64, TEST_LEN)) {
- printf("alloc error: Fail");
- return -1;
- }
- dest_ref1 = buf;
-
- if (posix_memalign(&buf, 64, TEST_LEN)) {
- printf("alloc error: Fail");
- return -1;
- }
- dest_ref2 = buf;
-
- if (posix_memalign(&buf, 64, TEST_LEN)) {
- printf("alloc error: Fail");
- return -1;
- }
- dest_ref3 = buf;
-
- if (posix_memalign(&buf, 64, TEST_LEN)) {
- printf("alloc error: Fail");
- return -1;
- }
- dest_ref4 = buf;
-
- if (posix_memalign(&buf, 64, TEST_LEN)) {
- printf("alloc error: Fail");
- return -1;
- }
- dest_ref5 = buf;
-
- if (posix_memalign(&buf, 64, TEST_LEN)) {
- printf("alloc error: Fail");
- return -1;
- }
- dest_ref6 = buf;
-
- dest_ptrs[0] = dest1;
- dest_ptrs[1] = dest2;
- dest_ptrs[2] = dest3;
- dest_ptrs[3] = dest4;
- dest_ptrs[4] = dest5;
- dest_ptrs[5] = dest6;
-
- // Test of all zeros
- for (i = 0; i < TEST_SOURCES; i++)
- memset(buffs[i], 0, TEST_LEN);
-
- memset(dest1, 0, TEST_LEN);
- memset(dest2, 0, TEST_LEN);
- memset(dest3, 0, TEST_LEN);
- memset(dest4, 0, TEST_LEN);
- memset(dest5, 0, TEST_LEN);
- memset(dest6, 0, TEST_LEN);
- memset(dest_ref1, 0, TEST_LEN);
- memset(dest_ref2, 0, TEST_LEN);
- memset(dest_ref3, 0, TEST_LEN);
- memset(dest_ref4, 0, TEST_LEN);
- memset(dest_ref5, 0, TEST_LEN);
- memset(dest_ref6, 0, TEST_LEN);
- memset(g1, 2, TEST_SOURCES);
- memset(g2, 1, TEST_SOURCES);
- memset(g3, 7, TEST_SOURCES);
- memset(g4, 9, TEST_SOURCES);
- memset(g5, 4, TEST_SOURCES);
- memset(g6, 0xe6, TEST_SOURCES);
-
- for (i = 0; i < TEST_SOURCES; i++) {
- gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
- gf_vect_mul_init(g2[i], &g_tbls[32 * TEST_SOURCES + i * 32]);
- gf_vect_mul_init(g3[i], &g_tbls[64 * TEST_SOURCES + i * 32]);
- gf_vect_mul_init(g4[i], &g_tbls[96 * TEST_SOURCES + i * 32]);
- gf_vect_mul_init(g5[i], &g_tbls[128 * TEST_SOURCES + i * 32]);
- gf_vect_mul_init(g6[i], &g_tbls[160 * TEST_SOURCES + i * 32]);
- }
-
- gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
- gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES], buffs,
- dest_ref2);
- gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES], buffs,
- dest_ref3);
- gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES], buffs,
- dest_ref4);
- gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[128 * TEST_SOURCES], buffs,
- dest_ref5);
- gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[160 * TEST_SOURCES], buffs,
- dest_ref6);
-
- FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
-
- if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
- printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test1\n");
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref1, 25);
- printf("dprod_dut:");
- dump(dest1, 25);
- return -1;
- }
- if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
- printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test2\n");
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref2, 25);
- printf("dprod_dut:");
- dump(dest2, 25);
- return -1;
- }
- if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
- printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test3\n");
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref3, 25);
- printf("dprod_dut:");
- dump(dest3, 25);
- return -1;
- }
- if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) {
- printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test4\n");
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref4, 25);
- printf("dprod_dut:");
- dump(dest4, 25);
- return -1;
- }
- if (0 != memcmp(dest_ref5, dest5, TEST_LEN)) {
- printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test5\n");
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref5, 25);
- printf("dprod_dut:");
- dump(dest5, 25);
- return -1;
- }
- if (0 != memcmp(dest_ref6, dest6, TEST_LEN)) {
- printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test6\n");
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref6, 25);
- printf("dprod_dut:");
- dump(dest6, 25);
- return -1;
- }
- putchar('.');
-
- // Rand data test
-
- for (rtest = 0; rtest < RANDOMS; rtest++) {
- for (i = 0; i < TEST_SOURCES; i++)
- for (j = 0; j < TEST_LEN; j++)
- buffs[i][j] = rand();
-
- for (i = 0; i < TEST_SOURCES; i++) {
- g1[i] = rand();
- g2[i] = rand();
- g3[i] = rand();
- g4[i] = rand();
- g5[i] = rand();
- g6[i] = rand();
- }
-
- for (i = 0; i < TEST_SOURCES; i++) {
- gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
- gf_vect_mul_init(g2[i], &g_tbls[(32 * TEST_SOURCES) + (i * 32)]);
- gf_vect_mul_init(g3[i], &g_tbls[(64 * TEST_SOURCES) + (i * 32)]);
- gf_vect_mul_init(g4[i], &g_tbls[(96 * TEST_SOURCES) + (i * 32)]);
- gf_vect_mul_init(g5[i], &g_tbls[(128 * TEST_SOURCES) + (i * 32)]);
- gf_vect_mul_init(g6[i], &g_tbls[(160 * TEST_SOURCES) + (i * 32)]);
- }
-
- gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
- gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
- buffs, dest_ref2);
- gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES],
- buffs, dest_ref3);
- gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES],
- buffs, dest_ref4);
- gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[128 * TEST_SOURCES],
- buffs, dest_ref5);
- gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[160 * TEST_SOURCES],
- buffs, dest_ref6);
-
- FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
-
- if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test1 %d\n", rtest);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref1, 25);
- printf("dprod_dut:");
- dump(dest1, 25);
- return -1;
- }
- if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test2 %d\n", rtest);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref2, 25);
- printf("dprod_dut:");
- dump(dest2, 25);
- return -1;
- }
- if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test3 %d\n", rtest);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref3, 25);
- printf("dprod_dut:");
- dump(dest3, 25);
- return -1;
- }
- if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test4 %d\n", rtest);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref4, 25);
- printf("dprod_dut:");
- dump(dest4, 25);
- return -1;
- }
- if (0 != memcmp(dest_ref5, dest5, TEST_LEN)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test5 %d\n", rtest);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref5, 25);
- printf("dprod_dut:");
- dump(dest5, 25);
- return -1;
- }
- if (0 != memcmp(dest_ref6, dest6, TEST_LEN)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test6 %d\n", rtest);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref6, 25);
- printf("dprod_dut:");
- dump(dest6, 25);
- return -1;
- }
-
- putchar('.');
- }
-
- // Rand data test with varied parameters
- for (rtest = 0; rtest < RANDOMS; rtest++) {
- for (srcs = TEST_SOURCES; srcs > 0; srcs--) {
- for (i = 0; i < srcs; i++)
- for (j = 0; j < TEST_LEN; j++)
- buffs[i][j] = rand();
-
- for (i = 0; i < srcs; i++) {
- g1[i] = rand();
- g2[i] = rand();
- g3[i] = rand();
- g4[i] = rand();
- g5[i] = rand();
- g6[i] = rand();
- }
-
- for (i = 0; i < srcs; i++) {
- gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
- gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
- gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]);
- gf_vect_mul_init(g4[i], &g_tbls[(96 * srcs) + (i * 32)]);
- gf_vect_mul_init(g5[i], &g_tbls[(128 * srcs) + (i * 32)]);
- gf_vect_mul_init(g6[i], &g_tbls[(160 * srcs) + (i * 32)]);
- }
-
- gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[0], buffs, dest_ref1);
- gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[32 * srcs], buffs,
- dest_ref2);
- gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[64 * srcs], buffs,
- dest_ref3);
- gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[96 * srcs], buffs,
- dest_ref4);
- gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[128 * srcs], buffs,
- dest_ref5);
- gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[160 * srcs], buffs,
- dest_ref6);
-
- FUNCTION_UNDER_TEST(TEST_LEN, srcs, g_tbls, buffs, dest_ptrs);
-
- if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
- " test1 srcs=%d\n", srcs);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref1, 25);
- printf("dprod_dut:");
- dump(dest1, 25);
- return -1;
- }
- if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
- " test2 srcs=%d\n", srcs);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref2, 25);
- printf("dprod_dut:");
- dump(dest2, 25);
- return -1;
- }
- if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
- " test3 srcs=%d\n", srcs);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref3, 25);
- printf("dprod_dut:");
- dump(dest3, 25);
- return -1;
- }
- if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
- " test4 srcs=%d\n", srcs);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref4, 25);
- printf("dprod_dut:");
- dump(dest4, 25);
- return -1;
- }
- if (0 != memcmp(dest_ref5, dest5, TEST_LEN)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
- " test5 srcs=%d\n", srcs);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref5, 25);
- printf("dprod_dut:");
- dump(dest5, 25);
- return -1;
- }
- if (0 != memcmp(dest_ref6, dest6, TEST_LEN)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
- " test6 srcs=%d\n", srcs);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref6, 25);
- printf("dprod_dut:");
- dump(dest6, 25);
- return -1;
- }
-
- putchar('.');
- }
- }
-
- // Run tests at end of buffer for Electric Fence
- align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
- for (size = TEST_MIN_SIZE; size <= TEST_SIZE; size += align) {
- for (i = 0; i < TEST_SOURCES; i++)
- for (j = 0; j < TEST_LEN; j++)
- buffs[i][j] = rand();
-
- for (i = 0; i < TEST_SOURCES; i++) // Line up TEST_SIZE from end
- efence_buffs[i] = buffs[i] + TEST_LEN - size;
-
- for (i = 0; i < TEST_SOURCES; i++) {
- g1[i] = rand();
- g2[i] = rand();
- g3[i] = rand();
- g4[i] = rand();
- g5[i] = rand();
- g6[i] = rand();
- }
-
- for (i = 0; i < TEST_SOURCES; i++) {
- gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
- gf_vect_mul_init(g2[i], &g_tbls[(32 * TEST_SOURCES) + (i * 32)]);
- gf_vect_mul_init(g3[i], &g_tbls[(64 * TEST_SOURCES) + (i * 32)]);
- gf_vect_mul_init(g4[i], &g_tbls[(96 * TEST_SOURCES) + (i * 32)]);
- gf_vect_mul_init(g5[i], &g_tbls[(128 * TEST_SOURCES) + (i * 32)]);
- gf_vect_mul_init(g6[i], &g_tbls[(160 * TEST_SOURCES) + (i * 32)]);
- }
-
- gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[0], efence_buffs, dest_ref1);
- gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
- efence_buffs, dest_ref2);
- gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES],
- efence_buffs, dest_ref3);
- gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES],
- efence_buffs, dest_ref4);
- gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[128 * TEST_SOURCES],
- efence_buffs, dest_ref5);
- gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[160 * TEST_SOURCES],
- efence_buffs, dest_ref6);
-
- FUNCTION_UNDER_TEST(size, TEST_SOURCES, g_tbls, efence_buffs, dest_ptrs);
-
- if (0 != memcmp(dest_ref1, dest1, size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test1 %d\n", rtest);
- dump_matrix(efence_buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref1, align);
- printf("dprod_dut:");
- dump(dest1, align);
- return -1;
- }
-
- if (0 != memcmp(dest_ref2, dest2, size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test2 %d\n", rtest);
- dump_matrix(efence_buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref2, align);
- printf("dprod_dut:");
- dump(dest2, align);
- return -1;
- }
-
- if (0 != memcmp(dest_ref3, dest3, size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test3 %d\n", rtest);
- dump_matrix(efence_buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref3, align);
- printf("dprod_dut:");
- dump(dest3, align);
- return -1;
- }
-
- if (0 != memcmp(dest_ref4, dest4, size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test4 %d\n", rtest);
- dump_matrix(efence_buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref4, align);
- printf("dprod_dut:");
- dump(dest4, align);
- return -1;
- }
-
- if (0 != memcmp(dest_ref5, dest5, size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test5 %d\n", rtest);
- dump_matrix(efence_buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref5, align);
- printf("dprod_dut:");
- dump(dest5, align);
- return -1;
- }
-
- if (0 != memcmp(dest_ref6, dest6, size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test6 %d\n", rtest);
- dump_matrix(efence_buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref6, align);
- printf("dprod_dut:");
- dump(dest6, align);
- return -1;
- }
-
- putchar('.');
- }
-
- // Test rand ptr alignment if available
-
- for (rtest = 0; rtest < RANDOMS; rtest++) {
- size = (TEST_LEN - PTR_ALIGN_CHK_B) & ~(TEST_MIN_SIZE - 1);
- srcs = rand() % TEST_SOURCES;
- if (srcs == 0)
- continue;
-
- offset = (PTR_ALIGN_CHK_B != 0) ? 1 : PTR_ALIGN_CHK_B;
- // Add random offsets
- for (i = 0; i < srcs; i++)
- ubuffs[i] = buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
-
- udest_ptrs[0] = dest1 + (rand() & (PTR_ALIGN_CHK_B - offset));
- udest_ptrs[1] = dest2 + (rand() & (PTR_ALIGN_CHK_B - offset));
- udest_ptrs[2] = dest3 + (rand() & (PTR_ALIGN_CHK_B - offset));
- udest_ptrs[3] = dest4 + (rand() & (PTR_ALIGN_CHK_B - offset));
- udest_ptrs[4] = dest5 + (rand() & (PTR_ALIGN_CHK_B - offset));
- udest_ptrs[5] = dest6 + (rand() & (PTR_ALIGN_CHK_B - offset));
-
- memset(dest1, 0, TEST_LEN); // zero pad to check write-over
- memset(dest2, 0, TEST_LEN);
- memset(dest3, 0, TEST_LEN);
- memset(dest4, 0, TEST_LEN);
- memset(dest5, 0, TEST_LEN);
- memset(dest6, 0, TEST_LEN);
-
- for (i = 0; i < srcs; i++)
- for (j = 0; j < size; j++)
- ubuffs[i][j] = rand();
-
- for (i = 0; i < srcs; i++) {
- g1[i] = rand();
- g2[i] = rand();
- g3[i] = rand();
- g4[i] = rand();
- g5[i] = rand();
- g6[i] = rand();
- }
-
- for (i = 0; i < srcs; i++) {
- gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
- gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
- gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]);
- gf_vect_mul_init(g4[i], &g_tbls[(96 * srcs) + (i * 32)]);
- gf_vect_mul_init(g5[i], &g_tbls[(128 * srcs) + (i * 32)]);
- gf_vect_mul_init(g6[i], &g_tbls[(160 * srcs) + (i * 32)]);
- }
-
- gf_vect_dot_prod_base(size, srcs, &g_tbls[0], ubuffs, dest_ref1);
- gf_vect_dot_prod_base(size, srcs, &g_tbls[32 * srcs], ubuffs, dest_ref2);
- gf_vect_dot_prod_base(size, srcs, &g_tbls[64 * srcs], ubuffs, dest_ref3);
- gf_vect_dot_prod_base(size, srcs, &g_tbls[96 * srcs], ubuffs, dest_ref4);
- gf_vect_dot_prod_base(size, srcs, &g_tbls[128 * srcs], ubuffs, dest_ref5);
- gf_vect_dot_prod_base(size, srcs, &g_tbls[160 * srcs], ubuffs, dest_ref6);
-
- FUNCTION_UNDER_TEST(size, srcs, g_tbls, ubuffs, udest_ptrs);
-
- if (memcmp(dest_ref1, udest_ptrs[0], size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
- srcs);
- dump_matrix(ubuffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref1, 25);
- printf("dprod_dut:");
- dump(udest_ptrs[0], 25);
- return -1;
- }
- if (memcmp(dest_ref2, udest_ptrs[1], size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
- srcs);
- dump_matrix(ubuffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref2, 25);
- printf("dprod_dut:");
- dump(udest_ptrs[1], 25);
- return -1;
- }
- if (memcmp(dest_ref3, udest_ptrs[2], size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
- srcs);
- dump_matrix(ubuffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref3, 25);
- printf("dprod_dut:");
- dump(udest_ptrs[2], 25);
- return -1;
- }
- if (memcmp(dest_ref4, udest_ptrs[3], size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
- srcs);
- dump_matrix(ubuffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref4, 25);
- printf("dprod_dut:");
- dump(udest_ptrs[3], 25);
- return -1;
- }
- if (memcmp(dest_ref5, udest_ptrs[4], size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
- srcs);
- dump_matrix(ubuffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref5, 25);
- printf("dprod_dut:");
- dump(udest_ptrs[4], 25);
- return -1;
- }
- if (memcmp(dest_ref6, udest_ptrs[5], size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
- srcs);
- dump_matrix(ubuffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref6, 25);
- printf("dprod_dut:");
- dump(udest_ptrs[5], 25);
- return -1;
- }
- // Confirm that padding around dests is unchanged
- memset(dest_ref1, 0, PTR_ALIGN_CHK_B); // Make reference zero buff
- offset = udest_ptrs[0] - dest1;
-
- if (memcmp(dest1, dest_ref1, offset)) {
- printf("Fail rand ualign pad1 start\n");
- return -1;
- }
- if (memcmp(dest1 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
- printf("Fail rand ualign pad1 end\n");
- return -1;
- }
-
- offset = udest_ptrs[1] - dest2;
- if (memcmp(dest2, dest_ref1, offset)) {
- printf("Fail rand ualign pad2 start\n");
- return -1;
- }
- if (memcmp(dest2 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
- printf("Fail rand ualign pad2 end\n");
- return -1;
- }
-
- offset = udest_ptrs[2] - dest3;
- if (memcmp(dest3, dest_ref1, offset)) {
- printf("Fail rand ualign pad3 start\n");
- return -1;
- }
- if (memcmp(dest3 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
- printf("Fail rand ualign pad3 end\n");
- return -1;
- }
-
- offset = udest_ptrs[3] - dest4;
- if (memcmp(dest4, dest_ref1, offset)) {
- printf("Fail rand ualign pad4 start\n");
- return -1;
- }
- if (memcmp(dest4 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
- printf("Fail rand ualign pad4 end\n");
- return -1;
- }
-
- offset = udest_ptrs[4] - dest5;
- if (memcmp(dest5, dest_ref1, offset)) {
- printf("Fail rand ualign pad5 start\n");
- return -1;
- }
- if (memcmp(dest5 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
- printf("Fail rand ualign pad5 end\n");
- return -1;
- }
-
- offset = udest_ptrs[5] - dest6;
- if (memcmp(dest6, dest_ref1, offset)) {
- printf("Fail rand ualign pad6 start\n");
- return -1;
- }
- if (memcmp(dest6 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
- printf("Fail rand ualign pad6 end\n");
- return -1;
- }
-
- putchar('.');
- }
-
- // Test all size alignment
- align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
-
- for (size = TEST_LEN; size >= TEST_MIN_SIZE; size -= align) {
- srcs = TEST_SOURCES;
-
- for (i = 0; i < srcs; i++)
- for (j = 0; j < size; j++)
- buffs[i][j] = rand();
-
- for (i = 0; i < srcs; i++) {
- g1[i] = rand();
- g2[i] = rand();
- g3[i] = rand();
- g4[i] = rand();
- g5[i] = rand();
- g6[i] = rand();
- }
-
- for (i = 0; i < srcs; i++) {
- gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
- gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
- gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]);
- gf_vect_mul_init(g4[i], &g_tbls[(96 * srcs) + (i * 32)]);
- gf_vect_mul_init(g5[i], &g_tbls[(128 * srcs) + (i * 32)]);
- gf_vect_mul_init(g6[i], &g_tbls[(160 * srcs) + (i * 32)]);
- }
-
- gf_vect_dot_prod_base(size, srcs, &g_tbls[0], buffs, dest_ref1);
- gf_vect_dot_prod_base(size, srcs, &g_tbls[32 * srcs], buffs, dest_ref2);
- gf_vect_dot_prod_base(size, srcs, &g_tbls[64 * srcs], buffs, dest_ref3);
- gf_vect_dot_prod_base(size, srcs, &g_tbls[96 * srcs], buffs, dest_ref4);
- gf_vect_dot_prod_base(size, srcs, &g_tbls[128 * srcs], buffs, dest_ref5);
- gf_vect_dot_prod_base(size, srcs, &g_tbls[160 * srcs], buffs, dest_ref6);
-
- FUNCTION_UNDER_TEST(size, srcs, g_tbls, buffs, dest_ptrs);
-
- if (memcmp(dest_ref1, dest_ptrs[0], size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
- size);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref1, 25);
- printf("dprod_dut:");
- dump(dest_ptrs[0], 25);
- return -1;
- }
- if (memcmp(dest_ref2, dest_ptrs[1], size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
- size);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref2, 25);
- printf("dprod_dut:");
- dump(dest_ptrs[1], 25);
- return -1;
- }
- if (memcmp(dest_ref3, dest_ptrs[2], size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
- size);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref3, 25);
- printf("dprod_dut:");
- dump(dest_ptrs[2], 25);
- return -1;
- }
- if (memcmp(dest_ref4, dest_ptrs[3], size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
- size);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref4, 25);
- printf("dprod_dut:");
- dump(dest_ptrs[3], 25);
- return -1;
- }
- if (memcmp(dest_ref5, dest_ptrs[4], size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
- size);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref5, 25);
- printf("dprod_dut:");
- dump(dest_ptrs[4], 25);
- return -1;
- }
- if (memcmp(dest_ref6, dest_ptrs[5], size)) {
- printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
- size);
- dump_matrix(buffs, 5, TEST_SOURCES);
- printf("dprod_base:");
- dump(dest_ref6, 25);
- printf("dprod_dut:");
- dump(dest_ptrs[5], 25);
- return -1;
- }
- }
-
- printf("Pass\n");
- return 0;
-
-}
diff --git a/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_sse_test.patch b/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_sse_test.patch
deleted file mode 100644
index 21bbfaa667..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_6vect_dot_prod_sse_test.patch
+++ /dev/null
@@ -1,4 +0,0 @@
-34c34
-< // #include "types.h"
----
-> #include "types.h"
diff --git a/contrib/libs/isa-l/erasure_code/gf_6vect_mad_avx.asm b/contrib/libs/isa-l/erasure_code/gf_6vect_mad_avx.asm
index c9d7e57472..13a9af79c2 100644
--- a/contrib/libs/isa-l/erasure_code/gf_6vect_mad_avx.asm
+++ b/contrib/libs/isa-l/erasure_code/gf_6vect_mad_avx.asm
@@ -111,7 +111,7 @@
%define return rax
%define return.w eax
- %define func(x) x:
+ %define func(x) x: endbranch
%macro FUNC_SAVE 0
push r12
push r13
@@ -184,13 +184,8 @@ section .text
align 16
-global gf_6vect_mad_avx:ISAL_SYM_TYPE_FUNCTION
+global gf_6vect_mad_avx, function
func(gf_6vect_mad_avx)
-%ifidn __OUTPUT_FORMAT__, macho64
-global _gf_6vect_mad_avx:ISAL_SYM_TYPE_FUNCTION
-func(_gf_6vect_mad_avx)
-%endif
-
FUNC_SAVE
sub len, 16
jl .return_fail
@@ -394,6 +389,3 @@ align 16
mask0f: dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
constip16:
dq 0xf8f9fafbfcfdfeff, 0xf0f1f2f3f4f5f6f7
-
-;;; func core, ver, snum
-slversion gf_6vect_mad_avx, 02, 01, 0210
diff --git a/contrib/libs/isa-l/erasure_code/gf_6vect_mad_avx.patch b/contrib/libs/isa-l/erasure_code/gf_6vect_mad_avx.patch
deleted file mode 100644
index d5afa0f167..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_6vect_mad_avx.patch
+++ /dev/null
@@ -1,6 +0,0 @@
-189,193d188
-< %ifidn __OUTPUT_FORMAT__, macho64
-< global _gf_6vect_mad_avx:ISAL_SYM_TYPE_FUNCTION
-< func(_gf_6vect_mad_avx)
-< %endif
-<
diff --git a/contrib/libs/isa-l/erasure_code/gf_6vect_mad_avx2.asm b/contrib/libs/isa-l/erasure_code/gf_6vect_mad_avx2.asm
index 2b6babcba5..5f0b3477bb 100644
--- a/contrib/libs/isa-l/erasure_code/gf_6vect_mad_avx2.asm
+++ b/contrib/libs/isa-l/erasure_code/gf_6vect_mad_avx2.asm
@@ -107,7 +107,7 @@
%define return rax
%define return.w eax
- %define func(x) x:
+ %define func(x) x: endbranch
%macro FUNC_SAVE 0
push r12
%endmacro
@@ -177,13 +177,8 @@ section .text
%define xd6 xd1
align 16
-global gf_6vect_mad_avx2:ISAL_SYM_TYPE_FUNCTION
+global gf_6vect_mad_avx2, function
func(gf_6vect_mad_avx2)
-%ifidn __OUTPUT_FORMAT__, macho64
-global _gf_6vect_mad_avx2:ISAL_SYM_TYPE_FUNCTION
-func(_gf_6vect_mad_avx2)
-%endif
-
FUNC_SAVE
sub len, 32
jl .return_fail
@@ -400,6 +395,3 @@ align 32
constip32:
dq 0xf8f9fafbfcfdfeff, 0xf0f1f2f3f4f5f6f7
dq 0xe8e9eaebecedeeef, 0xe0e1e2e3e4e5e6e7
-
-;;; func core, ver, snum
-slversion gf_6vect_mad_avx2, 04, 01, 0211
diff --git a/contrib/libs/isa-l/erasure_code/gf_6vect_mad_avx2.patch b/contrib/libs/isa-l/erasure_code/gf_6vect_mad_avx2.patch
deleted file mode 100644
index add3a8e3a2..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_6vect_mad_avx2.patch
+++ /dev/null
@@ -1,6 +0,0 @@
-182,186d181
-< %ifidn __OUTPUT_FORMAT__, macho64
-< global _gf_6vect_mad_avx2:ISAL_SYM_TYPE_FUNCTION
-< func(_gf_6vect_mad_avx2)
-< %endif
-<
diff --git a/contrib/libs/isa-l/erasure_code/gf_6vect_mad_avx512.asm b/contrib/libs/isa-l/erasure_code/gf_6vect_mad_avx512.asm
new file mode 100644
index 0000000000..2d18c529be
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/gf_6vect_mad_avx512.asm
@@ -0,0 +1,321 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2019 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_6vect_mad_avx512(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+ %define arg4 r8
+ %define arg5 r9
+ %define tmp r11
+ %define tmp2 r10
+ %define tmp3 r12 ;must be saved and restored
+ %define return rax
+ %define func(x) x: endbranch
+ %macro FUNC_SAVE 0
+ push r12
+ %endmacro
+ %macro FUNC_RESTORE 0
+ pop r12
+ %endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+ %define arg4 r12
+ %define arg5 r15
+ %define tmp r11
+ %define tmp2 r10
+ %define tmp3 r13
+ %define return rax
+ %define stack_size 16*10 + 3*8
+ %define arg(x) [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+%macro FUNC_SAVE 0
+ sub rsp, stack_size
+ vmovdqa [rsp+16*0],xmm6
+ vmovdqa [rsp+16*1],xmm7
+ vmovdqa [rsp+16*2],xmm8
+ vmovdqa [rsp+16*3],xmm9
+ vmovdqa [rsp+16*4],xmm10
+ vmovdqa [rsp+16*5],xmm11
+ vmovdqa [rsp+16*6],xmm12
+ vmovdqa [rsp+16*7],xmm13
+ vmovdqa [rsp+16*8],xmm14
+ vmovdqa [rsp+16*9],xmm15
+ save_reg r12, 10*16 + 0*8
+ save_reg r15, 10*16 + 1*8
+ save_reg r13, 10*16 + 2*8
+ end_prolog
+ mov arg4, arg(4)
+ mov arg5, arg(5)
+%endmacro
+
+%macro FUNC_RESTORE 0
+ vmovdqa xmm6, [rsp+16*0]
+ vmovdqa xmm7, [rsp+16*1]
+ vmovdqa xmm8, [rsp+16*2]
+ vmovdqa xmm9, [rsp+16*3]
+ vmovdqa xmm10, [rsp+16*4]
+ vmovdqa xmm11, [rsp+16*5]
+ vmovdqa xmm12, [rsp+16*6]
+ vmovdqa xmm13, [rsp+16*7]
+ vmovdqa xmm14, [rsp+16*8]
+ vmovdqa xmm15, [rsp+16*9]
+ mov r12, [rsp + 10*16 + 0*8]
+ mov r15, [rsp + 10*16 + 1*8]
+ mov r13, [rsp + 10*16 + 2*8]
+ add rsp, stack_size
+%endmacro
+%endif
+
+%define PS 8
+%define len arg0
+%define vec arg1
+%define vec_i arg2
+%define mul_array arg3
+%define src arg4
+%define dest1 arg5
+%define pos return
+%define dest2 tmp3
+%define dest3 tmp2
+%define dest4 mul_array
+%define dest5 vec
+%define dest6 vec_i
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu8
+ %define XSTR vmovdqu8
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+ %define XLDR vmovdqa64
+ %define XSTR vmovdqa64
+ %else
+ %define XLDR vmovntdqa
+ %define XSTR vmovntdq
+ %endif
+%endif
+
+default rel
+[bits 64]
+section .text
+
+%define x0 zmm0
+%define xtmpa zmm1
+%define xtmpl1 zmm2
+%define xtmph1 zmm3
+%define xgft1_hi zmm4
+%define xgft1_lo zmm5
+%define xgft1_loy ymm5
+%define xgft2_hi zmm6
+%define xgft2_lo zmm7
+%define xgft2_loy ymm7
+%define xgft3_hi zmm8
+%define xgft3_lo zmm9
+%define xgft3_loy ymm9
+%define xgft4_hi zmm10
+%define xgft4_lo zmm11
+%define xgft4_loy ymm11
+%define xgft5_hi zmm12
+%define xgft5_lo zmm13
+%define xgft5_loy ymm13
+%define xgft6_hi zmm14
+%define xgft6_lo zmm15
+%define xgft6_loy ymm15
+%define xd1 zmm16
+%define xd2 zmm17
+%define xd3 zmm18
+%define xd4 zmm19
+%define xd5 zmm20
+%define xd6 zmm21
+%define xmask0f zmm22
+%define xtmpl2 zmm23
+%define xtmpl3 zmm24
+%define xtmpl4 zmm25
+%define xtmpl5 zmm26
+%define xtmph2 zmm27
+%define xtmph3 zmm28
+%define xtmph4 zmm29
+%define xtmph5 zmm30
+%define xtmph6 zmm31
+
+align 16
+global gf_6vect_mad_avx512, function
+func(gf_6vect_mad_avx512)
+ FUNC_SAVE
+ sub len, 64
+ jl .return_fail
+ xor pos, pos
+ mov tmp, 0x0f
+ vpbroadcastb xmask0f, tmp ;Construct mask 0x0f0f0f...
+ sal vec_i, 5 ;Multiply by 32
+ sal vec, 5 ;Multiply by 32
+ lea tmp, [mul_array + vec_i]
+ mov vec_i, vec
+ mov mul_array, vec
+ sal vec_i, 1 ;vec_i=vec*64
+ sal mul_array, 1 ;mul_array=vec*64
+ add vec_i, vec ;vec_i=vec*96
+ add mul_array, vec_i ;vec_i=vec*160
+
+ vmovdqu xgft1_loy, [tmp] ;Load array Ax{00}..{0f}, Ax{00}..{f0}
+ vmovdqu xgft2_loy, [tmp+vec] ;Load array Bx{00}..{0f}, Bx{00}..{f0}
+ vmovdqu xgft3_loy, [tmp+2*vec] ;Load array Cx{00}..{0f}, Cx{00}..{f0}
+ vmovdqu xgft4_loy, [tmp+vec_i] ;Load array Dx{00}..{0f}, Dx{00}..{f0}
+ vmovdqu xgft5_loy, [tmp+4*vec] ;Load array Ex{00}..{0f}, Ex{00}..{f0}
+ vmovdqu xgft6_loy, [tmp+mul_array] ;Load array Fx{00}..{0f}, Fx{00}..{f0}
+
+ vshufi64x2 xgft1_hi, xgft1_lo, xgft1_lo, 0x55
+ vshufi64x2 xgft1_lo, xgft1_lo, xgft1_lo, 0x00
+ vshufi64x2 xgft2_hi, xgft2_lo, xgft2_lo, 0x55
+ vshufi64x2 xgft2_lo, xgft2_lo, xgft2_lo, 0x00
+ vshufi64x2 xgft3_hi, xgft3_lo, xgft3_lo, 0x55
+ vshufi64x2 xgft3_lo, xgft3_lo, xgft3_lo, 0x00
+ vshufi64x2 xgft4_hi, xgft4_lo, xgft4_lo, 0x55
+ vshufi64x2 xgft4_lo, xgft4_lo, xgft4_lo, 0x00
+ vshufi64x2 xgft5_hi, xgft5_lo, xgft5_lo, 0x55
+ vshufi64x2 xgft5_lo, xgft5_lo, xgft5_lo, 0x00
+ vshufi64x2 xgft6_hi, xgft6_lo, xgft6_lo, 0x55
+ vshufi64x2 xgft6_lo, xgft6_lo, xgft6_lo, 0x00
+
+ mov dest2, [dest1+PS]
+ mov dest3, [dest1+2*PS]
+ mov dest4, [dest1+3*PS] ; reuse mul_array
+ mov dest5, [dest1+4*PS] ; reuse vec
+ mov dest6, [dest1+5*PS] ; reuse vec_i
+ mov dest1, [dest1]
+ mov tmp, -1
+ kmovq k1, tmp
+
+.loop64:
+ XLDR x0, [src+pos] ;Get next source vector
+ XLDR xd1, [dest1+pos] ;Get next dest vector
+ XLDR xd2, [dest2+pos] ;Get next dest vector
+ XLDR xd3, [dest3+pos] ;Get next dest vector
+ XLDR xd4, [dest4+pos] ;Get next dest vector
+ XLDR xd5, [dest5+pos] ;Get next dest vector
+ XLDR xd6, [dest6+pos] ;Get next dest vector
+
+ vpandq xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
+ vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
+ vpandq x0, x0, xmask0f ;Mask high src nibble in bits 4-0
+
+ ; dest1
+ vpshufb xtmph1 {k1}{z}, xgft1_hi, x0 ;Lookup mul table of high nibble
+ vpshufb xtmpl1 {k1}{z}, xgft1_lo, xtmpa ;Lookup mul table of low nibble
+ vpxorq xtmph1, xtmph1, xtmpl1 ;GF add high and low partials
+ vpxorq xd1, xd1, xtmph1 ;xd1 += partial
+
+ ; dest2
+ vpshufb xtmph2 {k1}{z}, xgft2_hi, x0 ;Lookup mul table of high nibble
+ vpshufb xtmpl2 {k1}{z}, xgft2_lo, xtmpa ;Lookup mul table of low nibble
+ vpxorq xtmph2, xtmph2, xtmpl2 ;GF add high and low partials
+ vpxorq xd2, xd2, xtmph2 ;xd2 += partial
+
+ ; dest3
+ vpshufb xtmph3 {k1}{z}, xgft3_hi, x0 ;Lookup mul table of high nibble
+ vpshufb xtmpl3 {k1}{z}, xgft3_lo, xtmpa ;Lookup mul table of low nibble
+ vpxorq xtmph3, xtmph3, xtmpl3 ;GF add high and low partials
+ vpxorq xd3, xd3, xtmph3 ;xd3 += partial
+
+ ; dest4
+ vpshufb xtmph4 {k1}{z}, xgft4_hi, x0 ;Lookup mul table of high nibble
+ vpshufb xtmpl4 {k1}{z}, xgft4_lo, xtmpa ;Lookup mul table of low nibble
+ vpxorq xtmph4, xtmph4, xtmpl4 ;GF add high and low partials
+ vpxorq xd4, xd4, xtmph4 ;xd4 += partial
+
+ ; dest5
+ vpshufb xtmph5 {k1}{z}, xgft5_hi, x0 ;Lookup mul table of high nibble
+ vpshufb xtmpl5 {k1}{z}, xgft5_lo, xtmpa ;Lookup mul table of low nibble
+ vpxorq xtmph5, xtmph5, xtmpl5 ;GF add high and low partials
+ vpxorq xd5, xd5, xtmph5 ;xd5 += partial
+
+ ; dest6
+ vpshufb xtmph6 {k1}{z}, xgft6_hi, x0 ;Lookup mul table of high nibble
+ vpshufb xtmpl5 {k1}{z}, xgft6_lo, xtmpa ;Lookup mul table of low nibble. Reuse xtmpl5
+ vpxorq xtmph6, xtmph6, xtmpl5 ;GF add high and low partials.
+ vpxorq xd6, xd6, xtmph6 ;xd6 += partial
+
+ XSTR [dest1+pos], xd1
+ XSTR [dest2+pos], xd2
+ XSTR [dest3+pos], xd3
+ XSTR [dest4+pos], xd4
+ XSTR [dest5+pos], xd5
+ XSTR [dest6+pos], xd6
+
+ add pos, 64 ;Loop on 64 bytes at a time
+ cmp pos, len
+ jle .loop64
+
+ lea tmp, [len + 64]
+ cmp pos, tmp
+ je .return_pass
+
+ ;; Tail len
+ mov pos, (1 << 63)
+ lea tmp, [len + 64 - 1]
+ and tmp, 63
+ sarx pos, pos, tmp
+ kmovq k1, pos
+ mov pos, len ;Overlapped offset length-64
+ jmp .loop64 ;Do one more overlap pass
+
+.return_pass:
+ mov return, 0
+ FUNC_RESTORE
+ ret
+
+.return_fail:
+ mov return, 1
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_gf_6vect_mad_avx512
+no_gf_6vect_mad_avx512:
+%endif
+%endif ; ifdef HAVE_AS_KNOWS_AVX512
diff --git a/contrib/libs/isa-l/erasure_code/gf_6vect_mad_avx512_gfni.asm b/contrib/libs/isa-l/erasure_code/gf_6vect_mad_avx512_gfni.asm
new file mode 100644
index 0000000000..b1853b65fd
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/gf_6vect_mad_avx512_gfni.asm
@@ -0,0 +1,259 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2023 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_6vect_mad_avx512_gfni(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%include "reg_sizes.asm"
+%include "gf_vect_gfni.inc"
+
+%if AS_FEATURE_LEVEL >= 10
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+ %define arg4 r8
+ %define arg5 r9
+ %define tmp r11
+ %define tmp2 r10
+ %define tmp3 r12 ;must be saved and restored
+ %define func(x) x: endbranch
+ %macro FUNC_SAVE 0
+ push r12
+ %endmacro
+ %macro FUNC_RESTORE 0
+ pop r12
+ %endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+ %define arg4 r12
+ %define arg5 r14
+ %define tmp r11
+ %define tmp2 r10
+ %define tmp3 r13
+ %define stack_size 16*10 + 3*8
+ %define arg(x) [rsp + stack_size + 8 + 8*x]
+ %define func(x) proc_frame x
+
+%macro FUNC_SAVE 0
+ sub rsp, stack_size
+ vmovdqa [rsp + 16*0], xmm6
+ vmovdqa [rsp + 16*1], xmm7
+ vmovdqa [rsp + 16*2], xmm8
+ vmovdqa [rsp + 16*3], xmm9
+ vmovdqa [rsp + 16*4], xmm10
+ vmovdqa [rsp + 16*5], xmm11
+ vmovdqa [rsp + 16*6], xmm12
+ vmovdqa [rsp + 16*7], xmm13
+ vmovdqa [rsp + 16*8], xmm14
+ vmovdqa [rsp + 16*9], xmm15
+ mov [rsp + 10*16 + 0*8], r12
+ mov [rsp + 10*16 + 1*8], r13
+ mov [rsp + 10*16 + 2*8], r14
+ end_prolog
+ mov arg4, arg(4)
+ mov arg5, arg(5)
+%endmacro
+
+%macro FUNC_RESTORE 0
+ vmovdqa xmm6, [rsp + 16*0]
+ vmovdqa xmm7, [rsp + 16*1]
+ vmovdqa xmm8, [rsp + 16*2]
+ vmovdqa xmm9, [rsp + 16*3]
+ vmovdqa xmm10, [rsp + 16*4]
+ vmovdqa xmm11, [rsp + 16*5]
+ vmovdqa xmm12, [rsp + 16*6]
+ vmovdqa xmm13, [rsp + 16*7]
+ vmovdqa xmm14, [rsp + 16*8]
+ vmovdqa xmm15, [rsp + 16*9]
+ mov r12, [rsp + 10*16 + 0*8]
+ mov r13, [rsp + 10*16 + 1*8]
+ mov r14, [rsp + 10*16 + 2*8]
+ add rsp, stack_size
+%endmacro
+%endif
+
+%define len arg0
+%define vec arg1
+%define vec_i arg2
+%define mul_array arg3
+%define src arg4
+%define dest1 arg5
+%define pos rax
+%define dest2 tmp3
+%define dest3 tmp2
+%define dest4 mul_array
+%define dest5 vec
+%define dest6 vec_i
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu8
+ %define XSTR vmovdqu8
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+ %define XLDR vmovdqa64
+ %define XSTR vmovdqa64
+ %else
+ %define XLDR vmovntdqa
+ %define XSTR vmovntdq
+ %endif
+%endif
+
+default rel
+[bits 64]
+section .text
+
+%define x0 zmm0
+%define xd1 zmm1
+%define xd2 zmm2
+%define xd3 zmm3
+%define xd4 zmm4
+%define xd5 zmm5
+%define xd6 zmm6
+
+%define xgft1 zmm7
+%define xgft2 zmm8
+%define xgft3 zmm9
+%define xgft4 zmm10
+%define xgft5 zmm11
+%define xgft6 zmm12
+
+%define xret1 zmm13
+%define xret2 zmm14
+%define xret3 zmm15
+%define xret4 zmm16
+%define xret5 zmm17
+%define xret6 zmm18
+
+;;
+;; Encodes 64 bytes of a single source into 6x 64 bytes (parity disks)
+;;
+%macro ENCODE_64B_6 0-1
+%define %%KMASK %1
+
+%if %0 == 1
+ vmovdqu8 x0{%%KMASK}, [src + pos] ;Get next source vector
+ vmovdqu8 xd1{%%KMASK}, [dest1 + pos] ;Get next dest vector
+ vmovdqu8 xd2{%%KMASK}, [dest2 + pos] ;Get next dest vector
+ vmovdqu8 xd3{%%KMASK}, [dest3 + pos] ;Get next dest vector
+ vmovdqu8 xd4{%%KMASK}, [dest4 + pos] ;Get next dest vector
+ vmovdqu8 xd5{%%KMASK}, [dest5 + pos] ;Get next dest vector
+ vmovdqu8 xd6{%%KMASK}, [dest6 + pos] ;Get next dest vector
+%else
+ XLDR x0, [src + pos] ;Get next source vector
+ XLDR xd1, [dest1 + pos] ;Get next dest vector
+ XLDR xd2, [dest2 + pos] ;Get next dest vector
+ XLDR xd3, [dest3 + pos] ;Get next dest vector
+ XLDR xd4, [dest4 + pos] ;Get next dest vector
+ XLDR xd5, [dest5 + pos] ;Get next dest vector
+ XLDR xd6, [dest6 + pos] ;Get next dest vector
+%endif
+
+ GF_MUL_XOR EVEX, x0, xgft1, xret1, xd1, xgft2, xret2, xd2, xgft3, xret3, xd3, \
+ xgft4, xret4, xd4, xgft5, xret5, xd5, xgft6, xret6, xd6
+
+%if %0 == 1
+ vmovdqu8 [dest1 + pos]{%%KMASK}, xd1
+ vmovdqu8 [dest2 + pos]{%%KMASK}, xd2
+ vmovdqu8 [dest3 + pos]{%%KMASK}, xd3
+ vmovdqu8 [dest4 + pos]{%%KMASK}, xd4
+ vmovdqu8 [dest5 + pos]{%%KMASK}, xd5
+ vmovdqu8 [dest6 + pos]{%%KMASK}, xd6
+%else
+ XSTR [dest1 + pos], xd1
+ XSTR [dest2 + pos], xd2
+ XSTR [dest3 + pos], xd3
+ XSTR [dest4 + pos], xd4
+ XSTR [dest5 + pos], xd5
+ XSTR [dest6 + pos], xd6
+%endif
+%endmacro
+
+align 16
+global gf_6vect_mad_avx512_gfni, function
+func(gf_6vect_mad_avx512_gfni)
+ FUNC_SAVE
+
+ xor pos, pos
+ shl vec_i, 3 ;Multiply by 8
+ shl vec, 3 ;Multiply by 8
+ lea tmp, [mul_array + vec_i]
+ vbroadcastf32x2 xgft1, [tmp]
+ vbroadcastf32x2 xgft2, [tmp + vec]
+ vbroadcastf32x2 xgft3, [tmp + vec*2]
+ vbroadcastf32x2 xgft5, [tmp + vec*4]
+ add tmp, vec
+ vbroadcastf32x2 xgft4, [tmp + vec*2]
+ vbroadcastf32x2 xgft6, [tmp + vec*4]
+ mov dest2, [dest1 + 8]
+ mov dest3, [dest1 + 2*8]
+ mov dest4, [dest1 + 3*8] ; reuse mul_array
+ mov dest5, [dest1 + 4*8] ; reuse vec
+ mov dest6, [dest1 + 5*8] ; reuse vec_i
+ mov dest1, [dest1]
+
+ cmp len, 64
+ jl .len_lt_64
+.loop64:
+ ENCODE_64B_6
+
+ add pos, 64 ;Loop on 64 bytes at a time
+ sub len, 64
+ cmp len, 64
+ jge .loop64
+
+.len_lt_64:
+ cmp len, 0
+ jle .exit
+
+ xor tmp, tmp
+ bts tmp, len
+ dec tmp
+ kmovq k1, tmp
+
+ ENCODE_64B_6 k1
+
+.exit:
+ vzeroupper
+
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+%endif ; if AS_FEATURE_LEVEL >= 10
diff --git a/contrib/libs/isa-l/erasure_code/gf_6vect_mad_sse.asm b/contrib/libs/isa-l/erasure_code/gf_6vect_mad_sse.asm
index 8e0fc0e0a4..a816f8bbf4 100644
--- a/contrib/libs/isa-l/erasure_code/gf_6vect_mad_sse.asm
+++ b/contrib/libs/isa-l/erasure_code/gf_6vect_mad_sse.asm
@@ -113,7 +113,7 @@
%define return rax
%define return.w eax
- %define func(x) x:
+ %define func(x) x: endbranch
%macro FUNC_SAVE 0
push r12
push r13
@@ -185,13 +185,8 @@ section .text
align 16
-global gf_6vect_mad_sse:ISAL_SYM_TYPE_FUNCTION
+global gf_6vect_mad_sse, function
func(gf_6vect_mad_sse)
-%ifidn __OUTPUT_FORMAT__, macho64
-global _gf_6vect_mad_sse:ISAL_SYM_TYPE_FUNCTION
-func(_gf_6vect_mad_sse)
-%endif
-
FUNC_SAVE
sub len, 16
jl .return_fail
@@ -406,6 +401,3 @@ align 16
mask0f: dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
constip16:
dq 0xf8f9fafbfcfdfeff, 0xf0f1f2f3f4f5f6f7
-
-;;; func core, ver, snum
-slversion gf_6vect_mad_sse, 00, 01, 020f
diff --git a/contrib/libs/isa-l/erasure_code/gf_6vect_mad_sse.patch b/contrib/libs/isa-l/erasure_code/gf_6vect_mad_sse.patch
deleted file mode 100644
index 619072af53..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_6vect_mad_sse.patch
+++ /dev/null
@@ -1,6 +0,0 @@
-190,194d189
-< %ifidn __OUTPUT_FORMAT__, macho64
-< global _gf_6vect_mad_sse:ISAL_SYM_TYPE_FUNCTION
-< func(_gf_6vect_mad_sse)
-< %endif
-<
diff --git a/contrib/libs/isa-l/erasure_code/gf_inverse_test.c b/contrib/libs/isa-l/erasure_code/gf_inverse_test.c
index fe2006eeb7..54e70bb46e 100644
--- a/contrib/libs/isa-l/erasure_code/gf_inverse_test.c
+++ b/contrib/libs/isa-l/erasure_code/gf_inverse_test.c
@@ -111,7 +111,9 @@ int inv_test(u8 * in, u8 * inv, u8 * sav, int n)
print_matrix(in, n);
return -1;
}
+#ifdef TEST_VERBOSE
putchar('.');
+#endif
return 0;
}
@@ -119,7 +121,8 @@ int inv_test(u8 * in, u8 * inv, u8 * sav, int n)
int main(int argc, char *argv[])
{
int i, k, t;
- u8 *test_mat, *save_mat, *invr_mat;
+ u8 *test_mat = NULL, *save_mat = NULL, *invr_mat = NULL;
+ int ret = -1;
u8 test1[] = { 1, 1, 6,
1, 1, 1,
@@ -149,25 +152,25 @@ int main(int argc, char *argv[])
invr_mat = malloc(KMAX * KMAX);
if (NULL == test_mat || NULL == save_mat || NULL == invr_mat)
- return -1;
+ goto exit;
// Test with lots of leading 1's
k = 3;
memcpy(test_mat, test1, k * k);
if (inv_test(test_mat, invr_mat, save_mat, k))
- return -1;
+ goto exit;
// Test with leading zeros
k = 3;
memcpy(test_mat, test2, k * k);
if (inv_test(test_mat, invr_mat, save_mat, k))
- return -1;
+ goto exit;
// Test 3
k = 3;
memcpy(test_mat, test3, k * k);
if (inv_test(test_mat, invr_mat, save_mat, k))
- return -1;
+ goto exit;
// Test 4 - try a singular matrix
k = 4;
@@ -175,7 +178,7 @@ int main(int argc, char *argv[])
if (!gf_invert_matrix(test_mat, invr_mat, k)) {
printf("Fail: didn't catch singular matrix\n");
print_matrix(test4, 4);
- return -1;
+ goto exit;
}
// Do random test of size KMAX
k = KMAX;
@@ -185,7 +188,7 @@ int main(int argc, char *argv[])
if (gf_invert_matrix(test_mat, invr_mat, k)) {
printf("rand picked a singular matrix, try again\n");
- return -1;
+ goto exit;
}
matrix_mult(invr_mat, save_mat, test_mat, k);
@@ -195,7 +198,7 @@ int main(int argc, char *argv[])
print_matrix(save_mat, k);
print_matrix(invr_mat, k);
print_matrix(test_mat, k);
- return -1;
+ goto exit;
}
// Do Randoms. Random size and coefficients
for (t = 0; t < RANDOMS; t++) {
@@ -214,12 +217,22 @@ int main(int argc, char *argv[])
print_matrix(save_mat, k);
print_matrix(invr_mat, k);
print_matrix(test_mat, k);
- return -1;
+ goto exit;
}
+#ifdef TEST_VERBOSE
if (0 == (t % 8))
putchar('.');
+#endif
}
printf(" Pass\n");
- return 0;
+
+ ret = 0;
+
+ exit:
+ free(test_mat);
+ free(save_mat);
+ free(invr_mat);
+
+ return ret;
}
diff --git a/contrib/libs/isa-l/erasure_code/gf_inverse_test.patch b/contrib/libs/isa-l/erasure_code/gf_inverse_test.patch
deleted file mode 100644
index b91cf7f348..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_inverse_test.patch
+++ /dev/null
@@ -1,4 +0,0 @@
-59c59
-< d ^= gf_mul_erasure(a[n * i + k], b[n * k + j]);
----
-> d ^= gf_mul(a[n * i + k], b[n * k + j]);
diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_1tbl.c b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_1tbl.c
index d2959c3c51..bd202fdcf1 100644
--- a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_1tbl.c
+++ b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_1tbl.c
@@ -33,22 +33,22 @@
#include "test.h"
#include "erasure_code.h"
-//#define CACHED_TEST
-#ifdef CACHED_TEST
+#ifndef GT_L3_CACHE
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+#endif
+
+#if !defined(COLD_TEST) && !defined(TEST_CUSTOM)
// Cached test, loop many times over small dataset
# define TEST_SOURCES 10
# define TEST_LEN 8*1024
# define TEST_TYPE_STR "_warm"
-#else
-# ifndef TEST_CUSTOM
+#elif defined (COLD_TEST)
// Uncached test. Pull from large mem base.
-# define TEST_SOURCES 10
-# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
-# define TEST_LEN GT_L3_CACHE / TEST_SOURCES
-# define TEST_TYPE_STR "_cold"
-# else
-# define TEST_TYPE_STR "_cus"
-# endif
+# define TEST_SOURCES 10
+# define TEST_LEN (GT_L3_CACHE / TEST_SOURCES)
+# define TEST_TYPE_STR "_cold"
+#elif defined (TEST_CUSTOM)
+# define TEST_TYPE_STR "_cus"
#endif
typedef unsigned char u8;
@@ -111,10 +111,20 @@ void gf_vect_dot_prod_mult(int len, int vlen, u8 * v, u8 ** src, u8 * dest)
int main(void)
{
int i, j;
- u8 vec[TEST_SOURCES], dest1[TEST_LEN], dest2[TEST_LEN];
+ u8 vec[TEST_SOURCES], *dest1, *dest2;
u8 *matrix[TEST_SOURCES];
struct perf start;
+ dest1 = (u8 *) malloc(TEST_LEN);
+ dest2 = (u8 *) malloc(TEST_LEN);
+
+ if (NULL == dest1 || NULL == dest2) {
+ printf("buffer alloc error\n");
+ return -1;
+ }
+ memset(dest1, 0xfe, TEST_LEN);
+ memset(dest2, 0xfe, TEST_LEN);
+
mk_gf_field();
mk_gf_mul_table(gf_mul_table);
diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_1tbl.patch b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_1tbl.patch
deleted file mode 100644
index 5ea8eab7d0..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_1tbl.patch
+++ /dev/null
@@ -1,8 +0,0 @@
-81c81
-< table[i * 256 + j] = gf_mul_erasure(i, j);
----
-> table[i * 256 + j] = gf_mul(i, j);
-91c91
-< s ^= gf_mul_erasure(src[j][i], v[j]);
----
-> s ^= gf_mul(src[j][i], v[j]);
diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx.asm b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx.asm
index dc1eebb972..37915c873b 100644
--- a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx.asm
+++ b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx.asm
@@ -48,7 +48,7 @@
%endmacro
%define SSTR SLDR
%define PS 8
- %define func(x) x:
+ %define func(x) x: endbranch
%define FUNC_SAVE
%define FUNC_RESTORE
%endif
@@ -106,7 +106,7 @@
%define PS 4
%define LOG_PS 2
- %define func(x) x:
+ %define func(x) x: endbranch
%define arg(x) [ebp + PS*2 + PS*x]
%define trans ecx ;trans is for the variables in stack
@@ -194,13 +194,8 @@ section .text
%define xp xmm2
align 16
-global gf_vect_dot_prod_avx:ISAL_SYM_TYPE_FUNCTION
+global gf_vect_dot_prod_avx, function
func(gf_vect_dot_prod_avx)
-%ifidn __OUTPUT_FORMAT__, macho64
-global _gf_vect_dot_prod_avx:ISAL_SYM_TYPE_FUNCTION
-func(_gf_vect_dot_prod_avx)
-%endif
-
FUNC_SAVE
SLDR len, len_m
sub len, 16
@@ -271,6 +266,3 @@ align 16
mask0f:
dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
-
-;;; func core, ver, snum
-slversion gf_vect_dot_prod_avx, 02, 05, 0061
diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx.patch b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx.patch
deleted file mode 100644
index 30bdc75785..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx.patch
+++ /dev/null
@@ -1,6 +0,0 @@
-199,203d198
-< %ifidn __OUTPUT_FORMAT__, macho64
-< global _gf_vect_dot_prod_avx:ISAL_SYM_TYPE_FUNCTION
-< func(_gf_vect_dot_prod_avx)
-< %endif
-<
diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx2.asm b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx2.asm
index 986fd28a4e..fb9b022975 100644
--- a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx2.asm
+++ b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx2.asm
@@ -51,7 +51,7 @@
%endmacro
%define SSTR SLDR
%define PS 8
- %define func(x) x:
+ %define func(x) x: endbranch
%define FUNC_SAVE
%define FUNC_RESTORE
%endif
@@ -111,7 +111,7 @@
%define PS 4
%define LOG_PS 2
- %define func(x) x:
+ %define func(x) x: endbranch
%define arg(x) [ebp + PS*2 + PS*x]
%define trans ecx ;trans is for the variables in stack
@@ -202,13 +202,8 @@ section .text
%define xp ymm2
align 16
-global gf_vect_dot_prod_avx2:ISAL_SYM_TYPE_FUNCTION
+global gf_vect_dot_prod_avx2, function
func(gf_vect_dot_prod_avx2)
-%ifidn __OUTPUT_FORMAT__, macho64
-global _gf_vect_dot_prod_avx2:ISAL_SYM_TYPE_FUNCTION
-func(_gf_vect_dot_prod_avx2)
-%endif
-
FUNC_SAVE
SLDR len, len_m
sub len, 32
@@ -278,8 +273,3 @@ func(_gf_vect_dot_prod_avx2)
ret
endproc_frame
-
-section .data
-
-;;; func core, ver, snum
-slversion gf_vect_dot_prod_avx2, 04, 05, 0190
diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx2.patch b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx2.patch
deleted file mode 100644
index c2890dbc39..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx2.patch
+++ /dev/null
@@ -1,6 +0,0 @@
-207,211d206
-< %ifidn __OUTPUT_FORMAT__, macho64
-< global _gf_vect_dot_prod_avx2:ISAL_SYM_TYPE_FUNCTION
-< func(_gf_vect_dot_prod_avx2)
-< %endif
-<
diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx2_gfni.asm b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx2_gfni.asm
new file mode 100644
index 0000000000..c084894c7b
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx2_gfni.asm
@@ -0,0 +1,318 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2023 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_vect_dot_prod_avx2_gfni(len, vec, *g_tbls, **buffs, *dest);
+;;;
+
+%include "reg_sizes.asm"
+%include "gf_vect_gfni.inc"
+%include "memcpy.asm"
+
+%if AS_FEATURE_LEVEL >= 10
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+ %define arg4 r8
+ %define arg5 r9
+
+ %define tmp r11
+ %define tmp2 r10
+ %define tmp3 r12 ; must be saved and restored
+
+ %define stack_size 1*8
+ %define func(x) x: endbranch
+ %macro FUNC_SAVE 0
+ sub rsp, stack_size
+ mov [rsp + 0*8], r12
+ %endmacro
+ %macro FUNC_RESTORE 0
+ mov r12, [rsp + 0*8]
+ add rsp, stack_size
+ %endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+
+ %define arg4 r12 ; must be saved, loaded and restored
+ %define arg5 r15 ; must be saved and restored
+ %define tmp r11
+ %define tmp2 r10
+ %define tmp3 r13 ; must be saved and restored
+ %define stack_size 4*16 + 3*8 ; must be an odd multiple of 8
+ %define arg(x) [rsp + stack_size + 8 + 8*x]
+
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ vmovdqa [rsp + 0*16], xmm6
+ vmovdqa [rsp + 1*16], xmm7
+ vmovdqa [rsp + 2*16], xmm8
+ vmovdqa [rsp + 3*16], xmm9
+ mov [rsp + 4*16 + 0*8], r12
+ mov [rsp + 4*16 + 1*8], r13
+ mov [rsp + 4*16 + 2*8], r15
+ end_prolog
+ mov arg4, arg(4)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ vmovdqa xmm6, [rsp + 0*16]
+ vmovdqa xmm7, [rsp + 1*16]
+ vmovdqa xmm8, [rsp + 2*16]
+ vmovdqa xmm9, [rsp + 3*16]
+ mov r12, [rsp + 4*16 + 0*8]
+ mov r13, [rsp + 4*16 + 1*8]
+ mov r15, [rsp + 4*16 + 2*8]
+ add rsp, stack_size
+ %endmacro
+%endif
+
+
+%define len arg0
+%define vec arg1
+%define mul_array arg2
+%define src arg3
+%define dest1 arg4
+%define ptr arg5
+%define vec_i tmp2
+%define pos rax
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+ %define XLDR vmovdqa
+ %define XSTR vmovdqa
+ %else
+ %define XLDR vmovntdqa
+ %define XSTR vmovntdq
+ %endif
+%endif
+
+%define x0l ymm0
+%define x0h ymm1
+%define x0x ymm2
+
+%define xp1l ymm3
+%define xp1h ymm4
+%define xp1x ymm5
+
+%define xgft1 ymm6
+%define xgft2 ymm7
+%define xgft3 ymm8
+
+%define xtmp1 ymm9
+
+%define x0 x0l
+%define xp1 xp1l
+%define xp2 xp2l
+%define xp3 xp3l
+
+default rel
+[bits 64]
+
+section .text
+
+;;
+;; Encodes 96 bytes of all "k" sources into 96 bytes (single parity disk)
+;;
+%macro ENCODE_96B 0
+ vpxor xp1l, xp1l, xp1l
+ vpxor xp1h, xp1h, xp1h
+ vpxor xp1x, xp1x, xp1x
+ mov tmp, mul_array
+ xor vec_i, vec_i
+
+%%next_vect:
+ ;; load next source vector
+ mov ptr, [src + vec_i]
+ XLDR x0l, [ptr + pos]
+ XLDR x0h, [ptr + pos + 32]
+ XLDR x0x, [ptr + pos + 64]
+ add vec_i, 8
+
+ vbroadcastsd xgft1, [tmp]
+ add tmp, 8
+
+ GF_MUL_XOR VEX, x0l, xgft1, xtmp1, xp1l
+ GF_MUL_XOR VEX, x0h, xgft1, xtmp1, xp1h
+ GF_MUL_XOR VEX, x0x, xgft1, xtmp1, xp1x
+
+ cmp vec_i, vec
+ jl %%next_vect
+
+ XSTR [dest1 + pos], xp1l
+ XSTR [dest1 + pos + 32], xp1h
+ XSTR [dest1 + pos + 64], xp1x
+%endmacro
+
+;;
+;; Encodes 64 bytes of all "k" sources into 64 bytes (single parity disk)
+;;
+%macro ENCODE_64B 0
+ vpxor xp1l, xp1l, xp1l
+ vpxor xp1h, xp1h, xp1h
+ mov tmp, mul_array
+ xor vec_i, vec_i
+
+%%next_vect:
+ ;; load next source vector
+ mov ptr, [src + vec_i]
+ XLDR x0l, [ptr + pos]
+ XLDR x0h, [ptr + pos + 32]
+ add vec_i, 8
+
+ vbroadcastsd xgft1, [tmp]
+ add tmp, 8
+
+ GF_MUL_XOR VEX, x0l, xgft1, xtmp1, xp1l
+ GF_MUL_XOR VEX, x0h, xgft1, xgft1, xp1h
+
+ cmp vec_i, vec
+ jl %%next_vect
+
+ XSTR [dest1 + pos], xp1l
+ XSTR [dest1 + pos + 32], xp1h
+%endmacro
+
+;;
+;; Encodes 32 bytes of all "k" sources into 32 bytes (single parity disks)
+;;
+%macro ENCODE_32B 0
+ vpxor xp1, xp1, xp1
+ mov tmp, mul_array
+ xor vec_i, vec_i
+
+%%next_vect:
+ ;; load next source vector
+ mov ptr, [src + vec_i]
+ XLDR x0, [ptr + pos]
+ add vec_i, 8
+
+ vbroadcastsd xgft1, [tmp]
+ add tmp, 8
+
+ GF_MUL_XOR VEX, x0, xgft1, xgft1, xp1
+
+ cmp vec_i, vec
+ jl %%next_vect
+
+ XSTR [dest1 + pos], xp1
+%endmacro
+
+;;
+;; Encodes less than 32 bytes of all "k" sources into single parity disks
+;;
+%macro ENCODE_LT_32B 1
+%define %%LEN %1
+
+ vpxor xp1, xp1, xp1
+ xor vec_i, vec_i
+
+%%next_vect:
+ ; get next source vector
+ mov ptr, [src + vec_i]
+ simd_load_avx2 x0, ptr + pos, %%LEN, tmp, tmp3
+ add vec_i, 8
+
+ vbroadcastsd xgft1, [mul_array]
+ add mul_array, 8
+
+ GF_MUL_XOR VEX, x0, xgft1, xgft1, xp1
+
+ cmp vec_i, vec
+ jl %%next_vect
+
+ ;; Store updated encoded data
+ lea ptr, [dest1 + pos]
+ simd_store_avx2 ptr, xp1, %%LEN, tmp, vec_i
+%endmacro
+
+align 16
+global gf_vect_dot_prod_avx2_gfni, function
+func(gf_vect_dot_prod_avx2_gfni)
+ FUNC_SAVE
+
+ xor pos, pos
+ shl vec, 3 ;; vec *= 8. Make vec_i count by 8
+
+ cmp len, 96
+ jl .len_lt_96
+
+.loop96:
+ ENCODE_96B
+
+ add pos, 96 ;; Loop on 96 bytes at a time first
+ sub len, 96
+ cmp len, 96
+ jge .loop96
+
+.len_lt_96:
+ cmp len, 64
+ jl .len_lt_64
+
+ ENCODE_64B
+
+ add pos, 64 ;; encode next 64 bytes
+ sub len, 64
+
+.len_lt_64:
+ cmp len, 32
+ jl .len_lt_32
+
+ ENCODE_32B
+
+ add pos, 32 ;; encode next 32 bytes
+ sub len, 32
+
+.len_lt_32:
+ cmp len, 0
+ jle .exit
+
+ ENCODE_LT_32B len ;; encode final bytes
+
+.exit:
+ vzeroupper
+
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+%endif ; if AS_FEATURE_LEVEL >= 10
diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx512.asm b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx512.asm
index 405c1e48e2..b5fbf42498 100644
--- a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx512.asm
+++ b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx512.asm
@@ -49,7 +49,7 @@
%define PS 8
%define LOG_PS 3
- %define func(x) x:
+ %define func(x) x: endbranch
%define FUNC_SAVE
%define FUNC_RESTORE
%endif
@@ -73,15 +73,15 @@
%define func(x) proc_frame x
%macro FUNC_SAVE 0
alloc_stack stack_size
- save_reg r12, 9*16 + 0*8
- save_reg r15, 9*16 + 3*8
+ save_reg r12, 0*8
+ save_reg r15, 1*8
end_prolog
mov arg4, arg(4)
%endmacro
%macro FUNC_RESTORE 0
- mov r12, [rsp + 9*16 + 0*8]
- mov r15, [rsp + 9*16 + 3*8]
+ mov r12, [rsp + 0*8]
+ mov r15, [rsp + 1*8]
add rsp, stack_size
%endmacro
%endif
@@ -104,8 +104,8 @@
%else
;;; Use Non-temporal load/stor
%ifdef NO_NT_LDST
- %define XLDR vmovdqa
- %define XSTR vmovdqa
+ %define XLDR vmovdqa64
+ %define XSTR vmovdqa64
%else
%define XLDR vmovntdqa
%define XSTR vmovntdq
@@ -128,13 +128,8 @@ default rel
section .text
align 16
-global gf_vect_dot_prod_avx512:ISAL_SYM_TYPE_FUNCTION
+global gf_vect_dot_prod_avx512, function
func(gf_vect_dot_prod_avx512)
-%ifidn __OUTPUT_FORMAT__, macho64
-global _gf_vect_dot_prod_avx512:ISAL_SYM_TYPE_FUNCTION
-func(_gf_vect_dot_prod_avx512)
-%endif
-
FUNC_SAVE
xor pos, pos
mov tmp, 0x0f
diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx512.patch b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx512.patch
deleted file mode 100644
index 61be77efd3..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx512.patch
+++ /dev/null
@@ -1,6 +0,0 @@
-133,137d132
-< %ifidn __OUTPUT_FORMAT__, macho64
-< global _gf_vect_dot_prod_avx512:ISAL_SYM_TYPE_FUNCTION
-< func(_gf_vect_dot_prod_avx512)
-< %endif
-<
diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx512_gfni.asm b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx512_gfni.asm
new file mode 100644
index 0000000000..b8fc778a88
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_avx512_gfni.asm
@@ -0,0 +1,190 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2023 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_vect_dot_prod_avx512_gfni(len, vec, *g_tbls, **buffs, *dest);
+;;;
+
+%include "reg_sizes.asm"
+%include "gf_vect_gfni.inc"
+
+%if AS_FEATURE_LEVEL >= 10
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+ %define arg4 r8
+ %define arg5 r9
+
+ %define tmp r11
+ %define tmp2 r10
+
+ %define func(x) x: endbranch
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+
+ %define arg4 r12 ; must be saved, loaded and restored
+ %define arg5 r13 ; must be saved and restored
+ %define tmp r11
+ %define tmp2 r10
+ %define stack_size 0*16 + 3*8 ; must be an odd multiple of 8
+ %define arg(x) [rsp + stack_size + 8 + 8*x]
+
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ mov [rsp + 0*8], r12
+ mov [rsp + 1*8], r13
+ end_prolog
+ mov arg4, arg(4)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ mov r12, [rsp + 0*8]
+ mov r13, [rsp + 1*8]
+ add rsp, stack_size
+ %endmacro
+%endif
+
+
+%define len arg0
+%define vec arg1
+%define mul_array arg2
+%define src arg3
+%define dest1 arg4
+%define ptr arg5
+%define vec_i tmp2
+%define pos rax
+
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu8
+ %define XSTR vmovdqu8
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+ %define XLDR vmovdqa64
+ %define XSTR vmovdqa64
+ %else
+ %define XLDR vmovntdqa
+ %define XSTR vmovntdq
+ %endif
+%endif
+
+%define xgft1 zmm2
+
+%define x0 zmm0
+%define xp1 zmm1
+
+default rel
+[bits 64]
+section .text
+
+;;
+;; Encodes 64 bytes of all "k" sources into 64 bytes (single parity disk)
+;;
+%macro ENCODE_64B 0-1
+%define %%KMASK %1
+
+ vpxorq xp1, xp1, xp1
+ mov tmp, mul_array
+ xor vec_i, vec_i
+
+%%next_vect:
+ mov ptr, [src + vec_i]
+%if %0 == 1
+ vmovdqu8 x0{%%KMASK}, [ptr + pos] ;Get next source vector (less than 64 bytes)
+%else
+ XLDR x0, [ptr + pos] ;Get next source vector (64 bytes)
+%endif
+ add vec_i, 8
+
+ vbroadcastf32x2 xgft1, [tmp]
+ add tmp, 8
+
+ GF_MUL_XOR EVEX, x0, xgft1, xgft1, xp1
+
+ cmp vec_i, vec
+ jl %%next_vect
+
+%if %0 == 1
+ vmovdqu8 [dest1 + pos]{%%KMASK}, xp1
+%else
+ XSTR [dest1 + pos], xp1
+%endif
+%endmacro
+
+align 16
+global gf_vect_dot_prod_avx512_gfni, function
+func(gf_vect_dot_prod_avx512_gfni)
+ FUNC_SAVE
+ xor pos, pos
+ shl vec, 3 ;vec *= 8. Make vec_i count by 8
+
+ cmp len, 64
+ jl .len_lt_64
+
+.loop64:
+
+ ENCODE_64B
+
+ add pos, 64 ;Loop on 64 bytes at a time
+ sub len, 64
+ cmp len, 64
+ jge .loop64
+
+.len_lt_64:
+ cmp len, 0
+ jle .exit
+
+ xor tmp, tmp
+ bts tmp, len
+ dec tmp
+ kmovq k1, tmp
+
+ ENCODE_64B k1
+
+.exit:
+ vzeroupper
+
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+%endif ; if AS_FEATURE_LEVEL >= 10
diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_base_test.c b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_base_test.c
index b2601226e9..0cfd444413 100644
--- a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_base_test.c
+++ b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_base_test.c
@@ -30,10 +30,11 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h> // for memset, memcmp
+#include <assert.h>
#include "erasure_code.h"
-// #include "types.h"
+#include "test.h"
-#define TEST_LEN 2048
+#define TEST_LEN 8192
#define TEST_SIZE (TEST_LEN/2)
#ifndef TEST_SOURCES
@@ -134,8 +135,7 @@ int main(int argc, char *argv[])
// Pick a first test
m = 9;
k = 5;
- if (m > MMAX || k > KMAX)
- return -1;
+ assert(!(m > MMAX || k > KMAX));
gf_gen_cauchy1_matrix(a, m, k);
@@ -282,7 +282,9 @@ int main(int argc, char *argv[])
return -1;
}
}
+#ifdef TEST_VERBOSE
putchar('.');
+#endif
}
printf("done all: Pass\n");
diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_base_test.patch b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_base_test.patch
deleted file mode 100644
index 21bbfaa667..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_base_test.patch
+++ /dev/null
@@ -1,4 +0,0 @@
-34c34
-< // #include "types.h"
----
-> #include "types.h"
diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_perf.c b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_perf.c
index bd2b555b0a..3b051c67a4 100644
--- a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_perf.c
+++ b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_perf.c
@@ -40,22 +40,22 @@
#define str(s) #s
#define xstr(s) str(s)
-//#define CACHED_TEST
-#ifdef CACHED_TEST
+#ifndef GT_L3_CACHE
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+#endif
+
+#if !defined(COLD_TEST) && !defined(TEST_CUSTOM)
// Cached test, loop many times over small dataset
# define TEST_SOURCES 10
# define TEST_LEN 8*1024
# define TEST_TYPE_STR "_warm"
-#else
-# ifndef TEST_CUSTOM
+#elif defined (COLD_TEST)
// Uncached test. Pull from large mem base.
-# define TEST_SOURCES 10
-# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
-# define TEST_LEN ((GT_L3_CACHE / TEST_SOURCES) & ~(64-1))
-# define TEST_TYPE_STR "_cold"
-# else
-# define TEST_TYPE_STR "_cus"
-# endif
+# define TEST_SOURCES 10
+# define TEST_LEN ((GT_L3_CACHE / TEST_SOURCES) & ~(64-1))
+# define TEST_TYPE_STR "_cold"
+#elif defined (TEST_CUSTOM)
+# define TEST_TYPE_STR "_cus"
#endif
typedef unsigned char u8;
diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_sse.asm b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_sse.asm
index 67f4a1a329..ef245b4961 100644
--- a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_sse.asm
+++ b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_sse.asm
@@ -48,7 +48,7 @@
%endmacro
%define SSTR SLDR
%define PS 8
- %define func(x) x:
+ %define func(x) x: endbranch
%define FUNC_SAVE
%define FUNC_RESTORE
%endif
@@ -106,7 +106,7 @@
%define PS 4
%define LOG_PS 2
- %define func(x) x:
+ %define func(x) x: endbranch
%define arg(x) [ebp + PS*2 + PS*x]
%define trans ecx ;trans is for the variables in stack
@@ -194,13 +194,8 @@ section .text
%define xp xmm2
align 16
-global gf_vect_dot_prod_sse:ISAL_SYM_TYPE_FUNCTION
+global gf_vect_dot_prod_sse, function
func(gf_vect_dot_prod_sse)
-%ifidn __OUTPUT_FORMAT__, macho64
-global _gf_vect_dot_prod_sse:ISAL_SYM_TYPE_FUNCTION
-func(_gf_vect_dot_prod_sse)
-%endif
-
FUNC_SAVE
SLDR len, len_m
sub len, 16
@@ -271,6 +266,3 @@ section .data
align 16
mask0f: dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
-
-;;; func core, ver, snum
-slversion gf_vect_dot_prod_sse, 00, 05, 0060
diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_sse.patch b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_sse.patch
deleted file mode 100644
index ab47fc7a53..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_sse.patch
+++ /dev/null
@@ -1,6 +0,0 @@
-199,203d198
-< %ifidn __OUTPUT_FORMAT__, macho64
-< global _gf_vect_dot_prod_sse:ISAL_SYM_TYPE_FUNCTION
-< func(_gf_vect_dot_prod_sse)
-< %endif
-<
diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_test.c b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_test.c
index dbfc2da045..8300fbd70d 100644
--- a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_test.c
+++ b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_test.c
@@ -31,7 +31,7 @@
#include <stdlib.h>
#include <string.h> // for memset, memcmp
#include "erasure_code.h"
-// #include "types.h"
+#include "test.h"
#ifndef FUNCTION_UNDER_TEST
# define FUNCTION_UNDER_TEST gf_vect_dot_prod
@@ -43,7 +43,7 @@
#define str(s) #s
#define xstr(s) str(s)
-#define TEST_LEN 2048
+#define TEST_LEN 8192
#define TEST_SIZE (TEST_LEN/2)
#ifndef TEST_SOURCES
@@ -171,8 +171,11 @@ int main(int argc, char *argv[])
printf("dprod:");
dump(dest, 25);
return -1;
- } else
+ }
+#ifdef TEST_VERBOSE
+ else
putchar('.');
+#endif
// Rand data test
for (rtest = 0; rtest < RANDOMS; rtest++) {
@@ -199,7 +202,9 @@ int main(int argc, char *argv[])
return -1;
}
+#ifdef TEST_VERBOSE
putchar('.');
+#endif
}
// Rand data test with varied parameters
@@ -228,7 +233,9 @@ int main(int argc, char *argv[])
return -1;
}
+#ifdef TEST_VERBOSE
putchar('.');
+#endif
}
}
@@ -396,7 +403,9 @@ int main(int argc, char *argv[])
return -1;
}
}
+#ifdef TEST_VERBOSE
putchar('.');
+#endif
}
// Run tests at end of buffer for Electric Fence
@@ -428,7 +437,9 @@ int main(int argc, char *argv[])
return -1;
}
+#ifdef TEST_VERBOSE
putchar('.');
+#endif
}
// Test rand ptr alignment if available
@@ -485,7 +496,9 @@ int main(int argc, char *argv[])
return -1;
}
+#ifdef TEST_VERBOSE
putchar('.');
+#endif
}
// Test all size alignment
diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_test.patch b/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_test.patch
deleted file mode 100644
index 21bbfaa667..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_vect_dot_prod_test.patch
+++ /dev/null
@@ -1,4 +0,0 @@
-34c34
-< // #include "types.h"
----
-> #include "types.h"
diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_gfni.inc b/contrib/libs/isa-l/erasure_code/gf_vect_gfni.inc
new file mode 100644
index 0000000000..83d362bdae
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/gf_vect_gfni.inc
@@ -0,0 +1,72 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2023 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;
+; Multiply 1 source register to up to 6 different GF table registers
+; and XOR the results to partial registers
+;
+%macro GF_MUL_XOR 5-20
+%define %%ENCODING %1
+%define %%SRC %2
+%define %%GFTABLE1 %3
+%define %%TMP1 %4
+%define %%PARTIAL1 %5
+%define %%GFTABLE2 %6
+%define %%TMP2 %7
+%define %%PARTIAL2 %8
+%define %%GFTABLE3 %9
+%define %%TMP3 %10
+%define %%PARTIAL3 %11
+%define %%GFTABLE4 %12
+%define %%TMP4 %13
+%define %%PARTIAL4 %14
+%define %%GFTABLE5 %15
+%define %%TMP5 %16
+%define %%PARTIAL5 %17
+%define %%GFTABLE6 %18
+%define %%TMP6 %19
+%define %%PARTIAL6 %20
+
+%define %%N_BLOCKS ((%0 - 2) / 3)
+
+%assign %%I 1
+%rep %%N_BLOCKS
+ vgf2p8affineqb %%TMP %+ %%I, %%SRC, %%GFTABLE %+ %%I, 0x00
+%assign %%I (%%I + 1)
+%endrep
+%assign %%I 1
+%rep %%N_BLOCKS
+%ifidn %%ENCODING, VEX
+ vpxor %%PARTIAL %+ %%I, %%TMP %+ %%I
+%else
+ vpxorq %%PARTIAL %+ %%I, %%TMP %+ %%I
+%endif
+%assign %%I (%%I + 1)
+%endrep
+%endmacro
diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx.asm b/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx.asm
index 1a252c474f..20a44d7aa3 100644
--- a/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx.asm
+++ b/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx.asm
@@ -82,7 +82,7 @@
%define return rax
%define return.w eax
- %define func(x) x:
+ %define func(x) x: endbranch
%define FUNC_SAVE
%define FUNC_RESTORE
%endif
@@ -131,13 +131,8 @@ section .text
%define xtmpd xmm5
align 16
-global gf_vect_mad_avx:ISAL_SYM_TYPE_FUNCTION
+global gf_vect_mad_avx, function
func(gf_vect_mad_avx)
-%ifidn __OUTPUT_FORMAT__, macho64
-global _gf_vect_mad_avx:ISAL_SYM_TYPE_FUNCTION
-func(_gf_vect_mad_avx)
-%endif
-
FUNC_SAVE
sub len, 16
jl .return_fail
@@ -196,6 +191,3 @@ section .data
align 16
mask0f: dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
-
-;;; func core, ver, snum
-slversion gf_vect_mad_avx, 02, 01, 0201
diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx.patch b/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx.patch
deleted file mode 100644
index e3932a80b5..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx.patch
+++ /dev/null
@@ -1,6 +0,0 @@
-136,140d135
-< %ifidn __OUTPUT_FORMAT__, macho64
-< global _gf_vect_mad_avx:ISAL_SYM_TYPE_FUNCTION
-< func(_gf_vect_mad_avx)
-< %endif
-<
diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx2.asm b/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx2.asm
index 9b24c6e62a..c833f5e98c 100644
--- a/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx2.asm
+++ b/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx2.asm
@@ -88,7 +88,7 @@
%define return rax
%define return.w eax
- %define func(x) x:
+ %define func(x) x: endbranch
%define FUNC_SAVE
%define FUNC_RESTORE
%endif
@@ -139,13 +139,8 @@ section .text
%define xtmpd ymm5
align 16
-global gf_vect_mad_avx2:ISAL_SYM_TYPE_FUNCTION
+global gf_vect_mad_avx2, function
func(gf_vect_mad_avx2)
-%ifidn __OUTPUT_FORMAT__, macho64
-global _gf_vect_mad_avx2:ISAL_SYM_TYPE_FUNCTION
-func(_gf_vect_mad_avx2)
-%endif
-
FUNC_SAVE
sub len, 32
jl .return_fail
@@ -201,8 +196,3 @@ func(_gf_vect_mad_avx2)
ret
endproc_frame
-
-section .data
-
-;;; func core, ver, snum
-slversion gf_vect_mad_avx2, 04, 01, 0202
diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx2.patch b/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx2.patch
deleted file mode 100644
index 9d37d75b8d..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx2.patch
+++ /dev/null
@@ -1,6 +0,0 @@
-144,148d143
-< %ifidn __OUTPUT_FORMAT__, macho64
-< global _gf_vect_mad_avx2:ISAL_SYM_TYPE_FUNCTION
-< func(_gf_vect_mad_avx2)
-< %endif
-<
diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx2_gfni.asm b/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx2_gfni.asm
new file mode 100644
index 0000000000..29af12a0fc
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx2_gfni.asm
@@ -0,0 +1,255 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2023 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_vect_mad_avx2_gfni(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%include "reg_sizes.asm"
+%include "gf_vect_gfni.inc"
+%include "memcpy.asm"
+
+%if AS_FEATURE_LEVEL >= 10
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+ %define arg4 r8
+ %define arg5 r9
+ %define tmp r11
+ %define tmp2 r10
+ %define func(x) x: endbranch
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+ %define arg4 r12 ; must be saved, loaded and restored
+ %define arg5 r13 ; must be saved and restored
+ %define tmp r11
+ %define tmp2 r10
+ %define stack_size 16*3 + 3*8
+ %define arg(x) [rsp + stack_size + 8 + 8*x]
+ %define func(x) proc_frame x
+
+ %macro FUNC_SAVE 0
+ sub rsp, stack_size
+ vmovdqa [rsp + 0*16], xmm6
+ vmovdqa [rsp + 1*16], xmm7
+ vmovdqa [rsp + 2*16], xmm8
+ mov [rsp + 3*16 + 0*8], r12
+ mov [rsp + 3*16 + 1*8], r13
+ end_prolog
+ mov arg4, arg(4)
+ mov arg5, arg(5)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ vmovdqa xmm6, [rsp + 0*16]
+ vmovdqa xmm7, [rsp + 1*16]
+ vmovdqa xmm8, [rsp + 2*16]
+ mov r12, [rsp + 3*16 + 0*8]
+ mov r13, [rsp + 3*16 + 1*8]
+ add rsp, stack_size
+ %endmacro
+%endif
+
+%define len arg0
+%define vec arg1
+%define vec_i arg2
+%define mul_array arg3
+%define src arg4
+%define dest1 arg5
+%define pos rax
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+ %define XLDR vmovdqa
+ %define XSTR vmovdqa
+ %else
+ %define XLDR vmovntdqa
+ %define XSTR vmovntdq
+ %endif
+%endif
+
+default rel
+[bits 64]
+section .text
+
+%define x0l ymm0
+%define x0h ymm1
+%define x0x ymm2
+
+%define xgft1 ymm3
+
+%define xd1l ymm4
+%define xd1h ymm5
+%define xd1x ymm6
+
+%define xret1l ymm7
+%define xret1h ymm8
+
+%define x0 x0l
+%define xd1 xd1l
+%define xret1 xret1l
+
+;;
+;; Encodes 96 bytes of a single source and updates a single parity disk
+;;
+%macro ENCODE_96B 0
+ ;; get next source vector
+ XLDR x0l, [src + pos]
+ XLDR x0h, [src + pos + 32]
+ XLDR x0x, [src + pos + 64]
+ ;; get next dest vector
+ XLDR xd1l, [dest1 + pos]
+ XLDR xd1h, [dest1 + pos + 32]
+ XLDR xd1x, [dest1 + pos + 64]
+
+ GF_MUL_XOR VEX, x0l, xgft1, xret1l, xd1l
+ GF_MUL_XOR VEX, x0h, xgft1, xret1h, xd1h
+ GF_MUL_XOR VEX, x0x, xgft1, xret1l, xd1x
+
+ XSTR [dest1 + pos], xd1l
+ XSTR [dest1 + pos + 32], xd1h
+ XSTR [dest1 + pos + 64], xd1x
+%endmacro
+
+;;
+;; Encodes 64 bytes of a single source and updates a single parity disk
+;;
+%macro ENCODE_64B 0
+ ;; get next source vector
+ XLDR x0l, [src + pos]
+ XLDR x0h, [src + pos + 32]
+ ;; get next dest vector
+ XLDR xd1l, [dest1 + pos]
+ XLDR xd1h, [dest1 + pos + 32]
+
+ GF_MUL_XOR VEX, x0l, xgft1, xret1l, xd1l
+ GF_MUL_XOR VEX, x0h, xgft1, xret1h, xd1h
+
+ XSTR [dest1 + pos], xd1l
+ XSTR [dest1 + pos + 32], xd1h
+%endmacro
+
+;;
+;; Encodes 32 bytes of a single source and updates single parity disk
+;;
+%macro ENCODE_32B 0
+ ;; get next source vector
+ XLDR x0, [src + pos]
+ ;; get next dest vector
+ XLDR xd1, [dest1 + pos]
+
+ GF_MUL_XOR VEX, x0, xgft1, xret1, xd1
+
+ XSTR [dest1 + pos], xd1
+%endmacro
+
+;;
+;; Encodes less than 32 bytes of a single source and updates parity disk
+;;
+%macro ENCODE_LT_32B 1
+%define %%LEN %1
+ ;; get next source vector
+ simd_load_avx2 x0, src + pos, %%LEN, tmp, tmp2
+ ;; get next dest vector
+ simd_load_avx2 xd1, dest1 + pos, %%LEN, tmp, tmp2
+
+ GF_MUL_XOR VEX, x0, xgft1, xret1, xd1
+
+ lea dest1, [dest1 + pos]
+ simd_store_avx2 dest1, xd1, %%LEN, tmp, tmp2
+%endmacro
+
+align 16
+global gf_vect_mad_avx2_gfni, function
+func(gf_vect_mad_avx2_gfni)
+ FUNC_SAVE
+
+ xor pos, pos
+ shl vec_i, 3 ;Multiply by 8
+ shl vec, 3 ;Multiply by 8
+ lea tmp, [mul_array + vec_i]
+ vbroadcastsd xgft1, [tmp]
+
+ cmp len, 96
+ jl .len_lt_96
+
+.loop96:
+ ENCODE_96B ;; loop on 96 bytes at a time
+ add pos, 96
+ sub len, 96
+ cmp len, 96
+ jge .loop96
+
+.len_lt_96:
+ cmp len, 64
+ jl .len_lt_64
+
+ ENCODE_64B ;; encode next 64 bytes
+
+ add pos, 64
+ sub len, 64
+
+.len_lt_64:
+ cmp len, 32
+ jl .len_lt_32
+
+ ENCODE_32B ;; encode next 32 bytes
+
+ add pos, 32
+ sub len, 32
+
+.len_lt_32:
+ cmp len, 0
+ jle .exit
+
+ ENCODE_LT_32B len ;; encode final bytes
+
+.exit:
+ vzeroupper
+
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+%endif ; if AS_FEATURE_LEVEL >= 10
diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx512.asm b/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx512.asm
index adc2acf3e8..6f1bf35197 100644
--- a/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx512.asm
+++ b/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx512.asm
@@ -44,7 +44,7 @@
%define arg5 r9
%define tmp r11
%define return rax
- %define func(x) x:
+ %define func(x) x: endbranch
%define FUNC_SAVE
%define FUNC_RESTORE
%endif
@@ -101,8 +101,8 @@
%else
;;; Use Non-temporal load/stor
%ifdef NO_NT_LDST
- %define XLDR vmovdqa
- %define XSTR vmovdqa
+ %define XLDR vmovdqa64
+ %define XSTR vmovdqa64
%else
%define XLDR vmovntdqa
%define XSTR vmovntdq
@@ -127,13 +127,8 @@ section .text
%define xmask0f zmm8
align 16
-global gf_vect_mad_avx512:ISAL_SYM_TYPE_FUNCTION
+global gf_vect_mad_avx512, function
func(gf_vect_mad_avx512)
-%ifidn __OUTPUT_FORMAT__, macho64
-global _gf_vect_mad_avx512:ISAL_SYM_TYPE_FUNCTION
-func(_gf_vect_mad_avx512)
-%endif
-
FUNC_SAVE
sub len, 64
jl .return_fail
diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx512.patch b/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx512.patch
deleted file mode 100644
index 9a20fa281a..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx512.patch
+++ /dev/null
@@ -1,6 +0,0 @@
-132,136d131
-< %ifidn __OUTPUT_FORMAT__, macho64
-< global _gf_vect_mad_avx512:ISAL_SYM_TYPE_FUNCTION
-< func(_gf_vect_mad_avx512)
-< %endif
-<
diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx512_gfni.asm b/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx512_gfni.asm
new file mode 100644
index 0000000000..09073ce157
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/gf_vect_mad_avx512_gfni.asm
@@ -0,0 +1,175 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2023 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_vect_mad_avx512_gfni(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%include "reg_sizes.asm"
+%include "gf_vect_gfni.inc"
+
+%if AS_FEATURE_LEVEL >= 10
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+ %define arg4 r8
+ %define arg5 r9
+ %define tmp r11
+ %define func(x) x: endbranch
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+ %define arg4 r12 ; must be saved and loaded
+ %define arg5 r13
+ %define tmp r11
+ %define stack_size 3*8
+ %define arg(x) [rsp + stack_size + 8 + 8*x]
+ %define func(x) proc_frame x
+
+ %macro FUNC_SAVE 0
+ sub rsp, stack_size
+ mov [rsp + 0*8], r12
+ mov [rsp + 1*8], r13
+ end_prolog
+ mov arg4, arg(4)
+ mov arg5, arg(5)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ mov r12, [rsp + 0*8]
+ mov r13, [rsp + 1*8]
+ add rsp, stack_size
+ %endmacro
+%endif
+
+;;; gf_vect_mad_avx512_gfni(len, vec, vec_i, mul_array, src, dest)
+%define len arg0
+%define vec arg1
+%define vec_i arg2
+%define mul_array arg3
+%define src arg4
+%define dest arg5
+%define pos rax
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu8
+ %define XSTR vmovdqu8
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+ %define XLDR vmovdqa64
+ %define XSTR vmovdqa64
+ %else
+ %define XLDR vmovntdqa
+ %define XSTR vmovntdq
+ %endif
+%endif
+
+
+default rel
+
+[bits 64]
+section .text
+
+%define x0 zmm0
+%define xd zmm1
+%define xgft1 zmm2
+%define xret1 zmm3
+
+;;
+;; Encodes 64 bytes of a single source into 64 bytes (single parity disk)
+;;
+%macro ENCODE_64B 0-1
+%define %%KMASK %1
+
+%if %0 == 1
+ vmovdqu8 x0{%%KMASK}, [src + pos] ;Get next source vector
+ vmovdqu8 xd{%%KMASK}, [dest + pos] ;Get next dest vector
+%else
+ XLDR x0, [src + pos] ;Get next source vector
+ XLDR xd, [dest + pos] ;Get next dest vector
+%endif
+
+ GF_MUL_XOR EVEX, x0, xgft1, xret1, xd
+
+%if %0 == 1
+ vmovdqu8 [dest + pos]{%%KMASK}, xd
+%else
+ XSTR [dest + pos], xd
+%endif
+%endmacro
+
+align 16
+global gf_vect_mad_avx512_gfni, function
+func(gf_vect_mad_avx512_gfni)
+ FUNC_SAVE
+ xor pos, pos
+ shl vec_i, 3 ;Multiply by 8
+
+ vbroadcastf32x2 xgft1, [vec_i + mul_array]
+
+ cmp len, 64
+ jl .len_lt_64
+.loop64:
+ ENCODE_64B
+
+ add pos, 64 ;Loop on 64 bytes at a time
+ sub len, 64
+ cmp len, 64
+ jge .loop64
+
+.len_lt_64:
+ cmp len, 0
+ jle .exit
+
+ xor tmp, tmp
+ bts tmp, len
+ dec tmp
+ kmovq k1, tmp
+
+ ENCODE_64B k1
+
+.exit:
+ vzeroupper
+
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+%endif ; if AS_FEATURE_LEVEL >= 10
diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_mad_sse.asm b/contrib/libs/isa-l/erasure_code/gf_vect_mad_sse.asm
index ea48612324..c3afe72041 100644
--- a/contrib/libs/isa-l/erasure_code/gf_vect_mad_sse.asm
+++ b/contrib/libs/isa-l/erasure_code/gf_vect_mad_sse.asm
@@ -82,7 +82,7 @@
%define return rax
%define return.w eax
- %define func(x) x:
+ %define func(x) x: endbranch
%define FUNC_SAVE
%define FUNC_RESTORE
%endif
@@ -131,13 +131,8 @@ section .text
align 16
-global gf_vect_mad_sse:ISAL_SYM_TYPE_FUNCTION
+global gf_vect_mad_sse, function
func(gf_vect_mad_sse)
-%ifidn __OUTPUT_FORMAT__, macho64
-global _gf_vect_mad_sse:ISAL_SYM_TYPE_FUNCTION
-func(_gf_vect_mad_sse)
-%endif
-
FUNC_SAVE
sub len, 16
jl .return_fail
@@ -197,6 +192,3 @@ section .data
align 16
mask0f: dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
-
-;;; func core, ver, snum
-slversion gf_vect_mad_sse, 00, 01, 0200
diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_mad_sse.patch b/contrib/libs/isa-l/erasure_code/gf_vect_mad_sse.patch
deleted file mode 100644
index 9af95a1e02..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_vect_mad_sse.patch
+++ /dev/null
@@ -1,6 +0,0 @@
-136,140d135
-< %ifidn __OUTPUT_FORMAT__, macho64
-< global _gf_vect_mad_sse:ISAL_SYM_TYPE_FUNCTION
-< func(_gf_vect_mad_sse)
-< %endif
-<
diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_mad_test.c b/contrib/libs/isa-l/erasure_code/gf_vect_mad_test.c
index e2cbc1063d..3a552b2c08 100644
--- a/contrib/libs/isa-l/erasure_code/gf_vect_mad_test.c
+++ b/contrib/libs/isa-l/erasure_code/gf_vect_mad_test.c
@@ -31,7 +31,7 @@
#include <stdlib.h>
#include <string.h> // for memset, memcmp
#include "erasure_code.h"
-// #include "types.h"
+#include "test.h"
#ifndef ALIGN_SIZE
# define ALIGN_SIZE 32
@@ -51,7 +51,7 @@
#define str(s) #s
#define xstr(s) str(s)
-#define TEST_LEN 2048
+#define TEST_LEN 8192
#define TEST_SIZE (TEST_LEN/2)
#define TEST_MEM TEST_SIZE
#define TEST_LOOPS 20000
@@ -249,7 +249,9 @@ int main(int argc, char *argv[])
}
}
+#ifdef TEST_VERBOSE
putchar('.');
+#endif
// Rand data test
@@ -294,7 +296,9 @@ int main(int argc, char *argv[])
}
}
+#ifdef TEST_VERBOSE
putchar('.');
+#endif
}
// Rand data test with varied parameters
@@ -340,7 +344,9 @@ int main(int argc, char *argv[])
}
}
+#ifdef TEST_VERBOSE
putchar('.');
+#endif
}
}
@@ -390,7 +396,9 @@ int main(int argc, char *argv[])
}
}
+#ifdef TEST_VERBOSE
putchar('.');
+#endif
}
// Test rand ptr alignment if available
@@ -462,7 +470,9 @@ int main(int argc, char *argv[])
}
}
+#ifdef TEST_VERBOSE
putchar('.');
+#endif
}
// Test all size alignment
@@ -509,7 +519,9 @@ int main(int argc, char *argv[])
}
}
+#ifdef TEST_VERBOSE
putchar('.');
+#endif
}
diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_mad_test.patch b/contrib/libs/isa-l/erasure_code/gf_vect_mad_test.patch
deleted file mode 100644
index 21bbfaa667..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_vect_mad_test.patch
+++ /dev/null
@@ -1,4 +0,0 @@
-34c34
-< // #include "types.h"
----
-> #include "types.h"
diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_mul_avx.asm b/contrib/libs/isa-l/erasure_code/gf_vect_mul_avx.asm
index 86121b298a..d8d8c4c050 100644
--- a/contrib/libs/isa-l/erasure_code/gf_vect_mul_avx.asm
+++ b/contrib/libs/isa-l/erasure_code/gf_vect_mul_avx.asm
@@ -38,11 +38,8 @@
%define arg1 rsi
%define arg2 rdx
%define arg3 rcx
- %define arg4 r8
- %define arg5 r9
- %define tmp r11
%define return rax
- %define func(x) x:
+ %define func(x) x: endbranch
%define FUNC_SAVE
%define FUNC_RESTORE
@@ -56,11 +53,11 @@
%define func(x) proc_frame x
%macro FUNC_SAVE 0
alloc_stack stack_size
- save_xmm128 xmm6, 0*16
- save_xmm128 xmm7, 1*16
- save_xmm128 xmm13, 2*16
- save_xmm128 xmm14, 3*16
- save_xmm128 xmm15, 4*16
+ vmovdqa [rsp + 0*16], xmm6
+ vmovdqa [rsp + 1*16], xmm7
+ vmovdqa [rsp + 2*16], xmm13
+ vmovdqa [rsp + 3*16], xmm14
+ vmovdqa [rsp + 4*16], xmm15
end_prolog
%endmacro
@@ -81,6 +78,7 @@
%define src arg2
%define dest arg3
%define pos return
+%define tmp r11
;;; Use Non-temporal load/stor
@@ -111,13 +109,16 @@ section .text
%define xtmp2c xmm7
align 16
-global gf_vect_mul_avx:ISAL_SYM_TYPE_FUNCTION
+global gf_vect_mul_avx, function
func(gf_vect_mul_avx)
-%ifidn __OUTPUT_FORMAT__, macho64
-global _gf_vect_mul_avx:ISAL_SYM_TYPE_FUNCTION
-func(_gf_vect_mul_avx)
-%endif
+
+ ; Check if length is multiple of 32 bytes
+ mov tmp, len
+ and tmp, 0x1f
+ jnz return_fail
+
FUNC_SAVE
+
mov pos, 0
vmovdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
vmovdqu xgft_lo, [mul_array] ;Load array Cx{00}, Cx{01}, Cx{02}, ...
@@ -144,14 +145,13 @@ loop32:
XSTR [dest+pos-16], xtmp2b ;Store +16B result
jl loop32
+ FUNC_RESTORE
return_pass:
- FUNC_RESTORE
- sub pos, len
+ xor return, return
ret
return_fail:
- FUNC_RESTORE
mov return, 1
ret
@@ -163,6 +163,3 @@ align 16
mask0f:
dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
-
-;;; func core, ver, snum
-slversion gf_vect_mul_avx, 01, 03, 0036
diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_mul_avx.patch b/contrib/libs/isa-l/erasure_code/gf_vect_mul_avx.patch
deleted file mode 100644
index 99d4bd2d35..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_vect_mul_avx.patch
+++ /dev/null
@@ -1,5 +0,0 @@
-116,119d115
-< %ifidn __OUTPUT_FORMAT__, macho64
-< global _gf_vect_mul_avx:ISAL_SYM_TYPE_FUNCTION
-< func(_gf_vect_mul_avx)
-< %endif
diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_mul_base_test.c b/contrib/libs/isa-l/erasure_code/gf_vect_mul_base_test.c
index 81f04c4443..5ac40cd079 100644
--- a/contrib/libs/isa-l/erasure_code/gf_vect_mul_base_test.c
+++ b/contrib/libs/isa-l/erasure_code/gf_vect_mul_base_test.c
@@ -63,7 +63,10 @@ int main(int argc, char *argv[])
for (i = 0; i < TEST_SIZE; i++)
buff1[i] = rand();
- gf_vect_mul_base(TEST_SIZE, gf_const_tbl, buff1, buff2);
+ if (gf_vect_mul_base(TEST_SIZE, gf_const_tbl, buff1, buff2) != 0) {
+ printf("fail fill with rand data\n");
+ return 1;
+ }
for (i = 0; i < TEST_SIZE; i++)
if (gf_mul_erasure(a, buff1[i]) != buff2[i]) {
@@ -72,8 +75,10 @@ int main(int argc, char *argv[])
return 1;
}
- gf_vect_mul_base(TEST_SIZE, gf_const_tbl, buff1, buff3);
-
+ if (gf_vect_mul_base(TEST_SIZE, gf_const_tbl, buff1, buff3) != 0) {
+ printf("fail fill with rand data for buff1\n");
+ return -1;
+ }
// Check reference function
for (i = 0; i < TEST_SIZE; i++)
if (buff2[i] != buff3[i]) {
@@ -89,7 +94,10 @@ int main(int argc, char *argv[])
printf("Random tests ");
for (a = 0; a != 255; a++) {
gf_vect_mul_init(a, gf_const_tbl);
- gf_vect_mul_base(TEST_SIZE, gf_const_tbl, buff1, buff2);
+ if (gf_vect_mul_base(TEST_SIZE, gf_const_tbl, buff1, buff2) != 0) {
+ printf("fail random tests\n");
+ return 1;
+ }
for (i = 0; i < TEST_SIZE; i++)
if (gf_mul_erasure(a, buff1[i]) != buff2[i]) {
@@ -97,7 +105,9 @@ int main(int argc, char *argv[])
i, a, buff1[i], buff2[i], gf_mul_erasure(2, buff1[i]));
return 1;
}
+#ifdef TEST_VERBOSE
putchar('.');
+#endif
}
// Run tests at end of buffer for Electric Fence
@@ -110,7 +120,11 @@ int main(int argc, char *argv[])
efence_buff1 = buff1 + size;
efence_buff2 = buff2 + size;
- gf_vect_mul_base(TEST_SIZE - size, gf_const_tbl, efence_buff1, efence_buff2);
+ if (gf_vect_mul_base
+ (TEST_SIZE - size, gf_const_tbl, efence_buff1, efence_buff2) != 0) {
+ printf("fail tests at end of buffer\n");
+ return -1;
+ }
for (i = 0; i < TEST_SIZE - size; i++)
if (gf_mul_erasure(a, efence_buff1[i]) != efence_buff2[i]) {
@@ -121,7 +135,9 @@ int main(int argc, char *argv[])
return 1;
}
+#ifdef TEST_VERBOSE
putchar('.');
+#endif
}
printf(" done: Pass\n");
diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_mul_base_test.patch b/contrib/libs/isa-l/erasure_code/gf_vect_mul_base_test.patch
deleted file mode 100644
index 84c965985f..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_vect_mul_base_test.patch
+++ /dev/null
@@ -1,28 +0,0 @@
-69c69
-< if (gf_mul_erasure(a, buff1[i]) != buff2[i]) {
----
-> if (gf_mul(a, buff1[i]) != buff2[i]) {
-71c71
-< gf_mul_erasure(2, buff1[i]));
----
-> gf_mul(2, buff1[i]));
-81c81
-< i, a, buff1[i], buff2[i], gf_mul_erasure(a, buff1[i]));
----
-> i, a, buff1[i], buff2[i], gf_mul(a, buff1[i]));
-95c95
-< if (gf_mul_erasure(a, buff1[i]) != buff2[i]) {
----
-> if (gf_mul(a, buff1[i]) != buff2[i]) {
-97c97
-< i, a, buff1[i], buff2[i], gf_mul_erasure(2, buff1[i]));
----
-> i, a, buff1[i], buff2[i], gf_mul(2, buff1[i]));
-116c116
-< if (gf_mul_erasure(a, efence_buff1[i]) != efence_buff2[i]) {
----
-> if (gf_mul(a, efence_buff1[i]) != efence_buff2[i]) {
-118c118
-< i, efence_buff1[i], efence_buff2[i], gf_mul_erasure(2,
----
-> i, efence_buff1[i], efence_buff2[i], gf_mul(2,
diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_mul_perf.c b/contrib/libs/isa-l/erasure_code/gf_vect_mul_perf.c
index 58194ccebc..ae41cee43e 100644
--- a/contrib/libs/isa-l/erasure_code/gf_vect_mul_perf.c
+++ b/contrib/libs/isa-l/erasure_code/gf_vect_mul_perf.c
@@ -33,21 +33,22 @@
#include "erasure_code.h"
#include "test.h"
-//#define CACHED_TEST
-#ifdef CACHED_TEST
+#ifndef GT_L3_CACHE
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+#endif
+
+#if !defined(COLD_TEST) && !defined(TEST_CUSTOM)
// Cached test, loop many times over small dataset
+# define TEST_SOURCES 10
# define TEST_LEN 8*1024
# define TEST_TYPE_STR "_warm"
-#else
-# ifndef TEST_CUSTOM
+#elif defined (COLD_TEST)
// Uncached test. Pull from large mem base.
-# define TEST_SOURCES 10
-# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
-# define TEST_LEN GT_L3_CACHE / 2
-# define TEST_TYPE_STR "_cold"
-# else
-# define TEST_TYPE_STR "_cus"
-# endif
+# define TEST_SOURCES 10
+# define TEST_LEN (GT_L3_CACHE / 2)
+# define TEST_TYPE_STR "_cold"
+#elif defined (TEST_CUSTOM)
+# define TEST_TYPE_STR "_cus"
#endif
#define TEST_MEM (2 * TEST_LEN)
diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_mul_sse.asm b/contrib/libs/isa-l/erasure_code/gf_vect_mul_sse.asm
index 01a3269d65..ddccb129e1 100644
--- a/contrib/libs/isa-l/erasure_code/gf_vect_mul_sse.asm
+++ b/contrib/libs/isa-l/erasure_code/gf_vect_mul_sse.asm
@@ -38,11 +38,8 @@
%define arg1 rsi
%define arg2 rdx
%define arg3 rcx
- %define arg4 r8
- %define arg5 r9
- %define tmp r11
%define return rax
- %define func(x) x:
+ %define func(x) x: endbranch
%define FUNC_SAVE
%define FUNC_RESTORE
@@ -81,6 +78,7 @@
%define src arg2
%define dest arg3
%define pos return
+%define tmp r11
;;; Use Non-temporal load/stor
@@ -112,14 +110,15 @@ section .text
align 16
-global gf_vect_mul_sse:ISAL_SYM_TYPE_FUNCTION
+global gf_vect_mul_sse, function
func(gf_vect_mul_sse)
-%ifidn __OUTPUT_FORMAT__, macho64
-global _gf_vect_mul_sse:ISAL_SYM_TYPE_FUNCTION
-func(_gf_vect_mul_sse)
-%endif
+ ; Check if length is multiple of 32 bytes
+ mov tmp, len
+ and tmp, 0x1f
+ jnz return_fail
FUNC_SAVE
+
mov pos, 0
movdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
movdqu xgft_lo, [mul_array] ;Load array Cx{00}, Cx{01}, Cx{02}, ...
@@ -152,15 +151,14 @@ loop32:
cmp pos, len
jl loop32
+ FUNC_RESTORE
return_pass:
- sub pos, len
- FUNC_RESTORE
+ xor return, return
ret
return_fail:
mov return, 1
- FUNC_RESTORE
ret
endproc_frame
@@ -170,6 +168,3 @@ section .data
align 16
mask0f:
dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
-
-;;; func core, ver, snum
-slversion gf_vect_mul_sse, 00, 03, 0034
diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_mul_sse.patch b/contrib/libs/isa-l/erasure_code/gf_vect_mul_sse.patch
deleted file mode 100644
index 93027e3cf7..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_vect_mul_sse.patch
+++ /dev/null
@@ -1,6 +0,0 @@
-117,121d116
-< %ifidn __OUTPUT_FORMAT__, macho64
-< global _gf_vect_mul_sse:ISAL_SYM_TYPE_FUNCTION
-< func(_gf_vect_mul_sse)
-< %endif
-<
diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_mul_test.c b/contrib/libs/isa-l/erasure_code/gf_vect_mul_test.c
index c34e88b889..7e6457879c 100644
--- a/contrib/libs/isa-l/erasure_code/gf_vect_mul_test.c
+++ b/contrib/libs/isa-l/erasure_code/gf_vect_mul_test.c
@@ -31,14 +31,14 @@
#include <stdlib.h>
#include "erasure_code.h"
-#define TEST_SIZE (64*1024)
+#define TEST_SIZE (128*1024)
typedef unsigned char u8;
int main(int argc, char *argv[])
{
- int i;
- u8 *buff1, *buff2, *buff3, gf_const_tbl[64], a = 2;
+ int i, ret = -1;
+ u8 *buff1 = NULL, *buff2 = NULL, *buff3 = NULL, gf_const_tbl[64], a = 2;
int tsize;
int align, size;
unsigned char *efence_buff1;
@@ -55,30 +55,35 @@ int main(int argc, char *argv[])
if (NULL == buff1 || NULL == buff2 || NULL == buff3) {
printf("buffer alloc error\n");
- return -1;
+ goto exit;
}
// Fill with rand data
for (i = 0; i < TEST_SIZE; i++)
buff1[i] = rand();
- gf_vect_mul(TEST_SIZE, gf_const_tbl, buff1, buff2);
+ if (gf_vect_mul(TEST_SIZE, gf_const_tbl, buff1, buff2) != 0) {
+ printf("fail creating buff2\n");
+ goto exit;
+ }
for (i = 0; i < TEST_SIZE; i++) {
if (gf_mul_erasure(a, buff1[i]) != buff2[i]) {
printf("fail at %d, 0x%x x 2 = 0x%x (0x%x)\n", i,
buff1[i], buff2[i], gf_mul_erasure(2, buff1[i]));
- return -1;
+ goto exit;
}
}
- gf_vect_mul_base(TEST_SIZE, gf_const_tbl, buff1, buff3);
-
+ if (gf_vect_mul_base(TEST_SIZE, gf_const_tbl, buff1, buff3) != 0) {
+ printf("fail fill with rand data\n");
+ goto exit;
+ }
// Check reference function
for (i = 0; i < TEST_SIZE; i++) {
if (buff2[i] != buff3[i]) {
printf("fail at %d, 0x%x x 0x%d = 0x%x (0x%x)\n",
i, a, buff1[i], buff2[i], gf_mul_erasure(a, buff1[i]));
- return -1;
+ goto exit;
}
}
@@ -88,33 +93,43 @@ int main(int argc, char *argv[])
// Check each possible constant
for (a = 0; a != 255; a++) {
gf_vect_mul_init(a, gf_const_tbl);
- gf_vect_mul(TEST_SIZE, gf_const_tbl, buff1, buff2);
+ if (gf_vect_mul(TEST_SIZE, gf_const_tbl, buff1, buff2) != 0) {
+ printf("fail creating buff2\n");
+ goto exit;
+ }
for (i = 0; i < TEST_SIZE; i++)
if (gf_mul_erasure(a, buff1[i]) != buff2[i]) {
printf("fail at %d, 0x%x x %d = 0x%x (0x%x)\n",
i, a, buff1[i], buff2[i], gf_mul_erasure(2, buff1[i]));
- return -1;
+ goto exit;
}
+#ifdef TEST_VERBOSE
putchar('.');
+#endif
}
// Check buffer len
for (tsize = TEST_SIZE; tsize > 0; tsize -= 32) {
a = rand();
gf_vect_mul_init(a, gf_const_tbl);
- gf_vect_mul(tsize, gf_const_tbl, buff1, buff2);
+ if (gf_vect_mul(tsize, gf_const_tbl, buff1, buff2) != 0) {
+ printf("fail creating buff2 (len %d)\n", tsize);
+ goto exit;
+ }
for (i = 0; i < tsize; i++)
if (gf_mul_erasure(a, buff1[i]) != buff2[i]) {
printf("fail at %d, 0x%x x %d = 0x%x (0x%x)\n",
i, a, buff1[i], buff2[i], gf_mul_erasure(2, buff1[i]));
- return -1;
+ goto exit;
}
+#ifdef TEST_VERBOSE
if (0 == tsize % (32 * 8)) {
putchar('.');
fflush(0);
}
+#endif
}
// Run tests at end of buffer for Electric Fence
@@ -135,24 +150,46 @@ int main(int argc, char *argv[])
printf("fail at %d, 0x%x x 2 = 0x%x (0x%x)\n",
i, efence_buff1[i], efence_buff2[i],
gf_mul_erasure(2, efence_buff1[i]));
- return 1;
+ goto exit;
}
- gf_vect_mul_base(TEST_SIZE - size, gf_const_tbl, efence_buff1, efence_buff3);
-
+ if (gf_vect_mul_base
+ (TEST_SIZE - size, gf_const_tbl, efence_buff1, efence_buff3) != 0) {
+ printf("fail line up TEST_SIZE from end\n");
+ goto exit;
+ }
// Check reference function
for (i = 0; i < TEST_SIZE - size; i++)
if (efence_buff2[i] != efence_buff3[i]) {
printf("fail at %d, 0x%x x 0x%d = 0x%x (0x%x)\n",
i, a, efence_buff2[i], efence_buff3[i],
gf_mul_erasure(2, efence_buff1[i]));
- return 1;
+ goto exit;
}
-
+#ifdef TEST_VERBOSE
putchar('.');
+#endif
+ }
+
+ // Test all unsupported sizes up to TEST_SIZE
+ for (size = 0; size < TEST_SIZE; size++) {
+ if (size % align != 0 && gf_vect_mul(size, gf_const_tbl, buff1, buff2) == 0) {
+ printf
+ ("fail expecting nonzero return code for unaligned size param (%d)\n",
+ size);
+ goto exit;
+ }
}
printf(" done: Pass\n");
fflush(0);
- return 0;
+
+ ret = 0;
+ exit:
+
+ free(buff1);
+ free(buff2);
+ free(buff3);
+
+ return ret;
}
diff --git a/contrib/libs/isa-l/erasure_code/gf_vect_mul_test.patch b/contrib/libs/isa-l/erasure_code/gf_vect_mul_test.patch
deleted file mode 100644
index 4f5b5b575d..0000000000
--- a/contrib/libs/isa-l/erasure_code/gf_vect_mul_test.patch
+++ /dev/null
@@ -1,40 +0,0 @@
-67c67
-< if (gf_mul_erasure(a, buff1[i]) != buff2[i]) {
----
-> if (gf_mul(a, buff1[i]) != buff2[i]) {
-69c69
-< buff1[i], buff2[i], gf_mul_erasure(2, buff1[i]));
----
-> buff1[i], buff2[i], gf_mul(2, buff1[i]));
-80c80
-< i, a, buff1[i], buff2[i], gf_mul_erasure(a, buff1[i]));
----
-> i, a, buff1[i], buff2[i], gf_mul(a, buff1[i]));
-94c94
-< if (gf_mul_erasure(a, buff1[i]) != buff2[i]) {
----
-> if (gf_mul(a, buff1[i]) != buff2[i]) {
-96c96
-< i, a, buff1[i], buff2[i], gf_mul_erasure(2, buff1[i]));
----
-> i, a, buff1[i], buff2[i], gf_mul(2, buff1[i]));
-109c109
-< if (gf_mul_erasure(a, buff1[i]) != buff2[i]) {
----
-> if (gf_mul(a, buff1[i]) != buff2[i]) {
-111c111
-< i, a, buff1[i], buff2[i], gf_mul_erasure(2, buff1[i]));
----
-> i, a, buff1[i], buff2[i], gf_mul(2, buff1[i]));
-134c134
-< if (gf_mul_erasure(a, efence_buff1[i]) != efence_buff2[i]) {
----
-> if (gf_mul(a, efence_buff1[i]) != efence_buff2[i]) {
-137c137
-< gf_mul_erasure(2, efence_buff1[i]));
----
-> gf_mul(2, efence_buff1[i]));
-148c148
-< gf_mul_erasure(2, efence_buff1[i]));
----
-> gf_mul(2, efence_buff1[i]));
diff --git a/contrib/libs/isa-l/erasure_code/ppc64le/Makefile.am b/contrib/libs/isa-l/erasure_code/ppc64le/Makefile.am
new file mode 100644
index 0000000000..9d263ac22f
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/ppc64le/Makefile.am
@@ -0,0 +1,15 @@
+lsrc_ppc64le += erasure_code/ppc64le/ec_base_vsx.c \
+ erasure_code/ppc64le/gf_vect_mul_vsx.c \
+ erasure_code/ppc64le/gf_vect_dot_prod_vsx.c \
+ erasure_code/ppc64le/gf_vect_mad_vsx.c \
+ erasure_code/ppc64le/gf_2vect_dot_prod_vsx.c \
+ erasure_code/ppc64le/gf_2vect_mad_vsx.c \
+ erasure_code/ppc64le/gf_3vect_dot_prod_vsx.c \
+ erasure_code/ppc64le/gf_3vect_mad_vsx.c \
+ erasure_code/ppc64le/gf_4vect_dot_prod_vsx.c \
+ erasure_code/ppc64le/gf_4vect_mad_vsx.c \
+ erasure_code/ppc64le/gf_5vect_dot_prod_vsx.c \
+ erasure_code/ppc64le/gf_5vect_mad_vsx.c \
+ erasure_code/ppc64le/gf_6vect_dot_prod_vsx.c \
+ erasure_code/ppc64le/gf_6vect_mad_vsx.c
+
diff --git a/contrib/libs/isa-l/erasure_code/ppc64le/ec_base_vsx.c b/contrib/libs/isa-l/erasure_code/ppc64le/ec_base_vsx.c
new file mode 100644
index 0000000000..c3163a58ff
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/ppc64le/ec_base_vsx.c
@@ -0,0 +1,106 @@
+#include "erasure_code.h"
+#include "ec_base_vsx.h"
+
+void gf_vect_dot_prod(int len, int vlen, unsigned char *v,
+ unsigned char **src, unsigned char *dest)
+{
+ gf_vect_dot_prod_vsx(len, vlen, v, src, dest);
+}
+
+void gf_vect_mad(int len, int vec, int vec_i, unsigned char *v,
+ unsigned char *src, unsigned char *dest)
+{
+ gf_vect_mad_vsx(len, vec, vec_i, v, src, dest);
+
+}
+
+void ec_encode_data(int len, int srcs, int dests, unsigned char *v,
+ unsigned char **src, unsigned char **dest)
+{
+ if (len < 64) {
+ ec_encode_data_base(len, srcs, dests, v, src, dest);
+ return;
+ }
+
+ while (dests >= 6) {
+ gf_6vect_dot_prod_vsx(len, srcs, v, src, dest);
+ v += 6 * srcs * 32;
+ dest += 6;
+ dests -= 6;
+ }
+ switch (dests) {
+ case 6:
+ gf_6vect_dot_prod_vsx(len, srcs, v, src, dest);
+ break;
+ case 5:
+ gf_5vect_dot_prod_vsx(len, srcs, v, src, dest);
+ break;
+ case 4:
+ gf_4vect_dot_prod_vsx(len, srcs, v, src, dest);
+ break;
+ case 3:
+ gf_3vect_dot_prod_vsx(len, srcs, v, src, dest);
+ break;
+ case 2:
+ gf_2vect_dot_prod_vsx(len, srcs, v, src, dest);
+ break;
+ case 1:
+ gf_vect_dot_prod_vsx(len, srcs, v, src, *dest);
+ break;
+ case 0:
+ break;
+ }
+}
+
+void ec_encode_data_update(int len, int k, int rows, int vec_i, unsigned char *v,
+ unsigned char *data, unsigned char **dest)
+{
+ if (len < 64) {
+ ec_encode_data_update_base(len, k, rows, vec_i, v, data, dest);
+ return;
+ }
+
+ while (rows >= 6) {
+ gf_6vect_mad_vsx(len, k, vec_i, v, data, dest);
+ v += 6 * k * 32;
+ dest += 6;
+ rows -= 6;
+ }
+ switch (rows) {
+ case 6:
+ gf_6vect_mad_vsx(len, k, vec_i, v, data, dest);
+ break;
+ case 5:
+ gf_5vect_mad_vsx(len, k, vec_i, v, data, dest);
+ break;
+ case 4:
+ gf_4vect_mad_vsx(len, k, vec_i, v, data, dest);
+ break;
+ case 3:
+ gf_3vect_mad_vsx(len, k, vec_i, v, data, dest);
+ break;
+ case 2:
+ gf_2vect_mad_vsx(len, k, vec_i, v, data, dest);
+ break;
+ case 1:
+ gf_vect_mad_vsx(len, k, vec_i, v, data, *dest);
+ break;
+ case 0:
+ break;
+ }
+}
+
+int gf_vect_mul(int len, unsigned char *a, void *src, void *dest)
+{
+ /* Size must be aligned to 32 bytes */
+ if ((len % 32) != 0)
+ return -1;
+
+ gf_vect_mul_vsx(len, a, (unsigned char *)src, (unsigned char *)dest);
+ return 0;
+}
+
+void ec_init_tables(int k, int rows, unsigned char *a, unsigned char *g_tbls)
+{
+ return ec_init_tables_base(k, rows, a, g_tbls);
+}
diff --git a/contrib/libs/isa-l/erasure_code/ppc64le/ec_base_vsx.h b/contrib/libs/isa-l/erasure_code/ppc64le/ec_base_vsx.h
new file mode 100644
index 0000000000..c808629a95
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/ppc64le/ec_base_vsx.h
@@ -0,0 +1,338 @@
+#ifndef _ERASURE_CODE_PPC64LE_H_
+#define _ERASURE_CODE_PPC64LE_H_
+
+#include "erasure_code.h"
+#include <altivec.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(__ibmxl__)
+#define EC_vec_xl(a, b) vec_xl_be(a, b)
+#define EC_vec_permxor(va, vb, vc) __vpermxor(va, vb, vc)
+#elif defined __GNUC__ && __GNUC__ >= 8
+#define EC_vec_xl(a, b) vec_xl_be(a, b)
+#define EC_vec_permxor(va, vb, vc) __builtin_crypto_vpermxor(va, vb, vc)
+#elif defined __GNUC__ && __GNUC__ >= 7
+#if defined _ARCH_PWR9
+#define EC_vec_xl(a, b) vec_vsx_ld(a, b)
+#define EC_vec_permxor(va, vb, vc) __builtin_crypto_vpermxor(va, vb, vec_nor(vc, vc))
+#else
+inline vector unsigned char EC_vec_xl(int off, unsigned char *ptr) {
+ vector unsigned char vc;
+ __asm__ __volatile__("lxvd2x %x0, %1, %2; xxswapd %x0, %x0" : "=wa" (vc) : "r" (off), "r" (ptr));
+ return vc;
+}
+#define EC_vec_permxor(va, vb, vc) __builtin_crypto_vpermxor(va, vb, vec_nor(vc, vc))
+#endif
+#else
+#if defined _ARCH_PWR8
+inline vector unsigned char EC_vec_xl(int off, unsigned char *ptr) {
+ vector unsigned char vc;
+ __asm__ __volatile__("lxvd2x %x0, %1, %2; xxswapd %x0, %x0" : "=wa" (vc) : "r" (off), "r" (ptr));
+ return vc;
+}
+#define EC_vec_permxor(va, vb, vc) __builtin_crypto_vpermxor(va, vb, vec_nor(vc, vc))
+#else
+#error "This code is only supported on ppc64le."
+#endif
+#endif
+
+/**
+ * @brief GF(2^8) vector multiply. VSX version.
+ *
+ * Does a GF(2^8) multiply across each byte of input source with expanded
+ * constant and save to destination array. Can be used for erasure coding encode
+ * and decode update when only one source is available at a time. Function
+ * requires pre-calculation of a 32 byte constant array based on the input
+ * coefficients.
+ * @requires VSX
+ *
+ * @param len Length of each vector in bytes.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * coefficients in ec_init_tables(). Must be of size 32.
+ * @param src Array of pointers to source inputs.
+ * @param dest Pointer to destination data array.
+ * @returns none
+ */
+
+void gf_vect_mul_vsx(int len, unsigned char *gftbls, unsigned char *src, unsigned char *dest);
+
+/**
+ * @brief GF(2^8) vector dot product. VSX version.
+ *
+ * Does a GF(2^8) dot product across each byte of the input array and a constant
+ * set of coefficients to produce each byte of the output. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 32*vlen byte constant array based on the input coefficients.
+ * @requires VSX
+ *
+ * @param len Length of each vector in bytes.
+ * @param vlen Number of vector sources.
+ * @param gftbls Pointer to 32*vlen byte array of pre-calculated constants based
+ * on the array of input coefficients.
+ * @param src Array of pointers to source inputs.
+ * @param dest Pointer to destination data array.
+ * @returns none
+ */
+
+void gf_vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char *dest);
+
+/**
+ * @brief GF(2^8) vector dot product with two outputs. VSX version.
+ *
+ * Vector dot product optimized to calculate two outputs at a time. Does two
+ * GF(2^8) dot products across each byte of the input array and two constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 2*32*vlen byte constant array based on the two sets of input coefficients.
+ * @requires VSX
+ *
+ * @param len Length of each vector in bytes.
+ * @param vlen Number of vector sources.
+ * @param gftbls Pointer to 2*32*vlen byte array of pre-calculated constants
+ * based on the array of input coefficients.
+ * @param src Array of pointers to source inputs.
+ * @param dest Array of pointers to destination data buffers.
+ * @returns none
+ */
+
+void gf_2vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with three outputs. VSX version.
+ *
+ * Vector dot product optimized to calculate three outputs at a time. Does three
+ * GF(2^8) dot products across each byte of the input array and three constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 3*32*vlen byte constant array based on the three sets of input coefficients.
+ * @requires VSX
+ *
+ * @param len Length of each vector in bytes.
+ * @param vlen Number of vector sources.
+ * @param gftbls Pointer to 3*32*vlen byte array of pre-calculated constants
+ * based on the array of input coefficients.
+ * @param src Array of pointers to source inputs.
+ * @param dest Array of pointers to destination data buffers.
+ * @returns none
+ */
+
+void gf_3vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with four outputs. VSX version.
+ *
+ * Vector dot product optimized to calculate four outputs at a time. Does four
+ * GF(2^8) dot products across each byte of the input array and four constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 4*32*vlen byte constant array based on the four sets of input coefficients.
+ * @requires VSX
+ *
+ * @param len Length of each vector in bytes.
+ * @param vlen Number of vector sources.
+ * @param gftbls Pointer to 4*32*vlen byte array of pre-calculated constants
+ * based on the array of input coefficients.
+ * @param src Array of pointers to source inputs.
+ * @param dest Array of pointers to destination data buffers.
+ * @returns none
+ */
+
+void gf_4vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with five outputs. VSX version.
+ *
+ * Vector dot product optimized to calculate five outputs at a time. Does five
+ * GF(2^8) dot products across each byte of the input array and five constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 5*32*vlen byte constant array based on the five sets of input coefficients.
+ * @requires VSX
+ *
+ * @param len Length of each vector in bytes. Must >= 16.
+ * @param vlen Number of vector sources.
+ * @param gftbls Pointer to 5*32*vlen byte array of pre-calculated constants
+ * based on the array of input coefficients.
+ * @param src Array of pointers to source inputs.
+ * @param dest Array of pointers to destination data buffers.
+ * @returns none
+ */
+
+void gf_5vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with six outputs. VSX version.
+ *
+ * Vector dot product optimized to calculate six outputs at a time. Does six
+ * GF(2^8) dot products across each byte of the input array and six constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 6*32*vlen byte constant array based on the six sets of input coefficients.
+ * @requires VSX
+ *
+ * @param len Length of each vector in bytes.
+ * @param vlen Number of vector sources.
+ * @param gftbls Pointer to 6*32*vlen byte array of pre-calculated constants
+ * based on the array of input coefficients.
+ * @param src Array of pointers to source inputs.
+ * @param dest Array of pointers to destination data buffers.
+ * @returns none
+ */
+
+void gf_6vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply accumulate. VSX version.
+ *
+ * Does a GF(2^8) multiply across each byte of input source with expanded
+ * constant and add to destination array. Can be used for erasure coding encode
+ * and decode update when only one source is available at a time. Function
+ * requires pre-calculation of a 32*vec byte constant array based on the input
+ * coefficients.
+ * @requires VSX
+ *
+ * @param len Length of each vector in bytes.
+ * @param vec The number of vector sources or rows in the generator matrix
+ * for coding.
+ * @param vec_i The vector index corresponding to the single input source.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * coefficients in ec_init_tables(). Must be of size 32*vec.
+ * @param src Array of pointers to source inputs.
+ * @param dest Pointer to destination data array.
+ * @returns none
+ */
+
+void gf_vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+ unsigned char *dest);
+/**
+ * @brief GF(2^8) vector multiply with 2 accumulate. VSX version.
+ *
+ * Does a GF(2^8) multiply across each byte of input source with expanded
+ * constants and add to destination arrays. Can be used for erasure coding
+ * encode and decode update when only one source is available at a
+ * time. Function requires pre-calculation of a 32*vec byte constant array based
+ * on the input coefficients.
+ * @requires VSX
+ *
+ * @param len Length of each vector in bytes.
+ * @param vec The number of vector sources or rows in the generator matrix
+ * for coding.
+ * @param vec_i The vector index corresponding to the single input source.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * coefficients in ec_init_tables(). Must be of size 32*vec.
+ * @param src Pointer to source input array.
+ * @param dest Array of pointers to destination input/outputs.
+ * @returns none
+ */
+
+void gf_2vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+ unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 3 accumulate. VSX version.
+ *
+ * Does a GF(2^8) multiply across each byte of input source with expanded
+ * constants and add to destination arrays. Can be used for erasure coding
+ * encode and decode update when only one source is available at a
+ * time. Function requires pre-calculation of a 32*vec byte constant array based
+ * on the input coefficients.
+ * @requires VSX
+ *
+ * @param len Length of each vector in bytes.
+ * @param vec The number of vector sources or rows in the generator matrix
+ * for coding.
+ * @param vec_i The vector index corresponding to the single input source.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * coefficients in ec_init_tables(). Must be of size 32*vec.
+ * @param src Pointer to source input array.
+ * @param dest Array of pointers to destination input/outputs.
+ * @returns none
+ */
+
+void gf_3vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+ unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 4 accumulate. VSX version.
+ *
+ * Does a GF(2^8) multiply across each byte of input source with expanded
+ * constants and add to destination arrays. Can be used for erasure coding
+ * encode and decode update when only one source is available at a
+ * time. Function requires pre-calculation of a 32*vec byte constant array based
+ * on the input coefficients.
+ * @requires VSX
+ *
+ * @param len Length of each vector in bytes.
+ * @param vec The number of vector sources or rows in the generator matrix
+ * for coding.
+ * @param vec_i The vector index corresponding to the single input source.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * coefficients in ec_init_tables(). Must be of size 32*vec.
+ * @param src Pointer to source input array.
+ * @param dest Array of pointers to destination input/outputs.
+ * @returns none
+ */
+
+void gf_4vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+ unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 5 accumulate. VSX version.
+ *
+ * Does a GF(2^8) multiply across each byte of input source with expanded
+ * constants and add to destination arrays. Can be used for erasure coding
+ * encode and decode update when only one source is available at a
+ * time. Function requires pre-calculation of a 32*vec byte constant array based
+ * on the input coefficients.
+ * @requires VSX
+ *
+ * @param len Length of each vector in bytes.
+ * @param vec The number of vector sources or rows in the generator matrix
+ * for coding.
+ * @param vec_i The vector index corresponding to the single input source.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * coefficients in ec_init_tables(). Must be of size 32*vec.
+ * @param src Pointer to source input array.
+ * @param dest Array of pointers to destination input/outputs.
+ * @returns none
+ */
+void gf_5vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+ unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 6 accumulate. VSX version.
+ *
+ * Does a GF(2^8) multiply across each byte of input source with expanded
+ * constants and add to destination arrays. Can be used for erasure coding
+ * encode and decode update when only one source is available at a
+ * time. Function requires pre-calculation of a 32*vec byte constant array based
+ * on the input coefficients.
+ * @requires VSX
+ *
+ * @param len Length of each vector in bytes.
+ * @param vec The number of vector sources or rows in the generator matrix
+ * for coding.
+ * @param vec_i The vector index corresponding to the single input source.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * coefficients in ec_init_tables(). Must be of size 32*vec.
+ * @param src Pointer to source input array.
+ * @param dest Array of pointers to destination input/outputs.
+ * @returns none
+ */
+void gf_6vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+ unsigned char **dest);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //_ERASURE_CODE_PPC64LE_H_
diff --git a/contrib/libs/isa-l/erasure_code/ppc64le/gf_2vect_dot_prod_vsx.c b/contrib/libs/isa-l/erasure_code/ppc64le/gf_2vect_dot_prod_vsx.c
new file mode 100644
index 0000000000..3cb269ccef
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/ppc64le/gf_2vect_dot_prod_vsx.c
@@ -0,0 +1,83 @@
+#include "ec_base_vsx.h"
+
+void gf_2vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest)
+{
+ unsigned char *s, *t0, *t1;
+ vector unsigned char vX1, vX2, vX3, vX4;
+ vector unsigned char vY1, vY2, vY3, vY4;
+ vector unsigned char vYD, vYE, vYF, vYG;
+ vector unsigned char vhi0, vlo0, vhi1, vlo1;
+ int i, j, head;
+
+ if (vlen < 128) {
+ gf_vect_mul_vsx(len, &gftbls[0 * 32 * vlen], src[0], (unsigned char *)dest[0]);
+ gf_vect_mul_vsx(len, &gftbls[1 * 32 * vlen], src[0], (unsigned char *)dest[1]);
+
+ for (j = 1; j < vlen; j++) {
+ gf_2vect_mad_vsx(len, vlen, j, gftbls, src[j], dest);
+ }
+ return;
+ }
+
+ t0 = (unsigned char *)dest[0];
+ t1 = (unsigned char *)dest[1];
+
+ head = len % 64;
+ if (head != 0) {
+ gf_vect_dot_prod_base(head, vlen, &gftbls[0 * 32 * vlen], src, t0);
+ gf_vect_dot_prod_base(head, vlen, &gftbls[1 * 32 * vlen], src, t1);
+ }
+
+ for (i = head; i < len - 63; i += 64) {
+ vY1 = vY1 ^ vY1;
+ vY2 = vY2 ^ vY2;
+ vY3 = vY3 ^ vY3;
+ vY4 = vY4 ^ vY4;
+
+ vYD = vYD ^ vYD;
+ vYE = vYE ^ vYE;
+ vYF = vYF ^ vYF;
+ vYG = vYG ^ vYG;
+
+ unsigned char *g0 = &gftbls[0 * 32 * vlen];
+ unsigned char *g1 = &gftbls[1 * 32 * vlen];
+
+ for (j = 0; j < vlen; j++) {
+ s = (unsigned char *)src[j];
+ vX1 = vec_xl(0, s + i);
+ vX2 = vec_xl(16, s + i);
+ vX3 = vec_xl(32, s + i);
+ vX4 = vec_xl(48, s + i);
+
+ vlo0 = EC_vec_xl(0, g0);
+ vhi0 = EC_vec_xl(16, g0);
+ vlo1 = EC_vec_xl(0, g1);
+ vhi1 = EC_vec_xl(16, g1);
+
+ vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
+ vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
+ vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3);
+ vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4);
+
+ vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1);
+ vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2);
+ vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3);
+ vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4);
+
+ g0 += 32;
+ g1 += 32;
+ }
+
+ vec_xst(vY1, 0, t0 + i);
+ vec_xst(vY2, 16, t0 + i);
+ vec_xst(vY3, 0, t1 + i);
+ vec_xst(vY4, 16, t1 + i);
+
+ vec_xst(vYD, 32, t0 + i);
+ vec_xst(vYE, 48, t0 + i);
+ vec_xst(vYF, 32, t1 + i);
+ vec_xst(vYG, 48, t1 + i);
+ }
+ return;
+}
diff --git a/contrib/libs/isa-l/erasure_code/ppc64le/gf_2vect_mad_vsx.c b/contrib/libs/isa-l/erasure_code/ppc64le/gf_2vect_mad_vsx.c
new file mode 100644
index 0000000000..621684a5fb
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/ppc64le/gf_2vect_mad_vsx.c
@@ -0,0 +1,65 @@
+#include "ec_base_vsx.h"
+
+void gf_2vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char **dest)
+{
+ unsigned char *s, *t0, *t1;
+ vector unsigned char vX1, vX2, vX3, vX4;
+ vector unsigned char vY1, vY2, vY3, vY4;
+ vector unsigned char vYD, vYE, vYF, vYG;
+ vector unsigned char vhi0, vlo0, vhi1, vlo1;
+ int i, head;
+
+ s = (unsigned char *)src;
+ t0 = (unsigned char *)dest[0];
+ t1 = (unsigned char *)dest[1];
+
+ head = len % 64;
+ if (head != 0) {
+ gf_vect_mad_base(head, vec, vec_i, &gftbls[0 * 32 * vec], src, t0);
+ gf_vect_mad_base(head, vec, vec_i, &gftbls[1 * 32 * vec], src, t1);
+ }
+
+ vlo0 = EC_vec_xl(0, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
+ vhi0 = EC_vec_xl(16, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
+ vlo1 = EC_vec_xl(0, gftbls + (((1 * vec) << 5) + (vec_i << 5)));
+ vhi1 = EC_vec_xl(16, gftbls + (((1 * vec) << 5) + (vec_i << 5)));
+
+ for (i = head; i < len - 63; i += 64) {
+ vX1 = vec_xl(0, s + i);
+ vX2 = vec_xl(16, s + i);
+ vX3 = vec_xl(32, s + i);
+ vX4 = vec_xl(48, s + i);
+
+ vY1 = vec_xl(0, t0 + i);
+ vY2 = vec_xl(16, t0 + i);
+ vYD = vec_xl(32, t0 + i);
+ vYE = vec_xl(48, t0 + i);
+
+ vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
+ vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
+ vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3);
+ vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4);
+
+ vY3 = vec_xl(0, t1 + i);
+ vY4 = vec_xl(16, t1 + i);
+ vYF = vec_xl(32, t1 + i);
+ vYG = vec_xl(48, t1 + i);
+
+ vec_xst(vY1, 0, t0 + i);
+ vec_xst(vY2, 16, t0 + i);
+ vec_xst(vYD, 32, t0 + i);
+ vec_xst(vYE, 48, t0 + i);
+
+ vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1);
+ vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2);
+ vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3);
+ vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4);
+
+ vec_xst(vY3, 0, t1 + i);
+ vec_xst(vY4, 16, t1 + i);
+ vec_xst(vYF, 32, t1 + i);
+ vec_xst(vYG, 48, t1 + i);
+ }
+ return;
+}
diff --git a/contrib/libs/isa-l/erasure_code/ppc64le/gf_3vect_dot_prod_vsx.c b/contrib/libs/isa-l/erasure_code/ppc64le/gf_3vect_dot_prod_vsx.c
new file mode 100644
index 0000000000..23b72dc4ba
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/ppc64le/gf_3vect_dot_prod_vsx.c
@@ -0,0 +1,104 @@
+#include "ec_base_vsx.h"
+
+void gf_3vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest)
+{
+ unsigned char *s, *t0, *t1, *t2;
+ vector unsigned char vX1, vX2, vX3, vX4;
+ vector unsigned char vY1, vY2, vY3, vY4, vY5, vY6;
+ vector unsigned char vYD, vYE, vYF, vYG, vYH, vYI;
+ vector unsigned char vhi0, vlo0, vhi1, vlo1, vhi2, vlo2;
+ int i, j, head;
+
+ if (vlen < 128) {
+ gf_vect_mul_vsx(len, &gftbls[0 * 32 * vlen], src[0], (unsigned char *)dest[0]);
+ gf_vect_mul_vsx(len, &gftbls[1 * 32 * vlen], src[0], (unsigned char *)dest[1]);
+ gf_vect_mul_vsx(len, &gftbls[2 * 32 * vlen], src[0], (unsigned char *)dest[2]);
+
+ for (j = 1; j < vlen; j++) {
+ gf_3vect_mad_vsx(len, vlen, j, gftbls, src[j], dest);
+ }
+ return;
+ }
+
+ t0 = (unsigned char *)dest[0];
+ t1 = (unsigned char *)dest[1];
+ t2 = (unsigned char *)dest[2];
+
+ head = len % 64;
+ if (head != 0) {
+ gf_vect_dot_prod_base(head, vlen, &gftbls[0 * 32 * vlen], src, t0);
+ gf_vect_dot_prod_base(head, vlen, &gftbls[1 * 32 * vlen], src, t1);
+ gf_vect_dot_prod_base(head, vlen, &gftbls[2 * 32 * vlen], src, t2);
+ }
+
+ for (i = head; i < len - 63; i += 64) {
+ vY1 = vY1 ^ vY1;
+ vY2 = vY2 ^ vY2;
+ vY3 = vY3 ^ vY3;
+ vY4 = vY4 ^ vY4;
+ vY5 = vY5 ^ vY5;
+ vY6 = vY6 ^ vY6;
+
+ vYD = vYD ^ vYD;
+ vYE = vYE ^ vYE;
+ vYF = vYF ^ vYF;
+ vYG = vYG ^ vYG;
+ vYH = vYH ^ vYH;
+ vYI = vYI ^ vYI;
+
+ unsigned char *g0 = &gftbls[0 * 32 * vlen];
+ unsigned char *g1 = &gftbls[1 * 32 * vlen];
+ unsigned char *g2 = &gftbls[2 * 32 * vlen];
+
+ for (j = 0; j < vlen; j++) {
+ s = (unsigned char *)src[j];
+ vX1 = vec_xl(0, s + i);
+ vX2 = vec_xl(16, s + i);
+ vX3 = vec_xl(32, s + i);
+ vX4 = vec_xl(48, s + i);
+
+ vlo0 = EC_vec_xl(0, g0);
+ vhi0 = EC_vec_xl(16, g0);
+ vlo1 = EC_vec_xl(0, g1);
+ vhi1 = EC_vec_xl(16, g1);
+
+ vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
+ vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
+ vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3);
+ vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4);
+
+ vlo2 = vec_xl(0, g2);
+ vhi2 = vec_xl(16, g2);
+
+ vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1);
+ vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2);
+ vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3);
+ vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4);
+
+ vY5 = vY5 ^ EC_vec_permxor(vhi2, vlo2, vX1);
+ vY6 = vY6 ^ EC_vec_permxor(vhi2, vlo2, vX2);
+ vYH = vYH ^ EC_vec_permxor(vhi2, vlo2, vX3);
+ vYI = vYI ^ EC_vec_permxor(vhi2, vlo2, vX4);
+
+ g0 += 32;
+ g1 += 32;
+ g2 += 32;
+ }
+
+ vec_xst(vY1, 0, t0 + i);
+ vec_xst(vY2, 16, t0 + i);
+ vec_xst(vY3, 0, t1 + i);
+ vec_xst(vY4, 16, t1 + i);
+ vec_xst(vY5, 0, t2 + i);
+ vec_xst(vY6, 16, t2 + i);
+
+ vec_xst(vYD, 32, t0 + i);
+ vec_xst(vYE, 48, t0 + i);
+ vec_xst(vYF, 32, t1 + i);
+ vec_xst(vYG, 48, t1 + i);
+ vec_xst(vYH, 32, t2 + i);
+ vec_xst(vYI, 48, t2 + i);
+ }
+ return;
+}
diff --git a/contrib/libs/isa-l/erasure_code/ppc64le/gf_3vect_mad_vsx.c b/contrib/libs/isa-l/erasure_code/ppc64le/gf_3vect_mad_vsx.c
new file mode 100644
index 0000000000..ba90c1fdbf
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/ppc64le/gf_3vect_mad_vsx.c
@@ -0,0 +1,84 @@
+#include "ec_base_vsx.h"
+
+void gf_3vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char **dest)
+{
+ unsigned char *s, *t0, *t1, *t2;
+ vector unsigned char vX1, vX2, vX3, vX4;
+ vector unsigned char vY1, vY2, vY3, vY4, vY5, vY6;
+ vector unsigned char vYD, vYE, vYF, vYG, vYH, vYI;
+ vector unsigned char vhi0, vlo0, vhi1, vlo1, vhi2, vlo2;
+ int i, head;
+
+ s = (unsigned char *)src;
+ t0 = (unsigned char *)dest[0];
+ t1 = (unsigned char *)dest[1];
+ t2 = (unsigned char *)dest[2];
+
+ head = len % 64;
+ if (head != 0) {
+ gf_vect_mad_base(head, vec, vec_i, &gftbls[0 * 32 * vec], src, t0);
+ gf_vect_mad_base(head, vec, vec_i, &gftbls[1 * 32 * vec], src, t1);
+ gf_vect_mad_base(head, vec, vec_i, &gftbls[2 * 32 * vec], src, t2);
+ }
+
+ vlo0 = EC_vec_xl(0, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
+ vhi0 = EC_vec_xl(16, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
+ vlo1 = EC_vec_xl(0, gftbls + (((1 * vec) << 5) + (vec_i << 5)));
+ vhi1 = EC_vec_xl(16, gftbls + (((1 * vec) << 5) + (vec_i << 5)));
+ vlo2 = EC_vec_xl(0, gftbls + (((2 * vec) << 5) + (vec_i << 5)));
+ vhi2 = EC_vec_xl(16, gftbls + (((2 * vec) << 5) + (vec_i << 5)));
+
+ for (i = head; i < len - 63; i += 64) {
+ vX1 = vec_xl(0, s + i);
+ vX2 = vec_xl(16, s + i);
+ vX3 = vec_xl(32, s + i);
+ vX4 = vec_xl(48, s + i);
+
+ vY1 = vec_xl(0, t0 + i);
+ vY2 = vec_xl(16, t0 + i);
+ vYD = vec_xl(32, t0 + i);
+ vYE = vec_xl(48, t0 + i);
+
+ vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
+ vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
+ vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3);
+ vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4);
+
+ vY3 = vec_xl(0, t1 + i);
+ vY4 = vec_xl(16, t1 + i);
+ vYF = vec_xl(32, t1 + i);
+ vYG = vec_xl(48, t1 + i);
+
+ vec_xst(vY1, 0, t0 + i);
+ vec_xst(vY2, 16, t0 + i);
+ vec_xst(vYD, 32, t0 + i);
+ vec_xst(vYE, 48, t0 + i);
+
+ vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1);
+ vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2);
+ vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3);
+ vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4);
+
+ vY5 = vec_xl(0, t2 + i);
+ vY6 = vec_xl(16, t2 + i);
+ vYH = vec_xl(32, t2 + i);
+ vYI = vec_xl(48, t2 + i);
+
+ vec_xst(vY3, 0, t1 + i);
+ vec_xst(vY4, 16, t1 + i);
+ vec_xst(vYF, 32, t1 + i);
+ vec_xst(vYG, 48, t1 + i);
+
+ vY5 = vY5 ^ EC_vec_permxor(vhi2, vlo2, vX1);
+ vY6 = vY6 ^ EC_vec_permxor(vhi2, vlo2, vX2);
+ vYH = vYH ^ EC_vec_permxor(vhi2, vlo2, vX3);
+ vYI = vYI ^ EC_vec_permxor(vhi2, vlo2, vX4);
+
+ vec_xst(vY5, 0, t2 + i);
+ vec_xst(vY6, 16, t2 + i);
+ vec_xst(vYH, 32, t2 + i);
+ vec_xst(vYI, 48, t2 + i);
+ }
+ return;
+}
diff --git a/contrib/libs/isa-l/erasure_code/ppc64le/gf_4vect_dot_prod_vsx.c b/contrib/libs/isa-l/erasure_code/ppc64le/gf_4vect_dot_prod_vsx.c
new file mode 100644
index 0000000000..e656544530
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/ppc64le/gf_4vect_dot_prod_vsx.c
@@ -0,0 +1,124 @@
+#include "ec_base_vsx.h"
+
+void gf_4vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest)
+{
+ unsigned char *s, *t0, *t1, *t2, *t3;
+ vector unsigned char vX1, vX2, vX3, vX4;
+ vector unsigned char vY1, vY2, vY3, vY4, vY5, vY6, vY7, vY8;
+ vector unsigned char vYD, vYE, vYF, vYG, vYH, vYI, vYJ, vYK;
+ vector unsigned char vhi0, vlo0, vhi1, vlo1, vhi2, vlo2, vhi3, vlo3;
+ int i, j, head;
+
+ if (vlen < 128) {
+ gf_vect_mul_vsx(len, &gftbls[0 * 32 * vlen], src[0], (unsigned char *)dest[0]);
+ gf_vect_mul_vsx(len, &gftbls[1 * 32 * vlen], src[0], (unsigned char *)dest[1]);
+ gf_vect_mul_vsx(len, &gftbls[2 * 32 * vlen], src[0], (unsigned char *)dest[2]);
+ gf_vect_mul_vsx(len, &gftbls[3 * 32 * vlen], src[0], (unsigned char *)dest[3]);
+
+ for (j = 1; j < vlen; j++) {
+ gf_4vect_mad_vsx(len, vlen, j, gftbls, src[j], dest);
+ }
+ return;
+ }
+
+ t0 = (unsigned char *)dest[0];
+ t1 = (unsigned char *)dest[1];
+ t2 = (unsigned char *)dest[2];
+ t3 = (unsigned char *)dest[3];
+
+ head = len % 64;
+ if (head != 0) {
+ gf_vect_dot_prod_base(head, vlen, &gftbls[0 * 32 * vlen], src, t0);
+ gf_vect_dot_prod_base(head, vlen, &gftbls[1 * 32 * vlen], src, t1);
+ gf_vect_dot_prod_base(head, vlen, &gftbls[2 * 32 * vlen], src, t2);
+ gf_vect_dot_prod_base(head, vlen, &gftbls[3 * 32 * vlen], src, t3);
+ }
+
+ for (i = head; i < len - 63; i += 64) {
+ vY1 = vY1 ^ vY1;
+ vY2 = vY2 ^ vY2;
+ vY3 = vY3 ^ vY3;
+ vY4 = vY4 ^ vY4;
+ vY5 = vY5 ^ vY5;
+ vY6 = vY6 ^ vY6;
+ vY7 = vY7 ^ vY7;
+ vY8 = vY8 ^ vY8;
+
+ vYD = vYD ^ vYD;
+ vYE = vYE ^ vYE;
+ vYF = vYF ^ vYF;
+ vYG = vYG ^ vYG;
+ vYH = vYH ^ vYH;
+ vYI = vYI ^ vYI;
+ vYJ = vYJ ^ vYJ;
+ vYK = vYK ^ vYK;
+
+ unsigned char *g0 = &gftbls[0 * 32 * vlen];
+ unsigned char *g1 = &gftbls[1 * 32 * vlen];
+ unsigned char *g2 = &gftbls[2 * 32 * vlen];
+ unsigned char *g3 = &gftbls[3 * 32 * vlen];
+
+ for (j = 0; j < vlen; j++) {
+ s = (unsigned char *)src[j];
+ vX1 = vec_xl(0, s + i);
+ vX2 = vec_xl(16, s + i);
+ vX3 = vec_xl(32, s + i);
+ vX4 = vec_xl(48, s + i);
+
+ vlo0 = EC_vec_xl(0, g0);
+ vhi0 = EC_vec_xl(16, g0);
+ vlo1 = EC_vec_xl(0, g1);
+ vhi1 = EC_vec_xl(16, g1);
+
+ vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
+ vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
+ vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3);
+ vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4);
+
+ vlo2 = vec_xl(0, g2);
+ vhi2 = vec_xl(16, g2);
+ vlo3 = vec_xl(0, g3);
+ vhi3 = vec_xl(16, g3);
+
+ vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1);
+ vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2);
+ vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3);
+ vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4);
+
+ vY5 = vY5 ^ EC_vec_permxor(vhi2, vlo2, vX1);
+ vY6 = vY6 ^ EC_vec_permxor(vhi2, vlo2, vX2);
+ vYH = vYH ^ EC_vec_permxor(vhi2, vlo2, vX3);
+ vYI = vYI ^ EC_vec_permxor(vhi2, vlo2, vX4);
+
+ vY7 = vY7 ^ EC_vec_permxor(vhi3, vlo3, vX1);
+ vY8 = vY8 ^ EC_vec_permxor(vhi3, vlo3, vX2);
+ vYJ = vYJ ^ EC_vec_permxor(vhi3, vlo3, vX3);
+ vYK = vYK ^ EC_vec_permxor(vhi3, vlo3, vX4);
+
+ g0 += 32;
+ g1 += 32;
+ g2 += 32;
+ g3 += 32;
+ }
+
+ vec_xst(vY1, 0, t0 + i);
+ vec_xst(vY2, 16, t0 + i);
+ vec_xst(vY3, 0, t1 + i);
+ vec_xst(vY4, 16, t1 + i);
+ vec_xst(vY5, 0, t2 + i);
+ vec_xst(vY6, 16, t2 + i);
+ vec_xst(vY7, 0, t3 + i);
+ vec_xst(vY8, 16, t3 + i);
+
+ vec_xst(vYD, 32, t0 + i);
+ vec_xst(vYE, 48, t0 + i);
+ vec_xst(vYF, 32, t1 + i);
+ vec_xst(vYG, 48, t1 + i);
+ vec_xst(vYH, 32, t2 + i);
+ vec_xst(vYI, 48, t2 + i);
+ vec_xst(vYJ, 32, t3 + i);
+ vec_xst(vYK, 48, t3 + i);
+ }
+ return;
+}
diff --git a/contrib/libs/isa-l/erasure_code/ppc64le/gf_4vect_mad_vsx.c b/contrib/libs/isa-l/erasure_code/ppc64le/gf_4vect_mad_vsx.c
new file mode 100644
index 0000000000..7b236b6f81
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/ppc64le/gf_4vect_mad_vsx.c
@@ -0,0 +1,103 @@
+#include "ec_base_vsx.h"
+
+void gf_4vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char **dest)
+{
+ unsigned char *s, *t0, *t1, *t2, *t3;
+ vector unsigned char vX1, vX2, vX3, vX4;
+ vector unsigned char vY1, vY2, vY3, vY4, vY5, vY6, vY7, vY8;
+ vector unsigned char vYD, vYE, vYF, vYG, vYH, vYI, vYJ, vYK;
+ vector unsigned char vhi0, vlo0, vhi1, vlo1, vhi2, vlo2, vhi3, vlo3;
+ int i, head;
+
+ s = (unsigned char *)src;
+ t0 = (unsigned char *)dest[0];
+ t1 = (unsigned char *)dest[1];
+ t2 = (unsigned char *)dest[2];
+ t3 = (unsigned char *)dest[3];
+
+ head = len % 64;
+ if (head != 0) {
+ gf_vect_mad_base(head, vec, vec_i, &gftbls[0 * 32 * vec], src, t0);
+ gf_vect_mad_base(head, vec, vec_i, &gftbls[1 * 32 * vec], src, t1);
+ gf_vect_mad_base(head, vec, vec_i, &gftbls[2 * 32 * vec], src, t2);
+ gf_vect_mad_base(head, vec, vec_i, &gftbls[3 * 32 * vec], src, t3);
+ }
+
+ vlo0 = EC_vec_xl(0, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
+ vhi0 = EC_vec_xl(16, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
+ vlo1 = EC_vec_xl(0, gftbls + (((1 * vec) << 5) + (vec_i << 5)));
+ vhi1 = EC_vec_xl(16, gftbls + (((1 * vec) << 5) + (vec_i << 5)));
+ vlo2 = EC_vec_xl(0, gftbls + (((2 * vec) << 5) + (vec_i << 5)));
+ vhi2 = EC_vec_xl(16, gftbls + (((2 * vec) << 5) + (vec_i << 5)));
+ vlo3 = EC_vec_xl(0, gftbls + (((3 * vec) << 5) + (vec_i << 5)));
+ vhi3 = EC_vec_xl(16, gftbls + (((3 * vec) << 5) + (vec_i << 5)));
+
+ for (i = head; i < len - 63; i += 64) {
+ vX1 = vec_xl(0, s + i);
+ vX2 = vec_xl(16, s + i);
+ vX3 = vec_xl(32, s + i);
+ vX4 = vec_xl(48, s + i);
+
+ vY1 = vec_xl(0, t0 + i);
+ vY2 = vec_xl(16, t0 + i);
+ vYD = vec_xl(32, t0 + i);
+ vYE = vec_xl(48, t0 + i);
+
+ vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
+ vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
+ vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3);
+ vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4);
+
+ vY3 = vec_xl(0, t1 + i);
+ vY4 = vec_xl(16, t1 + i);
+ vYF = vec_xl(32, t1 + i);
+ vYG = vec_xl(48, t1 + i);
+
+ vec_xst(vY1, 0, t0 + i);
+ vec_xst(vY2, 16, t0 + i);
+ vec_xst(vYD, 32, t0 + i);
+ vec_xst(vYE, 48, t0 + i);
+
+ vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1);
+ vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2);
+ vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3);
+ vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4);
+
+ vY5 = vec_xl(0, t2 + i);
+ vY6 = vec_xl(16, t2 + i);
+ vYH = vec_xl(32, t2 + i);
+ vYI = vec_xl(48, t2 + i);
+
+ vec_xst(vY3, 0, t1 + i);
+ vec_xst(vY4, 16, t1 + i);
+ vec_xst(vYF, 32, t1 + i);
+ vec_xst(vYG, 48, t1 + i);
+
+ vY5 = vY5 ^ EC_vec_permxor(vhi2, vlo2, vX1);
+ vY6 = vY6 ^ EC_vec_permxor(vhi2, vlo2, vX2);
+ vYH = vYH ^ EC_vec_permxor(vhi2, vlo2, vX3);
+ vYI = vYI ^ EC_vec_permxor(vhi2, vlo2, vX4);
+
+ vY7 = vec_xl(0, t3 + i);
+ vY8 = vec_xl(16, t3 + i);
+ vYJ = vec_xl(32, t3 + i);
+ vYK = vec_xl(48, t3 + i);
+
+ vec_xst(vY5, 0, t2 + i);
+ vec_xst(vY6, 16, t2 + i);
+ vec_xst(vYH, 32, t2 + i);
+ vec_xst(vYI, 48, t2 + i);
+
+ vY7 = vY7 ^ EC_vec_permxor(vhi3, vlo3, vX1);
+ vY8 = vY8 ^ EC_vec_permxor(vhi3, vlo3, vX2);
+ vYJ = vYJ ^ EC_vec_permxor(vhi3, vlo3, vX3);
+ vYK = vYK ^ EC_vec_permxor(vhi3, vlo3, vX4);
+
+ vec_xst(vY7, 0, t3 + i);
+ vec_xst(vY8, 16, t3 + i);
+ vec_xst(vYJ, 32, t3 + i);
+ vec_xst(vYK, 48, t3 + i);
+ }
+ return;
+}
diff --git a/contrib/libs/isa-l/erasure_code/ppc64le/gf_5vect_dot_prod_vsx.c b/contrib/libs/isa-l/erasure_code/ppc64le/gf_5vect_dot_prod_vsx.c
new file mode 100644
index 0000000000..e9eef0e638
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/ppc64le/gf_5vect_dot_prod_vsx.c
@@ -0,0 +1,145 @@
+#include "ec_base_vsx.h"
+
+void gf_5vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest)
+{
+ unsigned char *s, *t0, *t1, *t2, *t3, *t4;
+ vector unsigned char vX1, vX2, vX3, vX4;
+ vector unsigned char vY1, vY2, vY3, vY4, vY5, vY6, vY7, vY8, vY9, vYA;
+ vector unsigned char vYD, vYE, vYF, vYG, vYH, vYI, vYJ, vYK, vYL, vYM;
+ vector unsigned char vhi0, vlo0, vhi1, vlo1, vhi2, vlo2, vhi3, vlo3, vhi4, vlo4;
+ int i, j, head;
+
+ if (vlen < 128) {
+ gf_vect_mul_vsx(len, &gftbls[0 * 32 * vlen], src[0], (unsigned char *)dest[0]);
+ gf_vect_mul_vsx(len, &gftbls[1 * 32 * vlen], src[0], (unsigned char *)dest[1]);
+ gf_vect_mul_vsx(len, &gftbls[2 * 32 * vlen], src[0], (unsigned char *)dest[2]);
+ gf_vect_mul_vsx(len, &gftbls[3 * 32 * vlen], src[0], (unsigned char *)dest[3]);
+ gf_vect_mul_vsx(len, &gftbls[4 * 32 * vlen], src[0], (unsigned char *)dest[4]);
+
+ for (j = 1; j < vlen; j++) {
+ gf_5vect_mad_vsx(len, vlen, j, gftbls, src[j], dest);
+ }
+ return;
+ }
+
+ t0 = (unsigned char *)dest[0];
+ t1 = (unsigned char *)dest[1];
+ t2 = (unsigned char *)dest[2];
+ t3 = (unsigned char *)dest[3];
+ t4 = (unsigned char *)dest[4];
+
+ head = len % 64;
+ if (head != 0) {
+ gf_vect_dot_prod_base(head, vlen, &gftbls[0 * 32 * vlen], src, t0);
+ gf_vect_dot_prod_base(head, vlen, &gftbls[1 * 32 * vlen], src, t1);
+ gf_vect_dot_prod_base(head, vlen, &gftbls[2 * 32 * vlen], src, t2);
+ gf_vect_dot_prod_base(head, vlen, &gftbls[3 * 32 * vlen], src, t3);
+ gf_vect_dot_prod_base(head, vlen, &gftbls[4 * 32 * vlen], src, t4);
+ }
+
+ for (i = head; i < len - 63; i += 64) {
+ vY1 = vY1 ^ vY1;
+ vY2 = vY2 ^ vY2;
+ vY3 = vY3 ^ vY3;
+ vY4 = vY4 ^ vY4;
+ vY5 = vY5 ^ vY5;
+ vY6 = vY6 ^ vY6;
+ vY7 = vY7 ^ vY7;
+ vY8 = vY8 ^ vY8;
+ vY9 = vY9 ^ vY9;
+ vYA = vYA ^ vYA;
+
+ vYD = vYD ^ vYD;
+ vYE = vYE ^ vYE;
+ vYF = vYF ^ vYF;
+ vYG = vYG ^ vYG;
+ vYH = vYH ^ vYH;
+ vYI = vYI ^ vYI;
+ vYJ = vYJ ^ vYJ;
+ vYK = vYK ^ vYK;
+ vYL = vYL ^ vYL;
+ vYM = vYM ^ vYM;
+
+ unsigned char *g0 = &gftbls[0 * 32 * vlen];
+ unsigned char *g1 = &gftbls[1 * 32 * vlen];
+ unsigned char *g2 = &gftbls[2 * 32 * vlen];
+ unsigned char *g3 = &gftbls[3 * 32 * vlen];
+ unsigned char *g4 = &gftbls[4 * 32 * vlen];
+
+ for (j = 0; j < vlen; j++) {
+ s = (unsigned char *)src[j];
+ vX1 = vec_xl(0, s + i);
+ vX2 = vec_xl(16, s + i);
+ vX3 = vec_xl(32, s + i);
+ vX4 = vec_xl(48, s + i);
+
+ vlo0 = EC_vec_xl(0, g0);
+ vhi0 = EC_vec_xl(16, g0);
+ vlo1 = EC_vec_xl(0, g1);
+ vhi1 = EC_vec_xl(16, g1);
+
+ vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
+ vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
+ vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3);
+ vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4);
+
+ vlo2 = vec_xl(0, g2);
+ vhi2 = vec_xl(16, g2);
+ vlo3 = vec_xl(0, g3);
+ vhi3 = vec_xl(16, g3);
+
+ vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1);
+ vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2);
+ vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3);
+ vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4);
+
+ vlo4 = vec_xl(0, g4);
+ vhi4 = vec_xl(16, g4);
+
+ vY5 = vY5 ^ EC_vec_permxor(vhi2, vlo2, vX1);
+ vY6 = vY6 ^ EC_vec_permxor(vhi2, vlo2, vX2);
+ vYH = vYH ^ EC_vec_permxor(vhi2, vlo2, vX3);
+ vYI = vYI ^ EC_vec_permxor(vhi2, vlo2, vX4);
+
+ vY7 = vY7 ^ EC_vec_permxor(vhi3, vlo3, vX1);
+ vY8 = vY8 ^ EC_vec_permxor(vhi3, vlo3, vX2);
+ vYJ = vYJ ^ EC_vec_permxor(vhi3, vlo3, vX3);
+ vYK = vYK ^ EC_vec_permxor(vhi3, vlo3, vX4);
+
+ vY9 = vY9 ^ EC_vec_permxor(vhi4, vlo4, vX1);
+ vYA = vYA ^ EC_vec_permxor(vhi4, vlo4, vX2);
+ vYL = vYL ^ EC_vec_permxor(vhi4, vlo4, vX3);
+ vYM = vYM ^ EC_vec_permxor(vhi4, vlo4, vX4);
+
+ g0 += 32;
+ g1 += 32;
+ g2 += 32;
+ g3 += 32;
+ g4 += 32;
+ }
+
+ vec_xst(vY1, 0, t0 + i);
+ vec_xst(vY2, 16, t0 + i);
+ vec_xst(vY3, 0, t1 + i);
+ vec_xst(vY4, 16, t1 + i);
+ vec_xst(vY5, 0, t2 + i);
+ vec_xst(vY6, 16, t2 + i);
+ vec_xst(vY7, 0, t3 + i);
+ vec_xst(vY8, 16, t3 + i);
+ vec_xst(vY9, 0, t4 + i);
+ vec_xst(vYA, 16, t4 + i);
+
+ vec_xst(vYD, 32, t0 + i);
+ vec_xst(vYE, 48, t0 + i);
+ vec_xst(vYF, 32, t1 + i);
+ vec_xst(vYG, 48, t1 + i);
+ vec_xst(vYH, 32, t2 + i);
+ vec_xst(vYI, 48, t2 + i);
+ vec_xst(vYJ, 32, t3 + i);
+ vec_xst(vYK, 48, t3 + i);
+ vec_xst(vYL, 32, t4 + i);
+ vec_xst(vYM, 48, t4 + i);
+ }
+ return;
+}
diff --git a/contrib/libs/isa-l/erasure_code/ppc64le/gf_5vect_mad_vsx.c b/contrib/libs/isa-l/erasure_code/ppc64le/gf_5vect_mad_vsx.c
new file mode 100644
index 0000000000..7bb7bb2115
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/ppc64le/gf_5vect_mad_vsx.c
@@ -0,0 +1,122 @@
+#include "ec_base_vsx.h"
+
+void gf_5vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char **dest)
+{
+ unsigned char *s, *t0, *t1, *t2, *t3, *t4;
+ vector unsigned char vX1, vX2, vX3, vX4;
+ vector unsigned char vY1, vY2, vY3, vY4, vY5, vY6, vY7, vY8, vY9, vYA;
+ vector unsigned char vYD, vYE, vYF, vYG, vYH, vYI, vYJ, vYK, vYL, vYM;
+ vector unsigned char vhi0, vlo0, vhi1, vlo1, vhi2, vlo2, vhi3, vlo3, vhi4, vlo4;
+ int i, head;
+
+ s = (unsigned char *)src;
+ t0 = (unsigned char *)dest[0];
+ t1 = (unsigned char *)dest[1];
+ t2 = (unsigned char *)dest[2];
+ t3 = (unsigned char *)dest[3];
+ t4 = (unsigned char *)dest[4];
+
+ head = len % 64;
+ if (head != 0) {
+ gf_vect_mad_base(head, vec, vec_i, &gftbls[0 * 32 * vec], src, t0);
+ gf_vect_mad_base(head, vec, vec_i, &gftbls[1 * 32 * vec], src, t1);
+ gf_vect_mad_base(head, vec, vec_i, &gftbls[2 * 32 * vec], src, t2);
+ gf_vect_mad_base(head, vec, vec_i, &gftbls[3 * 32 * vec], src, t3);
+ gf_vect_mad_base(head, vec, vec_i, &gftbls[4 * 32 * vec], src, t4);
+ }
+
+ vlo0 = EC_vec_xl(0, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
+ vhi0 = EC_vec_xl(16, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
+ vlo1 = EC_vec_xl(0, gftbls + (((1 * vec) << 5) + (vec_i << 5)));
+ vhi1 = EC_vec_xl(16, gftbls + (((1 * vec) << 5) + (vec_i << 5)));
+ vlo2 = EC_vec_xl(0, gftbls + (((2 * vec) << 5) + (vec_i << 5)));
+ vhi2 = EC_vec_xl(16, gftbls + (((2 * vec) << 5) + (vec_i << 5)));
+ vlo3 = EC_vec_xl(0, gftbls + (((3 * vec) << 5) + (vec_i << 5)));
+ vhi3 = EC_vec_xl(16, gftbls + (((3 * vec) << 5) + (vec_i << 5)));
+ vlo4 = EC_vec_xl(0, gftbls + (((4 * vec) << 5) + (vec_i << 5)));
+ vhi4 = EC_vec_xl(16, gftbls + (((4 * vec) << 5) + (vec_i << 5)));
+
+ for (i = head; i < len - 63; i += 64) {
+ vX1 = vec_xl(0, s + i);
+ vX2 = vec_xl(16, s + i);
+ vX3 = vec_xl(32, s + i);
+ vX4 = vec_xl(48, s + i);
+
+ vY1 = vec_xl(0, t0 + i);
+ vY2 = vec_xl(16, t0 + i);
+ vYD = vec_xl(32, t0 + i);
+ vYE = vec_xl(48, t0 + i);
+
+ vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
+ vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
+ vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3);
+ vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4);
+
+ vY3 = vec_xl(0, t1 + i);
+ vY4 = vec_xl(16, t1 + i);
+ vYF = vec_xl(32, t1 + i);
+ vYG = vec_xl(48, t1 + i);
+
+ vec_xst(vY1, 0, t0 + i);
+ vec_xst(vY2, 16, t0 + i);
+ vec_xst(vYD, 32, t0 + i);
+ vec_xst(vYE, 48, t0 + i);
+
+ vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1);
+ vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2);
+ vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3);
+ vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4);
+
+ vY5 = vec_xl(0, t2 + i);
+ vY6 = vec_xl(16, t2 + i);
+ vYH = vec_xl(32, t2 + i);
+ vYI = vec_xl(48, t2 + i);
+
+ vec_xst(vY3, 0, t1 + i);
+ vec_xst(vY4, 16, t1 + i);
+ vec_xst(vYF, 32, t1 + i);
+ vec_xst(vYG, 48, t1 + i);
+
+ vY5 = vY5 ^ EC_vec_permxor(vhi2, vlo2, vX1);
+ vY6 = vY6 ^ EC_vec_permxor(vhi2, vlo2, vX2);
+ vYH = vYH ^ EC_vec_permxor(vhi2, vlo2, vX3);
+ vYI = vYI ^ EC_vec_permxor(vhi2, vlo2, vX4);
+
+ vY7 = vec_xl(0, t3 + i);
+ vY8 = vec_xl(16, t3 + i);
+ vYJ = vec_xl(32, t3 + i);
+ vYK = vec_xl(48, t3 + i);
+
+ vec_xst(vY5, 0, t2 + i);
+ vec_xst(vY6, 16, t2 + i);
+ vec_xst(vYH, 32, t2 + i);
+ vec_xst(vYI, 48, t2 + i);
+
+ vY7 = vY7 ^ EC_vec_permxor(vhi3, vlo3, vX1);
+ vY8 = vY8 ^ EC_vec_permxor(vhi3, vlo3, vX2);
+ vYJ = vYJ ^ EC_vec_permxor(vhi3, vlo3, vX3);
+ vYK = vYK ^ EC_vec_permxor(vhi3, vlo3, vX4);
+
+ vY9 = vec_xl(0, t4 + i);
+ vYA = vec_xl(16, t4 + i);
+ vYL = vec_xl(32, t4 + i);
+ vYM = vec_xl(48, t4 + i);
+
+ vec_xst(vY7, 0, t3 + i);
+ vec_xst(vY8, 16, t3 + i);
+ vec_xst(vYJ, 32, t3 + i);
+ vec_xst(vYK, 48, t3 + i);
+
+ vY9 = vY9 ^ EC_vec_permxor(vhi4, vlo4, vX1);
+ vYA = vYA ^ EC_vec_permxor(vhi4, vlo4, vX2);
+ vYL = vYL ^ EC_vec_permxor(vhi4, vlo4, vX3);
+ vYM = vYM ^ EC_vec_permxor(vhi4, vlo4, vX4);
+
+ vec_xst(vY9, 0, t4 + i);
+ vec_xst(vYA, 16, t4 + i);
+ vec_xst(vYL, 32, t4 + i);
+ vec_xst(vYM, 48, t4 + i);
+ }
+ return;
+}
diff --git a/contrib/libs/isa-l/erasure_code/ppc64le/gf_6vect_dot_prod_vsx.c b/contrib/libs/isa-l/erasure_code/ppc64le/gf_6vect_dot_prod_vsx.c
new file mode 100644
index 0000000000..ac918bd493
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/ppc64le/gf_6vect_dot_prod_vsx.c
@@ -0,0 +1,166 @@
+#include "ec_base_vsx.h"
+
+void gf_6vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest)
+{
+ unsigned char *s, *t0, *t1, *t2, *t3, *t4, *t5;
+ vector unsigned char vX1, vX2, vX3, vX4;
+ vector unsigned char vY1, vY2, vY3, vY4, vY5, vY6, vY7, vY8, vY9, vYA, vYB, vYC;
+ vector unsigned char vYD, vYE, vYF, vYG, vYH, vYI, vYJ, vYK, vYL, vYM, vYN, vYO;
+ vector unsigned char vhi0, vlo0, vhi1, vlo1, vhi2, vlo2;
+ vector unsigned char vhi3, vlo3, vhi4, vlo4, vhi5, vlo5;
+ int i, j, head;
+
+ if (vlen < 128) {
+ gf_vect_mul_vsx(len, &gftbls[0 * 32 * vlen], src[0], (unsigned char *)dest[0]);
+ gf_vect_mul_vsx(len, &gftbls[1 * 32 * vlen], src[0], (unsigned char *)dest[1]);
+ gf_vect_mul_vsx(len, &gftbls[2 * 32 * vlen], src[0], (unsigned char *)dest[2]);
+ gf_vect_mul_vsx(len, &gftbls[3 * 32 * vlen], src[0], (unsigned char *)dest[3]);
+ gf_vect_mul_vsx(len, &gftbls[4 * 32 * vlen], src[0], (unsigned char *)dest[4]);
+ gf_vect_mul_vsx(len, &gftbls[5 * 32 * vlen], src[0], (unsigned char *)dest[5]);
+
+ for (j = 1; j < vlen; j++) {
+ gf_6vect_mad_vsx(len, vlen, j, gftbls, src[j], dest);
+ }
+ return;
+ }
+
+ t0 = (unsigned char *)dest[0];
+ t1 = (unsigned char *)dest[1];
+ t2 = (unsigned char *)dest[2];
+ t3 = (unsigned char *)dest[3];
+ t4 = (unsigned char *)dest[4];
+ t5 = (unsigned char *)dest[5];
+
+ head = len % 64;
+ if (head != 0) {
+ gf_vect_dot_prod_base(head, vlen, &gftbls[0 * 32 * vlen], src, t0);
+ gf_vect_dot_prod_base(head, vlen, &gftbls[1 * 32 * vlen], src, t1);
+ gf_vect_dot_prod_base(head, vlen, &gftbls[2 * 32 * vlen], src, t2);
+ gf_vect_dot_prod_base(head, vlen, &gftbls[3 * 32 * vlen], src, t3);
+ gf_vect_dot_prod_base(head, vlen, &gftbls[4 * 32 * vlen], src, t4);
+ gf_vect_dot_prod_base(head, vlen, &gftbls[5 * 32 * vlen], src, t5);
+ }
+
+ for (i = head; i < len - 63; i += 64) {
+ vY1 = vY1 ^ vY1;
+ vY2 = vY2 ^ vY2;
+ vY3 = vY3 ^ vY3;
+ vY4 = vY4 ^ vY4;
+ vY5 = vY5 ^ vY5;
+ vY6 = vY6 ^ vY6;
+ vY7 = vY7 ^ vY7;
+ vY8 = vY8 ^ vY8;
+ vY9 = vY9 ^ vY9;
+ vYA = vYA ^ vYA;
+ vYB = vYB ^ vYB;
+ vYC = vYC ^ vYC;
+
+ vYD = vYD ^ vYD;
+ vYE = vYE ^ vYE;
+ vYF = vYF ^ vYF;
+ vYG = vYG ^ vYG;
+ vYH = vYH ^ vYH;
+ vYI = vYI ^ vYI;
+ vYJ = vYJ ^ vYJ;
+ vYK = vYK ^ vYK;
+ vYL = vYL ^ vYL;
+ vYM = vYM ^ vYM;
+ vYN = vYN ^ vYN;
+ vYO = vYO ^ vYO;
+
+ unsigned char *g0 = &gftbls[0 * 32 * vlen];
+ unsigned char *g1 = &gftbls[1 * 32 * vlen];
+ unsigned char *g2 = &gftbls[2 * 32 * vlen];
+ unsigned char *g3 = &gftbls[3 * 32 * vlen];
+ unsigned char *g4 = &gftbls[4 * 32 * vlen];
+ unsigned char *g5 = &gftbls[5 * 32 * vlen];
+
+ for (j = 0; j < vlen; j++) {
+ s = (unsigned char *)src[j];
+ vX1 = vec_xl(0, s + i);
+ vX2 = vec_xl(16, s + i);
+ vX3 = vec_xl(32, s + i);
+ vX4 = vec_xl(48, s + i);
+
+ vlo0 = EC_vec_xl(0, g0);
+ vhi0 = EC_vec_xl(16, g0);
+ vlo1 = EC_vec_xl(0, g1);
+ vhi1 = EC_vec_xl(16, g1);
+
+ vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
+ vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
+ vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3);
+ vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4);
+
+ vlo2 = EC_vec_xl(0, g2);
+ vhi2 = EC_vec_xl(16, g2);
+ vlo3 = EC_vec_xl(0, g3);
+ vhi3 = EC_vec_xl(16, g3);
+
+ vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1);
+ vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2);
+ vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3);
+ vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4);
+
+ vlo4 = EC_vec_xl(0, g4);
+ vhi4 = EC_vec_xl(16, g4);
+ vlo5 = EC_vec_xl(0, g5);
+ vhi5 = EC_vec_xl(16, g5);
+
+ vY5 = vY5 ^ EC_vec_permxor(vhi2, vlo2, vX1);
+ vY6 = vY6 ^ EC_vec_permxor(vhi2, vlo2, vX2);
+ vYH = vYH ^ EC_vec_permxor(vhi2, vlo2, vX3);
+ vYI = vYI ^ EC_vec_permxor(vhi2, vlo2, vX4);
+
+ vY7 = vY7 ^ EC_vec_permxor(vhi3, vlo3, vX1);
+ vY8 = vY8 ^ EC_vec_permxor(vhi3, vlo3, vX2);
+ vYJ = vYJ ^ EC_vec_permxor(vhi3, vlo3, vX3);
+ vYK = vYK ^ EC_vec_permxor(vhi3, vlo3, vX4);
+
+ vY9 = vY9 ^ EC_vec_permxor(vhi4, vlo4, vX1);
+ vYA = vYA ^ EC_vec_permxor(vhi4, vlo4, vX2);
+ vYL = vYL ^ EC_vec_permxor(vhi4, vlo4, vX3);
+ vYM = vYM ^ EC_vec_permxor(vhi4, vlo4, vX4);
+
+ vYB = vYB ^ EC_vec_permxor(vhi5, vlo5, vX1);
+ vYC = vYC ^ EC_vec_permxor(vhi5, vlo5, vX2);
+ vYN = vYN ^ EC_vec_permxor(vhi5, vlo5, vX3);
+ vYO = vYO ^ EC_vec_permxor(vhi5, vlo5, vX4);
+
+ g0 += 32;
+ g1 += 32;
+ g2 += 32;
+ g3 += 32;
+ g4 += 32;
+ g5 += 32;
+ }
+
+ vec_xst(vY1, 0, t0 + i);
+ vec_xst(vY2, 16, t0 + i);
+ vec_xst(vY3, 0, t1 + i);
+ vec_xst(vY4, 16, t1 + i);
+ vec_xst(vY5, 0, t2 + i);
+ vec_xst(vY6, 16, t2 + i);
+ vec_xst(vY7, 0, t3 + i);
+ vec_xst(vY8, 16, t3 + i);
+ vec_xst(vY9, 0, t4 + i);
+ vec_xst(vYA, 16, t4 + i);
+ vec_xst(vYB, 0, t5 + i);
+ vec_xst(vYC, 16, t5 + i);
+
+ vec_xst(vYD, 32, t0 + i);
+ vec_xst(vYE, 48, t0 + i);
+ vec_xst(vYF, 32, t1 + i);
+ vec_xst(vYG, 48, t1 + i);
+ vec_xst(vYH, 32, t2 + i);
+ vec_xst(vYI, 48, t2 + i);
+ vec_xst(vYJ, 32, t3 + i);
+ vec_xst(vYK, 48, t3 + i);
+ vec_xst(vYL, 32, t4 + i);
+ vec_xst(vYM, 48, t4 + i);
+ vec_xst(vYN, 32, t5 + i);
+ vec_xst(vYO, 48, t5 + i);
+ }
+ return;
+}
diff --git a/contrib/libs/isa-l/erasure_code/ppc64le/gf_6vect_mad_vsx.c b/contrib/libs/isa-l/erasure_code/ppc64le/gf_6vect_mad_vsx.c
new file mode 100644
index 0000000000..43ea6c6966
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/ppc64le/gf_6vect_mad_vsx.c
@@ -0,0 +1,142 @@
+#include "ec_base_vsx.h"
+
+void gf_6vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char **dest)
+{
+ unsigned char *s, *t0, *t1, *t2, *t3, *t4, *t5;
+ vector unsigned char vX1, vX2, vX3, vX4;
+ vector unsigned char vY1, vY2, vY3, vY4, vY5, vY6, vY7, vY8, vY9, vYA, vYB, vYC;
+ vector unsigned char vYD, vYE, vYF, vYG, vYH, vYI, vYJ, vYK, vYL, vYM, vYN, vYO;
+ vector unsigned char vhi0, vlo0, vhi1, vlo1, vhi2, vlo2;
+ vector unsigned char vhi3, vlo3, vhi4, vlo4, vhi5, vlo5;
+ int i, head;
+
+ s = (unsigned char *)src;
+ t0 = (unsigned char *)dest[0];
+ t1 = (unsigned char *)dest[1];
+ t2 = (unsigned char *)dest[2];
+ t3 = (unsigned char *)dest[3];
+ t4 = (unsigned char *)dest[4];
+ t5 = (unsigned char *)dest[5];
+
+ head = len % 64;
+ if (head != 0) {
+ gf_vect_mad_base(head, vec, vec_i, &gftbls[0 * 32 * vec], src, t0);
+ gf_vect_mad_base(head, vec, vec_i, &gftbls[1 * 32 * vec], src, t1);
+ gf_vect_mad_base(head, vec, vec_i, &gftbls[2 * 32 * vec], src, t2);
+ gf_vect_mad_base(head, vec, vec_i, &gftbls[3 * 32 * vec], src, t3);
+ gf_vect_mad_base(head, vec, vec_i, &gftbls[4 * 32 * vec], src, t4);
+ gf_vect_mad_base(head, vec, vec_i, &gftbls[5 * 32 * vec], src, t5);
+ }
+
+ vlo0 = EC_vec_xl(0, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
+ vhi0 = EC_vec_xl(16, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
+ vlo1 = EC_vec_xl(0, gftbls + (((1 * vec) << 5) + (vec_i << 5)));
+ vhi1 = EC_vec_xl(16, gftbls + (((1 * vec) << 5) + (vec_i << 5)));
+ vlo2 = EC_vec_xl(0, gftbls + (((2 * vec) << 5) + (vec_i << 5)));
+ vhi2 = EC_vec_xl(16, gftbls + (((2 * vec) << 5) + (vec_i << 5)));
+ vlo3 = EC_vec_xl(0, gftbls + (((3 * vec) << 5) + (vec_i << 5)));
+ vhi3 = EC_vec_xl(16, gftbls + (((3 * vec) << 5) + (vec_i << 5)));
+ vlo4 = EC_vec_xl(0, gftbls + (((4 * vec) << 5) + (vec_i << 5)));
+ vhi4 = EC_vec_xl(16, gftbls + (((4 * vec) << 5) + (vec_i << 5)));
+ vlo5 = EC_vec_xl(0, gftbls + (((5 * vec) << 5) + (vec_i << 5)));
+ vhi5 = EC_vec_xl(16, gftbls + (((5 * vec) << 5) + (vec_i << 5)));
+
+ for (i = head; i < len - 63; i += 64) {
+ vX1 = vec_xl(0, s + i);
+ vX2 = vec_xl(16, s + i);
+ vX3 = vec_xl(32, s + i);
+ vX4 = vec_xl(48, s + i);
+
+ vY1 = vec_xl(0, t0 + i);
+ vY2 = vec_xl(16, t0 + i);
+ vYD = vec_xl(32, t0 + i);
+ vYE = vec_xl(48, t0 + i);
+
+ vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
+ vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
+ vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3);
+ vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4);
+
+ vec_xst(vY1, 0, t0 + i);
+ vec_xst(vY2, 16, t0 + i);
+ vec_xst(vYD, 32, t0 + i);
+ vec_xst(vYE, 48, t0 + i);
+
+ vY3 = vec_xl(0, t1 + i);
+ vY4 = vec_xl(16, t1 + i);
+ vYF = vec_xl(32, t1 + i);
+ vYG = vec_xl(48, t1 + i);
+
+ vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1);
+ vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2);
+ vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3);
+ vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4);
+
+ vec_xst(vY3, 0, t1 + i);
+ vec_xst(vY4, 16, t1 + i);
+ vec_xst(vYF, 32, t1 + i);
+ vec_xst(vYG, 48, t1 + i);
+
+ vY5 = vec_xl(0, t2 + i);
+ vY6 = vec_xl(16, t2 + i);
+ vYH = vec_xl(32, t2 + i);
+ vYI = vec_xl(48, t2 + i);
+
+ vY5 = vY5 ^ EC_vec_permxor(vhi2, vlo2, vX1);
+ vY6 = vY6 ^ EC_vec_permxor(vhi2, vlo2, vX2);
+ vYH = vYH ^ EC_vec_permxor(vhi2, vlo2, vX3);
+ vYI = vYI ^ EC_vec_permxor(vhi2, vlo2, vX4);
+
+ vY7 = vec_xl(0, t3 + i);
+ vY8 = vec_xl(16, t3 + i);
+ vYJ = vec_xl(32, t3 + i);
+ vYK = vec_xl(48, t3 + i);
+
+ vec_xst(vY5, 0, t2 + i);
+ vec_xst(vY6, 16, t2 + i);
+ vec_xst(vYH, 32, t2 + i);
+ vec_xst(vYI, 48, t2 + i);
+
+ vY7 = vY7 ^ EC_vec_permxor(vhi3, vlo3, vX1);
+ vY8 = vY8 ^ EC_vec_permxor(vhi3, vlo3, vX2);
+ vYJ = vYJ ^ EC_vec_permxor(vhi3, vlo3, vX3);
+ vYK = vYK ^ EC_vec_permxor(vhi3, vlo3, vX4);
+
+ vY9 = vec_xl(0, t4 + i);
+ vYA = vec_xl(16, t4 + i);
+ vYL = vec_xl(32, t4 + i);
+ vYM = vec_xl(48, t4 + i);
+
+ vec_xst(vY7, 0, t3 + i);
+ vec_xst(vY8, 16, t3 + i);
+ vec_xst(vYJ, 32, t3 + i);
+ vec_xst(vYK, 48, t3 + i);
+
+ vY9 = vY9 ^ EC_vec_permxor(vhi4, vlo4, vX1);
+ vYA = vYA ^ EC_vec_permxor(vhi4, vlo4, vX2);
+ vYL = vYL ^ EC_vec_permxor(vhi4, vlo4, vX3);
+ vYM = vYM ^ EC_vec_permxor(vhi4, vlo4, vX4);
+
+ vYB = vec_xl(0, t5 + i);
+ vYC = vec_xl(16, t5 + i);
+ vYN = vec_xl(32, t5 + i);
+ vYO = vec_xl(48, t5 + i);
+
+ vec_xst(vY9, 0, t4 + i);
+ vec_xst(vYA, 16, t4 + i);
+ vec_xst(vYL, 32, t4 + i);
+ vec_xst(vYM, 48, t4 + i);
+
+ vYB = vYB ^ EC_vec_permxor(vhi5, vlo5, vX1);
+ vYC = vYC ^ EC_vec_permxor(vhi5, vlo5, vX2);
+ vYN = vYN ^ EC_vec_permxor(vhi5, vlo5, vX3);
+ vYO = vYO ^ EC_vec_permxor(vhi5, vlo5, vX4);
+
+ vec_xst(vYB, 0, t5 + i);
+ vec_xst(vYC, 16, t5 + i);
+ vec_xst(vYN, 32, t5 + i);
+ vec_xst(vYO, 48, t5 + i);
+ }
+ return;
+}
diff --git a/contrib/libs/isa-l/erasure_code/ppc64le/gf_vect_dot_prod_vsx.c b/contrib/libs/isa-l/erasure_code/ppc64le/gf_vect_dot_prod_vsx.c
new file mode 100644
index 0000000000..2f97e3421f
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/ppc64le/gf_vect_dot_prod_vsx.c
@@ -0,0 +1,85 @@
+#include "ec_base_vsx.h"
+
+void gf_vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char *dest)
+{
+ unsigned char *s, *t0;
+ vector unsigned char vX1, vY1;
+ vector unsigned char vX2, vY2;
+ vector unsigned char vX3, vY3;
+ vector unsigned char vX4, vY4;
+ vector unsigned char vX5, vY5;
+ vector unsigned char vX6, vY6;
+ vector unsigned char vX7, vY7;
+ vector unsigned char vX8, vY8;
+ vector unsigned char vhi0, vlo0;
+ int i, j, head;
+
+ if (vlen < 128) {
+ gf_vect_mul_vsx(len, &gftbls[0 * 32 * vlen], src[0], (unsigned char *)dest);
+
+ for (j = 1; j < vlen; j++) {
+ gf_vect_mad_vsx(len, vlen, j, gftbls, src[j], dest);
+ }
+ return;
+ }
+
+ t0 = (unsigned char *)dest;
+
+ head = len % 128;
+ if (head != 0) {
+ gf_vect_dot_prod_base(head, vlen, &gftbls[0 * 32 * vlen], src, t0);
+ }
+
+ for (i = head; i < len - 127; i += 128) {
+ vY1 = vY1 ^ vY1;
+ vY2 = vY2 ^ vY2;
+ vY3 = vY3 ^ vY3;
+ vY4 = vY4 ^ vY4;
+
+ vY5 = vY5 ^ vY5;
+ vY6 = vY6 ^ vY6;
+ vY7 = vY7 ^ vY7;
+ vY8 = vY8 ^ vY8;
+
+ unsigned char *g0 = &gftbls[0 * 32 * vlen];
+
+ for (j = 0; j < vlen; j++) {
+ s = (unsigned char *)src[j];
+ vX1 = vec_xl(0, s + i);
+ vX2 = vec_xl(16, s + i);
+ vX3 = vec_xl(32, s + i);
+ vX4 = vec_xl(48, s + i);
+
+ vlo0 = EC_vec_xl(0, g0);
+ vhi0 = EC_vec_xl(16, g0);
+
+ vX5 = vec_xl(64, s + i);
+ vX6 = vec_xl(80, s + i);
+ vX7 = vec_xl(96, s + i);
+ vX8 = vec_xl(112, s + i);
+
+ vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
+ vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
+ vY3 = vY3 ^ EC_vec_permxor(vhi0, vlo0, vX3);
+ vY4 = vY4 ^ EC_vec_permxor(vhi0, vlo0, vX4);
+
+ vY5 = vY5 ^ EC_vec_permxor(vhi0, vlo0, vX5);
+ vY6 = vY6 ^ EC_vec_permxor(vhi0, vlo0, vX6);
+ vY7 = vY7 ^ EC_vec_permxor(vhi0, vlo0, vX7);
+ vY8 = vY8 ^ EC_vec_permxor(vhi0, vlo0, vX8);
+
+ g0 += 32;
+ }
+ vec_xst(vY1, 0, t0 + i);
+ vec_xst(vY2, 16, t0 + i);
+ vec_xst(vY3, 32, t0 + i);
+ vec_xst(vY4, 48, t0 + i);
+
+ vec_xst(vY5, 64, t0 + i);
+ vec_xst(vY6, 80, t0 + i);
+ vec_xst(vY7, 96, t0 + i);
+ vec_xst(vY8, 112, t0 + i);
+ }
+ return;
+}
diff --git a/contrib/libs/isa-l/erasure_code/ppc64le/gf_vect_mad_vsx.c b/contrib/libs/isa-l/erasure_code/ppc64le/gf_vect_mad_vsx.c
new file mode 100644
index 0000000000..a4810b96db
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/ppc64le/gf_vect_mad_vsx.c
@@ -0,0 +1,48 @@
+#include "ec_base_vsx.h"
+
+void gf_vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char *dest)
+{
+ unsigned char *s, *t0;
+ vector unsigned char vX1, vY1;
+ vector unsigned char vX2, vY2;
+ vector unsigned char vX3, vY3;
+ vector unsigned char vX4, vY4;
+ vector unsigned char vhi0, vlo0;
+ int i, head;
+
+ s = (unsigned char *)src;
+ t0 = (unsigned char *)dest;
+
+ head = len % 64;
+ if (head != 0) {
+ gf_vect_mad_base(head, vec, vec_i, &gftbls[0 * 32 * vec], src, dest);
+ }
+
+ vlo0 = EC_vec_xl(0, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
+ vhi0 = EC_vec_xl(16, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
+
+ for (i = head; i < len - 63; i += 64) {
+ vX1 = vec_xl(0, s + i);
+ vX2 = vec_xl(16, s + i);
+ vX3 = vec_xl(32, s + i);
+ vX4 = vec_xl(48, s + i);
+
+ vY1 = vec_xl(0, t0 + i);
+ vY2 = vec_xl(16, t0 + i);
+ vY3 = vec_xl(32, t0 + i);
+ vY4 = vec_xl(48, t0 + i);
+
+ vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
+ vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
+ vY3 = vY3 ^ EC_vec_permxor(vhi0, vlo0, vX3);
+ vY4 = vY4 ^ EC_vec_permxor(vhi0, vlo0, vX4);
+
+ vec_xst(vY1, 0, t0 + i);
+ vec_xst(vY2, 16, t0 + i);
+ vec_xst(vY3, 32, t0 + i);
+ vec_xst(vY4, 48, t0 + i);
+ }
+
+ return;
+}
diff --git a/contrib/libs/isa-l/erasure_code/ppc64le/gf_vect_mul_vsx.c b/contrib/libs/isa-l/erasure_code/ppc64le/gf_vect_mul_vsx.c
new file mode 100644
index 0000000000..812eb83d82
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/ppc64le/gf_vect_mul_vsx.c
@@ -0,0 +1,75 @@
+#include "ec_base_vsx.h"
+
+/*
+ * Same as gf_vect_mul_base in "ec_base.h" but without the size restriction.
+ */
+static void _gf_vect_mul_base(int len, unsigned char *a, unsigned char *src,
+ unsigned char *dest)
+{
+ //2nd element of table array is ref value used to fill it in
+ unsigned char c = a[1];
+
+ while (len-- > 0)
+ *dest++ = gf_mul_erasure(c, *src++);
+ return 0;
+}
+
+void gf_vect_mul_vsx(int len, unsigned char *gftbl, unsigned char *src, unsigned char *dest)
+{
+ unsigned char *s, *t0;
+ vector unsigned char vX1, vY1;
+ vector unsigned char vX2, vY2;
+ vector unsigned char vX3, vY3;
+ vector unsigned char vX4, vY4;
+ vector unsigned char vX5, vY5;
+ vector unsigned char vX6, vY6;
+ vector unsigned char vX7, vY7;
+ vector unsigned char vX8, vY8;
+ vector unsigned char vhi0, vlo0;
+ int i, head;
+
+ s = (unsigned char *)src;
+ t0 = (unsigned char *)dest;
+
+ head = len % 128;
+ if (head != 0) {
+ _gf_vect_mul_base(head, gftbl, src, dest);
+ }
+
+ vlo0 = EC_vec_xl(0, gftbl);
+ vhi0 = EC_vec_xl(16, gftbl);
+
+ for (i = head; i < len - 127; i += 128) {
+ vX1 = vec_xl(0, s + i);
+ vX2 = vec_xl(16, s + i);
+ vX3 = vec_xl(32, s + i);
+ vX4 = vec_xl(48, s + i);
+
+ vX5 = vec_xl(64, s + i);
+ vX6 = vec_xl(80, s + i);
+ vX7 = vec_xl(96, s + i);
+ vX8 = vec_xl(112, s + i);
+
+ vY1 = EC_vec_permxor(vhi0, vlo0, vX1);
+ vY2 = EC_vec_permxor(vhi0, vlo0, vX2);
+ vY3 = EC_vec_permxor(vhi0, vlo0, vX3);
+ vY4 = EC_vec_permxor(vhi0, vlo0, vX4);
+
+ vY5 = EC_vec_permxor(vhi0, vlo0, vX5);
+ vY6 = EC_vec_permxor(vhi0, vlo0, vX6);
+ vY7 = EC_vec_permxor(vhi0, vlo0, vX7);
+ vY8 = EC_vec_permxor(vhi0, vlo0, vX8);
+
+ vec_xst(vY1, 0, t0 + i);
+ vec_xst(vY2, 16, t0 + i);
+ vec_xst(vY3, 32, t0 + i);
+ vec_xst(vY4, 48, t0 + i);
+
+ vec_xst(vY5, 64, t0 + i);
+ vec_xst(vY6, 80, t0 + i);
+ vec_xst(vY7, 96, t0 + i);
+ vec_xst(vY8, 112, t0 + i);
+ }
+
+ return;
+}
diff --git a/contrib/libs/isa-l/erasure_code/ut/erasure_code_base_test/ya.make b/contrib/libs/isa-l/erasure_code/ut/erasure_code_base_test/ya.make
index 28acf0f7da..ad59a89cd1 100644
--- a/contrib/libs/isa-l/erasure_code/ut/erasure_code_base_test/ya.make
+++ b/contrib/libs/isa-l/erasure_code/ut/erasure_code_base_test/ya.make
@@ -1,6 +1,6 @@
PROGRAM()
-VERSION(2.28)
+VERSION(2.31)
LICENSE(BSD-3-Clause)
diff --git a/contrib/libs/isa-l/erasure_code/ut/erasure_code_test/ya.make b/contrib/libs/isa-l/erasure_code/ut/erasure_code_test/ya.make
index 1efbc5231f..4daab8f3ef 100644
--- a/contrib/libs/isa-l/erasure_code/ut/erasure_code_test/ya.make
+++ b/contrib/libs/isa-l/erasure_code/ut/erasure_code_test/ya.make
@@ -1,6 +1,6 @@
PROGRAM()
-VERSION(2.28)
+VERSION(2.31)
LICENSE(BSD-3-Clause)
diff --git a/contrib/libs/isa-l/erasure_code/ut/erasure_code_update_test/ya.make b/contrib/libs/isa-l/erasure_code/ut/erasure_code_update_test/ya.make
index 2b9e11fcaa..1fa89c9034 100644
--- a/contrib/libs/isa-l/erasure_code/ut/erasure_code_update_test/ya.make
+++ b/contrib/libs/isa-l/erasure_code/ut/erasure_code_update_test/ya.make
@@ -1,6 +1,6 @@
PROGRAM()
-VERSION(2.28)
+VERSION(2.31)
LICENSE(BSD-3-Clause)
diff --git a/contrib/libs/isa-l/erasure_code/ut/gf_2vect_dot_prod_sse_test/ya.make b/contrib/libs/isa-l/erasure_code/ut/gf_2vect_dot_prod_sse_test/ya.make
deleted file mode 100644
index 2a0c6baf2c..0000000000
--- a/contrib/libs/isa-l/erasure_code/ut/gf_2vect_dot_prod_sse_test/ya.make
+++ /dev/null
@@ -1,29 +0,0 @@
-PROGRAM()
-
-VERSION(2.28)
-
-LICENSE(BSD-3-Clause)
-
-LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
-
-NO_UTIL()
-
-SUBSCRIBER(
- akozhikhov
- g:base
- g:yt
-)
-
-ADDINCL(contrib/libs/isa-l/include)
-
-NO_COMPILER_WARNINGS()
-
-SRCS(
- ../../gf_2vect_dot_prod_sse_test.c
-)
-
-PEERDIR(
- contrib/libs/isa-l/erasure_code
-)
-
-END()
diff --git a/contrib/libs/isa-l/erasure_code/ut/gf_3vect_dot_prod_sse_test/.yandex_meta/licenses.list.txt b/contrib/libs/isa-l/erasure_code/ut/gf_3vect_dot_prod_sse_test/.yandex_meta/licenses.list.txt
deleted file mode 100644
index 8f218b47cb..0000000000
--- a/contrib/libs/isa-l/erasure_code/ut/gf_3vect_dot_prod_sse_test/.yandex_meta/licenses.list.txt
+++ /dev/null
@@ -1,164 +0,0 @@
-====================BSD-3-Clause====================
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions
- are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in
- the documentation and/or other materials provided with the
- distribution.
- * Neither the name of Intel Corporation nor the names of its
- contributors may be used to endorse or promote products derived
- from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-====================BSD-3-Clause====================
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-# * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-# * Redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in
-# the documentation and/or other materials provided with the
-# distribution.
-# * Neither the name of Arm Corporation nor the names of its
-# contributors may be used to endorse or promote products derived
-# from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-====================BSD-3-Clause====================
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-# * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-# * Redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in
-# the documentation and/or other materials provided with the
-# distribution.
-# * Neither the name of Intel Corporation nor the names of its
-# contributors may be used to endorse or promote products derived
-# from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-====================BSD-3-Clause====================
-; Redistribution and use in source and binary forms, with or without
-; modification, are permitted provided that the following conditions
-; are met:
-; * Redistributions of source code must retain the above copyright
-; notice, this list of conditions and the following disclaimer.
-; * Redistributions in binary form must reproduce the above copyright
-; notice, this list of conditions and the following disclaimer in
-; the documentation and/or other materials provided with the
-; distribution.
-; * Neither the name of Intel Corporation nor the names of its
-; contributors may be used to endorse or promote products derived
-; from this software without specific prior written permission.
-;
-; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-====================BSD-3-Clause====================
-ISA-L is licensed using a BSD 3-clause [license]. All code submitted to
-
-
-====================BSD-3-Clause AND BSD-3-Clause AND BSD-3-Clause====================
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions
- are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in
- the documentation and/or other materials provided with the
- distribution.
- * Neither the name of Arm Corporation nor the names of its
- contributors may be used to endorse or promote products derived
- from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-====================COPYRIGHT====================
- Copyright(c) 2011-2013 Intel Corporation All rights reserved.
-
-
-====================COPYRIGHT====================
- Copyright(c) 2011-2017 Intel Corporation All rights reserved.
-
-
-====================COPYRIGHT====================
-# Copyright(c) 2011-2015 Intel Corporation All rights reserved.
-
-
-====================COPYRIGHT====================
-# Copyright(c) 2011-2016 Intel Corporation All rights reserved.
-
-
-====================COPYRIGHT====================
-# Copyright(c) 2011-2018 Intel Corporation All rights reserved.
-
-
-====================COPYRIGHT====================
-# Copyright(c) 2019 Arm Corporation All rights reserved.
-
-
-====================COPYRIGHT====================
-; Copyright(c) 2011-2019 Intel Corporation All rights reserved.
diff --git a/contrib/libs/isa-l/erasure_code/ut/gf_3vect_dot_prod_sse_test/ya.make b/contrib/libs/isa-l/erasure_code/ut/gf_3vect_dot_prod_sse_test/ya.make
deleted file mode 100644
index c4b11b139e..0000000000
--- a/contrib/libs/isa-l/erasure_code/ut/gf_3vect_dot_prod_sse_test/ya.make
+++ /dev/null
@@ -1,29 +0,0 @@
-PROGRAM()
-
-VERSION(2.28)
-
-LICENSE(BSD-3-Clause)
-
-LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
-
-NO_UTIL()
-
-SUBSCRIBER(
- akozhikhov
- g:base
- g:yt
-)
-
-ADDINCL(contrib/libs/isa-l/include)
-
-NO_COMPILER_WARNINGS()
-
-SRCS(
- ../../gf_3vect_dot_prod_sse_test.c
-)
-
-PEERDIR(
- contrib/libs/isa-l/erasure_code
-)
-
-END()
diff --git a/contrib/libs/isa-l/erasure_code/ut/gf_4vect_dot_prod_sse_test/.yandex_meta/licenses.list.txt b/contrib/libs/isa-l/erasure_code/ut/gf_4vect_dot_prod_sse_test/.yandex_meta/licenses.list.txt
deleted file mode 100644
index 8f218b47cb..0000000000
--- a/contrib/libs/isa-l/erasure_code/ut/gf_4vect_dot_prod_sse_test/.yandex_meta/licenses.list.txt
+++ /dev/null
@@ -1,164 +0,0 @@
-====================BSD-3-Clause====================
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions
- are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in
- the documentation and/or other materials provided with the
- distribution.
- * Neither the name of Intel Corporation nor the names of its
- contributors may be used to endorse or promote products derived
- from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-====================BSD-3-Clause====================
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-# * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-# * Redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in
-# the documentation and/or other materials provided with the
-# distribution.
-# * Neither the name of Arm Corporation nor the names of its
-# contributors may be used to endorse or promote products derived
-# from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-====================BSD-3-Clause====================
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-# * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-# * Redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in
-# the documentation and/or other materials provided with the
-# distribution.
-# * Neither the name of Intel Corporation nor the names of its
-# contributors may be used to endorse or promote products derived
-# from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-====================BSD-3-Clause====================
-; Redistribution and use in source and binary forms, with or without
-; modification, are permitted provided that the following conditions
-; are met:
-; * Redistributions of source code must retain the above copyright
-; notice, this list of conditions and the following disclaimer.
-; * Redistributions in binary form must reproduce the above copyright
-; notice, this list of conditions and the following disclaimer in
-; the documentation and/or other materials provided with the
-; distribution.
-; * Neither the name of Intel Corporation nor the names of its
-; contributors may be used to endorse or promote products derived
-; from this software without specific prior written permission.
-;
-; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-====================BSD-3-Clause====================
-ISA-L is licensed using a BSD 3-clause [license]. All code submitted to
-
-
-====================BSD-3-Clause AND BSD-3-Clause AND BSD-3-Clause====================
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions
- are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in
- the documentation and/or other materials provided with the
- distribution.
- * Neither the name of Arm Corporation nor the names of its
- contributors may be used to endorse or promote products derived
- from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-====================COPYRIGHT====================
- Copyright(c) 2011-2013 Intel Corporation All rights reserved.
-
-
-====================COPYRIGHT====================
- Copyright(c) 2011-2017 Intel Corporation All rights reserved.
-
-
-====================COPYRIGHT====================
-# Copyright(c) 2011-2015 Intel Corporation All rights reserved.
-
-
-====================COPYRIGHT====================
-# Copyright(c) 2011-2016 Intel Corporation All rights reserved.
-
-
-====================COPYRIGHT====================
-# Copyright(c) 2011-2018 Intel Corporation All rights reserved.
-
-
-====================COPYRIGHT====================
-# Copyright(c) 2019 Arm Corporation All rights reserved.
-
-
-====================COPYRIGHT====================
-; Copyright(c) 2011-2019 Intel Corporation All rights reserved.
diff --git a/contrib/libs/isa-l/erasure_code/ut/gf_4vect_dot_prod_sse_test/ya.make b/contrib/libs/isa-l/erasure_code/ut/gf_4vect_dot_prod_sse_test/ya.make
deleted file mode 100644
index 758e7463a1..0000000000
--- a/contrib/libs/isa-l/erasure_code/ut/gf_4vect_dot_prod_sse_test/ya.make
+++ /dev/null
@@ -1,29 +0,0 @@
-PROGRAM()
-
-VERSION(2.28)
-
-LICENSE(BSD-3-Clause)
-
-LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
-
-NO_UTIL()
-
-SUBSCRIBER(
- akozhikhov
- g:base
- g:yt
-)
-
-ADDINCL(contrib/libs/isa-l/include)
-
-NO_COMPILER_WARNINGS()
-
-SRCS(
- ../../gf_4vect_dot_prod_sse_test.c
-)
-
-PEERDIR(
- contrib/libs/isa-l/erasure_code
-)
-
-END()
diff --git a/contrib/libs/isa-l/erasure_code/ut/gf_5vect_dot_prod_sse_test/.yandex_meta/licenses.list.txt b/contrib/libs/isa-l/erasure_code/ut/gf_5vect_dot_prod_sse_test/.yandex_meta/licenses.list.txt
deleted file mode 100644
index 8f218b47cb..0000000000
--- a/contrib/libs/isa-l/erasure_code/ut/gf_5vect_dot_prod_sse_test/.yandex_meta/licenses.list.txt
+++ /dev/null
@@ -1,164 +0,0 @@
-====================BSD-3-Clause====================
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions
- are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in
- the documentation and/or other materials provided with the
- distribution.
- * Neither the name of Intel Corporation nor the names of its
- contributors may be used to endorse or promote products derived
- from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-====================BSD-3-Clause====================
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-# * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-# * Redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in
-# the documentation and/or other materials provided with the
-# distribution.
-# * Neither the name of Arm Corporation nor the names of its
-# contributors may be used to endorse or promote products derived
-# from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-====================BSD-3-Clause====================
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-# * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-# * Redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in
-# the documentation and/or other materials provided with the
-# distribution.
-# * Neither the name of Intel Corporation nor the names of its
-# contributors may be used to endorse or promote products derived
-# from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-====================BSD-3-Clause====================
-; Redistribution and use in source and binary forms, with or without
-; modification, are permitted provided that the following conditions
-; are met:
-; * Redistributions of source code must retain the above copyright
-; notice, this list of conditions and the following disclaimer.
-; * Redistributions in binary form must reproduce the above copyright
-; notice, this list of conditions and the following disclaimer in
-; the documentation and/or other materials provided with the
-; distribution.
-; * Neither the name of Intel Corporation nor the names of its
-; contributors may be used to endorse or promote products derived
-; from this software without specific prior written permission.
-;
-; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-====================BSD-3-Clause====================
-ISA-L is licensed using a BSD 3-clause [license]. All code submitted to
-
-
-====================BSD-3-Clause AND BSD-3-Clause AND BSD-3-Clause====================
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions
- are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in
- the documentation and/or other materials provided with the
- distribution.
- * Neither the name of Arm Corporation nor the names of its
- contributors may be used to endorse or promote products derived
- from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-====================COPYRIGHT====================
- Copyright(c) 2011-2013 Intel Corporation All rights reserved.
-
-
-====================COPYRIGHT====================
- Copyright(c) 2011-2017 Intel Corporation All rights reserved.
-
-
-====================COPYRIGHT====================
-# Copyright(c) 2011-2015 Intel Corporation All rights reserved.
-
-
-====================COPYRIGHT====================
-# Copyright(c) 2011-2016 Intel Corporation All rights reserved.
-
-
-====================COPYRIGHT====================
-# Copyright(c) 2011-2018 Intel Corporation All rights reserved.
-
-
-====================COPYRIGHT====================
-# Copyright(c) 2019 Arm Corporation All rights reserved.
-
-
-====================COPYRIGHT====================
-; Copyright(c) 2011-2019 Intel Corporation All rights reserved.
diff --git a/contrib/libs/isa-l/erasure_code/ut/gf_5vect_dot_prod_sse_test/ya.make b/contrib/libs/isa-l/erasure_code/ut/gf_5vect_dot_prod_sse_test/ya.make
deleted file mode 100644
index 4c389b8188..0000000000
--- a/contrib/libs/isa-l/erasure_code/ut/gf_5vect_dot_prod_sse_test/ya.make
+++ /dev/null
@@ -1,29 +0,0 @@
-PROGRAM()
-
-VERSION(2.28)
-
-LICENSE(BSD-3-Clause)
-
-LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
-
-NO_UTIL()
-
-SUBSCRIBER(
- akozhikhov
- g:base
- g:yt
-)
-
-ADDINCL(contrib/libs/isa-l/include)
-
-NO_COMPILER_WARNINGS()
-
-SRCS(
- ../../gf_5vect_dot_prod_sse_test.c
-)
-
-PEERDIR(
- contrib/libs/isa-l/erasure_code
-)
-
-END()
diff --git a/contrib/libs/isa-l/erasure_code/ut/gf_6vect_dot_prod_sse_test/.yandex_meta/licenses.list.txt b/contrib/libs/isa-l/erasure_code/ut/gf_6vect_dot_prod_sse_test/.yandex_meta/licenses.list.txt
deleted file mode 100644
index 8f218b47cb..0000000000
--- a/contrib/libs/isa-l/erasure_code/ut/gf_6vect_dot_prod_sse_test/.yandex_meta/licenses.list.txt
+++ /dev/null
@@ -1,164 +0,0 @@
-====================BSD-3-Clause====================
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions
- are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in
- the documentation and/or other materials provided with the
- distribution.
- * Neither the name of Intel Corporation nor the names of its
- contributors may be used to endorse or promote products derived
- from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-====================BSD-3-Clause====================
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-# * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-# * Redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in
-# the documentation and/or other materials provided with the
-# distribution.
-# * Neither the name of Arm Corporation nor the names of its
-# contributors may be used to endorse or promote products derived
-# from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-====================BSD-3-Clause====================
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-# * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-# * Redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in
-# the documentation and/or other materials provided with the
-# distribution.
-# * Neither the name of Intel Corporation nor the names of its
-# contributors may be used to endorse or promote products derived
-# from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-====================BSD-3-Clause====================
-; Redistribution and use in source and binary forms, with or without
-; modification, are permitted provided that the following conditions
-; are met:
-; * Redistributions of source code must retain the above copyright
-; notice, this list of conditions and the following disclaimer.
-; * Redistributions in binary form must reproduce the above copyright
-; notice, this list of conditions and the following disclaimer in
-; the documentation and/or other materials provided with the
-; distribution.
-; * Neither the name of Intel Corporation nor the names of its
-; contributors may be used to endorse or promote products derived
-; from this software without specific prior written permission.
-;
-; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-====================BSD-3-Clause====================
-ISA-L is licensed using a BSD 3-clause [license]. All code submitted to
-
-
-====================BSD-3-Clause AND BSD-3-Clause AND BSD-3-Clause====================
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions
- are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in
- the documentation and/or other materials provided with the
- distribution.
- * Neither the name of Arm Corporation nor the names of its
- contributors may be used to endorse or promote products derived
- from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-====================COPYRIGHT====================
- Copyright(c) 2011-2013 Intel Corporation All rights reserved.
-
-
-====================COPYRIGHT====================
- Copyright(c) 2011-2017 Intel Corporation All rights reserved.
-
-
-====================COPYRIGHT====================
-# Copyright(c) 2011-2015 Intel Corporation All rights reserved.
-
-
-====================COPYRIGHT====================
-# Copyright(c) 2011-2016 Intel Corporation All rights reserved.
-
-
-====================COPYRIGHT====================
-# Copyright(c) 2011-2018 Intel Corporation All rights reserved.
-
-
-====================COPYRIGHT====================
-# Copyright(c) 2019 Arm Corporation All rights reserved.
-
-
-====================COPYRIGHT====================
-; Copyright(c) 2011-2019 Intel Corporation All rights reserved.
diff --git a/contrib/libs/isa-l/erasure_code/ut/gf_6vect_dot_prod_sse_test/ya.make b/contrib/libs/isa-l/erasure_code/ut/gf_6vect_dot_prod_sse_test/ya.make
deleted file mode 100644
index 09f782f76a..0000000000
--- a/contrib/libs/isa-l/erasure_code/ut/gf_6vect_dot_prod_sse_test/ya.make
+++ /dev/null
@@ -1,29 +0,0 @@
-PROGRAM()
-
-VERSION(2.28)
-
-LICENSE(BSD-3-Clause)
-
-LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
-
-NO_UTIL()
-
-SUBSCRIBER(
- akozhikhov
- g:base
- g:yt
-)
-
-ADDINCL(contrib/libs/isa-l/include)
-
-NO_COMPILER_WARNINGS()
-
-SRCS(
- ../../gf_6vect_dot_prod_sse_test.c
-)
-
-PEERDIR(
- contrib/libs/isa-l/erasure_code
-)
-
-END()
diff --git a/contrib/libs/isa-l/erasure_code/ut/gf_inverse_test/ya.make b/contrib/libs/isa-l/erasure_code/ut/gf_inverse_test/ya.make
index dc59e7e3d3..fc897ca8a2 100644
--- a/contrib/libs/isa-l/erasure_code/ut/gf_inverse_test/ya.make
+++ b/contrib/libs/isa-l/erasure_code/ut/gf_inverse_test/ya.make
@@ -1,6 +1,6 @@
PROGRAM()
-VERSION(2.28)
+VERSION(2.31)
LICENSE(BSD-3-Clause)
diff --git a/contrib/libs/isa-l/erasure_code/ut/gf_vect_dot_prod_base_test/ya.make b/contrib/libs/isa-l/erasure_code/ut/gf_vect_dot_prod_base_test/ya.make
index 89d263fa19..e396b42fdf 100644
--- a/contrib/libs/isa-l/erasure_code/ut/gf_vect_dot_prod_base_test/ya.make
+++ b/contrib/libs/isa-l/erasure_code/ut/gf_vect_dot_prod_base_test/ya.make
@@ -1,6 +1,6 @@
PROGRAM()
-VERSION(2.28)
+VERSION(2.31)
LICENSE(BSD-3-Clause)
diff --git a/contrib/libs/isa-l/erasure_code/ut/gf_vect_dot_prod_test/ya.make b/contrib/libs/isa-l/erasure_code/ut/gf_vect_dot_prod_test/ya.make
index 4767c00d3a..20897781ea 100644
--- a/contrib/libs/isa-l/erasure_code/ut/gf_vect_dot_prod_test/ya.make
+++ b/contrib/libs/isa-l/erasure_code/ut/gf_vect_dot_prod_test/ya.make
@@ -1,6 +1,6 @@
PROGRAM()
-VERSION(2.28)
+VERSION(2.31)
LICENSE(BSD-3-Clause)
diff --git a/contrib/libs/isa-l/erasure_code/ut/gf_vect_mad_test/ya.make b/contrib/libs/isa-l/erasure_code/ut/gf_vect_mad_test/ya.make
index 2d776e53f4..0bd839954d 100644
--- a/contrib/libs/isa-l/erasure_code/ut/gf_vect_mad_test/ya.make
+++ b/contrib/libs/isa-l/erasure_code/ut/gf_vect_mad_test/ya.make
@@ -1,6 +1,6 @@
PROGRAM()
-VERSION(2.28)
+VERSION(2.31)
LICENSE(BSD-3-Clause)
diff --git a/contrib/libs/isa-l/erasure_code/ut/gf_vect_mul_base_test/ya.make b/contrib/libs/isa-l/erasure_code/ut/gf_vect_mul_base_test/ya.make
index 2725cbf67e..37f0606713 100644
--- a/contrib/libs/isa-l/erasure_code/ut/gf_vect_mul_base_test/ya.make
+++ b/contrib/libs/isa-l/erasure_code/ut/gf_vect_mul_base_test/ya.make
@@ -1,6 +1,6 @@
PROGRAM()
-VERSION(2.28)
+VERSION(2.31)
LICENSE(BSD-3-Clause)
diff --git a/contrib/libs/isa-l/erasure_code/ut/gf_vect_mul_test/ya.make b/contrib/libs/isa-l/erasure_code/ut/gf_vect_mul_test/ya.make
index 614ceddd89..c22dd39c6d 100644
--- a/contrib/libs/isa-l/erasure_code/ut/gf_vect_mul_test/ya.make
+++ b/contrib/libs/isa-l/erasure_code/ut/gf_vect_mul_test/ya.make
@@ -1,6 +1,6 @@
PROGRAM()
-VERSION(2.28)
+VERSION(2.31)
LICENSE(BSD-3-Clause)
diff --git a/contrib/libs/isa-l/erasure_code/ut/ya.make b/contrib/libs/isa-l/erasure_code/ut/ya.make
index 637eac1966..7bc4eff15e 100644
--- a/contrib/libs/isa-l/erasure_code/ut/ya.make
+++ b/contrib/libs/isa-l/erasure_code/ut/ya.make
@@ -5,7 +5,7 @@ SUBSCRIBER(
EXECTEST()
-VERSION(2.28)
+VERSION(2.31)
LICENSE(BSD-3-Clause)
@@ -19,16 +19,6 @@ RUN(erasure_code_base_test)
RUN(erasure_code_update_test)
-RUN(gf_2vect_dot_prod_sse_test)
-
-RUN(gf_3vect_dot_prod_sse_test)
-
-RUN(gf_4vect_dot_prod_sse_test)
-
-RUN(gf_5vect_dot_prod_sse_test)
-
-RUN(gf_6vect_dot_prod_sse_test)
-
RUN(gf_inverse_test)
RUN(gf_vect_dot_prod_base_test)
@@ -45,11 +35,6 @@ DEPENDS(
contrib/libs/isa-l/erasure_code/ut/erasure_code_test
contrib/libs/isa-l/erasure_code/ut/erasure_code_base_test
contrib/libs/isa-l/erasure_code/ut/erasure_code_update_test
- contrib/libs/isa-l/erasure_code/ut/gf_2vect_dot_prod_sse_test
- contrib/libs/isa-l/erasure_code/ut/gf_3vect_dot_prod_sse_test
- contrib/libs/isa-l/erasure_code/ut/gf_4vect_dot_prod_sse_test
- contrib/libs/isa-l/erasure_code/ut/gf_5vect_dot_prod_sse_test
- contrib/libs/isa-l/erasure_code/ut/gf_6vect_dot_prod_sse_test
contrib/libs/isa-l/erasure_code/ut/gf_inverse_test
contrib/libs/isa-l/erasure_code/ut/gf_vect_dot_prod_base_test
contrib/libs/isa-l/erasure_code/ut/gf_vect_dot_prod_test
@@ -64,11 +49,6 @@ RECURSE_FOR_TESTS(
erasure_code_test
erasure_code_base_test
erasure_code_update_test
- gf_2vect_dot_prod_sse_test
- gf_3vect_dot_prod_sse_test
- gf_4vect_dot_prod_sse_test
- gf_5vect_dot_prod_sse_test
- gf_6vect_dot_prod_sse_test
gf_inverse_test
gf_vect_dot_prod_base_test
gf_vect_dot_prod_test
diff --git a/contrib/libs/isa-l/erasure_code/ya.make b/contrib/libs/isa-l/erasure_code/ya.make
index a1c30ae5be..0f2c15a27f 100644
--- a/contrib/libs/isa-l/erasure_code/ya.make
+++ b/contrib/libs/isa-l/erasure_code/ya.make
@@ -4,7 +4,7 @@ LICENSE(BSD-3-Clause)
LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
-VERSION(2.28)
+VERSION(2.31)
NO_UTIL()
@@ -17,11 +17,6 @@ ADDINCL(
contrib/libs/isa-l/include
)
-SRCS(
- ec_base.c
- ec_highlevel_func.c
-)
-
IF (ARCH_X86_64)
IF (OS_DARWIN)
SRCS(
@@ -34,52 +29,88 @@ ELSE()
ENDIF()
SRCS(
- gf_vect_mul_sse.asm
- gf_vect_mul_avx.asm
- gf_vect_dot_prod_sse.asm
- gf_vect_dot_prod_avx.asm
- gf_vect_dot_prod_avx2.asm
- gf_vect_dot_prod_avx512.asm
- gf_2vect_dot_prod_sse.asm
- gf_2vect_dot_prod_avx.asm
+ ec_base.c
+ ec_highlevel_func.c
gf_2vect_dot_prod_avx2.asm
+ gf_2vect_dot_prod_avx2_gfni.asm
gf_2vect_dot_prod_avx512.asm
- gf_3vect_dot_prod_sse.asm
- gf_3vect_dot_prod_avx.asm
- gf_3vect_dot_prod_avx2.asm
- gf_3vect_dot_prod_avx512.asm
- gf_4vect_dot_prod_sse.asm
- gf_4vect_dot_prod_avx.asm
- gf_4vect_dot_prod_avx2.asm
- gf_4vect_dot_prod_avx512.asm
- gf_5vect_dot_prod_sse.asm
- gf_5vect_dot_prod_avx.asm
- gf_5vect_dot_prod_avx2.asm
- gf_6vect_dot_prod_sse.asm
- gf_6vect_dot_prod_avx.asm
- gf_6vect_dot_prod_avx2.asm
- gf_vect_mad_sse.asm
- gf_vect_mad_avx.asm
- gf_vect_mad_avx2.asm
- gf_vect_mad_avx512.asm
- gf_2vect_mad_sse.asm
- gf_2vect_mad_avx.asm
+ gf_2vect_dot_prod_avx512_gfni.asm
+ gf_2vect_dot_prod_avx.asm
+ gf_2vect_dot_prod_sse.asm
gf_2vect_mad_avx2.asm
+ gf_2vect_mad_avx2_gfni.asm
gf_2vect_mad_avx512.asm
- gf_3vect_mad_sse.asm
- gf_3vect_mad_avx.asm
+ gf_2vect_mad_avx512_gfni.asm
+ gf_2vect_mad_avx.asm
+ gf_2vect_mad_sse.asm
+ gf_3vect_dot_prod_avx2.asm
+ gf_3vect_dot_prod_avx2_gfni.asm
+ gf_3vect_dot_prod_avx512.asm
+ gf_3vect_dot_prod_avx512_gfni.asm
+ gf_3vect_dot_prod_avx.asm
+ gf_3vect_dot_prod_sse.asm
gf_3vect_mad_avx2.asm
+ gf_3vect_mad_avx2_gfni.asm
gf_3vect_mad_avx512.asm
- gf_4vect_mad_sse.asm
- gf_4vect_mad_avx.asm
+ gf_3vect_mad_avx512_gfni.asm
+ gf_3vect_mad_avx.asm
+ gf_3vect_mad_sse.asm
+ gf_4vect_dot_prod_avx2.asm
+ gf_4vect_dot_prod_avx512.asm
+ gf_4vect_dot_prod_avx512_gfni.asm
+ gf_4vect_dot_prod_avx.asm
+ gf_4vect_dot_prod_sse.asm
gf_4vect_mad_avx2.asm
+ gf_4vect_mad_avx2_gfni.asm
gf_4vect_mad_avx512.asm
- gf_5vect_mad_sse.asm
- gf_5vect_mad_avx.asm
+ gf_4vect_mad_avx512_gfni.asm
+ gf_4vect_mad_avx.asm
+ gf_4vect_mad_sse.asm
+ gf_5vect_dot_prod_avx2.asm
+ gf_5vect_dot_prod_avx512.asm
+ gf_5vect_dot_prod_avx512_gfni.asm
+ gf_5vect_dot_prod_avx.asm
+ gf_5vect_dot_prod_sse.asm
gf_5vect_mad_avx2.asm
- gf_6vect_mad_sse.asm
- gf_6vect_mad_avx.asm
+ gf_5vect_mad_avx2_gfni.asm
+ gf_5vect_mad_avx512.asm
+ gf_5vect_mad_avx512_gfni.asm
+ gf_5vect_mad_avx.asm
+ gf_5vect_mad_sse.asm
+ gf_6vect_dot_prod_avx2.asm
+ gf_6vect_dot_prod_avx512.asm
+ gf_6vect_dot_prod_avx512_gfni.asm
+ gf_6vect_dot_prod_avx.asm
+ gf_6vect_dot_prod_sse.asm
gf_6vect_mad_avx2.asm
+ gf_6vect_mad_avx512.asm
+ gf_6vect_mad_avx512_gfni.asm
+ gf_6vect_mad_avx.asm
+ gf_6vect_mad_sse.asm
+ gf_vect_dot_prod_avx2.asm
+ gf_vect_dot_prod_avx2_gfni.asm
+ gf_vect_dot_prod_avx512.asm
+ gf_vect_dot_prod_avx512_gfni.asm
+ gf_vect_dot_prod_avx.asm
+ gf_vect_dot_prod_sse.asm
+ gf_vect_mad_avx2.asm
+ gf_vect_mad_avx2_gfni.asm
+ gf_vect_mad_avx512.asm
+ gf_vect_mad_avx512_gfni.asm
+ gf_vect_mad_avx.asm
+ gf_vect_mad_sse.asm
+ gf_vect_mul_avx.asm
+ gf_vect_mul_sse.asm
+)
+ELSEIF(ARCH_AARCH64)
+SRCS(
+ ec_base.c
+ aarch64/ec_aarch64_dispatcher.c
+ aarch64/ec_aarch64_highlevel_func.c
+)
+
+PEERDIR(
+ contrib/libs/isa-l/erasure_code/aarch64
)
ENDIF()
diff --git a/contrib/libs/isa-l/include/aarch64_label.h b/contrib/libs/isa-l/include/aarch64_label.h
new file mode 100644
index 0000000000..a4e6d0609c
--- /dev/null
+++ b/contrib/libs/isa-l/include/aarch64_label.h
@@ -0,0 +1,18 @@
+#ifndef __AARCH64_LABEL_H__
+#define __AARCH64_LABEL_H__
+
+#ifdef __USER_LABEL_PREFIX__
+#define CONCAT1(a, b) CONCAT2(a, b)
+#define CONCAT2(a, b) a ## b
+#define cdecl(x) CONCAT1 (__USER_LABEL_PREFIX__, x)
+#else
+#define cdecl(x) x
+#endif
+
+#ifdef __APPLE__
+#define ASM_DEF_RODATA .section __TEXT,__const
+#else
+#define ASM_DEF_RODATA .section .rodata
+#endif
+
+#endif
diff --git a/contrib/libs/isa-l/include/aarch64_multibinary.h b/contrib/libs/isa-l/include/aarch64_multibinary.h
new file mode 100644
index 0000000000..6c77665fd6
--- /dev/null
+++ b/contrib/libs/isa-l/include/aarch64_multibinary.h
@@ -0,0 +1,347 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#ifndef __AARCH64_MULTIBINARY_H__
+#define __AARCH64_MULTIBINARY_H__
+#ifndef __aarch64__
+#error "This file is for aarch64 only"
+#endif
+#include "aarch64_label.h"
+#ifdef __ASSEMBLY__
+/**
+ * # mbin_interface : the wrapper layer for isal-l api
+ *
+ * ## references:
+ * * https://sourceware.org/git/gitweb.cgi?p=glibc.git;a=blob;f=sysdeps/aarch64/dl-trampoline.S
+ * * http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055b/IHI0055B_aapcs64.pdf
+ * * https://static.docs.arm.com/ihi0057/b/IHI0057B_aadwarf64.pdf?_ga=2.80574487.1870739014.1564969896-1634778941.1548729310
+ *
+ * ## Usage:
+ * 1. Define dispather function
+ * 2. name must be \name\()_dispatcher
+ * 3. Prototype should be *"void * \name\()_dispatcher"*
+ * 4. The dispather should return the right function pointer , revision and a string information .
+ **/
+.macro mbin_interface name:req
+ .extern cdecl(\name\()_dispatcher)
+ .data
+ .balign 8
+ .global cdecl(\name\()_dispatcher_info)
+#ifndef __APPLE__
+ .type \name\()_dispatcher_info,%object
+#endif
+ cdecl(\name\()_dispatcher_info):
+ .quad \name\()_mbinit //func_entry
+#ifndef __APPLE__
+ .size \name\()_dispatcher_info,. - \name\()_dispatcher_info
+#endif
+ .balign 8
+ .text
+ \name\()_mbinit:
+ //save lp fp, sub sp
+ .cfi_startproc
+ stp x29, x30, [sp, -224]!
+
+ //add cfi directive to avoid GDB bt cmds error
+ //set cfi(Call Frame Information)
+ .cfi_def_cfa_offset 224
+ .cfi_offset 29, -224
+ .cfi_offset 30, -216
+
+ //save parameter/result/indirect result registers
+ stp x8, x9, [sp, 16]
+ .cfi_offset 8, -208
+ .cfi_offset 9, -200
+ stp x0, x1, [sp, 32]
+ .cfi_offset 0, -192
+ .cfi_offset 1, -184
+ stp x2, x3, [sp, 48]
+ .cfi_offset 2, -176
+ .cfi_offset 3, -168
+ stp x4, x5, [sp, 64]
+ .cfi_offset 4, -160
+ .cfi_offset 5, -152
+ stp x6, x7, [sp, 80]
+ .cfi_offset 6, -144
+ .cfi_offset 7, -136
+ stp q0, q1, [sp, 96]
+ .cfi_offset 64, -128
+ .cfi_offset 65, -112
+ stp q2, q3, [sp, 128]
+ .cfi_offset 66, -96
+ .cfi_offset 67, -80
+ stp q4, q5, [sp, 160]
+ .cfi_offset 68, -64
+ .cfi_offset 69, -48
+ stp q6, q7, [sp, 192]
+ .cfi_offset 70, -32
+ .cfi_offset 71, -16
+
+ /**
+ * The dispatcher functions have the following prototype:
+ * void * function_dispatcher(void)
+ * As the dispatcher is returning a struct, by the AAPCS,
+ */
+
+
+ bl cdecl(\name\()_dispatcher)
+ //restore temp/indirect result registers
+ ldp x8, x9, [sp, 16]
+ .cfi_restore 8
+ .cfi_restore 9
+
+ // save function entry
+ str x0, [x9]
+
+ //restore parameter/result registers
+ ldp x0, x1, [sp, 32]
+ .cfi_restore 0
+ .cfi_restore 1
+ ldp x2, x3, [sp, 48]
+ .cfi_restore 2
+ .cfi_restore 3
+ ldp x4, x5, [sp, 64]
+ .cfi_restore 4
+ .cfi_restore 5
+ ldp x6, x7, [sp, 80]
+ .cfi_restore 6
+ .cfi_restore 7
+ ldp q0, q1, [sp, 96]
+ .cfi_restore 64
+ .cfi_restore 65
+ ldp q2, q3, [sp, 128]
+ .cfi_restore 66
+ .cfi_restore 67
+ ldp q4, q5, [sp, 160]
+ .cfi_restore 68
+ .cfi_restore 69
+ ldp q6, q7, [sp, 192]
+ .cfi_restore 70
+ .cfi_restore 71
+ //save lp fp and sp
+ ldp x29, x30, [sp], 224
+ //restore cfi setting
+ .cfi_restore 30
+ .cfi_restore 29
+ .cfi_def_cfa_offset 0
+ .cfi_endproc
+
+ .global cdecl(\name)
+#ifndef __APPLE__
+ .type \name,%function
+#endif
+ .align 2
+ cdecl(\name\()):
+#ifndef __APPLE__
+ adrp x9, :got:\name\()_dispatcher_info
+ ldr x9, [x9, #:got_lo12:\name\()_dispatcher_info]
+#else
+ adrp x9, cdecl(\name\()_dispatcher_info)@GOTPAGE
+ ldr x9, [x9, #cdecl(\name\()_dispatcher_info)@GOTPAGEOFF]
+#endif
+ ldr x10,[x9]
+ br x10
+#ifndef __APPLE__
+ .size \name,. - \name
+#endif
+.endm
+
+/**
+ * mbin_interface_base is used for the interfaces which have only
+ * noarch implementation
+ */
+.macro mbin_interface_base name:req, base:req
+ .extern \base
+ .data
+ .balign 8
+ .global cdecl(\name\()_dispatcher_info)
+#ifndef __APPLE__
+ .type \name\()_dispatcher_info,%object
+#endif
+ cdecl(\name\()_dispatcher_info):
+ .quad \base //func_entry
+#ifndef __APPLE__
+ .size \name\()_dispatcher_info,. - \name\()_dispatcher_info
+#endif
+ .balign 8
+ .text
+ .global cdecl(\name)
+#ifndef __APPLE__
+ .type \name,%function
+#endif
+ .align 2
+ cdecl(\name\()):
+#ifndef __APPLE__
+ adrp x9, :got:cdecl(_\name\()_dispatcher_info)
+ ldr x9, [x9, #:got_lo12:cdecl(_\name\()_dispatcher_info)]
+#else
+ adrp x9, cdecl(_\name\()_dispatcher_info)@GOTPAGE
+ ldr x9, [x9, #cdecl(_\name\()_dispatcher_info)@GOTPAGEOFF]
+#endif
+ ldr x10,[x9]
+ br x10
+#ifndef __APPLE__
+ .size \name,. - \name
+#endif
+.endm
+
+#else /* __ASSEMBLY__ */
+#include <stdint.h>
+#if defined(__linux__)
+#include <sys/auxv.h>
+#include <asm/hwcap.h>
+#elif defined(__APPLE__)
+#define SYSCTL_PMULL_KEY "hw.optional.arm.FEAT_PMULL" // from macOS 12 FEAT_* sysctl infos are available
+#define SYSCTL_CRC32_KEY "hw.optional.armv8_crc32"
+#define SYSCTL_SVE_KEY "hw.optional.arm.FEAT_SVE" // this one is just a guess and need to check macOS update
+#include <sys/sysctl.h>
+#include <stddef.h>
+static inline int sysctlEnabled(const char* name){
+ int enabled;
+ size_t size = sizeof(enabled);
+ int status = sysctlbyname(name, &enabled, &size, NULL, 0);
+ return status ? 0 : enabled;
+}
+#endif
+
+
+#define DEFINE_INTERFACE_DISPATCHER(name) \
+ void * name##_dispatcher(void)
+
+#define PROVIDER_BASIC(name) \
+ PROVIDER_INFO(name##_base)
+
+#define DO_DIGNOSTIC(x) _Pragma GCC diagnostic ignored "-W"#x
+#define DO_PRAGMA(x) _Pragma (#x)
+#define DIGNOSTIC_IGNORE(x) DO_PRAGMA(GCC diagnostic ignored #x)
+#define DIGNOSTIC_PUSH() DO_PRAGMA(GCC diagnostic push)
+#define DIGNOSTIC_POP() DO_PRAGMA(GCC diagnostic pop)
+
+
+#define PROVIDER_INFO(_func_entry) \
+ ({ DIGNOSTIC_PUSH() \
+ DIGNOSTIC_IGNORE(-Wnested-externs) \
+ extern void _func_entry(void); \
+ DIGNOSTIC_POP() \
+ _func_entry; \
+ })
+
+/**
+ * Micro-Architector definitions
+ * Reference: https://developer.arm.com/docs/ddi0595/f/aarch64-system-registers/midr_el1
+ */
+
+#define CPU_IMPLEMENTER_RESERVE 0x00
+#define CPU_IMPLEMENTER_ARM 0x41
+
+
+#define CPU_PART_CORTEX_A57 0xD07
+#define CPU_PART_CORTEX_A72 0xD08
+#define CPU_PART_NEOVERSE_N1 0xD0C
+
+#define MICRO_ARCH_ID(imp,part) \
+ (((CPU_IMPLEMENTER_##imp&0xff)<<24)|((CPU_PART_##part&0xfff)<<4))
+
+#ifndef HWCAP_CPUID
+#define HWCAP_CPUID (1<<11)
+#endif
+
+/**
+ * @brief get_micro_arch_id
+ *
+ * read micro-architector register instruction if possible.This function
+ * provides microarchitecture information and make microarchitecture optimization
+ * possible.
+ *
+ * Read system registers(MRS) is forbidden in userspace. If executed, it
+ * will raise illegal instruction error. Kernel provides a solution for
+ * this issue. The solution depends on HWCAP_CPUID flags. Reference(1)
+ * describes how to use it. It provides a "illegal insstruction" handler
+ * in kernel space, the handler will execute MRS and return the correct
+ * value to userspace.
+ *
+ * To avoid too many kernel trap, this function MUST be only called in
+ * dispatcher. And HWCAP must be match,That will make sure there are no
+ * illegal instruction errors. HWCAP_CPUID should be available to get the
+ * best performance.
+ *
+ * NOTICE:
+ * - HWCAP_CPUID should be available. Otherwise it returns reserve value
+ * - It MUST be called inside dispather.
+ * - It MUST meet the HWCAP requirements
+ *
+ * Example:
+ * DEFINE_INTERFACE_DISPATCHER(crc32_iscsi)
+ * {
+ * unsigned long auxval = getauxval(AT_HWCAP);
+ * // MUST do the judgement is MUST.
+ * if ((HWCAP_CRC32 | HWCAP_PMULL) == (auxval & (HWCAP_CRC32 | HWCAP_PMULL))) {
+ * switch (get_micro_arch_id()) {
+ * case MICRO_ARCH_ID(ARM, CORTEX_A57):
+ * return PROVIDER_INFO(crc32_pmull_crc_for_a57);
+ * case MICRO_ARCH_ID(ARM, CORTEX_A72):
+ * return PROVIDER_INFO(crc32_pmull_crc_for_a72);
+ * case MICRO_ARCH_ID(ARM, NEOVERSE_N1):
+ * return PROVIDER_INFO(crc32_pmull_crc_for_n1);
+ * case default:
+ * return PROVIDER_INFO(crc32_pmull_crc_for_others);
+ * }
+ * }
+ * return PROVIDER_BASIC(crc32_iscsi);
+ * }
+ * KNOWN ISSUE:
+ * On a heterogeneous system (big.LITTLE), it will work but the performance
+ * might not be the best one as expected.
+ *
+ * If this function is called on the big core, it will return the function
+ * optimized for the big core.
+ *
+ * If execution is then scheduled to the little core. It will still work (1),
+ * but the function won't be optimized for the little core, thus the performance
+ * won't be as expected.
+ *
+ * References:
+ * - [CPU Feature detection](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/arm64/cpu-feature-registers.rst?h=v5.5)
+ *
+ */
+static inline uint32_t get_micro_arch_id(void)
+{
+ uint32_t id=CPU_IMPLEMENTER_RESERVE;
+#ifndef __APPLE__
+ if ((getauxval(AT_HWCAP) & HWCAP_CPUID)) {
+ /** Here will trap into kernel space */
+ asm("mrs %0, MIDR_EL1 " : "=r" (id));
+ }
+#endif
+ return id&0xff00fff0;
+}
+
+
+
+#endif /* __ASSEMBLY__ */
+#endif
diff --git a/contrib/libs/isa-l/include/erasure_code.h b/contrib/libs/isa-l/include/erasure_code.h
index 04fdfb1bc2..e361d7f4bb 100644
--- a/contrib/libs/isa-l/include/erasure_code.h
+++ b/contrib/libs/isa-l/include/erasure_code.h
@@ -74,6 +74,14 @@ extern "C" {
void ec_init_tables(int k, int rows, unsigned char* a, unsigned char* gftbls);
/**
+ * @brief Initialize tables for fast Erasure Code encode and decode, runs baseline version.
+ *
+ * Baseline version of ec_encode_data() with same parameters.
+ */
+
+void ec_init_tables_base(int k, int rows, unsigned char* a, unsigned char* gftbls);
+
+/**
* @brief Generate or decode erasure codes on blocks of data, runs appropriate version.
*
* Given a list of source data blocks, generate one or multiple blocks of
@@ -926,7 +934,10 @@ void gf_gen_cauchy1_matrix(unsigned char *a, int m, int k);
/**
* @brief Invert a matrix in GF(2^8)
*
- * @param in input matrix
+ * Attempts to construct an n x n inverse of the input matrix. Returns non-zero
+ * if singular. Will always destroy input matrix in process.
+ *
+ * @param in input matrix, destroyed by invert process
* @param out output matrix such that [in] x [out] = [I] - identity matrix
* @param n size of matrix [nxn]
* @returns 0 successful, other fail on singular input matrix
diff --git a/contrib/libs/isa-l/include/gf_vect_mul.h b/contrib/libs/isa-l/include/gf_vect_mul.h
index 70a0ab2ed3..7cd954452e 100644
--- a/contrib/libs/isa-l/include/gf_vect_mul.h
+++ b/contrib/libs/isa-l/include/gf_vect_mul.h
@@ -140,10 +140,11 @@ void gf_vect_mul_init(unsigned char c, unsigned char* gftbl);
* only use 2nd element is used.
* @param src Pointer to src data array. Must be aligned to 32B.
* @param dest Pointer to destination data array. Must be aligned to 32B.
+ * @returns 0 pass, other fail
*/
-void gf_vect_mul_base(int len, unsigned char *a, unsigned char *src,
- unsigned char *dest);
+int gf_vect_mul_base(int len, unsigned char *a, unsigned char *src,
+ unsigned char *dest);
#ifdef __cplusplus
}
diff --git a/contrib/libs/isa-l/include/memcpy.asm b/contrib/libs/isa-l/include/memcpy.asm
new file mode 100644
index 0000000000..8ce39cc28b
--- /dev/null
+++ b/contrib/libs/isa-l/include/memcpy.asm
@@ -0,0 +1,769 @@
+;;
+;; Copyright (c) 2023, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%ifndef __MEMCPY_INC__
+%define __MEMCPY_INC__
+
+%include "reg_sizes.asm"
+
+; This section defines a series of macros to copy small to medium amounts
+; of data from memory to memory, where the size is variable but limited.
+;
+; The macros are all called as:
+; memcpy DST, SRC, SIZE, TMP0, TMP1, XTMP0, XTMP1, XTMP2, XTMP3
+; with the parameters defined as:
+; DST : register: pointer to dst (not modified)
+; SRC : register: pointer to src (not modified)
+; SIZE : register: length in bytes (not modified)
+; TMP0 : 64-bit temp GPR (clobbered)
+; TMP1 : 64-bit temp GPR (clobbered)
+; XTMP0 : temp XMM (clobbered)
+; XTMP1 : temp XMM (clobbered)
+; XTMP2 : temp XMM (clobbered)
+; XTMP3 : temp XMM (clobbered)
+;
+; The name indicates the options. The name is of the form:
+; memcpy_<VEC>_<SZ><ZERO><RET>
+; where:
+; <VEC> is either "sse" or "avx" or "avx2"
+; <SZ> is either "64" or "128" and defines largest value of SIZE
+; <ZERO> is blank or "_1". If "_1" then the min SIZE is 1 (otherwise 0)
+; <RET> is blank or "_ret". If blank, the code falls through. If "ret"
+; it does a "ret" at the end
+;
+; For the avx2 versions, the temp XMM registers need to be YMM registers
+; If the SZ is 64, then only two YMM temps are needed, i.e. it is called as:
+; memcpy_avx2_64 DST, SRC, SIZE, TMP0, TMP1, YTMP0, YTMP1
+; memcpy_avx2_128 DST, SRC, SIZE, TMP0, TMP1, YTMP0, YTMP1, YTMP2, YTMP3
+;
+; For example:
+; memcpy_sse_64 : SSE, 0 <= size < 64, falls through
+; memcpy_avx_64_1 : AVX1, 1 <= size < 64, falls through
+; memcpy_sse_128_ret : SSE, 0 <= size < 128, ends with ret
+; mempcy_avx_128_1_ret : AVX1, 1 <= size < 128, ends with ret
+;
+
+%macro memcpy_sse_64 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 0, 0
+%endm
+
+%macro memcpy_sse_64_1 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 0, 0
+%endm
+
+%macro memcpy_sse_128 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 0, 0
+%endm
+
+%macro memcpy_sse_128_1 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 0, 0
+%endm
+
+%macro memcpy_sse_64_ret 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 1, 0
+%endm
+
+%macro memcpy_sse_64_1_ret 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 1, 0
+%endm
+
+%macro memcpy_sse_128_ret 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 1, 0
+%endm
+
+%macro memcpy_sse_128_1_ret 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 1, 0
+%endm
+
+%macro memcpy_sse_16 5
+ __memcpy_int %1,%2,%3,%4,%5,,,,, 0, 16, 0, 0
+%endm
+
+%macro memcpy_sse_16_1 5
+ __memcpy_int %1,%2,%3,%4,%5,,,,, 1, 16, 0, 0
+%endm
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%macro memcpy_avx_64 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 0, 1
+%endm
+
+%macro memcpy_avx_64_1 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 0, 1
+%endm
+
+%macro memcpy_avx_128 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 0, 1
+%endm
+
+%macro memcpy_avx_128_1 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 0, 1
+%endm
+
+%macro memcpy_avx_64_ret 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 1, 1
+%endm
+
+%macro memcpy_avx_64_1_ret 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 1, 1
+%endm
+
+%macro memcpy_avx_128_ret 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 1, 1
+%endm
+
+%macro memcpy_avx_128_1_ret 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 1, 1
+%endm
+
+%macro memcpy_avx_16 5
+ __memcpy_int %1,%2,%3,%4,%5,,,,, 0, 16, 0, 1
+%endm
+
+%macro memcpy_avx_16_1 5
+ __memcpy_int %1,%2,%3,%4,%5,,,,, 1, 16, 0, 1
+%endm
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%macro memcpy_avx2_64 7
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 0, 64, 0, 2
+%endm
+
+%macro memcpy_avx2_64_1 7
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 1, 64, 0, 2
+%endm
+
+%macro memcpy_avx2_128 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7, %8, %9, 0, 128, 0, 2
+%endm
+
+%macro memcpy_avx2_128_1 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7, %8, %9, 1, 128, 0, 2
+%endm
+
+%macro memcpy_avx2_64_ret 7
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 0, 64, 1, 2
+%endm
+
+%macro memcpy_avx2_64_1_ret 7
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 1, 64, 1, 2
+%endm
+
+%macro memcpy_avx2_128_ret 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 1, 2
+%endm
+
+%macro memcpy_avx2_128_1_ret 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 1, 2
+%endm
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%macro __memcpy_int 13
+%define %%DST %1 ; register: pointer to dst (not modified)
+%define %%SRC %2 ; register: pointer to src (not modified)
+%define %%SIZE %3 ; register: length in bytes (not modified)
+%define %%TMP0 %4 ; 64-bit temp GPR (clobbered)
+%define %%TMP1 %5 ; 64-bit temp GPR (clobbered)
+%define %%XTMP0 %6 ; temp XMM (clobbered)
+%define %%XTMP1 %7 ; temp XMM (clobbered)
+%define %%XTMP2 %8 ; temp XMM (clobbered)
+%define %%XTMP3 %9 ; temp XMM (clobbered)
+%define %%NOT0 %10 ; if not 0, then assume size cannot be zero
+%define %%MAXSIZE %11 ; 128, 64, etc
+%define %%USERET %12 ; if not 0, use "ret" at end
+%define %%USEAVX %13 ; 0 = SSE, 1 = AVX1, 2 = AVX2
+
+%if (%%USERET != 0)
+ %define %%DONE ret
+%else
+ %define %%DONE jmp %%end
+%endif
+
+%if (%%USEAVX != 0)
+ %define %%MOVDQU vmovdqu
+%else
+ %define %%MOVDQU movdqu
+%endif
+
+%if (%%MAXSIZE >= 128)
+ test %%SIZE, 64
+ jz %%lt64
+ %if (%%USEAVX >= 2)
+ %%MOVDQU %%XTMP0, [%%SRC + 0*32]
+ %%MOVDQU %%XTMP1, [%%SRC + 1*32]
+ %%MOVDQU %%XTMP2, [%%SRC + %%SIZE - 2*32]
+ %%MOVDQU %%XTMP3, [%%SRC + %%SIZE - 1*32]
+
+ %%MOVDQU [%%DST + 0*32], %%XTMP0
+ %%MOVDQU [%%DST + 1*32], %%XTMP1
+ %%MOVDQU [%%DST + %%SIZE - 2*32], %%XTMP2
+ %%MOVDQU [%%DST + %%SIZE - 1*32], %%XTMP3
+ %else
+ %%MOVDQU %%XTMP0, [%%SRC + 0*16]
+ %%MOVDQU %%XTMP1, [%%SRC + 1*16]
+ %%MOVDQU %%XTMP2, [%%SRC + 2*16]
+ %%MOVDQU %%XTMP3, [%%SRC + 3*16]
+ %%MOVDQU [%%DST + 0*16], %%XTMP0
+ %%MOVDQU [%%DST + 1*16], %%XTMP1
+ %%MOVDQU [%%DST + 2*16], %%XTMP2
+ %%MOVDQU [%%DST + 3*16], %%XTMP3
+
+ %%MOVDQU %%XTMP0, [%%SRC + %%SIZE - 4*16]
+ %%MOVDQU %%XTMP1, [%%SRC + %%SIZE - 3*16]
+ %%MOVDQU %%XTMP2, [%%SRC + %%SIZE - 2*16]
+ %%MOVDQU %%XTMP3, [%%SRC + %%SIZE - 1*16]
+ %%MOVDQU [%%DST + %%SIZE - 4*16], %%XTMP0
+ %%MOVDQU [%%DST + %%SIZE - 3*16], %%XTMP1
+ %%MOVDQU [%%DST + %%SIZE - 2*16], %%XTMP2
+ %%MOVDQU [%%DST + %%SIZE - 1*16], %%XTMP3
+ %endif
+ %%DONE
+%endif
+
+%if (%%MAXSIZE >= 64)
+%%lt64:
+ test %%SIZE, 32
+ jz %%lt32
+ %if (%%USEAVX >= 2)
+ %%MOVDQU %%XTMP0, [%%SRC + 0*32]
+ %%MOVDQU %%XTMP1, [%%SRC + %%SIZE - 1*32]
+ %%MOVDQU [%%DST + 0*32], %%XTMP0
+ %%MOVDQU [%%DST + %%SIZE - 1*32], %%XTMP1
+ %else
+ %%MOVDQU %%XTMP0, [%%SRC + 0*16]
+ %%MOVDQU %%XTMP1, [%%SRC + 1*16]
+ %%MOVDQU %%XTMP2, [%%SRC + %%SIZE - 2*16]
+ %%MOVDQU %%XTMP3, [%%SRC + %%SIZE - 1*16]
+ %%MOVDQU [%%DST + 0*16], %%XTMP0
+ %%MOVDQU [%%DST + 1*16], %%XTMP1
+ %%MOVDQU [%%DST + %%SIZE - 2*16], %%XTMP2
+ %%MOVDQU [%%DST + %%SIZE - 1*16], %%XTMP3
+ %endif
+ %%DONE
+%endif
+
+%if (%%MAXSIZE >= 32)
+%%lt32:
+ test %%SIZE, 16
+ jz %%lt16
+ %if (%%USEAVX >= 2)
+ %%MOVDQU XWORD(%%XTMP0), [%%SRC + 0*16]
+ %%MOVDQU XWORD(%%XTMP1), [%%SRC + %%SIZE - 1*16]
+ %%MOVDQU [%%DST + 0*16], XWORD(%%XTMP0)
+ %%MOVDQU [%%DST + %%SIZE - 1*16], XWORD(%%XTMP1)
+ %else
+ %%MOVDQU %%XTMP0, [%%SRC + 0*16]
+ %%MOVDQU %%XTMP1, [%%SRC + %%SIZE - 1*16]
+ %%MOVDQU [%%DST + 0*16], %%XTMP0
+ %%MOVDQU [%%DST + %%SIZE - 1*16], %%XTMP1
+ %endif
+ %%DONE
+%endif
+
+%if (%%MAXSIZE >= 16)
+ test %%SIZE, 16
+ jz %%lt16
+ mov %%TMP0, [%%SRC]
+ mov %%TMP1, [%%SRC + 8]
+ mov [%%DST], %%TMP0
+ mov [%%DST + 8], %%TMP1
+%%lt16:
+ test %%SIZE, 8
+ jz %%lt8
+ mov %%TMP0, [%%SRC]
+ mov %%TMP1, [%%SRC + %%SIZE - 8]
+ mov [%%DST], %%TMP0
+ mov [%%DST + %%SIZE - 8], %%TMP1
+ %%DONE
+%endif
+
+%if (%%MAXSIZE >= 8)
+%%lt8:
+ test %%SIZE, 4
+ jz %%lt4
+ mov DWORD(%%TMP0), [%%SRC]
+ mov DWORD(%%TMP1), [%%SRC + %%SIZE - 4]
+ mov [%%DST], DWORD(%%TMP0)
+ mov [%%DST + %%SIZE - 4], DWORD(%%TMP1)
+ %%DONE
+%endif
+
+%if (%%MAXSIZE >= 4)
+%%lt4:
+ test %%SIZE, 2
+ jz %%lt2
+ movzx DWORD(%%TMP0), word [%%SRC]
+ movzx DWORD(%%TMP1), byte [%%SRC + %%SIZE - 1]
+ mov [%%DST], WORD(%%TMP0)
+ mov [%%DST + %%SIZE - 1], BYTE(%%TMP1)
+ %%DONE
+%endif
+
+%%lt2:
+%if (%%NOT0 == 0)
+ test %%SIZE, 1
+ jz %%end
+%endif
+ movzx DWORD(%%TMP0), byte [%%SRC]
+ mov [%%DST], BYTE(%%TMP0)
+%%end:
+%if (%%USERET != 0)
+ ret
+%endif
+%endm
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Utility macro to assist with SIMD shifting
+%macro _PSRLDQ 3
+%define %%VEC %1
+%define %%REG %2
+%define %%IMM %3
+
+%ifidn %%VEC, SSE
+ psrldq %%REG, %%IMM
+%else
+ vpsrldq %%REG, %%REG, %%IMM
+%endif
+%endm
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; This section defines a series of macros to store small to medium amounts
+; of data from SIMD registers to memory, where the size is variable but limited.
+;
+; The macros are all called as:
+; memcpy DST, SRC, SIZE, TMP, IDX
+; with the parameters defined as:
+; DST : register: pointer to dst (not modified)
+; SRC : register: src data (clobbered)
+; SIZE : register: length in bytes (not modified)
+; TMP : 64-bit temp GPR (clobbered)
+; IDX : 64-bit GPR to store dst index/offset (clobbered)
+; OFFSET ; Offset to be applied to destination pointer (optional)
+;
+; The name indicates the options. The name is of the form:
+; simd_store_<VEC>
+; where <VEC> is the SIMD instruction type e.g. "sse" or "avx"
+
+%macro simd_store_sse 5-6
+%if %0 == 6
+ __simd_store %1,%2,%3,%4,%5,SSE,16,%6
+%else
+ __simd_store %1,%2,%3,%4,%5,SSE,16
+%endif
+%endm
+
+%macro simd_store_avx 5-6
+%if %0 == 6
+ __simd_store %1,%2,%3,%4,%5,AVX,16,%6
+%else
+ __simd_store %1,%2,%3,%4,%5,AVX,16
+%endif
+%endm
+
+%macro simd_store_sse_15 5-6
+%if %0 == 6
+ __simd_store %1,%2,%3,%4,%5,SSE,15,%6
+%else
+ __simd_store %1,%2,%3,%4,%5,SSE,15
+%endif
+%endm
+
+%macro simd_store_avx_15 5-6
+%if %0 == 6
+ __simd_store %1,%2,%3,%4,%5,AVX,15,%6
+%else
+ __simd_store %1,%2,%3,%4,%5,AVX,15
+%endif
+%endm
+
+%macro __simd_store 7-8
+%define %%DST %1 ; register: pointer to dst (not modified)
+%define %%SRC %2 ; register: src data (clobbered)
+%define %%SIZE %3 ; register: length in bytes (not modified)
+%define %%TMP %4 ; 64-bit temp GPR (clobbered)
+%define %%IDX %5 ; 64-bit temp GPR to store dst idx (clobbered)
+%define %%SIMDTYPE %6 ; "SSE" or "AVX"
+%define %%MAX_LEN %7 ; maximum length to be stored
+%define %%OFFSET %8 ; offset to be applied to destination pointer
+
+%define %%PSRLDQ _PSRLDQ %%SIMDTYPE,
+
+%ifidn %%SIMDTYPE, SSE
+ %define %%MOVDQU movdqu
+ %define %%MOVQ movq
+%else
+ %define %%MOVDQU vmovdqu
+ %define %%MOVQ vmovq
+%endif
+
+;; determine max byte size for store operation
+%assign max_length_to_store %%MAX_LEN
+
+%if max_length_to_store > 16
+%error "__simd_store macro invoked with MAX_LEN bigger than 16!"
+%endif
+
+%if %0 == 8
+ mov %%IDX, %%OFFSET
+%else
+ xor %%IDX, %%IDX ; zero idx
+%endif
+
+%if max_length_to_store == 16
+ test %%SIZE, 16
+ jz %%lt16
+ %%MOVDQU [%%DST + %%IDX], %%SRC
+ jmp %%end
+%%lt16:
+%endif
+
+%if max_length_to_store >= 8
+ test %%SIZE, 8
+ jz %%lt8
+ %%MOVQ [%%DST + %%IDX], %%SRC
+ %%PSRLDQ %%SRC, 8
+ add %%IDX, 8
+%%lt8:
+%endif
+
+ %%MOVQ %%TMP, %%SRC ; use GPR from now on
+
+%if max_length_to_store >= 4
+ test %%SIZE, 4
+ jz %%lt4
+ mov [%%DST + %%IDX], DWORD(%%TMP)
+ shr %%TMP, 32
+ add %%IDX, 4
+%%lt4:
+%endif
+
+ test %%SIZE, 2
+ jz %%lt2
+ mov [%%DST + %%IDX], WORD(%%TMP)
+ shr %%TMP, 16
+ add %%IDX, 2
+%%lt2:
+ test %%SIZE, 1
+ jz %%end
+ mov [%%DST + %%IDX], BYTE(%%TMP)
+%%end:
+%endm
+
+; This section defines a series of macros to load small to medium amounts
+; (from 0 to 16 bytes) of data from memory to SIMD registers,
+; where the size is variable but limited.
+;
+; The macros are all called as:
+; simd_load DST, SRC, SIZE
+; with the parameters defined as:
+; DST : register: destination XMM register
+; SRC : register: pointer to src data (not modified)
+; SIZE : register: length in bytes (not modified)
+;
+; The name indicates the options. The name is of the form:
+; simd_load_<VEC>_<SZ><ZERO>
+; where:
+; <VEC> is either "sse" or "avx"
+; <SZ> is either "15" or "16" and defines largest value of SIZE
+; <ZERO> is blank or "_1". If "_1" then the min SIZE is 1 (otherwise 0)
+;
+; For example:
+; simd_load_sse_16 : SSE, 0 <= size <= 16
+; simd_load_avx_15_1 : AVX, 1 <= size <= 15
+
+%macro simd_load_sse_15_1 3
+ __simd_load %1,%2,%3,0,0,SSE
+%endm
+%macro simd_load_sse_15 3
+ __simd_load %1,%2,%3,1,0,SSE
+%endm
+%macro simd_load_sse_16_1 3
+ __simd_load %1,%2,%3,0,1,SSE
+%endm
+%macro simd_load_sse_16 3
+ __simd_load %1,%2,%3,1,1,SSE
+%endm
+
+%macro simd_load_avx_15_1 3
+ __simd_load %1,%2,%3,0,0,AVX
+%endm
+%macro simd_load_avx_15 3
+ __simd_load %1,%2,%3,1,0,AVX
+%endm
+%macro simd_load_avx_16_1 3
+ __simd_load %1,%2,%3,0,1,AVX
+%endm
+%macro simd_load_avx_16 3
+ __simd_load %1,%2,%3,1,1,AVX
+%endm
+
+%macro __simd_load 6
+%define %%DST %1 ; [out] destination XMM register
+%define %%SRC %2 ; [in] pointer to src data
+%define %%SIZE %3 ; [in] length in bytes (0-16 bytes)
+%define %%ACCEPT_0 %4 ; 0 = min length = 1, 1 = min length = 0
+%define %%ACCEPT_16 %5 ; 0 = max length = 15 , 1 = max length = 16
+%define %%SIMDTYPE %6 ; "SSE" or "AVX"
+
+%ifidn %%SIMDTYPE, SSE
+ %define %%MOVDQU movdqu
+ %define %%PINSRB pinsrb
+ %define %%PINSRQ pinsrq
+ %define %%PXOR pxor
+%else
+ %define %%MOVDQU vmovdqu
+ %define %%PINSRB vpinsrb
+ %define %%PINSRQ vpinsrq
+ %define %%PXOR vpxor
+%endif
+
+%if (%%ACCEPT_16 != 0)
+ test %%SIZE, 16
+ jz %%_skip_16
+ %%MOVDQU %%DST, [%%SRC]
+ jmp %%end_load
+
+%%_skip_16:
+%endif
+ %%PXOR %%DST, %%DST ; clear XMM register
+%if (%%ACCEPT_0 != 0)
+ or %%SIZE, %%SIZE
+ je %%end_load
+%endif
+ cmp %%SIZE, 2
+ jb %%_size_1
+ je %%_size_2
+ cmp %%SIZE, 4
+ jb %%_size_3
+ je %%_size_4
+ cmp %%SIZE, 6
+ jb %%_size_5
+ je %%_size_6
+ cmp %%SIZE, 8
+ jb %%_size_7
+ je %%_size_8
+ cmp %%SIZE, 10
+ jb %%_size_9
+ je %%_size_10
+ cmp %%SIZE, 12
+ jb %%_size_11
+ je %%_size_12
+ cmp %%SIZE, 14
+ jb %%_size_13
+ je %%_size_14
+
+%%_size_15:
+ %%PINSRB %%DST, [%%SRC + 14], 14
+%%_size_14:
+ %%PINSRB %%DST, [%%SRC + 13], 13
+%%_size_13:
+ %%PINSRB %%DST, [%%SRC + 12], 12
+%%_size_12:
+ %%PINSRB %%DST, [%%SRC + 11], 11
+%%_size_11:
+ %%PINSRB %%DST, [%%SRC + 10], 10
+%%_size_10:
+ %%PINSRB %%DST, [%%SRC + 9], 9
+%%_size_9:
+ %%PINSRB %%DST, [%%SRC + 8], 8
+%%_size_8:
+ %%PINSRQ %%DST, [%%SRC], 0
+ jmp %%end_load
+%%_size_7:
+ %%PINSRB %%DST, [%%SRC + 6], 6
+%%_size_6:
+ %%PINSRB %%DST, [%%SRC + 5], 5
+%%_size_5:
+ %%PINSRB %%DST, [%%SRC + 4], 4
+%%_size_4:
+ %%PINSRB %%DST, [%%SRC + 3], 3
+%%_size_3:
+ %%PINSRB %%DST, [%%SRC + 2], 2
+%%_size_2:
+ %%PINSRB %%DST, [%%SRC + 1], 1
+%%_size_1:
+ %%PINSRB %%DST, [%%SRC + 0], 0
+%%end_load:
+%endm
+
+%macro simd_load_avx2 5
+%define %%DST %1 ; [out] destination YMM register
+%define %%SRC %2 ; [in] pointer to src data
+%define %%SIZE %3 ; [in] length in bytes (0-32 bytes)
+%define %%IDX %4 ; [clobbered] Temp GP register to store src idx
+%define %%TMP %5 ; [clobbered] Temp GP register
+
+ test %%SIZE, 32
+ jz %%_skip_32
+ vmovdqu %%DST, [%%SRC]
+ jmp %%end_load
+
+%%_skip_32:
+ vpxor %%DST, %%DST ; clear YMM register
+ or %%SIZE, %%SIZE
+ je %%end_load
+
+ lea %%IDX, [%%SRC]
+ mov %%TMP, %%SIZE
+ cmp %%SIZE, 16
+ jle %%_check_size
+
+ add %%IDX, 16
+ sub %%TMP, 16
+
+%%_check_size:
+ cmp %%TMP, 2
+ jb %%_size_1
+ je %%_size_2
+ cmp %%TMP, 4
+ jb %%_size_3
+ je %%_size_4
+ cmp %%TMP, 6
+ jb %%_size_5
+ je %%_size_6
+ cmp %%TMP, 8
+ jb %%_size_7
+ je %%_size_8
+ cmp %%TMP, 10
+ jb %%_size_9
+ je %%_size_10
+ cmp %%TMP, 12
+ jb %%_size_11
+ je %%_size_12
+ cmp %%TMP, 14
+ jb %%_size_13
+ je %%_size_14
+ cmp %%TMP, 15
+ je %%_size_15
+
+%%_size_16:
+ vmovdqu XWORD(%%DST), [%%IDX]
+ jmp %%end_load
+%%_size_15:
+ vpinsrb XWORD(%%DST), [%%IDX + 14], 14
+%%_size_14:
+ vpinsrb XWORD(%%DST), [%%IDX + 13], 13
+%%_size_13:
+ vpinsrb XWORD(%%DST), [%%IDX + 12], 12
+%%_size_12:
+ vpinsrb XWORD(%%DST), [%%IDX + 11], 11
+%%_size_11:
+ vpinsrb XWORD(%%DST), [%%IDX + 10], 10
+%%_size_10:
+ vpinsrb XWORD(%%DST), [%%IDX + 9], 9
+%%_size_9:
+ vpinsrb XWORD(%%DST), [%%IDX + 8], 8
+%%_size_8:
+ vpinsrq XWORD(%%DST), [%%IDX], 0
+ jmp %%_check_higher_16
+%%_size_7:
+ vpinsrb XWORD(%%DST), [%%IDX + 6], 6
+%%_size_6:
+ vpinsrb XWORD(%%DST), [%%IDX + 5], 5
+%%_size_5:
+ vpinsrb XWORD(%%DST), [%%IDX + 4], 4
+%%_size_4:
+ vpinsrb XWORD(%%DST), [%%IDX + 3], 3
+%%_size_3:
+ vpinsrb XWORD(%%DST), [%%IDX + 2], 2
+%%_size_2:
+ vpinsrb XWORD(%%DST), [%%IDX + 1], 1
+%%_size_1:
+ vpinsrb XWORD(%%DST), [%%IDX + 0], 0
+%%_check_higher_16:
+ test %%SIZE, 16
+ jz %%end_load
+
+ ; Move last bytes loaded to upper half and load 16 bytes in lower half
+ vinserti128 %%DST, XWORD(%%DST), 1
+ vinserti128 %%DST, [%%SRC], 0
+%%end_load:
+%endm
+
+%macro simd_store_avx2 5
+%define %%DST %1 ; register: pointer to dst (not modified)
+%define %%SRC %2 ; register: src data (clobbered)
+%define %%SIZE %3 ; register: length in bytes (not modified)
+%define %%TMP %4 ; 64-bit temp GPR (clobbered)
+%define %%IDX %5 ; 64-bit temp GPR to store dst idx (clobbered)
+
+ xor %%IDX, %%IDX ; zero idx
+
+ test %%SIZE, 32
+ jz %%lt32
+ vmovdqu [%%DST], %%SRC
+ jmp %%end
+%%lt32:
+
+ test %%SIZE, 16
+ jz %%lt16
+ vmovdqu [%%DST], XWORD(%%SRC)
+ ; Move upper half to lower half for further stores
+ vperm2i128 %%SRC, %%SRC, %%SRC, 0x81
+ add %%IDX, 16
+%%lt16:
+
+ test %%SIZE, 8
+ jz %%lt8
+ vmovq [%%DST + %%IDX], XWORD(%%SRC)
+ vpsrldq XWORD(%%SRC), 8
+ add %%IDX, 8
+%%lt8:
+
+ vmovq %%TMP, XWORD(%%SRC) ; use GPR from now on
+
+ test %%SIZE, 4
+ jz %%lt4
+ mov [%%DST + %%IDX], DWORD(%%TMP)
+ shr %%TMP, 32
+ add %%IDX, 4
+%%lt4:
+
+ test %%SIZE, 2
+ jz %%lt2
+ mov [%%DST + %%IDX], WORD(%%TMP)
+ shr %%TMP, 16
+ add %%IDX, 2
+%%lt2:
+ test %%SIZE, 1
+ jz %%end
+ mov [%%DST + %%IDX], BYTE(%%TMP)
+%%end:
+%endm
+
+%endif ; ifndef __MEMCPY_INC__
diff --git a/contrib/libs/isa-l/include/multibinary.asm b/contrib/libs/isa-l/include/multibinary.asm
index 2cad1c51be..1a861a0376 100644
--- a/contrib/libs/isa-l/include/multibinary.asm
+++ b/contrib/libs/isa-l/include/multibinary.asm
@@ -69,12 +69,14 @@
mbin_def_ptr %1_mbinit
section .text
- global %1:ISAL_SYM_TYPE_FUNCTION
+ global %1, function
%1_mbinit:
+ endbranch
;;; only called the first time to setup hardware match
call %1_dispatch_init
;;; falls thru to execute the hw optimized code
%1:
+ endbranch
jmp mbin_ptr_sz [%1_dispatched]
%endmacro
@@ -152,8 +154,10 @@
; 1-> function name
; 2-> base function
; 3-> SSE4_1 and CLMUL optimized function
+; 4-> AVX/02 opt func
+; 5-> AVX512/10 opt func
;;;;;
-%macro mbin_dispatch_init_clmul 3
+%macro mbin_dispatch_init_clmul 5
section .text
%1_dispatch_init:
push mbin_rsi
@@ -161,18 +165,55 @@
push mbin_rbx
push mbin_rcx
push mbin_rdx
+ push mbin_rdi
lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function
mov eax, 1
cpuid
- lea mbin_rbx, [%3 WRT_OPT] ; SSE opt func
-
- ; Test for SSE4.2
+ mov ebx, ecx ; save cpuid1.ecx
test ecx, FLAG_CPUID1_ECX_SSE4_1
jz _%1_init_done
test ecx, FLAG_CPUID1_ECX_CLMUL
- cmovne mbin_rsi, mbin_rbx
+ jz _%1_init_done
+ lea mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt
+
+ ;; Test for XMM_YMM support/AVX
+ test ecx, FLAG_CPUID1_ECX_OSXSAVE
+ je _%1_init_done
+ xor ecx, ecx
+ xgetbv ; xcr -> edx:eax
+ mov edi, eax ; save xgetvb.eax
+
+ and eax, FLAG_XGETBV_EAX_XMM_YMM
+ cmp eax, FLAG_XGETBV_EAX_XMM_YMM
+ jne _%1_init_done
+ test ebx, FLAG_CPUID1_ECX_AVX
+ je _%1_init_done
+ lea mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt
+
+%if AS_FEATURE_LEVEL >= 10
+ ;; Test for AVX2
+ xor ecx, ecx
+ mov eax, 7
+ cpuid
+ test ebx, FLAG_CPUID7_EBX_AVX2
+ je _%1_init_done ; No AVX2 possible
+
+ ;; Test for AVX512
+ and edi, FLAG_XGETBV_EAX_ZMM_OPM
+ cmp edi, FLAG_XGETBV_EAX_ZMM_OPM
+ jne _%1_init_done ; No AVX512 possible
+ and ebx, FLAGS_CPUID7_EBX_AVX512_G1
+ cmp ebx, FLAGS_CPUID7_EBX_AVX512_G1
+ jne _%1_init_done
+
+ and ecx, FLAGS_CPUID7_ECX_AVX512_G2
+ cmp ecx, FLAGS_CPUID7_ECX_AVX512_G2
+ lea mbin_rbx, [%5 WRT_OPT] ; AVX512/10 opt
+ cmove mbin_rsi, mbin_rbx
+%endif
_%1_init_done:
+ pop mbin_rdi
pop mbin_rdx
pop mbin_rcx
pop mbin_rbx
@@ -390,10 +431,97 @@
pop mbin_rsi
ret
%endmacro
+
+;;;;;
+; mbin_dispatch_init8 parameters
+; 1-> function name
+; 2-> base function
+; 3-> SSE4_2 or 00/01 optimized function
+; 4-> AVX/02 opt func
+; 5-> AVX2/04 opt func
+; 6-> AVX512/06 opt func
+; 7-> AVX2 Update/07 opt func
+; 8-> AVX512 Update/10 opt func
+;;;;;
+%macro mbin_dispatch_init8 8
+ section .text
+ %1_dispatch_init:
+ push mbin_rsi
+ push mbin_rax
+ push mbin_rbx
+ push mbin_rcx
+ push mbin_rdx
+ push mbin_rdi
+ lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function
+
+ mov eax, 1
+ cpuid
+ mov ebx, ecx ; save cpuid1.ecx
+ test ecx, FLAG_CPUID1_ECX_SSE4_2
+ je _%1_init_done ; Use base function if no SSE4_2
+ lea mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt
+
+ ;; Test for XMM_YMM support/AVX
+ test ecx, FLAG_CPUID1_ECX_OSXSAVE
+ je _%1_init_done
+ xor ecx, ecx
+ xgetbv ; xcr -> edx:eax
+ mov edi, eax ; save xgetvb.eax
+
+ and eax, FLAG_XGETBV_EAX_XMM_YMM
+ cmp eax, FLAG_XGETBV_EAX_XMM_YMM
+ jne _%1_init_done
+ test ebx, FLAG_CPUID1_ECX_AVX
+ je _%1_init_done
+ lea mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt
+
+ ;; Test for AVX2
+ xor ecx, ecx
+ mov eax, 7
+ cpuid
+ test ebx, FLAG_CPUID7_EBX_AVX2
+ je _%1_init_done ; No AVX2 possible
+ lea mbin_rsi, [%5 WRT_OPT] ; AVX2/04 opt func
+
+ ;; Test for AVX512
+ and edi, FLAG_XGETBV_EAX_ZMM_OPM
+ cmp edi, FLAG_XGETBV_EAX_ZMM_OPM
+ jne _%1_check_avx2_g2 ; No AVX512 possible
+ and ebx, FLAGS_CPUID7_EBX_AVX512_G1
+ cmp ebx, FLAGS_CPUID7_EBX_AVX512_G1
+ lea mbin_rbx, [%6 WRT_OPT] ; AVX512/06 opt
+ cmove mbin_rsi, mbin_rbx
+
+ and ecx, FLAGS_CPUID7_ECX_AVX512_G2
+ cmp ecx, FLAGS_CPUID7_ECX_AVX512_G2
+ lea mbin_rbx, [%8 WRT_OPT] ; AVX512/10 opt
+ cmove mbin_rsi, mbin_rbx
+ jmp _%1_init_done
+
+ _%1_check_avx2_g2:
+ ;; Test for AVX2 Gen 2
+ and ecx, FLAGS_CPUID7_ECX_AVX2_G2
+ cmp ecx, FLAGS_CPUID7_ECX_AVX2_G2
+ lea mbin_rbx, [%7 WRT_OPT] ; AVX2/7 opt
+ cmove mbin_rsi, mbin_rbx
+
+ _%1_init_done:
+ pop mbin_rdi
+ pop mbin_rdx
+ pop mbin_rcx
+ pop mbin_rbx
+ pop mbin_rax
+ mov [%1_dispatched], mbin_rsi
+ pop mbin_rsi
+ ret
+%endmacro
%else
%macro mbin_dispatch_init7 7
mbin_dispatch_init6 %1, %2, %3, %4, %5, %6
%endmacro
+%macro mbin_dispatch_init8 8
+ mbin_dispatch_init6 %1, %2, %3, %4, %5, %6
+%endmacro
%endif
%endif ; ifndef _MULTIBINARY_ASM_
diff --git a/contrib/libs/isa-l/include/reg_sizes.asm b/contrib/libs/isa-l/include/reg_sizes.asm
index fec6a8aafb..983f8b421d 100644
--- a/contrib/libs/isa-l/include/reg_sizes.asm
+++ b/contrib/libs/isa-l/include/reg_sizes.asm
@@ -30,14 +30,6 @@
%ifndef _REG_SIZES_ASM_
%define _REG_SIZES_ASM_
-%ifdef __NASM_VER__
-%ifidn __OUTPUT_FORMAT__, win64
-%error nasm not supported in windows
-%else
-%define endproc_frame
-%endif
-%endif
-
%ifndef AS_FEATURE_LEVEL
%define AS_FEATURE_LEVEL 4
%endif
@@ -75,6 +67,7 @@
%define FLAGS_CPUID7_EBX_AVX512_G1 (FLAG_CPUID7_EBX_AVX512F | FLAG_CPUID7_EBX_AVX512VL | FLAG_CPUID7_EBX_AVX512BW | FLAG_CPUID7_EBX_AVX512CD | FLAG_CPUID7_EBX_AVX512DQ)
%define FLAGS_CPUID7_ECX_AVX512_G2 (FLAG_CPUID7_ECX_AVX512VBMI2 | FLAG_CPUID7_ECX_GFNI | FLAG_CPUID7_ECX_VAES | FLAG_CPUID7_ECX_VPCLMULQDQ | FLAG_CPUID7_ECX_VNNI | FLAG_CPUID7_ECX_BITALG | FLAG_CPUID7_ECX_VPOPCNTDQ)
+%define FLAGS_CPUID7_ECX_AVX2_G2 (FLAG_CPUID7_ECX_GFNI | FLAG_CPUID7_ECX_VAES | FLAG_CPUID7_ECX_VPCLMULQDQ)
%define FLAG_XGETBV_EAX_XMM (1<<1)
%define FLAG_XGETBV_EAX_YMM (1<<2)
@@ -203,14 +196,44 @@
%define XWORD(reg) reg %+ x
+%ifdef INTEL_CET_ENABLED
+ %ifdef __NASM_VER__
+ %if AS_FEATURE_LEVEL >= 10
+ %ifidn __OUTPUT_FORMAT__,elf32
+section .note.gnu.property note alloc noexec align=4
+DD 0x00000004,0x0000000c,0x00000005,0x00554e47
+DD 0xc0000002,0x00000004,0x00000003
+ %endif
+ %ifidn __OUTPUT_FORMAT__,elf64
+section .note.gnu.property note alloc noexec align=8
+DD 0x00000004,0x00000010,0x00000005,0x00554e47
+DD 0xc0000002,0x00000004,0x00000003,0x00000000
+ %endif
+ %endif
+ %endif
+%endif
+
%ifidn __OUTPUT_FORMAT__,elf32
section .note.GNU-stack noalloc noexec nowrite progbits
section .text
%endif
%ifidn __OUTPUT_FORMAT__,elf64
+ %define __x86_64__
section .note.GNU-stack noalloc noexec nowrite progbits
section .text
%endif
+%ifidn __OUTPUT_FORMAT__,win64
+ %define __x86_64__
+%endif
+%ifidn __OUTPUT_FORMAT__,macho64
+ %define __x86_64__
+%endif
+
+%ifdef __x86_64__
+ %define endbranch db 0xf3, 0x0f, 0x1e, 0xfa
+%else
+ %define endbranch db 0xf3, 0x0f, 0x1e, 0xfb
+%endif
%ifdef REL_TEXT
%define WRT_OPT
@@ -220,29 +243,56 @@ section .text
%define WRT_OPT
%endif
+%macro mk_global 1-3
+ %ifdef __NASM_VER__
+ %ifidn __OUTPUT_FORMAT__, macho64
+ global %1
+ %elifidn __OUTPUT_FORMAT__, win64
+ global %1
+ %else
+ global %1:%2 %3
+ %endif
+ %else
+ global %1:%2 %3
+ %endif
+%endmacro
+
+
+; Fixes for nasm lack of MS proc helpers
+%ifdef __NASM_VER__
+ %ifidn __OUTPUT_FORMAT__, win64
+ %macro alloc_stack 1
+ sub rsp, %1
+ %endmacro
+
+ %macro proc_frame 1
+ %1:
+ %endmacro
+
+ %macro save_xmm128 2
+ movdqa [rsp + %2], %1
+ %endmacro
+
+ %macro save_reg 2
+ mov [rsp + %2], %1
+ %endmacro
+
+ %macro rex_push_reg 1
+ push %1
+ %endmacro
+
+ %macro push_reg 1
+ push %1
+ %endmacro
+
+ %define end_prolog
+ %endif
+
+ %define endproc_frame
+%endif
+
%ifidn __OUTPUT_FORMAT__, macho64
%define elf64 macho64
mac_equ equ 1
- %ifdef __NASM_VER__
- %define ISAL_SYM_TYPE_FUNCTION
- %define ISAL_SYM_TYPE_DATA_INTERNAL
- %else
- %define ISAL_SYM_TYPE_FUNCTION function
- %define ISAL_SYM_TYPE_DATA_INTERNAL data internal
- %endif
-%else
- %define ISAL_SYM_TYPE_FUNCTION function
- %define ISAL_SYM_TYPE_DATA_INTERNAL data internal
%endif
-
-%macro slversion 4
- section .text
- global %1_slver_%2%3%4
- global %1_slver
- %1_slver:
- %1_slver_%2%3%4:
- dw 0x%4
- db 0x%3, 0x%2
-%endmacro
-
%endif ; ifndef _REG_SIZES_ASM_