aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_mad_sve.S
diff options
context:
space:
mode:
authorAlexander Smirnov <alex@ydb.tech>2024-10-16 12:11:24 +0000
committerAlexander Smirnov <alex@ydb.tech>2024-10-16 12:11:24 +0000
commit40811e93f3fdf9342a9295369994012420fac548 (patch)
treea8d85e094a9c21e10aa250f537c101fc2016a049 /contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_mad_sve.S
parent30ebe5357bb143648c6be4d151ecd4944af81ada (diff)
parent28a0c4a9f297064538a018c512cd9bbd00a1a35d (diff)
downloadydb-40811e93f3fdf9342a9295369994012420fac548.tar.gz
Merge branch 'rightlib' into mergelibs-241016-1210
Diffstat (limited to 'contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_mad_sve.S')
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_mad_sve.S194
1 files changed, 194 insertions, 0 deletions
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_mad_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_mad_sve.S
new file mode 100644
index 0000000000..89ec89f5c6
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_mad_sve.S
@@ -0,0 +1,194 @@
+/**************************************************************
+ Copyright (c) 2021 Linaro Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+.text
+.align 6
+.arch armv8-a+sve
+
+#include "../include/aarch64_label.h"
+
+.global cdecl(gf_4vect_mad_sve)
+#ifndef __APPLE__
+.type gf_4vect_mad_sve, %function
+#endif
+
+/* gf_4vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char **dest);
+ */
+/* arguments */
+x_len .req x0
+x_vec .req x1
+x_vec_i .req x2
+x_tbl .req x3
+x_src .req x4
+x_dest .req x5
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_pos .req x6
+x_dest2 .req x7
+x_dest3 .req x8
+x_dest4 .req x9
+x_dest1 .req x12
+
+/* vectors */
+z_mask0f .req z0
+
+z_src .req z1
+z_src_lo .req z2
+z_src_hi .req z_src
+
+z_dest1 .req z3
+
+z_tmp_lo .req z4
+z_tmp_hi .req z5
+
+z_gft1_lo .req z6
+z_gft1_hi .req z7
+q_gft1_lo .req q6
+q_gft1_hi .req q7
+
+/* bottom 64-bit of v8..v15 must be preserved if used */
+z_gft2_lo .req z17
+z_gft2_hi .req z18
+q_gft2_lo .req q17
+q_gft2_hi .req q18
+
+z_gft3_lo .req z19
+z_gft3_hi .req z20
+q_gft3_lo .req q19
+q_gft3_hi .req q20
+
+z_gft4_lo .req z21
+z_gft4_hi .req z22
+q_gft4_lo .req q21
+q_gft4_hi .req q22
+
+z_dest2 .req z27
+z_dest3 .req z28
+z_dest4 .req z29
+
+cdecl(gf_4vect_mad_sve):
+ /* less than 16 bytes, return_fail */
+ cmp x_len, #16
+ blt .return_fail
+
+ mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */
+ /* load table 1 */
+ add x_tbl, x_tbl, x_vec_i, LSL #5 /* x_tbl += x_vec_i * 2^5 */
+
+ /* Load table 1 with NEON instruction ldp */
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl]
+ /* load table 2 */
+ add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */
+ ldp q_gft2_lo, q_gft2_hi, [x_tbl]
+ /* load table 3 */
+ add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */
+ ldp q_gft3_lo, q_gft3_hi, [x_tbl]
+ /* load table 4 */
+ add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */
+ ldp q_gft4_lo, q_gft4_hi, [x_tbl]
+
+ ldr x_dest1, [x_dest, #8*0] /* pointer to dest1 */
+ ldr x_dest2, [x_dest, #8*1] /* pointer to dest2 */
+ ldr x_dest3, [x_dest, #8*2] /* pointer to dest3 */
+ ldr x_dest4, [x_dest, #8*3] /* pointer to dest4 */
+
+ mov x_pos, #0
+
+ /* vector length agnostic */
+.Lloopsve_vl:
+ whilelo p0.b, x_pos, x_len
+ b.none .return_pass
+
+ prfb pldl2strm, p0, [x_dest1, x_pos]
+ prfb pldl2strm, p0, [x_dest2, x_pos]
+
+ /* load src data, governed by p0 */
+ ld1b z_src.b, p0/z, [x_src, x_pos]
+
+ /* split 4-bit lo; 4-bit hi */
+ and z_src_lo.d, z_src.d, z_mask0f.d
+ lsr z_src_hi.b, z_src.b, #4
+
+ /* load dest data, governed by p0 */
+ ld1b z_dest1.b, p0/z, [x_dest1, x_pos]
+ ld1b z_dest2.b, p0/z, [x_dest2, x_pos]
+
+ prfb pldl2strm, p0, [x_dest3, x_pos]
+ prfb pldl2strm, p0, [x_dest4, x_pos]
+
+ /* dest1 */
+ /* table indexing, ie. gf(2^8) multiplication */
+ tbl z_tmp_lo.b, {z_gft1_lo.b}, z_src_lo.b
+ tbl z_tmp_hi.b, {z_gft1_hi.b}, z_src_hi.b
+ /* exclusive or, ie. gf(2^8) add */
+ eor z_dest1.d, z_tmp_lo.d, z_dest1.d
+ eor z_dest1.d, z_tmp_hi.d, z_dest1.d
+
+ ld1b z_dest3.b, p0/z, [x_dest3, x_pos]
+ ld1b z_dest4.b, p0/z, [x_dest4, x_pos]
+
+ /* dest2 */
+ tbl z_tmp_lo.b, {z_gft2_lo.b}, z_src_lo.b
+ tbl z_tmp_hi.b, {z_gft2_hi.b}, z_src_hi.b
+ eor z_dest2.d, z_tmp_lo.d, z_dest2.d
+ eor z_dest2.d, z_tmp_hi.d, z_dest2.d
+
+ /* dest3 */
+ tbl z_tmp_lo.b, {z_gft3_lo.b}, z_src_lo.b
+ tbl z_tmp_hi.b, {z_gft3_hi.b}, z_src_hi.b
+ eor z_dest3.d, z_tmp_lo.d, z_dest3.d
+ eor z_dest3.d, z_tmp_hi.d, z_dest3.d
+
+ /* store dest data, governed by p0 */
+ st1b z_dest1.b, p0, [x_dest1, x_pos]
+ st1b z_dest2.b, p0, [x_dest2, x_pos]
+
+ /* dest4 */
+ tbl z_tmp_lo.b, {z_gft4_lo.b}, z_src_lo.b
+ tbl z_tmp_hi.b, {z_gft4_hi.b}, z_src_hi.b
+ eor z_dest4.d, z_tmp_lo.d, z_dest4.d
+ eor z_dest4.d, z_tmp_hi.d, z_dest4.d
+
+ st1b z_dest3.b, p0, [x_dest3, x_pos]
+ st1b z_dest4.b, p0, [x_dest4, x_pos]
+ /* increment one vector length */
+ incb x_pos
+
+ b .Lloopsve_vl
+
+.return_pass:
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret