Restoring authorship annotation for <yazevnul@yandex-team.ru>. Commit 2 of 2.

author: yazevnul <yazevnul@yandex-team.ru> 2022-02-10 16:46:48 +0300
committer: Daniil Cherednik <dcherednik@yandex-team.ru> 2022-02-10 16:46:48 +0300
commit: 9abfb1a53b7f7b791444d1378e645d8fad9b06ed (patch)
tree: 49e222ea1c5804306084bb3ae065bb702625360f /contrib/libs/base64/neon32/codec_neon32.c
parent: 8cbc307de0221f84c80c42dcbe07d40727537e2c (diff)
download: ydb-9abfb1a53b7f7b791444d1378e645d8fad9b06ed.tar.gz
1 files changed, 160 insertions, 160 deletions
diff --git a/contrib/libs/base64/neon32/codec_neon32.c b/contrib/libs/base64/neon32/codec_neon32.c
index 05fcfc3e63..2c9ae02f75 100644
--- a/contrib/libs/base64/neon32/codec_neon32.c
+++ b/contrib/libs/base64/neon32/codec_neon32.c
@@ -1,160 +1,160 @@
-#if (defined(__ARM_NEON) && !defined(__ARM_NEON__)) 
-#define __ARM_NEON__ 
-#endif 
- 
-#include <stdint.h> 
-#include <stddef.h> 
-#include <stdlib.h> 
-#ifdef __ARM_NEON__ 
-#include <arm_neon.h> 
-#endif 
- 
-#include "libbase64.h" 
-#include "codecs.h" 
- 
-#if (defined(__arm__) && defined(__ARM_NEON__)) 
- 
-#define CMPGT(s,n)	vcgtq_u8((s), vdupq_n_u8(n)) 
-#define CMPEQ(s,n)	vceqq_u8((s), vdupq_n_u8(n)) 
-#define REPLACE(s,n)	vandq_u8((s), vdupq_n_u8(n)) 
-#define RANGE(s,a,b)	vandq_u8(vcgeq_u8((s), vdupq_n_u8(a)), vcleq_u8((s), vdupq_n_u8(b))) 
- 
-static inline uint8x16x4_t 
-enc_reshuffle (uint8x16x3_t in) 
-{ 
-	uint8x16x4_t out; 
- 
-	// Divide bits of three input bytes over four output bytes: 
-	out.val[0] = vshrq_n_u8(in.val[0], 2); 
-	out.val[1] = vorrq_u8(vshrq_n_u8(in.val[1], 4), vshlq_n_u8(in.val[0], 4)); 
-	out.val[2] = vorrq_u8(vshrq_n_u8(in.val[2], 6), vshlq_n_u8(in.val[1], 2)); 
-	out.val[3] = in.val[2]; 
- 
-	// Clear top two bits: 
-	out.val[0] = vandq_u8(out.val[0], vdupq_n_u8(0x3F)); 
-	out.val[1] = vandq_u8(out.val[1], vdupq_n_u8(0x3F)); 
-	out.val[2] = vandq_u8(out.val[2], vdupq_n_u8(0x3F)); 
-	out.val[3] = vandq_u8(out.val[3], vdupq_n_u8(0x3F)); 
- 
-	return out; 
-} 
- 
-static inline uint8x16x4_t 
-enc_translate (uint8x16x4_t in) 
-{ 
-	uint8x16x4_t mask1, mask2, mask3, mask4, out; 
- 
-	// Translate values 0..63 to the Base64 alphabet. There are five sets: 
-	// #  From      To         Abs  Delta  Characters 
-	// 0  [0..25]   [65..90]   +65  +65    ABCDEFGHIJKLMNOPQRSTUVWXYZ 
-	// 1  [26..51]  [97..122]  +71   +6    abcdefghijklmnopqrstuvwxyz 
-	// 2  [52..61]  [48..57]    -4  -75    0123456789 
-	// 3  [62]      [43]       -19  -15    + 
-	// 4  [63]      [47]       -16   +3    / 
- 
-	// Create cumulative masks for characters in sets [1,2,3,4], [2,3,4], 
-	// [3,4], and [4]: 
-	mask1.val[0] = CMPGT(in.val[0], 25); 
-	mask1.val[1] = CMPGT(in.val[1], 25); 
-	mask1.val[2] = CMPGT(in.val[2], 25); 
-	mask1.val[3] = CMPGT(in.val[3], 25); 
- 
-	mask2.val[0] = CMPGT(in.val[0], 51); 
-	mask2.val[1] = CMPGT(in.val[1], 51); 
-	mask2.val[2] = CMPGT(in.val[2], 51); 
-	mask2.val[3] = CMPGT(in.val[3], 51); 
- 
-	mask3.val[0] = CMPGT(in.val[0], 61); 
-	mask3.val[1] = CMPGT(in.val[1], 61); 
-	mask3.val[2] = CMPGT(in.val[2], 61); 
-	mask3.val[3] = CMPGT(in.val[3], 61); 
- 
-	mask4.val[0] = CMPEQ(in.val[0], 63); 
-	mask4.val[1] = CMPEQ(in.val[1], 63); 
-	mask4.val[2] = CMPEQ(in.val[2], 63); 
-	mask4.val[3] = CMPEQ(in.val[3], 63); 
- 
-	// All characters are at least in cumulative set 0, so add 'A': 
-	out.val[0] = vaddq_u8(in.val[0], vdupq_n_u8(65)); 
-	out.val[1] = vaddq_u8(in.val[1], vdupq_n_u8(65)); 
-	out.val[2] = vaddq_u8(in.val[2], vdupq_n_u8(65)); 
-	out.val[3] = vaddq_u8(in.val[3], vdupq_n_u8(65)); 
- 
-	// For inputs which are also in any of the other cumulative sets, 
-	// add delta values against the previous set(s) to correct the shift: 
-	out.val[0] = vaddq_u8(out.val[0], REPLACE(mask1.val[0], 6)); 
-	out.val[1] = vaddq_u8(out.val[1], REPLACE(mask1.val[1], 6)); 
-	out.val[2] = vaddq_u8(out.val[2], REPLACE(mask1.val[2], 6)); 
-	out.val[3] = vaddq_u8(out.val[3], REPLACE(mask1.val[3], 6)); 
- 
-	out.val[0] = vsubq_u8(out.val[0], REPLACE(mask2.val[0], 75)); 
-	out.val[1] = vsubq_u8(out.val[1], REPLACE(mask2.val[1], 75)); 
-	out.val[2] = vsubq_u8(out.val[2], REPLACE(mask2.val[2], 75)); 
-	out.val[3] = vsubq_u8(out.val[3], REPLACE(mask2.val[3], 75)); 
- 
-	out.val[0] = vsubq_u8(out.val[0], REPLACE(mask3.val[0], 15)); 
-	out.val[1] = vsubq_u8(out.val[1], REPLACE(mask3.val[1], 15)); 
-	out.val[2] = vsubq_u8(out.val[2], REPLACE(mask3.val[2], 15)); 
-	out.val[3] = vsubq_u8(out.val[3], REPLACE(mask3.val[3], 15)); 
- 
-	out.val[0] = vaddq_u8(out.val[0], REPLACE(mask4.val[0], 3)); 
-	out.val[1] = vaddq_u8(out.val[1], REPLACE(mask4.val[1], 3)); 
-	out.val[2] = vaddq_u8(out.val[2], REPLACE(mask4.val[2], 3)); 
-	out.val[3] = vaddq_u8(out.val[3], REPLACE(mask4.val[3], 3)); 
- 
-	return out; 
-} 
- 
-#endif 
- 
-// Stride size is so large on these NEON 32-bit functions 
-// (48 bytes encode, 32 bytes decode) that we inline the 
-// uint32 codec to stay performant on smaller inputs. 
- 
-void 
-neon32_base64_stream_encode 
-	( struct neon32_base64_state	*state 
-	, const char		*src 
-	, size_t		 srclen 
-	, char			*out 
-	, size_t		*outlen 
-	) 
-{ 
-#if (defined(__arm__) && defined(__ARM_NEON__)) 
-	#include "enc_head.c" 
-	#include "enc_neon.c" 
-	#include "enc_uint32.c" 
-	#include "enc_tail.c" 
-#else 
-    (void)state; 
-    (void)src; 
-    (void)srclen; 
-    (void)out; 
-    (void)outlen; 
-    abort(); 
-#endif 
-} 
- 
-int 
-neon32_base64_stream_decode 
-	( struct neon32_base64_state	*state 
-	, const char		*src 
-	, size_t		 srclen 
-	, char			*out 
-	, size_t		*outlen 
-	) 
-{ 
-#if (defined(__arm__) && defined(__ARM_NEON__)) 
-	#include "dec_head.c" 
-	#include "dec_neon.c" 
-	#include "dec_uint32.c" 
-	#include "dec_tail.c" 
-#else 
-    (void)state; 
-    (void)src; 
-    (void)srclen; 
-    (void)out; 
-    (void)outlen; 
-    abort(); 
-#endif 
-} 
+#if (defined(__ARM_NEON) && !defined(__ARM_NEON__))
+#define __ARM_NEON__
+#endif
+
+#include <stdint.h>
+#include <stddef.h>
+#include <stdlib.h>
+#ifdef __ARM_NEON__
+#include <arm_neon.h>
+#endif
+
+#include "libbase64.h"
+#include "codecs.h"
+
+#if (defined(__arm__) && defined(__ARM_NEON__))
+
+#define CMPGT(s,n)	vcgtq_u8((s), vdupq_n_u8(n))
+#define CMPEQ(s,n)	vceqq_u8((s), vdupq_n_u8(n))
+#define REPLACE(s,n)	vandq_u8((s), vdupq_n_u8(n))
+#define RANGE(s,a,b)	vandq_u8(vcgeq_u8((s), vdupq_n_u8(a)), vcleq_u8((s), vdupq_n_u8(b)))
+
+static inline uint8x16x4_t
+enc_reshuffle (uint8x16x3_t in)
+{
+	uint8x16x4_t out;
+
+	// Divide bits of three input bytes over four output bytes:
+	out.val[0] = vshrq_n_u8(in.val[0], 2);
+	out.val[1] = vorrq_u8(vshrq_n_u8(in.val[1], 4), vshlq_n_u8(in.val[0], 4));
+	out.val[2] = vorrq_u8(vshrq_n_u8(in.val[2], 6), vshlq_n_u8(in.val[1], 2));
+	out.val[3] = in.val[2];
+
+	// Clear top two bits:
+	out.val[0] = vandq_u8(out.val[0], vdupq_n_u8(0x3F));
+	out.val[1] = vandq_u8(out.val[1], vdupq_n_u8(0x3F));
+	out.val[2] = vandq_u8(out.val[2], vdupq_n_u8(0x3F));
+	out.val[3] = vandq_u8(out.val[3], vdupq_n_u8(0x3F));
+
+	return out;
+}
+
+static inline uint8x16x4_t
+enc_translate (uint8x16x4_t in)
+{
+	uint8x16x4_t mask1, mask2, mask3, mask4, out;
+
+	// Translate values 0..63 to the Base64 alphabet. There are five sets:
+	// #  From      To         Abs  Delta  Characters
+	// 0  [0..25]   [65..90]   +65  +65    ABCDEFGHIJKLMNOPQRSTUVWXYZ
+	// 1  [26..51]  [97..122]  +71   +6    abcdefghijklmnopqrstuvwxyz
+	// 2  [52..61]  [48..57]    -4  -75    0123456789
+	// 3  [62]      [43]       -19  -15    +
+	// 4  [63]      [47]       -16   +3    /
+
+	// Create cumulative masks for characters in sets [1,2,3,4], [2,3,4],
+	// [3,4], and [4]:
+	mask1.val[0] = CMPGT(in.val[0], 25);
+	mask1.val[1] = CMPGT(in.val[1], 25);
+	mask1.val[2] = CMPGT(in.val[2], 25);
+	mask1.val[3] = CMPGT(in.val[3], 25);
+
+	mask2.val[0] = CMPGT(in.val[0], 51);
+	mask2.val[1] = CMPGT(in.val[1], 51);
+	mask2.val[2] = CMPGT(in.val[2], 51);
+	mask2.val[3] = CMPGT(in.val[3], 51);
+
+	mask3.val[0] = CMPGT(in.val[0], 61);
+	mask3.val[1] = CMPGT(in.val[1], 61);
+	mask3.val[2] = CMPGT(in.val[2], 61);
+	mask3.val[3] = CMPGT(in.val[3], 61);
+
+	mask4.val[0] = CMPEQ(in.val[0], 63);
+	mask4.val[1] = CMPEQ(in.val[1], 63);
+	mask4.val[2] = CMPEQ(in.val[2], 63);
+	mask4.val[3] = CMPEQ(in.val[3], 63);
+
+	// All characters are at least in cumulative set 0, so add 'A':
+	out.val[0] = vaddq_u8(in.val[0], vdupq_n_u8(65));
+	out.val[1] = vaddq_u8(in.val[1], vdupq_n_u8(65));
+	out.val[2] = vaddq_u8(in.val[2], vdupq_n_u8(65));
+	out.val[3] = vaddq_u8(in.val[3], vdupq_n_u8(65));
+
+	// For inputs which are also in any of the other cumulative sets,
+	// add delta values against the previous set(s) to correct the shift:
+	out.val[0] = vaddq_u8(out.val[0], REPLACE(mask1.val[0], 6));
+	out.val[1] = vaddq_u8(out.val[1], REPLACE(mask1.val[1], 6));
+	out.val[2] = vaddq_u8(out.val[2], REPLACE(mask1.val[2], 6));
+	out.val[3] = vaddq_u8(out.val[3], REPLACE(mask1.val[3], 6));
+
+	out.val[0] = vsubq_u8(out.val[0], REPLACE(mask2.val[0], 75));
+	out.val[1] = vsubq_u8(out.val[1], REPLACE(mask2.val[1], 75));
+	out.val[2] = vsubq_u8(out.val[2], REPLACE(mask2.val[2], 75));
+	out.val[3] = vsubq_u8(out.val[3], REPLACE(mask2.val[3], 75));
+
+	out.val[0] = vsubq_u8(out.val[0], REPLACE(mask3.val[0], 15));
+	out.val[1] = vsubq_u8(out.val[1], REPLACE(mask3.val[1], 15));
+	out.val[2] = vsubq_u8(out.val[2], REPLACE(mask3.val[2], 15));
+	out.val[3] = vsubq_u8(out.val[3], REPLACE(mask3.val[3], 15));
+
+	out.val[0] = vaddq_u8(out.val[0], REPLACE(mask4.val[0], 3));
+	out.val[1] = vaddq_u8(out.val[1], REPLACE(mask4.val[1], 3));
+	out.val[2] = vaddq_u8(out.val[2], REPLACE(mask4.val[2], 3));
+	out.val[3] = vaddq_u8(out.val[3], REPLACE(mask4.val[3], 3));
+
+	return out;
+}
+
+#endif
+
+// Stride size is so large on these NEON 32-bit functions
+// (48 bytes encode, 32 bytes decode) that we inline the
+// uint32 codec to stay performant on smaller inputs.
+
+void
+neon32_base64_stream_encode
+	( struct neon32_base64_state	*state
+	, const char		*src
+	, size_t		 srclen
+	, char			*out
+	, size_t		*outlen
+	)
+{
+#if (defined(__arm__) && defined(__ARM_NEON__))
+	#include "enc_head.c"
+	#include "enc_neon.c"
+	#include "enc_uint32.c"
+	#include "enc_tail.c"
+#else
+    (void)state;
+    (void)src;
+    (void)srclen;
+    (void)out;
+    (void)outlen;
+    abort();
+#endif
+}
+
+int
+neon32_base64_stream_decode
+	( struct neon32_base64_state	*state
+	, const char		*src
+	, size_t		 srclen
+	, char			*out
+	, size_t		*outlen
+	)
+{
+#if (defined(__arm__) && defined(__ARM_NEON__))
+	#include "dec_head.c"
+	#include "dec_neon.c"
+	#include "dec_uint32.c"
+	#include "dec_tail.c"
+#else
+    (void)state;
+    (void)src;
+    (void)srclen;
+    (void)out;
+    (void)outlen;
+    abort();
+#endif
+}
author	yazevnul <yazevnul@yandex-team.ru>	2022-02-10 16:46:48 +0300
committer	Daniil Cherednik <dcherednik@yandex-team.ru>	2022-02-10 16:46:48 +0300
commit	9abfb1a53b7f7b791444d1378e645d8fad9b06ed (patch)
tree	49e222ea1c5804306084bb3ae065bb702625360f /contrib/libs/base64/neon32/codec_neon32.c
parent	8cbc307de0221f84c80c42dcbe07d40727537e2c (diff)
download	ydb-9abfb1a53b7f7b791444d1378e645d8fad9b06ed.tar.gz