Restoring authorship annotation for <yazevnul@yandex-team.ru>. Commit 2 of 2.

author: yazevnul <yazevnul@yandex-team.ru> 2022-02-10 16:46:48 +0300
committer: Daniil Cherednik <dcherednik@yandex-team.ru> 2022-02-10 16:46:48 +0300
commit: 9abfb1a53b7f7b791444d1378e645d8fad9b06ed (patch)
tree: 49e222ea1c5804306084bb3ae065bb702625360f /contrib/libs/base64/neon32/dec_neon.c
parent: 8cbc307de0221f84c80c42dcbe07d40727537e2c (diff)
download: ydb-9abfb1a53b7f7b791444d1378e645d8fad9b06ed.tar.gz
1 files changed, 77 insertions, 77 deletions
diff --git a/contrib/libs/base64/neon32/dec_neon.c b/contrib/libs/base64/neon32/dec_neon.c
index 30d846a916..713d8ca9a4 100644
--- a/contrib/libs/base64/neon32/dec_neon.c
+++ b/contrib/libs/base64/neon32/dec_neon.c
@@ -1,77 +1,77 @@
-// If we have NEON support, pick off 64 bytes at a time for as long as we can. 
-// Unlike the SSE codecs, we don't write trailing zero bytes to output, so we 
-// don't need to check if we have enough remaining input to cover them: 
-while (srclen >= 64) 
-{ 
-	uint8x16x4_t set1, set2, set3, set4, set5, set6, set7, delta; 
-	uint8x16x3_t dec; 
- 
-	// Load 64 bytes and deinterleave: 
-	uint8x16x4_t str = vld4q_u8((uint8_t *)c); 
- 
-	// The input consists of six character sets in the Base64 alphabet, 
-	// which we need to map back to the 6-bit values they represent. 
-	// There are three ranges, two singles, and then there's the rest. 
-	// 
-	//  #  From       To        Add  Characters 
-	//  1  [43]       [62]      +19  + 
-	//  2  [47]       [63]      +16  / 
-	//  3  [48..57]   [52..61]   +4  0..9 
-	//  4  [65..90]   [0..25]   -65  A..Z 
-	//  5  [97..122]  [26..51]  -71  a..z 
-	// (6) Everything else => invalid input 
- 
-	// Benchmarking on the Raspberry Pi 2B and Clang shows that looping 
-	// generates slightly faster code than explicit unrolling: 
-	for (int i = 0; i < 4; i++) { 
-		set1.val[i] = CMPEQ(str.val[i], '+'); 
-		set2.val[i] = CMPEQ(str.val[i], '/'); 
-		set3.val[i] = RANGE(str.val[i], '0', '9'); 
-		set4.val[i] = RANGE(str.val[i], 'A', 'Z'); 
-		set5.val[i] = RANGE(str.val[i], 'a', 'z'); 
-		set6.val[i] = CMPEQ(str.val[i], '-'); 
-		set7.val[i] = CMPEQ(str.val[i], '_'); 
- 
-		delta.val[i] = REPLACE(set1.val[i], 19); 
-		delta.val[i] = vorrq_u8(delta.val[i], REPLACE(set2.val[i],  16)); 
-		delta.val[i] = vorrq_u8(delta.val[i], REPLACE(set3.val[i],   4)); 
-		delta.val[i] = vorrq_u8(delta.val[i], REPLACE(set4.val[i], -65)); 
-		delta.val[i] = vorrq_u8(delta.val[i], REPLACE(set5.val[i], -71)); 
-		delta.val[i] = vorrq_u8(delta.val[i], REPLACE(set6.val[i], 17)); 
-		delta.val[i] = vorrq_u8(delta.val[i], REPLACE(set7.val[i], -32)); 
-	} 
- 
-	// Check for invalid input: if any of the delta values are zero, 
-	// fall back on bytewise code to do error checking and reporting: 
-	uint8x16_t classified = CMPEQ(delta.val[0], 0); 
-	classified = vorrq_u8(classified, CMPEQ(delta.val[1], 0)); 
-	classified = vorrq_u8(classified, CMPEQ(delta.val[2], 0)); 
-	classified = vorrq_u8(classified, CMPEQ(delta.val[3], 0)); 
- 
-	// Extract both 32-bit halves; check that all bits are zero: 
-	if (vgetq_lane_u32((uint32x4_t)classified, 0) != 0 
-	 || vgetq_lane_u32((uint32x4_t)classified, 1) != 0 
-	 || vgetq_lane_u32((uint32x4_t)classified, 2) != 0 
-	 || vgetq_lane_u32((uint32x4_t)classified, 3) != 0) { 
-		break; 
-	} 
- 
-	// Now simply add the delta values to the input: 
-	str.val[0] = vaddq_u8(str.val[0], delta.val[0]); 
-	str.val[1] = vaddq_u8(str.val[1], delta.val[1]); 
-	str.val[2] = vaddq_u8(str.val[2], delta.val[2]); 
-	str.val[3] = vaddq_u8(str.val[3], delta.val[3]); 
- 
-	// Compress four bytes into three: 
-	dec.val[0] = vshlq_n_u8(str.val[0], 2) | vshrq_n_u8(str.val[1], 4); 
-	dec.val[1] = vshlq_n_u8(str.val[1], 4) | vshrq_n_u8(str.val[2], 2); 
-	dec.val[2] = vshlq_n_u8(str.val[2], 6) | str.val[3]; 
- 
-	// Interleave and store decoded result: 
-	vst3q_u8((uint8_t *)o, dec); 
- 
-	c += 64; 
-	o += 48; 
-	outl += 48; 
-	srclen -= 64; 
-} 
+// If we have NEON support, pick off 64 bytes at a time for as long as we can.
+// Unlike the SSE codecs, we don't write trailing zero bytes to output, so we
+// don't need to check if we have enough remaining input to cover them:
+while (srclen >= 64)
+{
+	uint8x16x4_t set1, set2, set3, set4, set5, set6, set7, delta;
+	uint8x16x3_t dec;
+
+	// Load 64 bytes and deinterleave:
+	uint8x16x4_t str = vld4q_u8((uint8_t *)c);
+
+	// The input consists of six character sets in the Base64 alphabet,
+	// which we need to map back to the 6-bit values they represent.
+	// There are three ranges, two singles, and then there's the rest.
+	//
+	//  #  From       To        Add  Characters
+	//  1  [43]       [62]      +19  +
+	//  2  [47]       [63]      +16  /
+	//  3  [48..57]   [52..61]   +4  0..9
+	//  4  [65..90]   [0..25]   -65  A..Z
+	//  5  [97..122]  [26..51]  -71  a..z
+	// (6) Everything else => invalid input
+
+	// Benchmarking on the Raspberry Pi 2B and Clang shows that looping
+	// generates slightly faster code than explicit unrolling:
+	for (int i = 0; i < 4; i++) {
+		set1.val[i] = CMPEQ(str.val[i], '+');
+		set2.val[i] = CMPEQ(str.val[i], '/');
+		set3.val[i] = RANGE(str.val[i], '0', '9');
+		set4.val[i] = RANGE(str.val[i], 'A', 'Z');
+		set5.val[i] = RANGE(str.val[i], 'a', 'z');
+		set6.val[i] = CMPEQ(str.val[i], '-');
+		set7.val[i] = CMPEQ(str.val[i], '_');
+
+		delta.val[i] = REPLACE(set1.val[i], 19);
+		delta.val[i] = vorrq_u8(delta.val[i], REPLACE(set2.val[i],  16));
+		delta.val[i] = vorrq_u8(delta.val[i], REPLACE(set3.val[i],   4));
+		delta.val[i] = vorrq_u8(delta.val[i], REPLACE(set4.val[i], -65));
+		delta.val[i] = vorrq_u8(delta.val[i], REPLACE(set5.val[i], -71));
+		delta.val[i] = vorrq_u8(delta.val[i], REPLACE(set6.val[i], 17));
+		delta.val[i] = vorrq_u8(delta.val[i], REPLACE(set7.val[i], -32));
+	}
+
+	// Check for invalid input: if any of the delta values are zero,
+	// fall back on bytewise code to do error checking and reporting:
+	uint8x16_t classified = CMPEQ(delta.val[0], 0);
+	classified = vorrq_u8(classified, CMPEQ(delta.val[1], 0));
+	classified = vorrq_u8(classified, CMPEQ(delta.val[2], 0));
+	classified = vorrq_u8(classified, CMPEQ(delta.val[3], 0));
+
+	// Extract both 32-bit halves; check that all bits are zero:
+	if (vgetq_lane_u32((uint32x4_t)classified, 0) != 0
+	 || vgetq_lane_u32((uint32x4_t)classified, 1) != 0
+	 || vgetq_lane_u32((uint32x4_t)classified, 2) != 0
+	 || vgetq_lane_u32((uint32x4_t)classified, 3) != 0) {
+		break;
+	}
+
+	// Now simply add the delta values to the input:
+	str.val[0] = vaddq_u8(str.val[0], delta.val[0]);
+	str.val[1] = vaddq_u8(str.val[1], delta.val[1]);
+	str.val[2] = vaddq_u8(str.val[2], delta.val[2]);
+	str.val[3] = vaddq_u8(str.val[3], delta.val[3]);
+
+	// Compress four bytes into three:
+	dec.val[0] = vshlq_n_u8(str.val[0], 2) | vshrq_n_u8(str.val[1], 4);
+	dec.val[1] = vshlq_n_u8(str.val[1], 4) | vshrq_n_u8(str.val[2], 2);
+	dec.val[2] = vshlq_n_u8(str.val[2], 6) | str.val[3];
+
+	// Interleave and store decoded result:
+	vst3q_u8((uint8_t *)o, dec);
+
+	c += 64;
+	o += 48;
+	outl += 48;
+	srclen -= 64;
+}
author	yazevnul <yazevnul@yandex-team.ru>	2022-02-10 16:46:48 +0300
committer	Daniil Cherednik <dcherednik@yandex-team.ru>	2022-02-10 16:46:48 +0300
commit	9abfb1a53b7f7b791444d1378e645d8fad9b06ed (patch)
tree	49e222ea1c5804306084bb3ae065bb702625360f /contrib/libs/base64/neon32/dec_neon.c
parent	8cbc307de0221f84c80c42dcbe07d40727537e2c (diff)
download	ydb-9abfb1a53b7f7b791444d1378e645d8fad9b06ed.tar.gz