Restoring authorship annotation for <[email protected]>. Commit 2 of 2.

author: mihaild <[email protected]> 2022-02-10 16:46:59 +0300
committer: Daniil Cherednik <[email protected]> 2022-02-10 16:46:59 +0300
commit: 55fa8c7df8dba9a6fda8a807e529a9d04bd88580 (patch)
tree: b83306b6e37edeea782e9eed673d89286c4fef35 /util/charset/wide_sse41.cpp
parent: 246417ad6168d3f7ab4a0cf1c79ba4259f7c45ae (diff)
1 files changed, 39 insertions, 39 deletions
diff --git a/util/charset/wide_sse41.cpp b/util/charset/wide_sse41.cpp
index 6859e9c44c4..d1f2a74851e 100644
--- a/util/charset/wide_sse41.cpp
+++ b/util/charset/wide_sse41.cpp
@@ -1,6 +1,6 @@
-#include <util/charset/wide.h> 
+#include <util/charset/wide.h>
 #include <util/system/types.h>
- 
+
 #ifdef SSE41_STUB
 
 namespace NDetail {
@@ -13,21 +13,21 @@ namespace NDetail {
 #else
 
     #include <util/system/compiler.h>
- 
+
     #include <cstring>
     #include <emmintrin.h>
     #include <smmintrin.h>
- 
-//processes to the first error, or until less then 16 bytes left 
-//most code taken from https://woboq.com/blog/utf-8-processing-using-simd.html 
- 
+
+//processes to the first error, or until less then 16 bytes left
+//most code taken from https://woboq.com/blog/utf-8-processing-using-simd.html
+
 //return dstAdvance 0 in case of problems
 static Y_FORCE_INLINE ui32 Unpack16BytesIntoUtf16IfNoSurrogats(const unsigned char*& cur, __m128i& utf16Low, __m128i& utf16High) {
     unsigned char curAligned[16];
- 
+
     memcpy(curAligned, cur, sizeof(__m128i));
     __m128i chunk = _mm_load_si128(reinterpret_cast<const __m128i*>(curAligned));
- 
+
     //only ascii characters - simple copy
     if (!_mm_movemask_epi8(chunk)) {
         utf16Low = _mm_unpacklo_epi8(chunk, _mm_setzero_si128());
@@ -35,68 +35,68 @@ static Y_FORCE_INLINE ui32 Unpack16BytesIntoUtf16IfNoSurrogats(const unsigned ch
         cur += 16;
         return 16;
     }
- 
+
     __m128i chunkSigned = _mm_add_epi8(chunk, _mm_set1_epi8(0x80));
     __m128i isAsciiMask = _mm_cmpgt_epi8(chunk, _mm_set1_epi8(0));
- 
+
     __m128i cond2 = _mm_cmplt_epi8(_mm_set1_epi8(0xc2 - 1 - 0x80), chunkSigned);
     __m128i state = _mm_set1_epi8(0x0 | (char)0x80);
- 
+
     __m128i cond3 = _mm_cmplt_epi8(_mm_set1_epi8(0xe0 - 1 - 0x80), chunkSigned);
     state = _mm_blendv_epi8(state, _mm_set1_epi8(0x2 | (char)0xc0), cond2);
- 
+
     int sourceAdvance;
     __m128i shifts;
     __m128i chunkLow, chunkHigh;
- 
+
     if (Y_LIKELY(!_mm_movemask_epi8(cond3))) {
         //main case: no bloks of size 3 or 4
- 
+
         //rune len for start of multi-byte sequences (0 for b0... and b10..., 2 for b110..., etc.)
         __m128i count = _mm_and_si128(state, _mm_set1_epi8(0x7));
- 
+
         __m128i countSub1 = _mm_subs_epu8(count, _mm_set1_epi8(0x1));
- 
+
         shifts = countSub1;
         __m128i continuation1 = _mm_slli_si128(countSub1, 1);
- 
+
         shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 1));
         shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 2));
- 
+
         __m128i counts = _mm_or_si128(count, continuation1);
- 
+
         __m128i isBeginMultibyteMask = _mm_cmpgt_epi8(count, _mm_set1_epi8(0));
         __m128i needNoContinuationMask = _mm_cmpeq_epi8(continuation1, _mm_set1_epi8(0));
         __m128i isBeginMask = _mm_add_epi8(isBeginMultibyteMask, isAsciiMask);
         //each symbol should be exactly one of ascii, continuation or begin
         __m128i okMask = _mm_cmpeq_epi8(isBeginMask, needNoContinuationMask);
- 
+
         if (_mm_movemask_epi8(okMask) != 0xFFFF) {
             return 0;
         }
- 
+
         shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 4));
- 
+
         __m128i mask = _mm_and_si128(state, _mm_set1_epi8(0xf8));
         shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 8));
- 
+
         chunk = _mm_andnot_si128(mask, chunk);                                    // from now on, we only have usefull bits
         shifts = _mm_and_si128(shifts, _mm_cmplt_epi8(counts, _mm_set1_epi8(2))); // <=1
- 
+
         __m128i chunk_right = _mm_slli_si128(chunk, 1);
         shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 1),
                                  _mm_srli_si128(_mm_slli_epi16(shifts, 7), 1));
- 
+
         chunkLow = _mm_blendv_epi8(chunk,
                                    _mm_or_si128(chunk, _mm_and_si128(_mm_slli_epi16(chunk_right, 6), _mm_set1_epi8(0xc0))),
                                    _mm_cmpeq_epi8(counts, _mm_set1_epi8(1)));
- 
+
         chunkHigh = _mm_and_si128(chunk, _mm_cmpeq_epi8(counts, _mm_set1_epi8(2)));
- 
+
         shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 2),
                                  _mm_srli_si128(_mm_slli_epi16(shifts, 6), 2));
         chunkHigh = _mm_srli_epi32(chunkHigh, 2);
- 
+
         shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 4),
                                  _mm_srli_si128(_mm_slli_epi16(shifts, 5), 4));
 
@@ -166,15 +166,15 @@ static Y_FORCE_INLINE ui32 Unpack16BytesIntoUtf16IfNoSurrogats(const unsigned ch
         chunkHigh = _mm_or_si128(chunkHigh,
                                  _mm_and_si128(_mm_and_si128(_mm_slli_epi32(chunk_right, 4), _mm_set1_epi8(0xf0)),
                                                mask3));
- 
+
         int c = _mm_extract_epi16(counts, 7);
         sourceAdvance = !(c & 0x0200) ? 16 : !(c & 0x02) ? 15
                                                          : 14;
     }
- 
+
     shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 8),
                              _mm_srli_si128(_mm_slli_epi16(shifts, 4), 8));
- 
+
     chunkHigh = _mm_slli_si128(chunkHigh, 1);
 
     __m128i shuf = _mm_add_epi8(shifts, _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
@@ -206,16 +206,16 @@ namespace NDetail {
 
             _mm_store_si128(reinterpret_cast<__m128i*>(destAligned), utf16Low);
             _mm_store_si128(reinterpret_cast<__m128i*>(destAligned) + 1, utf16High);
-            memcpy(dest, destAligned, sizeof(__m128i) * 2); 
+            memcpy(dest, destAligned, sizeof(__m128i) * 2);
             dest += dstAdvance;
         }
         //The rest will be handled sequencially.
         // Possible improvement: go back to the vectorized processing after the error or the 4 byte sequence
     }
- 
+
     void UTF8ToWideImplSSE41(const unsigned char*& cur, const unsigned char* last, wchar32*& dest) noexcept {
         alignas(16) wchar32 destAligned[16];
- 
+
         while (cur + 16 <= last) {
             __m128i utf16Low;
             __m128i utf16High;
@@ -238,10 +238,10 @@ namespace NDetail {
 
             memcpy(dest, destAligned, sizeof(__m128i) * 4);
             dest += dstAdvance;
-        } 
-        //The rest will be handled sequencially. 
+        }
+        //The rest will be handled sequencially.
         // Possible improvement: go back to the vectorized processing after the error or the 4 byte sequence
-    } 
-} 
+    }
+}
 
 #endif
author	mihaild <[email protected]>	2022-02-10 16:46:59 +0300
committer	Daniil Cherednik <[email protected]>	2022-02-10 16:46:59 +0300
commit	55fa8c7df8dba9a6fda8a807e529a9d04bd88580 (patch)
tree	b83306b6e37edeea782e9eed673d89286c4fef35 /util/charset/wide_sse41.cpp
parent	246417ad6168d3f7ab4a0cf1c79ba4259f7c45ae (diff)