diff options
author | vvvv <[email protected]> | 2025-10-06 11:26:09 +0300 |
---|---|---|
committer | vvvv <[email protected]> | 2025-10-06 11:53:26 +0300 |
commit | 60f45e69a4d7dbc6131208e16c45faf35aa5a985 (patch) | |
tree | 4daa45b52c295a178c7620e4c93921465fcf7950 /yql/essentials/utils/utf8.cpp | |
parent | 1bded1a65a7e6e9171418f3e1c691d390125b64e (diff) |
YQL-20086 utils
init
commit_hash:54feccd520ebd0ab23612bc0cb830914dff9d0e8
Diffstat (limited to 'yql/essentials/utils/utf8.cpp')
-rw-r--r-- | yql/essentials/utils/utf8.cpp | 104 |
1 files changed, 77 insertions, 27 deletions
diff --git a/yql/essentials/utils/utf8.cpp b/yql/essentials/utils/utf8.cpp index af284849a83..d47743e2b7a 100644 --- a/yql/essentials/utils/utf8.cpp +++ b/yql/essentials/utils/utf8.cpp @@ -12,6 +12,7 @@ namespace { unsigned char GetRange(unsigned char c) { // Referring to DFA of http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ // With new mapping 1 -> 0x10, 7 -> 0x20, 9 -> 0x40, such that AND operation can test multiple types. + // clang-format off static const unsigned char type[] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, @@ -24,6 +25,7 @@ unsigned char GetRange(unsigned char c) { 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, }; + // clang-format on return type[c]; } @@ -38,6 +40,7 @@ struct TUtf8Ranges { }; // see https://lemire.me/blog/2018/05/09/how-quickly-can-you-check-that-a-string-is-valid-unicode-utf-8 +// clang-format off inline static const std::vector<TUtf8Ranges> Utf8Ranges = { { 1, { {0x00, 0x7f}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, } }, { 2, { {0xc2, 0xdf}, {0x80, 0xbf}, {0x00, 0x00}, {0x00, 0x00}, } }, @@ -49,9 +52,10 @@ inline static const std::vector<TUtf8Ranges> Utf8Ranges = { { 4, { {0xf1, 0xf3}, {0x80, 0xbf}, {0x80, 0xbf}, {0x80, 0xbf}, } }, { 4, { {0xf4, 0xf4}, {0x80, 0x8f}, {0x80, 0xbf}, {0x80, 0xbf}, } }, }; +// clang-format on std::optional<std::string> RoundBadUtf8(size_t range, std::string_view inputString, size_t pos, - bool roundDown) + bool roundDown) { Y_ENSURE(range > 0); Y_ENSURE(range < Utf8Ranges.size()); @@ -119,35 +123,72 @@ std::optional<std::string> RoundBadUtf8(size_t range, std::string_view inputStri } } } - } return prefix + newSuffix; } -} +} // namespace bool IsUtf8(const std::string_view& str) { for (auto it = str.cbegin(); str.cend() != it;) { -#define COPY() if (str.cend() != it) { c = *it++; } else { return false; } +#define COPY() \ + if (str.cend() != it) { \ + c = *it++; \ + } else { \ + return false; \ + } #define TRANS(mask) result &= ((GetRange(static_cast<unsigned char>(c)) & mask) != 0) -#define TAIL() COPY(); TRANS(0x70) +#define TAIL() \ + COPY(); \ + TRANS(0x70) auto c = *it++; - if (!(c & 0x80)) + if (!(c & 0x80)) { continue; + } bool result = true; switch (GetRange(static_cast<unsigned char>(c))) { - case 2: TAIL(); break; - case 3: TAIL(); TAIL(); break; - case 4: COPY(); TRANS(0x50); TAIL(); break; - case 5: COPY(); TRANS(0x10); TAIL(); TAIL(); break; - case 6: TAIL(); TAIL(); TAIL(); break; - case 10: COPY(); TRANS(0x20); TAIL(); break; - case 11: COPY(); TRANS(0x60); TAIL(); TAIL(); break; - default: return false; + case 2: + TAIL(); + break; + case 3: + TAIL(); + TAIL(); + break; + case 4: + COPY(); + TRANS(0x50); + TAIL(); + break; + case 5: + COPY(); + TRANS(0x10); + TAIL(); + TAIL(); + break; + case 6: + TAIL(); + TAIL(); + TAIL(); + break; + case 10: + COPY(); + TRANS(0x20); + TAIL(); + break; + case 11: + COPY(); + TRANS(0x60); + TAIL(); + TAIL(); + break; + default: + return false; } - if (!result) return false; + if (!result) { + return false; + } #undef COPY #undef TRANS #undef TAIL @@ -157,21 +198,30 @@ bool IsUtf8(const std::string_view& str) { unsigned char WideCharSize(char head) { switch (GetRange(static_cast<unsigned char>(head))) { - case 0: return 1; - case 2: return 2; - case 3: return 3; - case 4: return 3; - case 5: return 4; - case 6: return 4; - case 10: return 3; - case 11: return 4; - default: return 0; + case 0: + return 1; + case 2: + return 2; + case 3: + return 3; + case 4: + return 3; + case 5: + return 4; + case 6: + return 4; + case 10: + return 3; + case 11: + return 4; + default: + return 0; } } std::optional<std::string> RoundToNearestValidUtf8(const std::string_view& str, bool roundDown) { const size_t ss = str.size(); - for (size_t pos = 0; pos < ss; ) { + for (size_t pos = 0; pos < ss;) { ui8 c = str[pos]; for (size_t i = 0; i < Utf8Ranges.size(); ++i) { @@ -197,7 +247,7 @@ std::optional<std::string> RoundToNearestValidUtf8(const std::string_view& str, break; } else if (i + 1 == Utf8Ranges.size()) { if (!roundDown) { - return NextValidUtf8(str.substr(0, pos)); + return NextValidUtf8(str.substr(0, pos)); } return RoundBadUtf8(i, str, pos, roundDown); } @@ -257,4 +307,4 @@ std::optional<std::string> NextLexicographicString(const std::string_view& str) return result; } -} +} // namespace NYql |