Moved yql/utils YQL-19206

Также была выделена жирная зависимость из yql/utils в yql/utils/network, в результате library/cpp/getopt была добавлена явно в те проекты, которые ее ранее наследовали, а не указывали явно commit_hash:36aa4c41f807b4cbbf70a3ed7ac0a1a5079bb75d
author: vvvv <vvvv@yandex-team.com> 2024-11-01 15:41:40 +0300
committer: vvvv <vvvv@yandex-team.com> 2024-11-01 15:55:52 +0300
commit: 3325f745e67f7f442790822b5c9c5e9996708be7 (patch)
tree: f7318d68bbe8990092715436444b05297ce35777 /yql/essentials/utils/utf8.cpp
parent: 6dce3f1c71786f2694b73b1a5155efc58f4557dd (diff)
download: ydb-3325f745e67f7f442790822b5c9c5e9996708be7.tar.gz
1 files changed, 260 insertions, 0 deletions
diff --git a/yql/essentials/utils/utf8.cpp b/yql/essentials/utils/utf8.cpp
new file mode 100644
index 0000000000..af284849a8
--- /dev/null
+++ b/yql/essentials/utils/utf8.cpp
@@ -0,0 +1,260 @@
+#include "utf8.h"
+
+#include <util/charset/wide.h>
+
+#include <ctype.h>
+#include <vector>
+
+namespace NYql {
+
+namespace {
+
+unsigned char GetRange(unsigned char c) {
+    // Referring to DFA of http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
+    // With new mapping 1 -> 0x10, 7 -> 0x20, 9 -> 0x40, such that AND operation can test multiple types.
+    static const unsigned char type[] = {
+        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+        0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,
+        0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,
+        0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,
+        0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,
+        8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+        10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
+    };
+    return type[c];
+}
+
+struct TByteRange {
+    ui8 First = 0;
+    ui8 Last = 0;
+};
+
+struct TUtf8Ranges {
+    size_t BytesCount = 0;
+    TByteRange Bytes[4] = {};
+};
+
+// see https://lemire.me/blog/2018/05/09/how-quickly-can-you-check-that-a-string-is-valid-unicode-utf-8
+inline static const std::vector<TUtf8Ranges> Utf8Ranges = {
+    { 1, { {0x00, 0x7f}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, } },
+    { 2, { {0xc2, 0xdf}, {0x80, 0xbf}, {0x00, 0x00}, {0x00, 0x00}, } },
+    { 3, { {0xe0, 0xe0}, {0xa0, 0xbf}, {0x80, 0xbf}, {0x00, 0x00}, } },
+    { 3, { {0xe1, 0xec}, {0x80, 0xbf}, {0x80, 0xbf}, {0x00, 0x00}, } },
+    { 3, { {0xed, 0xed}, {0x80, 0x9f}, {0x80, 0xbf}, {0x00, 0x00}, } },
+    { 3, { {0xee, 0xef}, {0x80, 0xbf}, {0x80, 0xbf}, {0x00, 0x00}, } },
+    { 4, { {0xf0, 0xf0}, {0x90, 0xbf}, {0x80, 0xbf}, {0x80, 0xbf}, } },
+    { 4, { {0xf1, 0xf3}, {0x80, 0xbf}, {0x80, 0xbf}, {0x80, 0xbf}, } },
+    { 4, { {0xf4, 0xf4}, {0x80, 0x8f}, {0x80, 0xbf}, {0x80, 0xbf}, } },
+};
+
+std::optional<std::string> RoundBadUtf8(size_t range, std::string_view inputString, size_t pos,
+    bool roundDown)
+{
+    Y_ENSURE(range > 0);
+    Y_ENSURE(range < Utf8Ranges.size());
+
+    const std::string prefix{inputString.substr(0, pos)};
+    std::string_view suffix = inputString.substr(pos, Utf8Ranges[range].BytesCount);
+
+    std::string newSuffix;
+    if (roundDown) {
+        std::optional<size_t> lastNonMin;
+        for (size_t i = 0; i < suffix.size(); ++i) {
+            if (Utf8Ranges[range].Bytes[i].First < ui8(suffix[i])) {
+                lastNonMin = i;
+            }
+        }
+
+        if (!lastNonMin) {
+            for (size_t i = 0; i < Utf8Ranges[range - 1].BytesCount; ++i) {
+                newSuffix.push_back(Utf8Ranges[range - 1].Bytes[i].Last);
+            }
+        } else {
+            for (size_t i = 0; i < Utf8Ranges[range].BytesCount; ++i) {
+                if (i < *lastNonMin) {
+                    ui8 c = suffix[i];
+                    newSuffix.push_back(c);
+                } else if (i == *lastNonMin) {
+                    ui8 c = suffix[i];
+                    newSuffix.push_back(std::min<ui8>(c - 1, Utf8Ranges[range].Bytes[i].Last));
+                } else {
+                    newSuffix.push_back(Utf8Ranges[range].Bytes[i].Last);
+                }
+            }
+        }
+    } else {
+        std::optional<size_t> lastNonMax;
+        bool valid = true;
+        for (size_t i = 0; i < suffix.size(); ++i) {
+            ui8 last = Utf8Ranges[range].Bytes[i].Last;
+            ui8 first = Utf8Ranges[range].Bytes[i].First;
+            ui8 curr = ui8(suffix[i]);
+
+            valid = valid && curr <= last && curr >= first;
+            if (curr < last) {
+                lastNonMax = i;
+            }
+        }
+
+        if (valid) {
+            newSuffix = suffix;
+            for (size_t i = suffix.size(); i < Utf8Ranges[range].BytesCount; ++i) {
+                newSuffix.push_back(Utf8Ranges[range].Bytes[i].First);
+            }
+        } else if (!lastNonMax) {
+            return NextValidUtf8(prefix);
+        } else {
+            for (size_t i = 0; i < Utf8Ranges[range].BytesCount; ++i) {
+                if (i < *lastNonMax) {
+                    ui8 c = suffix[i];
+                    newSuffix.push_back(c);
+                } else if (i == *lastNonMax) {
+                    ui8 c = suffix[i];
+                    newSuffix.push_back(std::max<ui8>(c + 1, Utf8Ranges[range].Bytes[i].First));
+                } else {
+                    newSuffix.push_back(Utf8Ranges[range].Bytes[i].First);
+                }
+            }
+        }
+
+    }
+    return prefix + newSuffix;
+}
+
+}
+
+bool IsUtf8(const std::string_view& str) {
+    for (auto it = str.cbegin(); str.cend() != it;) {
+#define COPY() if (str.cend() != it) { c = *it++; } else { return false; }
+#define TRANS(mask) result &= ((GetRange(static_cast<unsigned char>(c)) & mask) != 0)
+#define TAIL() COPY(); TRANS(0x70)
+        auto c = *it++;
+        if (!(c & 0x80))
+            continue;
+
+        bool result = true;
+        switch (GetRange(static_cast<unsigned char>(c))) {
+        case 2: TAIL(); break;
+        case 3: TAIL(); TAIL(); break;
+        case 4: COPY(); TRANS(0x50); TAIL(); break;
+        case 5: COPY(); TRANS(0x10); TAIL(); TAIL(); break;
+        case 6: TAIL(); TAIL(); TAIL(); break;
+        case 10: COPY(); TRANS(0x20); TAIL(); break;
+        case 11: COPY(); TRANS(0x60); TAIL(); TAIL(); break;
+        default: return false;
+        }
+
+        if (!result) return false;
+#undef COPY
+#undef TRANS
+#undef TAIL
+    }
+    return true;
+}
+
+unsigned char WideCharSize(char head) {
+    switch (GetRange(static_cast<unsigned char>(head))) {
+        case 0: return 1;
+        case 2: return 2;
+        case 3: return 3;
+        case 4: return 3;
+        case 5: return 4;
+        case 6: return 4;
+        case 10: return 3;
+        case 11: return 4;
+        default: return 0;
+    }
+}
+
+std::optional<std::string> RoundToNearestValidUtf8(const std::string_view& str, bool roundDown) {
+    const size_t ss = str.size();
+    for (size_t pos = 0; pos < ss; ) {
+        ui8 c = str[pos];
+
+        for (size_t i = 0; i < Utf8Ranges.size(); ++i) {
+            auto& range = Utf8Ranges[i];
+
+            if (c < range.Bytes[0].First) {
+                return RoundBadUtf8(i, str, pos, roundDown);
+            }
+
+            if (c <= range.Bytes[0].Last) {
+                // valid UTF8 code point start
+                for (size_t j = 1; j < range.BytesCount; ++j) {
+                    if (pos + j >= ss) {
+                        return RoundBadUtf8(i, str, pos, roundDown);
+                    }
+                    ui8 cur = str[pos + j];
+                    if (!(cur >= range.Bytes[j].First && cur <= range.Bytes[j].Last)) {
+                        return RoundBadUtf8(i, str, pos, roundDown);
+                    }
+                }
+
+                pos += range.BytesCount;
+                break;
+            } else if (i + 1 == Utf8Ranges.size()) {
+                if (!roundDown) {
+                    return NextValidUtf8(str.substr(0,  pos));
+                }
+                return RoundBadUtf8(i, str, pos, roundDown);
+            }
+        }
+    }
+    return std::string(str);
+}
+
+std::optional<std::string> NextValidUtf8(const std::string_view& str) {
+    Y_ENSURE(IsUtf8(str));
+    TUtf32String wide = UTF8ToUTF32<false>(str);
+    bool incremented = false;
+    size_t toDrop = 0;
+    for (auto it = wide.rbegin(); it != wide.rend(); ++it) {
+        auto& c = *it;
+        if (c < 0x10ffff) {
+            c = (c == 0xd7ff) ? 0xe000 : (c + 1);
+            incremented = true;
+            break;
+        } else {
+            ++toDrop;
+        }
+    }
+
+    if (!incremented) {
+        return {};
+    }
+
+    Y_ENSURE(toDrop < wide.size());
+    wide.resize(wide.size() - toDrop);
+
+    TString result = WideToUTF8(wide);
+    return std::string(result.data(), result.size());
+}
+
+std::optional<std::string> NextLexicographicString(const std::string_view& str) {
+    bool incremented = false;
+    size_t toDrop = 0;
+    std::string result{str};
+    for (auto it = result.rbegin(); it != result.rend(); ++it) {
+        auto& c = *it;
+        if (ui8(c) < 0xff) {
+            ++c;
+            incremented = true;
+            break;
+        } else {
+            ++toDrop;
+        }
+    }
+
+    if (!incremented) {
+        return {};
+    }
+
+    Y_ENSURE(toDrop < result.size());
+    result.resize(result.size() - toDrop);
+    return result;
+}
+
+}
author	vvvv <vvvv@yandex-team.com>	2024-11-01 15:41:40 +0300
committer	vvvv <vvvv@yandex-team.com>	2024-11-01 15:55:52 +0300
commit	3325f745e67f7f442790822b5c9c5e9996708be7 (patch)
tree	f7318d68bbe8990092715436444b05297ce35777 /yql/essentials/utils/utf8.cpp
parent	6dce3f1c71786f2694b73b1a5155efc58f4557dd (diff)
download	ydb-3325f745e67f7f442790822b5c9c5e9996708be7.tar.gz