aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/libs/llvm16/lib/Support/UnicodeNameToCodepoint.cpp
diff options
context:
space:
mode:
authorvitalyisaev <vitalyisaev@yandex-team.com>2023-06-29 10:00:50 +0300
committervitalyisaev <vitalyisaev@yandex-team.com>2023-06-29 10:00:50 +0300
commit6ffe9e53658409f212834330e13564e4952558f6 (patch)
tree85b1e00183517648b228aafa7c8fb07f5276f419 /contrib/libs/llvm16/lib/Support/UnicodeNameToCodepoint.cpp
parent726057070f9c5a91fc10fde0d5024913d10f1ab9 (diff)
downloadydb-6ffe9e53658409f212834330e13564e4952558f6.tar.gz
YQ Connector: support managed ClickHouse
Со стороны dqrun можно обратиться к инстансу коннектора, который работает на streaming стенде, и извлечь данные из облачного CH.
Diffstat (limited to 'contrib/libs/llvm16/lib/Support/UnicodeNameToCodepoint.cpp')
-rw-r--r--contrib/libs/llvm16/lib/Support/UnicodeNameToCodepoint.cpp550
1 files changed, 550 insertions, 0 deletions
diff --git a/contrib/libs/llvm16/lib/Support/UnicodeNameToCodepoint.cpp b/contrib/libs/llvm16/lib/Support/UnicodeNameToCodepoint.cpp
new file mode 100644
index 00000000000..accebf1098a
--- /dev/null
+++ b/contrib/libs/llvm16/lib/Support/UnicodeNameToCodepoint.cpp
@@ -0,0 +1,550 @@
+//===- llvm/Support/UnicodeNameToCodepoint.cpp - Unicode character properties
+//-*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements functions to map the name or alias of a unicode
+// character to its codepoint.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Unicode.h"
+
+namespace llvm {
+namespace sys {
+namespace unicode {
+
+extern const char *UnicodeNameToCodepointDict;
+extern const uint8_t *UnicodeNameToCodepointIndex;
+extern const std::size_t UnicodeNameToCodepointIndexSize;
+extern const std::size_t UnicodeNameToCodepointLargestNameSize;
+
+using BufferType = SmallString<64>;
+
+struct Node {
+ bool IsRoot = false;
+ char32_t Value = 0xFFFFFFFF;
+ uint32_t ChildrenOffset = 0;
+ bool HasSibling = false;
+ uint32_t Size = 0;
+ StringRef Name;
+ const Node *Parent = nullptr;
+
+ constexpr bool isValid() const {
+ return !Name.empty() || Value == 0xFFFFFFFF;
+ }
+ constexpr bool hasChildren() const { return ChildrenOffset != 0 || IsRoot; }
+
+ std::string fullName() const {
+ std::string S;
+ // Reserve enough space for most unicode code points.
+ // The chosen value represent the 99th percentile of name size as of
+ // Unicode 15.0.
+ S.reserve(46);
+ const Node *N = this;
+ while (N) {
+ std::reverse_copy(N->Name.begin(), N->Name.end(), std::back_inserter(S));
+ N = N->Parent;
+ }
+ std::reverse(S.begin(), S.end());
+ return S;
+ }
+};
+
+static Node createRoot() {
+ Node N;
+ N.IsRoot = true;
+ N.ChildrenOffset = 1;
+ N.Size = 1;
+ return N;
+}
+
+static Node readNode(uint32_t Offset, const Node *Parent = nullptr) {
+ if (Offset == 0)
+ return createRoot();
+
+ uint32_t Origin = Offset;
+ Node N;
+ N.Parent = Parent;
+ uint8_t NameInfo = UnicodeNameToCodepointIndex[Offset++];
+ if (Offset + 6 >= UnicodeNameToCodepointIndexSize)
+ return N;
+
+ bool LongName = NameInfo & 0x40;
+ bool HasValue = NameInfo & 0x80;
+ std::size_t Size = NameInfo & ~0xC0;
+ if (LongName) {
+ uint32_t NameOffset = (UnicodeNameToCodepointIndex[Offset++] << 8);
+ NameOffset |= UnicodeNameToCodepointIndex[Offset++];
+ N.Name = StringRef(UnicodeNameToCodepointDict + NameOffset, Size);
+ } else {
+ N.Name = StringRef(UnicodeNameToCodepointDict + Size, 1);
+ }
+ if (HasValue) {
+ uint8_t H = UnicodeNameToCodepointIndex[Offset++];
+ uint8_t M = UnicodeNameToCodepointIndex[Offset++];
+ uint8_t L = UnicodeNameToCodepointIndex[Offset++];
+ N.Value = ((H << 16) | (M << 8) | L) >> 3;
+
+ bool HasChildren = L & 0x02;
+ N.HasSibling = L & 0x01;
+
+ if (HasChildren) {
+ N.ChildrenOffset = UnicodeNameToCodepointIndex[Offset++] << 16;
+ N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++] << 8;
+ N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++];
+ }
+ } else {
+ uint8_t H = UnicodeNameToCodepointIndex[Offset++];
+ N.HasSibling = H & 0x80;
+ bool HasChildren = H & 0x40;
+ H &= uint8_t(~0xC0);
+ if (HasChildren) {
+ N.ChildrenOffset = (H << 16);
+ N.ChildrenOffset |=
+ (uint32_t(UnicodeNameToCodepointIndex[Offset++]) << 8);
+ N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++];
+ }
+ }
+ N.Size = Offset - Origin;
+ return N;
+}
+
+static bool startsWith(StringRef Name, StringRef Needle, bool Strict,
+ std::size_t &Consummed, char &PreviousCharInName,
+ char &PreviousCharInNeedle, bool IsPrefix = false) {
+
+ Consummed = 0;
+ if (Strict) {
+ if (!Name.startswith(Needle))
+ return false;
+ Consummed = Needle.size();
+ return true;
+ }
+ if (Needle.empty())
+ return true;
+
+ auto NamePos = Name.begin();
+ auto NeedlePos = Needle.begin();
+
+ char PreviousCharInNameOrigin = PreviousCharInName;
+ char PreviousCharInNeedleOrigin = PreviousCharInNeedle;
+
+ auto IgnoreSpaces = [](auto It, auto End, char &PreviousChar,
+ bool IgnoreEnd = false) {
+ while (It != End) {
+ const auto Next = std::next(It);
+ // Ignore spaces, underscore, medial hyphens
+ // https://unicode.org/reports/tr44/#UAX44-LM2.
+ bool Ignore =
+ *It == ' ' || *It == '_' ||
+ (*It == '-' && isAlnum(PreviousChar) &&
+ ((Next != End && isAlnum(*Next)) || (Next == End && IgnoreEnd)));
+ PreviousChar = *It;
+ if (!Ignore)
+ break;
+ ++It;
+ }
+ return It;
+ };
+
+ while (true) {
+ NamePos = IgnoreSpaces(NamePos, Name.end(), PreviousCharInName);
+ NeedlePos =
+ IgnoreSpaces(NeedlePos, Needle.end(), PreviousCharInNeedle, IsPrefix);
+ if (NeedlePos == Needle.end())
+ break;
+ if (NamePos == Name.end())
+ break;
+ if (toUpper(*NeedlePos) != toUpper(*NamePos))
+ break;
+ NeedlePos++;
+ NamePos++;
+ }
+ Consummed = std::distance(Name.begin(), NamePos);
+ if (NeedlePos != Needle.end()) {
+ PreviousCharInName = PreviousCharInNameOrigin;
+ PreviousCharInNeedle = PreviousCharInNeedleOrigin;
+ }
+ return NeedlePos == Needle.end();
+}
+
+static std::tuple<Node, bool, uint32_t>
+compareNode(uint32_t Offset, StringRef Name, bool Strict,
+ char PreviousCharInName, char PreviousCharInNeedle,
+ BufferType &Buffer, const Node *Parent = nullptr) {
+ Node N = readNode(Offset, Parent);
+ std::size_t Consummed = 0;
+ bool DoesStartWith =
+ N.IsRoot || startsWith(Name, N.Name, Strict, Consummed,
+ PreviousCharInName, PreviousCharInNeedle);
+ if (!DoesStartWith)
+ return std::make_tuple(N, false, 0);
+
+ if (Name.size() - Consummed == 0 && N.Value != 0xFFFFFFFF)
+ return std::make_tuple(N, true, N.Value);
+
+ if (N.hasChildren()) {
+ uint32_t ChildOffset = N.ChildrenOffset;
+ for (;;) {
+ Node C;
+ bool Matches;
+ uint32_t Value;
+ std::tie(C, Matches, Value) =
+ compareNode(ChildOffset, Name.substr(Consummed), Strict,
+ PreviousCharInName, PreviousCharInNeedle, Buffer, &N);
+ if (Matches) {
+ std::reverse_copy(C.Name.begin(), C.Name.end(),
+ std::back_inserter(Buffer));
+ return std::make_tuple(N, true, Value);
+ }
+ ChildOffset += C.Size;
+ if (!C.HasSibling)
+ break;
+ }
+ }
+ return std::make_tuple(N, false, 0);
+}
+
+static std::tuple<Node, bool, uint32_t>
+compareNode(uint32_t Offset, StringRef Name, bool Strict, BufferType &Buffer) {
+ return compareNode(Offset, Name, Strict, 0, 0, Buffer);
+}
+
+// clang-format off
+constexpr const char *const HangulSyllables[][3] = {
+ { "G", "A", "" },
+ { "GG", "AE", "G" },
+ { "N", "YA", "GG" },
+ { "D", "YAE", "GS" },
+ { "DD", "EO", "N", },
+ { "R", "E", "NJ" },
+ { "M", "YEO", "NH" },
+ { "B", "YE", "D" },
+ { "BB", "O", "L" },
+ { "S", "WA", "LG" },
+ { "SS", "WAE", "LM" },
+ { "", "OE", "LB" },
+ { "J", "YO", "LS" },
+ { "JJ", "U", "LT" },
+ { "C", "WEO", "LP" },
+ { "K", "WE", "LH" },
+ { "T", "WI", "M" },
+ { "P", "YU", "B" },
+ { "H", "EU", "BS" },
+ { 0, "YI", "S" },
+ { 0, "I", "SS" },
+ { 0, 0, "NG" },
+ { 0, 0, "J" },
+ { 0, 0, "C" },
+ { 0, 0, "K" },
+ { 0, 0, "T" },
+ { 0, 0, "P" },
+ { 0, 0, "H" }
+ };
+// clang-format on
+
+// Unicode 15.0
+// 3.12 Conjoining Jamo Behavior Common constants
+constexpr const char32_t SBase = 0xAC00;
+constexpr const uint32_t LCount = 19;
+constexpr const uint32_t VCount = 21;
+constexpr const uint32_t TCount = 28;
+
+static std::size_t findSyllable(StringRef Name, bool Strict,
+ char &PreviousInName, int &Pos, int Column) {
+ assert(Column == 0 || Column == 1 || Column == 2);
+ static std::size_t CountPerColumn[] = {LCount, VCount, TCount};
+ char NeedleStart = 0;
+ int Len = -1;
+ int Prev = PreviousInName;
+ for (std::size_t I = 0; I < CountPerColumn[Column]; I++) {
+ StringRef Syllable(HangulSyllables[I][Column]);
+ if (int(Syllable.size()) <= Len)
+ continue;
+ std::size_t Consummed = 0;
+ char PreviousInNameCopy = PreviousInName;
+ bool DoesStartWith = startsWith(Name, Syllable, Strict, Consummed,
+ PreviousInNameCopy, NeedleStart);
+ if (!DoesStartWith)
+ continue;
+ Len = Consummed;
+ Pos = I;
+ Prev = PreviousInNameCopy;
+ }
+ if (Len == -1)
+ return 0;
+ PreviousInName = Prev;
+ return size_t(Len);
+}
+
+static std::optional<char32_t>
+nameToHangulCodePoint(StringRef Name, bool Strict, BufferType &Buffer) {
+ Buffer.clear();
+ // Hangul Syllable Decomposition
+ std::size_t Consummed = 0;
+ char NameStart = 0, NeedleStart = 0;
+ bool DoesStartWith = startsWith(Name, "HANGUL SYLLABLE ", Strict, Consummed,
+ NameStart, NeedleStart);
+ if (!DoesStartWith)
+ return std::nullopt;
+ Name = Name.substr(Consummed);
+ int L = -1, V = -1, T = -1;
+ Name = Name.substr(findSyllable(Name, Strict, NameStart, L, 0));
+ Name = Name.substr(findSyllable(Name, Strict, NameStart, V, 1));
+ Name = Name.substr(findSyllable(Name, Strict, NameStart, T, 2));
+ if (L != -1 && V != -1 && T != -1 && Name.empty()) {
+ if (!Strict) {
+ Buffer.append("HANGUL SYLLABLE ");
+ if (L != -1)
+ Buffer.append(HangulSyllables[L][0]);
+ if (V != -1)
+ Buffer.append(HangulSyllables[V][1]);
+ if (T != -1)
+ Buffer.append(HangulSyllables[T][2]);
+ }
+ return SBase + (std::uint32_t(L) * VCount + std::uint32_t(V)) * TCount +
+ std::uint32_t(T);
+ }
+ // Otherwise, it's an illegal syllable name.
+ return std::nullopt;
+}
+
+struct GeneratedNamesData {
+ StringRef Prefix;
+ uint32_t Start;
+ uint32_t End;
+};
+
+// Unicode 15.0 Table 4-8. Name Derivation Rule Prefix Strings
+static const GeneratedNamesData GeneratedNamesDataTable[] = {
+ {"CJK UNIFIED IDEOGRAPH-", 0x3400, 0x4DBF},
+ {"CJK UNIFIED IDEOGRAPH-", 0x4E00, 0x9FFF},
+ {"CJK UNIFIED IDEOGRAPH-", 0x20000, 0x2A6DF},
+ {"CJK UNIFIED IDEOGRAPH-", 0x2A700, 0x2B739},
+ {"CJK UNIFIED IDEOGRAPH-", 0x2B740, 0x2B81D},
+ {"CJK UNIFIED IDEOGRAPH-", 0x2B820, 0x2CEA1},
+ {"CJK UNIFIED IDEOGRAPH-", 0x2CEB0, 0x2EBE0},
+ {"CJK UNIFIED IDEOGRAPH-", 0x30000, 0x3134A},
+ {"CJK UNIFIED IDEOGRAPH-", 0x31350, 0x323AF},
+ {"TANGUT IDEOGRAPH-", 0x17000, 0x187F7},
+ {"TANGUT IDEOGRAPH-", 0x18D00, 0x18D08},
+ {"KHITAN SMALL SCRIPT CHARACTER-", 0x18B00, 0x18CD5},
+ {"NUSHU CHARACTER-", 0x1B170, 0x1B2FB},
+ {"CJK COMPATIBILITY IDEOGRAPH-", 0xF900, 0xFA6D},
+ {"CJK COMPATIBILITY IDEOGRAPH-", 0xFA70, 0xFAD9},
+ {"CJK COMPATIBILITY IDEOGRAPH-", 0x2F800, 0x2FA1D},
+};
+
+static std::optional<char32_t>
+nameToGeneratedCodePoint(StringRef Name, bool Strict, BufferType &Buffer) {
+ for (auto &&Item : GeneratedNamesDataTable) {
+ Buffer.clear();
+ std::size_t Consummed = 0;
+ char NameStart = 0, NeedleStart = 0;
+ bool DoesStartWith = startsWith(Name, Item.Prefix, Strict, Consummed,
+ NameStart, NeedleStart, /*isPrefix*/ true);
+ if (!DoesStartWith)
+ continue;
+ auto Number = Name.substr(Consummed);
+ unsigned long long V = 0;
+ // Be consistent about mandating upper casing.
+ if (Strict &&
+ llvm::any_of(Number, [](char C) { return C >= 'a' && C <= 'f'; }))
+ return {};
+ if (getAsUnsignedInteger(Number, 16, V) || V < Item.Start || V > Item.End)
+ continue;
+ if (!Strict) {
+ Buffer.append(Item.Prefix);
+ Buffer.append(utohexstr(V, true));
+ }
+ return V;
+ }
+ return std::nullopt;
+}
+
+static std::optional<char32_t> nameToCodepoint(StringRef Name, bool Strict,
+ BufferType &Buffer) {
+ if (Name.empty())
+ return std::nullopt;
+
+ std::optional<char32_t> Res = nameToHangulCodePoint(Name, Strict, Buffer);
+ if (!Res)
+ Res = nameToGeneratedCodePoint(Name, Strict, Buffer);
+ if (Res)
+ return *Res;
+
+ Buffer.clear();
+ Node Node;
+ bool Matches;
+ uint32_t Value;
+ std::tie(Node, Matches, Value) = compareNode(0, Name, Strict, Buffer);
+ if (Matches) {
+ std::reverse(Buffer.begin(), Buffer.end());
+ // UAX44-LM2. Ignore case, whitespace, underscore ('_'), and all medial
+ // hyphens except the hyphen in U+1180 HANGUL JUNGSEONG O-E.
+ if (!Strict && Value == 0x116c &&
+ Name.find_insensitive("O-E") != StringRef::npos) {
+ Buffer = "HANGUL JUNGSEONG O-E";
+ Value = 0x1180;
+ }
+ return Value;
+ }
+ return std::nullopt;
+}
+
+std::optional<char32_t> nameToCodepointStrict(StringRef Name) {
+
+ BufferType Buffer;
+ auto Opt = nameToCodepoint(Name, true, Buffer);
+ return Opt;
+}
+
+std::optional<LooseMatchingResult>
+nameToCodepointLooseMatching(StringRef Name) {
+ BufferType Buffer;
+ auto Opt = nameToCodepoint(Name, false, Buffer);
+ if (!Opt)
+ return std::nullopt;
+ return LooseMatchingResult{*Opt, Buffer};
+}
+
+// Find the unicode character whose editing distance to Pattern
+// is shortest, using the Wagner–Fischer algorithm.
+llvm::SmallVector<MatchForCodepointName>
+nearestMatchesForCodepointName(StringRef Pattern, std::size_t MaxMatchesCount) {
+ // We maintain a fixed size vector of matches,
+ // sorted by distance
+ // The worst match (with the biggest distance) are discarded when new elements
+ // are added.
+ std::size_t LargestEditDistance = 0;
+ llvm::SmallVector<MatchForCodepointName> Matches;
+ Matches.reserve(MaxMatchesCount + 1);
+
+ auto Insert = [&](const Node &Node, uint32_t Distance,
+ char32_t Value) -> bool {
+ if (Distance > LargestEditDistance) {
+ if (Matches.size() == MaxMatchesCount)
+ return false;
+ LargestEditDistance = Distance;
+ }
+ // To avoid allocations, the creation of the name is delayed
+ // as much as possible.
+ std::string Name;
+ auto GetName = [&] {
+ if (Name.empty())
+ Name = Node.fullName();
+ return Name;
+ };
+
+ auto It = llvm::lower_bound(
+ Matches, Distance,
+ [&](const MatchForCodepointName &a, std::size_t Distance) {
+ if (Distance == a.Distance)
+ return a.Name < GetName();
+ return a.Distance < Distance;
+ });
+ if (It == Matches.end() && Matches.size() == MaxMatchesCount)
+ return false;
+
+ MatchForCodepointName M{GetName(), Distance, Value};
+ Matches.insert(It, std::move(M));
+ if (Matches.size() > MaxMatchesCount)
+ Matches.pop_back();
+ return true;
+ };
+
+ // We ignore case, space, hyphens, etc,
+ // in both the search pattern and the prospective names.
+ auto Normalize = [](StringRef Name) {
+ std::string Out;
+ Out.reserve(Name.size());
+ for (char C : Name) {
+ if (isAlnum(C))
+ Out.push_back(toUpper(C));
+ }
+ return Out;
+ };
+ std::string NormalizedName = Normalize(Pattern);
+
+ // Allocate a matrix big enough for longest names.
+ const std::size_t Columns =
+ std::min(NormalizedName.size(), UnicodeNameToCodepointLargestNameSize) +
+ 1;
+
+ LLVM_ATTRIBUTE_UNUSED static std::size_t Rows =
+ UnicodeNameToCodepointLargestNameSize + 1;
+
+ std::vector<char> Distances(
+ Columns * (UnicodeNameToCodepointLargestNameSize + 1), 0);
+
+ auto Get = [&Distances, Columns](size_t Column, std::size_t Row) -> char & {
+ assert(Column < Columns);
+ assert(Row < Rows);
+ return Distances[Row * Columns + Column];
+ };
+
+ for (std::size_t I = 0; I < Columns; I++)
+ Get(I, 0) = I;
+
+ // Visit the childrens,
+ // Filling (and overriding) the matrix for the name fragment of each node
+ // iteratively. CompleteName is used to collect the actual name of potential
+ // match, respecting case and spacing.
+ auto VisitNode = [&](const Node &N, std::size_t Row,
+ auto &VisitNode) -> void {
+ std::size_t J = 0;
+ for (; J < N.Name.size(); J++) {
+ if (!isAlnum(N.Name[J]))
+ continue;
+
+ Get(0, Row) = Row;
+
+ for (std::size_t I = 1; I < Columns; I++) {
+ const int Delete = Get(I - 1, Row) + 1;
+ const int Insert = Get(I, Row - 1) + 1;
+
+ const int Replace =
+ Get(I - 1, Row - 1) + (NormalizedName[I - 1] != N.Name[J] ? 1 : 0);
+
+ Get(I, Row) = std::min(Insert, std::min(Delete, Replace));
+ }
+
+ Row++;
+ }
+
+ unsigned Cost = Get(Columns - 1, Row - 1);
+ if (N.Value != 0xFFFFFFFF) {
+ Insert(N, Cost, N.Value);
+ }
+
+ if (N.hasChildren()) {
+ auto ChildOffset = N.ChildrenOffset;
+ for (;;) {
+ Node C = readNode(ChildOffset, &N);
+ ChildOffset += C.Size;
+ if (!C.isValid())
+ break;
+ VisitNode(C, Row, VisitNode);
+ if (!C.HasSibling)
+ break;
+ }
+ }
+ };
+
+ Node Root = createRoot();
+ VisitNode(Root, 1, VisitNode);
+ return Matches;
+}
+
+} // namespace unicode
+
+} // namespace sys
+} // namespace llvm