diff options
author | vitalyisaev <vitalyisaev@yandex-team.com> | 2023-06-29 10:00:50 +0300 |
---|---|---|
committer | vitalyisaev <vitalyisaev@yandex-team.com> | 2023-06-29 10:00:50 +0300 |
commit | 6ffe9e53658409f212834330e13564e4952558f6 (patch) | |
tree | 85b1e00183517648b228aafa7c8fb07f5276f419 /contrib/libs/llvm16/lib/Support/UnicodeNameToCodepoint.cpp | |
parent | 726057070f9c5a91fc10fde0d5024913d10f1ab9 (diff) | |
download | ydb-6ffe9e53658409f212834330e13564e4952558f6.tar.gz |
YQ Connector: support managed ClickHouse
Со стороны dqrun можно обратиться к инстансу коннектора, который работает на streaming стенде, и извлечь данные из облачного CH.
Diffstat (limited to 'contrib/libs/llvm16/lib/Support/UnicodeNameToCodepoint.cpp')
-rw-r--r-- | contrib/libs/llvm16/lib/Support/UnicodeNameToCodepoint.cpp | 550 |
1 files changed, 550 insertions, 0 deletions
diff --git a/contrib/libs/llvm16/lib/Support/UnicodeNameToCodepoint.cpp b/contrib/libs/llvm16/lib/Support/UnicodeNameToCodepoint.cpp new file mode 100644 index 00000000000..accebf1098a --- /dev/null +++ b/contrib/libs/llvm16/lib/Support/UnicodeNameToCodepoint.cpp @@ -0,0 +1,550 @@ +//===- llvm/Support/UnicodeNameToCodepoint.cpp - Unicode character properties +//-*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements functions to map the name or alias of a unicode +// character to its codepoint. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Unicode.h" + +namespace llvm { +namespace sys { +namespace unicode { + +extern const char *UnicodeNameToCodepointDict; +extern const uint8_t *UnicodeNameToCodepointIndex; +extern const std::size_t UnicodeNameToCodepointIndexSize; +extern const std::size_t UnicodeNameToCodepointLargestNameSize; + +using BufferType = SmallString<64>; + +struct Node { + bool IsRoot = false; + char32_t Value = 0xFFFFFFFF; + uint32_t ChildrenOffset = 0; + bool HasSibling = false; + uint32_t Size = 0; + StringRef Name; + const Node *Parent = nullptr; + + constexpr bool isValid() const { + return !Name.empty() || Value == 0xFFFFFFFF; + } + constexpr bool hasChildren() const { return ChildrenOffset != 0 || IsRoot; } + + std::string fullName() const { + std::string S; + // Reserve enough space for most unicode code points. + // The chosen value represent the 99th percentile of name size as of + // Unicode 15.0. + S.reserve(46); + const Node *N = this; + while (N) { + std::reverse_copy(N->Name.begin(), N->Name.end(), std::back_inserter(S)); + N = N->Parent; + } + std::reverse(S.begin(), S.end()); + return S; + } +}; + +static Node createRoot() { + Node N; + N.IsRoot = true; + N.ChildrenOffset = 1; + N.Size = 1; + return N; +} + +static Node readNode(uint32_t Offset, const Node *Parent = nullptr) { + if (Offset == 0) + return createRoot(); + + uint32_t Origin = Offset; + Node N; + N.Parent = Parent; + uint8_t NameInfo = UnicodeNameToCodepointIndex[Offset++]; + if (Offset + 6 >= UnicodeNameToCodepointIndexSize) + return N; + + bool LongName = NameInfo & 0x40; + bool HasValue = NameInfo & 0x80; + std::size_t Size = NameInfo & ~0xC0; + if (LongName) { + uint32_t NameOffset = (UnicodeNameToCodepointIndex[Offset++] << 8); + NameOffset |= UnicodeNameToCodepointIndex[Offset++]; + N.Name = StringRef(UnicodeNameToCodepointDict + NameOffset, Size); + } else { + N.Name = StringRef(UnicodeNameToCodepointDict + Size, 1); + } + if (HasValue) { + uint8_t H = UnicodeNameToCodepointIndex[Offset++]; + uint8_t M = UnicodeNameToCodepointIndex[Offset++]; + uint8_t L = UnicodeNameToCodepointIndex[Offset++]; + N.Value = ((H << 16) | (M << 8) | L) >> 3; + + bool HasChildren = L & 0x02; + N.HasSibling = L & 0x01; + + if (HasChildren) { + N.ChildrenOffset = UnicodeNameToCodepointIndex[Offset++] << 16; + N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++] << 8; + N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++]; + } + } else { + uint8_t H = UnicodeNameToCodepointIndex[Offset++]; + N.HasSibling = H & 0x80; + bool HasChildren = H & 0x40; + H &= uint8_t(~0xC0); + if (HasChildren) { + N.ChildrenOffset = (H << 16); + N.ChildrenOffset |= + (uint32_t(UnicodeNameToCodepointIndex[Offset++]) << 8); + N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++]; + } + } + N.Size = Offset - Origin; + return N; +} + +static bool startsWith(StringRef Name, StringRef Needle, bool Strict, + std::size_t &Consummed, char &PreviousCharInName, + char &PreviousCharInNeedle, bool IsPrefix = false) { + + Consummed = 0; + if (Strict) { + if (!Name.startswith(Needle)) + return false; + Consummed = Needle.size(); + return true; + } + if (Needle.empty()) + return true; + + auto NamePos = Name.begin(); + auto NeedlePos = Needle.begin(); + + char PreviousCharInNameOrigin = PreviousCharInName; + char PreviousCharInNeedleOrigin = PreviousCharInNeedle; + + auto IgnoreSpaces = [](auto It, auto End, char &PreviousChar, + bool IgnoreEnd = false) { + while (It != End) { + const auto Next = std::next(It); + // Ignore spaces, underscore, medial hyphens + // https://unicode.org/reports/tr44/#UAX44-LM2. + bool Ignore = + *It == ' ' || *It == '_' || + (*It == '-' && isAlnum(PreviousChar) && + ((Next != End && isAlnum(*Next)) || (Next == End && IgnoreEnd))); + PreviousChar = *It; + if (!Ignore) + break; + ++It; + } + return It; + }; + + while (true) { + NamePos = IgnoreSpaces(NamePos, Name.end(), PreviousCharInName); + NeedlePos = + IgnoreSpaces(NeedlePos, Needle.end(), PreviousCharInNeedle, IsPrefix); + if (NeedlePos == Needle.end()) + break; + if (NamePos == Name.end()) + break; + if (toUpper(*NeedlePos) != toUpper(*NamePos)) + break; + NeedlePos++; + NamePos++; + } + Consummed = std::distance(Name.begin(), NamePos); + if (NeedlePos != Needle.end()) { + PreviousCharInName = PreviousCharInNameOrigin; + PreviousCharInNeedle = PreviousCharInNeedleOrigin; + } + return NeedlePos == Needle.end(); +} + +static std::tuple<Node, bool, uint32_t> +compareNode(uint32_t Offset, StringRef Name, bool Strict, + char PreviousCharInName, char PreviousCharInNeedle, + BufferType &Buffer, const Node *Parent = nullptr) { + Node N = readNode(Offset, Parent); + std::size_t Consummed = 0; + bool DoesStartWith = + N.IsRoot || startsWith(Name, N.Name, Strict, Consummed, + PreviousCharInName, PreviousCharInNeedle); + if (!DoesStartWith) + return std::make_tuple(N, false, 0); + + if (Name.size() - Consummed == 0 && N.Value != 0xFFFFFFFF) + return std::make_tuple(N, true, N.Value); + + if (N.hasChildren()) { + uint32_t ChildOffset = N.ChildrenOffset; + for (;;) { + Node C; + bool Matches; + uint32_t Value; + std::tie(C, Matches, Value) = + compareNode(ChildOffset, Name.substr(Consummed), Strict, + PreviousCharInName, PreviousCharInNeedle, Buffer, &N); + if (Matches) { + std::reverse_copy(C.Name.begin(), C.Name.end(), + std::back_inserter(Buffer)); + return std::make_tuple(N, true, Value); + } + ChildOffset += C.Size; + if (!C.HasSibling) + break; + } + } + return std::make_tuple(N, false, 0); +} + +static std::tuple<Node, bool, uint32_t> +compareNode(uint32_t Offset, StringRef Name, bool Strict, BufferType &Buffer) { + return compareNode(Offset, Name, Strict, 0, 0, Buffer); +} + +// clang-format off +constexpr const char *const HangulSyllables[][3] = { + { "G", "A", "" }, + { "GG", "AE", "G" }, + { "N", "YA", "GG" }, + { "D", "YAE", "GS" }, + { "DD", "EO", "N", }, + { "R", "E", "NJ" }, + { "M", "YEO", "NH" }, + { "B", "YE", "D" }, + { "BB", "O", "L" }, + { "S", "WA", "LG" }, + { "SS", "WAE", "LM" }, + { "", "OE", "LB" }, + { "J", "YO", "LS" }, + { "JJ", "U", "LT" }, + { "C", "WEO", "LP" }, + { "K", "WE", "LH" }, + { "T", "WI", "M" }, + { "P", "YU", "B" }, + { "H", "EU", "BS" }, + { 0, "YI", "S" }, + { 0, "I", "SS" }, + { 0, 0, "NG" }, + { 0, 0, "J" }, + { 0, 0, "C" }, + { 0, 0, "K" }, + { 0, 0, "T" }, + { 0, 0, "P" }, + { 0, 0, "H" } + }; +// clang-format on + +// Unicode 15.0 +// 3.12 Conjoining Jamo Behavior Common constants +constexpr const char32_t SBase = 0xAC00; +constexpr const uint32_t LCount = 19; +constexpr const uint32_t VCount = 21; +constexpr const uint32_t TCount = 28; + +static std::size_t findSyllable(StringRef Name, bool Strict, + char &PreviousInName, int &Pos, int Column) { + assert(Column == 0 || Column == 1 || Column == 2); + static std::size_t CountPerColumn[] = {LCount, VCount, TCount}; + char NeedleStart = 0; + int Len = -1; + int Prev = PreviousInName; + for (std::size_t I = 0; I < CountPerColumn[Column]; I++) { + StringRef Syllable(HangulSyllables[I][Column]); + if (int(Syllable.size()) <= Len) + continue; + std::size_t Consummed = 0; + char PreviousInNameCopy = PreviousInName; + bool DoesStartWith = startsWith(Name, Syllable, Strict, Consummed, + PreviousInNameCopy, NeedleStart); + if (!DoesStartWith) + continue; + Len = Consummed; + Pos = I; + Prev = PreviousInNameCopy; + } + if (Len == -1) + return 0; + PreviousInName = Prev; + return size_t(Len); +} + +static std::optional<char32_t> +nameToHangulCodePoint(StringRef Name, bool Strict, BufferType &Buffer) { + Buffer.clear(); + // Hangul Syllable Decomposition + std::size_t Consummed = 0; + char NameStart = 0, NeedleStart = 0; + bool DoesStartWith = startsWith(Name, "HANGUL SYLLABLE ", Strict, Consummed, + NameStart, NeedleStart); + if (!DoesStartWith) + return std::nullopt; + Name = Name.substr(Consummed); + int L = -1, V = -1, T = -1; + Name = Name.substr(findSyllable(Name, Strict, NameStart, L, 0)); + Name = Name.substr(findSyllable(Name, Strict, NameStart, V, 1)); + Name = Name.substr(findSyllable(Name, Strict, NameStart, T, 2)); + if (L != -1 && V != -1 && T != -1 && Name.empty()) { + if (!Strict) { + Buffer.append("HANGUL SYLLABLE "); + if (L != -1) + Buffer.append(HangulSyllables[L][0]); + if (V != -1) + Buffer.append(HangulSyllables[V][1]); + if (T != -1) + Buffer.append(HangulSyllables[T][2]); + } + return SBase + (std::uint32_t(L) * VCount + std::uint32_t(V)) * TCount + + std::uint32_t(T); + } + // Otherwise, it's an illegal syllable name. + return std::nullopt; +} + +struct GeneratedNamesData { + StringRef Prefix; + uint32_t Start; + uint32_t End; +}; + +// Unicode 15.0 Table 4-8. Name Derivation Rule Prefix Strings +static const GeneratedNamesData GeneratedNamesDataTable[] = { + {"CJK UNIFIED IDEOGRAPH-", 0x3400, 0x4DBF}, + {"CJK UNIFIED IDEOGRAPH-", 0x4E00, 0x9FFF}, + {"CJK UNIFIED IDEOGRAPH-", 0x20000, 0x2A6DF}, + {"CJK UNIFIED IDEOGRAPH-", 0x2A700, 0x2B739}, + {"CJK UNIFIED IDEOGRAPH-", 0x2B740, 0x2B81D}, + {"CJK UNIFIED IDEOGRAPH-", 0x2B820, 0x2CEA1}, + {"CJK UNIFIED IDEOGRAPH-", 0x2CEB0, 0x2EBE0}, + {"CJK UNIFIED IDEOGRAPH-", 0x30000, 0x3134A}, + {"CJK UNIFIED IDEOGRAPH-", 0x31350, 0x323AF}, + {"TANGUT IDEOGRAPH-", 0x17000, 0x187F7}, + {"TANGUT IDEOGRAPH-", 0x18D00, 0x18D08}, + {"KHITAN SMALL SCRIPT CHARACTER-", 0x18B00, 0x18CD5}, + {"NUSHU CHARACTER-", 0x1B170, 0x1B2FB}, + {"CJK COMPATIBILITY IDEOGRAPH-", 0xF900, 0xFA6D}, + {"CJK COMPATIBILITY IDEOGRAPH-", 0xFA70, 0xFAD9}, + {"CJK COMPATIBILITY IDEOGRAPH-", 0x2F800, 0x2FA1D}, +}; + +static std::optional<char32_t> +nameToGeneratedCodePoint(StringRef Name, bool Strict, BufferType &Buffer) { + for (auto &&Item : GeneratedNamesDataTable) { + Buffer.clear(); + std::size_t Consummed = 0; + char NameStart = 0, NeedleStart = 0; + bool DoesStartWith = startsWith(Name, Item.Prefix, Strict, Consummed, + NameStart, NeedleStart, /*isPrefix*/ true); + if (!DoesStartWith) + continue; + auto Number = Name.substr(Consummed); + unsigned long long V = 0; + // Be consistent about mandating upper casing. + if (Strict && + llvm::any_of(Number, [](char C) { return C >= 'a' && C <= 'f'; })) + return {}; + if (getAsUnsignedInteger(Number, 16, V) || V < Item.Start || V > Item.End) + continue; + if (!Strict) { + Buffer.append(Item.Prefix); + Buffer.append(utohexstr(V, true)); + } + return V; + } + return std::nullopt; +} + +static std::optional<char32_t> nameToCodepoint(StringRef Name, bool Strict, + BufferType &Buffer) { + if (Name.empty()) + return std::nullopt; + + std::optional<char32_t> Res = nameToHangulCodePoint(Name, Strict, Buffer); + if (!Res) + Res = nameToGeneratedCodePoint(Name, Strict, Buffer); + if (Res) + return *Res; + + Buffer.clear(); + Node Node; + bool Matches; + uint32_t Value; + std::tie(Node, Matches, Value) = compareNode(0, Name, Strict, Buffer); + if (Matches) { + std::reverse(Buffer.begin(), Buffer.end()); + // UAX44-LM2. Ignore case, whitespace, underscore ('_'), and all medial + // hyphens except the hyphen in U+1180 HANGUL JUNGSEONG O-E. + if (!Strict && Value == 0x116c && + Name.find_insensitive("O-E") != StringRef::npos) { + Buffer = "HANGUL JUNGSEONG O-E"; + Value = 0x1180; + } + return Value; + } + return std::nullopt; +} + +std::optional<char32_t> nameToCodepointStrict(StringRef Name) { + + BufferType Buffer; + auto Opt = nameToCodepoint(Name, true, Buffer); + return Opt; +} + +std::optional<LooseMatchingResult> +nameToCodepointLooseMatching(StringRef Name) { + BufferType Buffer; + auto Opt = nameToCodepoint(Name, false, Buffer); + if (!Opt) + return std::nullopt; + return LooseMatchingResult{*Opt, Buffer}; +} + +// Find the unicode character whose editing distance to Pattern +// is shortest, using the Wagner–Fischer algorithm. +llvm::SmallVector<MatchForCodepointName> +nearestMatchesForCodepointName(StringRef Pattern, std::size_t MaxMatchesCount) { + // We maintain a fixed size vector of matches, + // sorted by distance + // The worst match (with the biggest distance) are discarded when new elements + // are added. + std::size_t LargestEditDistance = 0; + llvm::SmallVector<MatchForCodepointName> Matches; + Matches.reserve(MaxMatchesCount + 1); + + auto Insert = [&](const Node &Node, uint32_t Distance, + char32_t Value) -> bool { + if (Distance > LargestEditDistance) { + if (Matches.size() == MaxMatchesCount) + return false; + LargestEditDistance = Distance; + } + // To avoid allocations, the creation of the name is delayed + // as much as possible. + std::string Name; + auto GetName = [&] { + if (Name.empty()) + Name = Node.fullName(); + return Name; + }; + + auto It = llvm::lower_bound( + Matches, Distance, + [&](const MatchForCodepointName &a, std::size_t Distance) { + if (Distance == a.Distance) + return a.Name < GetName(); + return a.Distance < Distance; + }); + if (It == Matches.end() && Matches.size() == MaxMatchesCount) + return false; + + MatchForCodepointName M{GetName(), Distance, Value}; + Matches.insert(It, std::move(M)); + if (Matches.size() > MaxMatchesCount) + Matches.pop_back(); + return true; + }; + + // We ignore case, space, hyphens, etc, + // in both the search pattern and the prospective names. + auto Normalize = [](StringRef Name) { + std::string Out; + Out.reserve(Name.size()); + for (char C : Name) { + if (isAlnum(C)) + Out.push_back(toUpper(C)); + } + return Out; + }; + std::string NormalizedName = Normalize(Pattern); + + // Allocate a matrix big enough for longest names. + const std::size_t Columns = + std::min(NormalizedName.size(), UnicodeNameToCodepointLargestNameSize) + + 1; + + LLVM_ATTRIBUTE_UNUSED static std::size_t Rows = + UnicodeNameToCodepointLargestNameSize + 1; + + std::vector<char> Distances( + Columns * (UnicodeNameToCodepointLargestNameSize + 1), 0); + + auto Get = [&Distances, Columns](size_t Column, std::size_t Row) -> char & { + assert(Column < Columns); + assert(Row < Rows); + return Distances[Row * Columns + Column]; + }; + + for (std::size_t I = 0; I < Columns; I++) + Get(I, 0) = I; + + // Visit the childrens, + // Filling (and overriding) the matrix for the name fragment of each node + // iteratively. CompleteName is used to collect the actual name of potential + // match, respecting case and spacing. + auto VisitNode = [&](const Node &N, std::size_t Row, + auto &VisitNode) -> void { + std::size_t J = 0; + for (; J < N.Name.size(); J++) { + if (!isAlnum(N.Name[J])) + continue; + + Get(0, Row) = Row; + + for (std::size_t I = 1; I < Columns; I++) { + const int Delete = Get(I - 1, Row) + 1; + const int Insert = Get(I, Row - 1) + 1; + + const int Replace = + Get(I - 1, Row - 1) + (NormalizedName[I - 1] != N.Name[J] ? 1 : 0); + + Get(I, Row) = std::min(Insert, std::min(Delete, Replace)); + } + + Row++; + } + + unsigned Cost = Get(Columns - 1, Row - 1); + if (N.Value != 0xFFFFFFFF) { + Insert(N, Cost, N.Value); + } + + if (N.hasChildren()) { + auto ChildOffset = N.ChildrenOffset; + for (;;) { + Node C = readNode(ChildOffset, &N); + ChildOffset += C.Size; + if (!C.isValid()) + break; + VisitNode(C, Row, VisitNode); + if (!C.HasSibling) + break; + } + } + }; + + Node Root = createRoot(); + VisitNode(Root, 1, VisitNode); + return Matches; +} + +} // namespace unicode + +} // namespace sys +} // namespace llvm |