aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/unicode/set/unicode_set.h
diff options
context:
space:
mode:
authoramnosov <amnosov@yandex-team.com>2022-10-26 11:59:40 +0300
committeramnosov <amnosov@yandex-team.com>2022-10-26 11:59:40 +0300
commit4225eab76862f099d4d55a0205ab0cdd39c0433c (patch)
tree842ff268488999a8f54243cfb10ba96fb333645b /library/cpp/unicode/set/unicode_set.h
parent2399206380b6eab57bb7b9ad0bf0ecf851c94c1d (diff)
downloadydb-4225eab76862f099d4d55a0205ab0cdd39c0433c.tar.gz
Unicode::Is{Category}
Unicode::Is{Category} udfs added
Diffstat (limited to 'library/cpp/unicode/set/unicode_set.h')
-rw-r--r--library/cpp/unicode/set/unicode_set.h154
1 files changed, 154 insertions, 0 deletions
diff --git a/library/cpp/unicode/set/unicode_set.h b/library/cpp/unicode/set/unicode_set.h
new file mode 100644
index 0000000000..e573e05143
--- /dev/null
+++ b/library/cpp/unicode/set/unicode_set.h
@@ -0,0 +1,154 @@
+#pragma once
+
+#include <util/str_stl.h>
+#include <util/charset/unidata.h>
+#include <util/generic/algorithm.h>
+#include <util/generic/ptr.h>
+#include <util/generic/strbuf.h>
+#include <util/generic/string.h>
+#include <util/generic/utility.h>
+#include <util/generic/vector.h>
+
+class IInputStream;
+class IOutputStream;
+
+namespace NUnicode {
+ namespace NPrivate {
+ struct TCategoryRanges;
+ }
+
+ class TUnicodeSet {
+ private:
+ typedef TSimpleSharedPtr<wchar32, TDeleteArray> TDynamicBuffer;
+
+ // Ranges can point to:
+ // 1) ShortBuffer for short sets (not more than 2 ranges)
+ // 2) static data (for predefined unicode categories)
+ // 3) or DynBuffer for big sets
+ const wchar32* Ranges;
+ wchar32 ShortBuffer[5];
+ TDynamicBuffer DynBuffer; // Can be shared between multiple sets
+ size_t Length; // Number of slots in Ranges
+ size_t Capacity; // Capacity of currently used buffer. Zero value means reference to static data
+
+ private:
+ Y_FORCE_INLINE bool IsShared() const {
+ return Ranges == DynBuffer.Get() && DynBuffer.RefCount() > 1;
+ }
+
+ Y_FORCE_INLINE bool IsStatic() const {
+ return 0 == Capacity;
+ }
+
+ size_t GetRangeItem(wchar32 c, size_t from = 0) const;
+
+ // Extends buffer capacity if required and returns pointer to the writable buffer of slots
+ wchar32* EnsureCapacity(size_t capacity);
+
+ // Makes the copy of buffer if the unicode set points to the static or shared data, and returns pointer to the writable buffer of slots
+ wchar32* EnsureWritable() {
+ if (IsShared()) {
+ // If multiple UnicodeSets refer to the same buffer then make the copy
+ Capacity = 0;
+ }
+ if (IsStatic()) {
+ // Copy static or shared data to own buffer before modifying
+ return EnsureCapacity(Length);
+ }
+ return const_cast<wchar32*>(Ranges);
+ }
+
+ // Returns pointer to the first inserted slot
+ wchar32* InsertRangeSlots(const size_t pos, const size_t count);
+ void EraseRangeSlots(const size_t pos, const size_t count);
+
+ void AddPredefRanges(const NPrivate::TCategoryRanges& ranges);
+ void SetPredefRanges(const NPrivate::TCategoryRanges& ranges);
+
+ public:
+ enum {
+ CODEPOINT_HIGH = 0x110000 // Next value after maximum valid code point
+ };
+
+ TUnicodeSet();
+ TUnicodeSet(const TUnicodeSet& s);
+ // Unicode set for specific character range. "from", "to" are inclusive
+ TUnicodeSet(wchar32 from, wchar32 to);
+ // Unicode set consists of all characters from the specified string
+ TUnicodeSet(const TWtringBuf& s);
+ // Unicode set for predefined category
+ TUnicodeSet(WC_TYPE c);
+
+ TUnicodeSet& operator=(const TUnicodeSet& s) {
+ return Set(s);
+ }
+
+ inline bool operator==(const TUnicodeSet& s) const {
+ return Length == s.Length && (Ranges == s.Ranges || ::Equal(Ranges, Ranges + Length, s.Ranges));
+ }
+
+ friend inline TUnicodeSet operator~(TUnicodeSet s) {
+ return s.Invert();
+ }
+
+ friend inline TUnicodeSet operator+(const TUnicodeSet& s1, const TUnicodeSet& s2) {
+ return TUnicodeSet(s1).Add(s2);
+ }
+
+ TUnicodeSet& Add(const TUnicodeSet& s);
+ TUnicodeSet& Add(const TWtringBuf& s);
+ TUnicodeSet& Add(wchar32 c);
+ // from, to - inclusive
+ TUnicodeSet& Add(wchar32 from, wchar32 to);
+ TUnicodeSet& Add(WC_TYPE c);
+ // Add unicode category by name (one- or two-letter)
+ TUnicodeSet& AddCategory(const TStringBuf& catName);
+
+ TUnicodeSet& Set(const TUnicodeSet& s);
+ // from, to - inclusive
+ TUnicodeSet& Set(wchar32 from, wchar32 to);
+ TUnicodeSet& Set(const TWtringBuf& s);
+ TUnicodeSet& Set(WC_TYPE c);
+ TUnicodeSet& SetCategory(const TStringBuf& catName);
+
+ TUnicodeSet& Invert();
+ // Converts existing unicode set to the case-insensitive set
+ TUnicodeSet& MakeCaseInsensitive();
+ TUnicodeSet& Clear();
+
+ size_t Hash() const;
+ TString ToString(bool escapeAllChars = false) const;
+
+ inline bool Valid() const {
+ return Length > 0 && Ranges[Length - 1] == CODEPOINT_HIGH;
+ }
+
+ inline bool Has(wchar32 c) const {
+ if (Y_UNLIKELY(c >= CODEPOINT_HIGH)) {
+ return false;
+ }
+ const size_t item = GetRangeItem(c);
+ return (item & 1);
+ }
+
+ inline bool Empty() const {
+ Y_ASSERT(Valid());
+ return Length < 2;
+ }
+
+ void Save(IOutputStream* out) const;
+ void Load(IInputStream* in);
+
+ TUnicodeSet& Parse(const TWtringBuf& data);
+ };
+
+ using TUnicodeSetPtr = TSimpleSharedPtr<TUnicodeSet>;
+
+}
+
+template <>
+struct THash<NUnicode::TUnicodeSet> {
+ size_t operator()(const NUnicode::TUnicodeSet& s) const {
+ return s.Hash();
+ }
+};