aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/unicode/set/unicode_set.h
blob: e573e051435fe31df7cb90af097171af0f0f70a0 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#pragma once

#include <util/str_stl.h>
#include <util/charset/unidata.h>
#include <util/generic/algorithm.h>
#include <util/generic/ptr.h>
#include <util/generic/strbuf.h>
#include <util/generic/string.h>
#include <util/generic/utility.h>
#include <util/generic/vector.h>

class IInputStream;
class IOutputStream;

namespace NUnicode {
    namespace NPrivate {
        struct TCategoryRanges;
    }

    class TUnicodeSet {
    private:
        typedef TSimpleSharedPtr<wchar32, TDeleteArray> TDynamicBuffer;

        // Ranges can point to:
        // 1) ShortBuffer for short sets (not more than 2 ranges)
        // 2) static data (for predefined unicode categories)
        // 3) or DynBuffer for big sets
        const wchar32* Ranges;
        wchar32 ShortBuffer[5];
        TDynamicBuffer DynBuffer; // Can be shared between multiple sets
        size_t Length;            // Number of slots in Ranges
        size_t Capacity;          // Capacity of currently used buffer. Zero value means reference to static data

    private:
        Y_FORCE_INLINE bool IsShared() const {
            return Ranges == DynBuffer.Get() && DynBuffer.RefCount() > 1;
        }

        Y_FORCE_INLINE bool IsStatic() const {
            return 0 == Capacity;
        }

        size_t GetRangeItem(wchar32 c, size_t from = 0) const;

        // Extends buffer capacity if required and returns pointer to the writable buffer of slots
        wchar32* EnsureCapacity(size_t capacity);

        // Makes the copy of buffer if the unicode set points to the static or shared data, and returns pointer to the writable buffer of slots
        wchar32* EnsureWritable() {
            if (IsShared()) {
                // If multiple UnicodeSets refer to the same buffer then make the copy
                Capacity = 0;
            }
            if (IsStatic()) {
                // Copy static or shared data to own buffer before modifying
                return EnsureCapacity(Length);
            }
            return const_cast<wchar32*>(Ranges);
        }

        // Returns pointer to the first inserted slot
        wchar32* InsertRangeSlots(const size_t pos, const size_t count);
        void EraseRangeSlots(const size_t pos, const size_t count);

        void AddPredefRanges(const NPrivate::TCategoryRanges& ranges);
        void SetPredefRanges(const NPrivate::TCategoryRanges& ranges);

    public:
        enum {
            CODEPOINT_HIGH = 0x110000 // Next value after maximum valid code point
        };

        TUnicodeSet();
        TUnicodeSet(const TUnicodeSet& s);
        // Unicode set for specific character range. "from", "to" are inclusive
        TUnicodeSet(wchar32 from, wchar32 to);
        // Unicode set consists of all characters from the specified string
        TUnicodeSet(const TWtringBuf& s);
        // Unicode set for predefined category
        TUnicodeSet(WC_TYPE c);

        TUnicodeSet& operator=(const TUnicodeSet& s) {
            return Set(s);
        }

        inline bool operator==(const TUnicodeSet& s) const {
            return Length == s.Length && (Ranges == s.Ranges || ::Equal(Ranges, Ranges + Length, s.Ranges));
        }

        friend inline TUnicodeSet operator~(TUnicodeSet s) {
            return s.Invert();
        }

        friend inline TUnicodeSet operator+(const TUnicodeSet& s1, const TUnicodeSet& s2) {
            return TUnicodeSet(s1).Add(s2);
        }

        TUnicodeSet& Add(const TUnicodeSet& s);
        TUnicodeSet& Add(const TWtringBuf& s);
        TUnicodeSet& Add(wchar32 c);
        // from, to - inclusive
        TUnicodeSet& Add(wchar32 from, wchar32 to);
        TUnicodeSet& Add(WC_TYPE c);
        // Add unicode category by name (one- or two-letter)
        TUnicodeSet& AddCategory(const TStringBuf& catName);

        TUnicodeSet& Set(const TUnicodeSet& s);
        // from, to - inclusive
        TUnicodeSet& Set(wchar32 from, wchar32 to);
        TUnicodeSet& Set(const TWtringBuf& s);
        TUnicodeSet& Set(WC_TYPE c);
        TUnicodeSet& SetCategory(const TStringBuf& catName);

        TUnicodeSet& Invert();
        // Converts existing unicode set to the case-insensitive set
        TUnicodeSet& MakeCaseInsensitive();
        TUnicodeSet& Clear();

        size_t Hash() const;
        TString ToString(bool escapeAllChars = false) const;

        inline bool Valid() const {
            return Length > 0 && Ranges[Length - 1] == CODEPOINT_HIGH;
        }

        inline bool Has(wchar32 c) const {
            if (Y_UNLIKELY(c >= CODEPOINT_HIGH)) {
                return false;
            }
            const size_t item = GetRangeItem(c);
            return (item & 1);
        }

        inline bool Empty() const {
            Y_ASSERT(Valid());
            return Length < 2;
        }

        void Save(IOutputStream* out) const;
        void Load(IInputStream* in);

        TUnicodeSet& Parse(const TWtringBuf& data);
    };

    using TUnicodeSetPtr = TSimpleSharedPtr<TUnicodeSet>;

}

template <>
struct THash<NUnicode::TUnicodeSet> {
    size_t operator()(const NUnicode::TUnicodeSet& s) const {
        return s.Hash();
    }
};