blob: e573e051435fe31df7cb90af097171af0f0f70a0 (
plain) (
tree)
|
|
#pragma once
#include <util/str_stl.h>
#include <util/charset/unidata.h>
#include <util/generic/algorithm.h>
#include <util/generic/ptr.h>
#include <util/generic/strbuf.h>
#include <util/generic/string.h>
#include <util/generic/utility.h>
#include <util/generic/vector.h>
class IInputStream;
class IOutputStream;
namespace NUnicode {
namespace NPrivate {
struct TCategoryRanges;
}
class TUnicodeSet {
private:
typedef TSimpleSharedPtr<wchar32, TDeleteArray> TDynamicBuffer;
// Ranges can point to:
// 1) ShortBuffer for short sets (not more than 2 ranges)
// 2) static data (for predefined unicode categories)
// 3) or DynBuffer for big sets
const wchar32* Ranges;
wchar32 ShortBuffer[5];
TDynamicBuffer DynBuffer; // Can be shared between multiple sets
size_t Length; // Number of slots in Ranges
size_t Capacity; // Capacity of currently used buffer. Zero value means reference to static data
private:
Y_FORCE_INLINE bool IsShared() const {
return Ranges == DynBuffer.Get() && DynBuffer.RefCount() > 1;
}
Y_FORCE_INLINE bool IsStatic() const {
return 0 == Capacity;
}
size_t GetRangeItem(wchar32 c, size_t from = 0) const;
// Extends buffer capacity if required and returns pointer to the writable buffer of slots
wchar32* EnsureCapacity(size_t capacity);
// Makes the copy of buffer if the unicode set points to the static or shared data, and returns pointer to the writable buffer of slots
wchar32* EnsureWritable() {
if (IsShared()) {
// If multiple UnicodeSets refer to the same buffer then make the copy
Capacity = 0;
}
if (IsStatic()) {
// Copy static or shared data to own buffer before modifying
return EnsureCapacity(Length);
}
return const_cast<wchar32*>(Ranges);
}
// Returns pointer to the first inserted slot
wchar32* InsertRangeSlots(const size_t pos, const size_t count);
void EraseRangeSlots(const size_t pos, const size_t count);
void AddPredefRanges(const NPrivate::TCategoryRanges& ranges);
void SetPredefRanges(const NPrivate::TCategoryRanges& ranges);
public:
enum {
CODEPOINT_HIGH = 0x110000 // Next value after maximum valid code point
};
TUnicodeSet();
TUnicodeSet(const TUnicodeSet& s);
// Unicode set for specific character range. "from", "to" are inclusive
TUnicodeSet(wchar32 from, wchar32 to);
// Unicode set consists of all characters from the specified string
TUnicodeSet(const TWtringBuf& s);
// Unicode set for predefined category
TUnicodeSet(WC_TYPE c);
TUnicodeSet& operator=(const TUnicodeSet& s) {
return Set(s);
}
inline bool operator==(const TUnicodeSet& s) const {
return Length == s.Length && (Ranges == s.Ranges || ::Equal(Ranges, Ranges + Length, s.Ranges));
}
friend inline TUnicodeSet operator~(TUnicodeSet s) {
return s.Invert();
}
friend inline TUnicodeSet operator+(const TUnicodeSet& s1, const TUnicodeSet& s2) {
return TUnicodeSet(s1).Add(s2);
}
TUnicodeSet& Add(const TUnicodeSet& s);
TUnicodeSet& Add(const TWtringBuf& s);
TUnicodeSet& Add(wchar32 c);
// from, to - inclusive
TUnicodeSet& Add(wchar32 from, wchar32 to);
TUnicodeSet& Add(WC_TYPE c);
// Add unicode category by name (one- or two-letter)
TUnicodeSet& AddCategory(const TStringBuf& catName);
TUnicodeSet& Set(const TUnicodeSet& s);
// from, to - inclusive
TUnicodeSet& Set(wchar32 from, wchar32 to);
TUnicodeSet& Set(const TWtringBuf& s);
TUnicodeSet& Set(WC_TYPE c);
TUnicodeSet& SetCategory(const TStringBuf& catName);
TUnicodeSet& Invert();
// Converts existing unicode set to the case-insensitive set
TUnicodeSet& MakeCaseInsensitive();
TUnicodeSet& Clear();
size_t Hash() const;
TString ToString(bool escapeAllChars = false) const;
inline bool Valid() const {
return Length > 0 && Ranges[Length - 1] == CODEPOINT_HIGH;
}
inline bool Has(wchar32 c) const {
if (Y_UNLIKELY(c >= CODEPOINT_HIGH)) {
return false;
}
const size_t item = GetRangeItem(c);
return (item & 1);
}
inline bool Empty() const {
Y_ASSERT(Valid());
return Length < 2;
}
void Save(IOutputStream* out) const;
void Load(IInputStream* in);
TUnicodeSet& Parse(const TWtringBuf& data);
};
using TUnicodeSetPtr = TSimpleSharedPtr<TUnicodeSet>;
}
template <>
struct THash<NUnicode::TUnicodeSet> {
size_t operator()(const NUnicode::TUnicodeSet& s) const {
return s.Hash();
}
};
|