#include "unicode_set.h"
#include "category_ranges.h"
#include "unicode_set_parser.h"
#include <util/ysaveload.h>
#include <util/charset/wide.h>
#include <util/digest/numeric.h>
#include <util/generic/buffer.h>
#include <util/generic/yexception.h>
#include <util/stream/format.h>
#include <util/stream/input.h>
#include <util/stream/output.h>
#include <util/string/cast.h>
// The original idea of unicode set implementation was taken from the icu::UnicodeSet.
// UnicodeSet has a set of ranges [from, to), where upper boundary is exclusive.
// The list of ranges always has a terminal value CODEPOINT_HIGH at the end.
namespace NUnicode {
namespace NPrivate {
inline wchar32 Bound(wchar32 c) {
return c < TUnicodeSet::CODEPOINT_HIGH ? c : TUnicodeSet::CODEPOINT_HIGH - 1;
}
inline void CheckWcType(WC_TYPE c) {
if (static_cast<size_t>(c) >= CCL_NUM) {
throw yexception() << "Category ID must be less than CCL_NUM (" << static_cast<size_t>(CCL_NUM) << "), specified: " << static_cast<size_t>(c);
}
}
}
// Returns the smallest value i >= from such that 'c' < Ranges[i].
// Some examples:
// GetRangeItem(c, 0)
// set Ranges[] c=0 1 3 4 7 8
// === ============== ===========
// [] [0x110000] 0 0 0 0 0 0
// [:Any:] [0, 0x110000] 1 1 1 1 1 1
// [\u0000-\u0003] [0, 4, 0x110000] 1 1 1 2 2 2
// [\u0004-\u0007] [4, 8, 0x110000] 0 0 0 1 1 2
//
// So, if method returns an odd value then 'c' falls to the {Range[i-1],Range[i]} range.
size_t TUnicodeSet::GetRangeItem(wchar32 c, size_t from) const {
Y_ASSERT(Valid());
Y_ASSERT(from < Length);
if (c < Ranges[from])
return from;
size_t lo = from;
size_t hi = Length - 1;
if (lo >= hi || c >= Ranges[hi - 1]) {
return hi;
}
for (;;) {
size_t i = (lo + hi) >> 1;
if (i == lo) {
break;
} else if (c < Ranges[i]) {
hi = i;
} else {
lo = i;
}
}
return hi;
}
wchar32* TUnicodeSet::EnsureCapacity(size_t capacity) {
if (capacity <= Capacity) {
return const_cast<wchar32*>(Ranges);
}
TDynamicBuffer buf = new wchar32[capacity];
Copy<const wchar32*, wchar32*>(Ranges, Ranges + Length, buf.Get());
DoSwap(buf, DynBuffer);
Ranges = DynBuffer.Get();
Capacity = capacity;
return DynBuffer.Get();
}
wchar32* TUnicodeSet::InsertRangeSlots(const size_t pos, const size_t count) {
Y_ASSERT(pos < Length);
wchar32* src = EnsureCapacity(Length + count) + Length - 1;
wchar32* dst = src + count;
for (size_t i = 0; i < Length - pos; ++i) {
*dst-- = *src--;
}
Length += count;
return src + 1;
}
void TUnicodeSet::EraseRangeSlots(const size_t pos, const size_t count) {
Y_ASSERT(pos < Length);
Y_ASSERT(pos + count <= Length);
wchar32* dst = EnsureWritable() + pos;
wchar32* src = dst + count;
for (size_t i = 0; i < Length - pos - count; ++i) {
*dst++ = *src++;
}
Length -= count;
}
TUnicodeSet::TUnicodeSet()
: Ranges(ShortBuffer)
, Length(0)
, Capacity(Y_ARRAY_SIZE(ShortBuffer))
{
Clear();
}
TUnicodeSet::TUnicodeSet(const TUnicodeSet& s)
: Ranges(ShortBuffer)
, Length(0)
, Capacity(Y_ARRAY_SIZE(ShortBuffer))
{
Set(s);
}
// from, to - inclusive
TUnicodeSet::TUnicodeSet(wchar32 from, wchar32 to)
: Ranges(ShortBuffer)
, Length(0)
, Capacity(Y_ARRAY_SIZE(ShortBuffer))
{
Set(from, to);
}
TUnicodeSet::TUnicodeSet(const TWtringBuf& s)
: Ranges(ShortBuffer)
, Length(0)
, Capacity(Y_ARRAY_SIZE(ShortBuffer))
{
Set(s);
}
TUnicodeSet::TUnicodeSet(WC_TYPE c)
: Ranges(ShortBuffer)
, Length(0)
, Capacity(Y_ARRAY_SIZE(ShortBuffer))
{
Set(c);
}
void TUnicodeSet::AddPredefRanges(const NPrivate::TCategoryRanges& ranges) {
if (ranges.Count > 0) {
for (size_t i = 0; i + 1 < ranges.Count; i += 2) {
Add(ranges.Data[i], ranges.Data[i + 1] - 1);
}
}
}
TUnicodeSet& TUnicodeSet::Add(const TUnicodeSet& s) {
if (Empty()) {
TUnicodeSet::operator=(s);
return *this;
}
for (size_t i = 0; i + 1 < s.Length; i += 2) {
Add(s.Ranges[i], s.Ranges[i + 1] - 1);
}
return *this;
}
TUnicodeSet& TUnicodeSet::Add(const TWtringBuf& s) {
const wchar16* begin = s.data();
const wchar16* end = s.data() + s.size();
while (begin < end) {
Add(ReadSymbolAndAdvance(begin, end));
}
return *this;
}
TUnicodeSet& TUnicodeSet::Add(wchar32 c) {
c = NPrivate::Bound(c);
const size_t i = GetRangeItem(c);
if (i & 1) {
return *this;
}
if (c == Ranges[i] - 1) { // The char adjoins with the next range
if (i > 0 && Ranges[i - 1] == c) { // The char adjoins with the previous range too
if (i + 1 == Length) { // Don't delete the last TERMINAL
EraseRangeSlots(i - 1, 1);
} else {
EraseRangeSlots(i - 1, 2); // Collapse ranges
}
} else {
EnsureWritable()[i] = c;
}
} else if (i > 0 && Ranges[i - 1] == c) {
++(EnsureWritable()[i - 1]);
} else {
wchar32* target = InsertRangeSlots(i, 2);
*target++ = c;
*target = c + 1;
}
Y_ASSERT(Valid());
return *this;
}
TUnicodeSet& TUnicodeSet::Add(wchar32 from, wchar32 to) {
from = NPrivate::Bound(from);
to = NPrivate::Bound(to);
Y_ASSERT(from <= to);
if (to == from) {
return Add(to);
} else if (from > to) {
return *this;
}
size_t i = GetRangeItem(from);
if (to < Ranges[i]) {
if (i & 1) {
return *this;
}
if (i > 0 && Ranges[i - 1] == from) {
if (Ranges[i] == to + 1) {
if (i + 1 == Length) {
EraseRangeSlots(i - 1, 1);
} else {
EraseRangeSlots(i - 1, 2);
}
} else {
EnsureWritable()[i - 1] = to + 1;
}
} else if (Ranges[i] == to + 1) {
if (i + 1 == Length) {
*InsertRangeSlots(i, 1) = from;
} else {
EnsureWritable()[i] = from;
}
} else {
wchar32* target = InsertRangeSlots(i, 2);
*target++ = from;
*target = to + 1;
}
Y_ASSERT(Valid());
return *this;
}
size_t j = GetRangeItem(to, i);
Y_ASSERT(i < j);
if (0 == (j & 1)) { // 'to' falls between ranges
if (Ranges[j] > to + 1) {
*InsertRangeSlots(j, 1) = to + 1;
} else if (j + 1 < Length) { // Exclude last TERMINAL element
Y_ASSERT(Ranges[j] == to + 1);
// The next range adjoins with the current one. Join them
++j;
}
}
if (0 == (i & 1)) { // 'from' falls between ranges
if (i > 0 && Ranges[i - 1] == from) {
--i;
} else {
*InsertRangeSlots(i, 1) = from;
++i;
++j;
}
}
// Erase ranges, which are covered by the new one
Y_ASSERT(i <= j);
Y_ASSERT(i <= Length);
Y_ASSERT(j <= Length);
EraseRangeSlots(i, j - i);
Y_ASSERT(Valid());
return *this;
}
TUnicodeSet& TUnicodeSet::Add(WC_TYPE c) {
NPrivate::CheckWcType(c);
if (Empty()) {
return Set(c);
}
AddPredefRanges(NPrivate::GetCategoryRanges(c));
return *this;
}
TUnicodeSet& TUnicodeSet::AddCategory(const TStringBuf& catName) {
if (Empty()) {
return SetCategory(catName);
}
AddPredefRanges(NPrivate::GetCategoryRanges(catName));
return *this;
}
void TUnicodeSet::SetPredefRanges(const NPrivate::TCategoryRanges& ranges) {
Clear();
if (ranges.Count > 0) {
DynBuffer.Drop();
Ranges = ranges.Data;
Length = ranges.Count;
Capacity = 0;
}
}
TUnicodeSet& TUnicodeSet::Set(const TUnicodeSet& s) {
if (0 == s.Capacity) {
DynBuffer.Drop();
Ranges = s.Ranges;
Length = s.Length;
Capacity = 0;
} else if (s.Ranges == s.DynBuffer.Get()) {
DynBuffer = s.DynBuffer;
Ranges = DynBuffer.Get();
Length = s.Length;
Capacity = s.Capacity;
} else {
::Copy(s.Ranges, s.Ranges + s.Length, EnsureCapacity(s.Length));
Length = s.Length;
}
return *this;
}
TUnicodeSet& TUnicodeSet::Set(wchar32 from, wchar32 to) {
from = NPrivate::Bound(from);
to = NPrivate::Bound(to);
Y_ASSERT(from <= to);
Clear();
if (to == from) {
return Add(to);
} else if (from > to) {
return *this;
}
if (to + 1 != CODEPOINT_HIGH) {
wchar32* target = InsertRangeSlots(0, 2);
*target++ = from;
*target = to + 1;
} else {
*InsertRangeSlots(0, 1) = from;
}
Y_ASSERT(Valid());
return *this;
}
TUnicodeSet& TUnicodeSet::Set(const TWtringBuf& s) {
Clear();
return Add(s);
}
TUnicodeSet& TUnicodeSet::Set(WC_TYPE c) {
NPrivate::CheckWcType(c);
SetPredefRanges(NPrivate::GetCategoryRanges(c));
return *this;
}
TUnicodeSet& TUnicodeSet::SetCategory(const TStringBuf& catName) {
SetPredefRanges(NPrivate::GetCategoryRanges(catName));
return *this;
}
TUnicodeSet& TUnicodeSet::Invert() {
Y_ASSERT(Valid());
if (0 == Ranges[0]) {
EraseRangeSlots(0, 1);
} else {
*InsertRangeSlots(0, 1) = 0;
}
return *this;
}
TUnicodeSet& TUnicodeSet::MakeCaseInsensitive() {
TVector<wchar32> oldRanges(Ranges, Ranges + Length);
for (size_t i = 0; i + 1 < oldRanges.size(); i += 2) {
for (wchar32 c = oldRanges[i]; c < oldRanges[i + 1]; ++c) {
const ::NUnicode::NPrivate::TProperty& p = ::NUnicode::NPrivate::CharProperty(c);
if (p.Lower) {
Add(static_cast<wchar32>(c + p.Lower));
}
if (p.Upper) {
Add(static_cast<wchar32>(c + p.Upper));
}
if (p.Title) {
Add(static_cast<wchar32>(c + p.Title));
}
}
}
return *this;
}
TUnicodeSet& TUnicodeSet::Clear() {
if (IsStatic() || IsShared()) {
DynBuffer.Drop();
ShortBuffer[0] = CODEPOINT_HIGH;
Capacity = Y_ARRAY_SIZE(ShortBuffer);
Ranges = ShortBuffer;
} else {
const_cast<wchar32*>(Ranges)[0] = CODEPOINT_HIGH;
}
Length = 1;
return *this;
}
size_t TUnicodeSet::Hash() const {
size_t res = 0;
for (size_t i = 0; i < Length; ++i) {
res = ::CombineHashes(size_t(Ranges[i]), res);
}
return res;
}
inline void WriteUnicodeChar(IOutputStream& out, wchar32 c, bool needEscape = false) {
switch (c) {
case wchar32('-'):
case wchar32('\\'):
case wchar32('^'):
needEscape = true;
break;
default:
break;
}
if (::IsGraph(c) && !needEscape) {
char buf[4]; // Max utf8 char length is 4
size_t wr = 0;
WideToUTF8(&c, 1, buf, wr);
Y_ASSERT(wr <= Y_ARRAY_SIZE(buf));
out.Write(buf, wr);
} else {
TString hexRepr = IntToString<16>(c);
if (c >> 8 == 0) {
out << "\\x" << LeftPad(hexRepr, 2, '0');
} else if (c >> 16 == 0) {
out << "\\u" << LeftPad(hexRepr, 4, '0');
} else {
out << "\\U" << LeftPad(hexRepr, 8, '0');
}
}
}
TString TUnicodeSet::ToString(bool escapeAllChars /* = false*/) const {
Y_ASSERT(Valid());
TStringStream str;
str.Reserve(Length * 4 + Length / 2 + 2);
str.Write('[');
for (size_t i = 0; i + 1 < Length; i += 2) {
WriteUnicodeChar(str, Ranges[i], escapeAllChars);
if (Ranges[i] + 1 < Ranges[i + 1]) {
// Don't write dash for two-symbol ranges
if (Ranges[i] + 2 < Ranges[i + 1]) {
str.Write('-');
}
WriteUnicodeChar(str, Ranges[i + 1] - 1, escapeAllChars);
}
}
str.Write(']');
return str.Str();
}
void TUnicodeSet::Save(IOutputStream* out) const {
::SaveSize(out, Length);
::SaveArray(out, Ranges, Length);
}
void TUnicodeSet::Load(IInputStream* in) {
const size_t length = ::LoadSize(in);
if (length > 0) {
::LoadArray(in, EnsureCapacity(length), length);
}
Length = length;
if (!Valid()) {
ythrow TSerializeException() << "Loaded broken unicode set";
}
}
TUnicodeSet& TUnicodeSet::Parse(const TWtringBuf& data) {
Clear();
NPrivate::ParseUnicodeSet(*this, data);
return *this;
}
}