#pragma once
#include "decomposition_table.h"
#include <util/charset/unidata.h>
#include <util/charset/wide.h>
#include <util/generic/hash.h>
#include <util/generic/vector.h>
#include <util/generic/algorithm.h>
#include <util/generic/singleton.h>
#include <util/generic/noncopyable.h>
#include <utility>
namespace NUnicode {
enum ENormalization {
NFD,
NFC,
NFKD,
NFKC,
};
// Грубо говоря:
// NFD расскладывает "ё" на "е + диакритику"
// NFC сначала всё раскладывает, потом всё что может - складывает
// NFKD делает то же, что и NFD. Кроме того, например, римскую IV (\x2163)
// превращает в латинские I и V
// NFKC - NFKD + композиция (римская четвёрка из I и V, естественно, не образуется)
// Формальная спецификация: http://www.unicode.org/reports/tr15/
namespace NPrivate {
inline const wchar32* Decomposition(const TDecompositionTable& table, wchar32 ch) {
return table.Get(ch, static_cast<const wchar32*>(nullptr));
}
class TDecompositor {
private:
const TDecompositionTable& Table;
public:
inline TDecompositor(const TDecompositionTable& table)
: Table(table)
{
}
inline const wchar32* Decomposition(wchar32 ch) const {
return NPrivate::Decomposition(Table, ch);
}
};
template <bool IsCompat>
struct TStandartDecompositor: public TDecompositor {
TStandartDecompositor()
: TDecompositor(NPrivate::DecompositionTable<IsCompat>())
{
}
};
template <ENormalization N>
struct TShift;
template <>
struct TShift<NFD> {
static const WC_TYPE Value = NFD_QC;
};
template <>
struct TShift<NFC> {
static const WC_TYPE Value = NFC_QC;
};
template <>
struct TShift<NFKD> {
static const WC_TYPE Value = NFKD_QC;
};
template <>
struct TShift<NFKC> {
static const WC_TYPE Value = NFKC_QC;
};
template <ENormalization N>
inline bool Normalized(wchar32 ch) {
return CharInfo(ch) & NPrivate::TShift<N>::Value;
}
class TComposition {
private:
struct TRawData {
wchar32 Lead;
wchar32 Tail;
wchar32 Comp;
};
static const TRawData RawData[];
static const size_t RawDataSize;
class TKey: public std::pair<wchar32, wchar32> {
public:
inline TKey(wchar32 a, wchar32 b)
: std::pair<wchar32, wchar32>(a, b)
{
}
inline size_t Hash() const {
return CombineHashes(first, second);
}
};
template <class T>
struct THash {
inline size_t operator()(const T& t) const {
return t.Hash();
}
};
typedef THashMap<TKey, wchar32, THash<TKey>> TData;
TData Data;
public:
TComposition();
inline wchar32 Composite(wchar32 lead, wchar32 tail) const {
TData::const_iterator i = Data.find(TKey(lead, tail));
if (i == Data.end())
return 0;
return i->second;
}
};
typedef std::pair<wchar32, TCombining> TSymbol;
typedef TVector<TSymbol> TBuffer;
template <bool doCompose>
class TCompositor;
template <>
class TCompositor<false> {
public:
inline void DoComposition(TBuffer& buffer) {
Y_UNUSED(buffer);
}
};
template <>
class TCompositor<true> {
private:
static const wchar32 NonComposite = 0;
const TComposition* Composition;
public:
inline TCompositor()
: Composition(Singleton<TComposition>())
{
}
inline void DoComposition(TBuffer& buffer) {
if (buffer.size() < 2)
return;
const TSymbol& leadSymbol = buffer[0];
if (leadSymbol.second != 0)
return;
wchar32 lead = leadSymbol.first;
bool oneMoreTurnPlease = false;
do {
oneMoreTurnPlease = false;
TCombining lastCombining = 0;
for (TBuffer::iterator i = buffer.begin() + 1, mi = buffer.end(); i != mi; ++i) {
TCombining currentCombining = i->second;
if (!(currentCombining != lastCombining && currentCombining != 0 || lastCombining == 0 && currentCombining == 0))
continue;
lastCombining = currentCombining;
wchar32 comb = Composition->Composite(lead, i->first);
if (comb == NonComposite)
continue;
lead = comb;
buffer.erase(i);
oneMoreTurnPlease = true;
break;
}
} while (oneMoreTurnPlease);
Y_ASSERT(DecompositionCombining(lead) == 0);
buffer[0] = TSymbol(lead, 0);
}
};
template <ENormalization N, typename TCharType>
inline bool Normalized(const TCharType* begin, const TCharType* end) {
TCombining lastCanonicalClass = 0;
for (const TCharType* i = begin; i != end;) {
wchar32 ch = ReadSymbolAndAdvance(i, end);
TCombining canonicalClass = DecompositionCombining(ch);
if (lastCanonicalClass > canonicalClass && canonicalClass != 0)
return false;
if (!Normalized<N>(ch))
return false;
lastCanonicalClass = canonicalClass;
}
return true;
}
}
template <bool compat>
inline const wchar32* Decomposition(wchar32 ch) {
return NPrivate::Decomposition(NPrivate::DecompositionTable<compat>(), ch);
}
template <ENormalization N, class TDecompositor = NPrivate::TDecompositor>
class TNormalizer : NNonCopyable::TNonCopyable {
private:
static const ENormalization Norm = N;
static const bool IsCompat = Norm == NFKD || Norm == NFKC;
static const bool RequireComposition = Norm == NFC || Norm == NFKC;
typedef NPrivate::TSymbol TSymbol;
typedef NPrivate::TBuffer TBuffer;
TBuffer Buffer;
NPrivate::TCompositor<RequireComposition> Compositor;
const TDecompositor& Decompositor;
private:
static inline bool Compare(const TSymbol& a, const TSymbol& b) {
return a.second < b.second;
}
struct TComparer {
inline bool operator()(const TSymbol& a, const TSymbol& b) {
return Compare(a, b);
}
};
template <class T>
static inline void Write(const TBuffer::const_iterator& begin, const TBuffer::const_iterator& end, T& out) {
for (TBuffer::const_iterator i = begin; i != end; ++i) {
WriteSymbol(i->first, out);
}
}
static inline void Write(const TBuffer::const_iterator& begin, const TBuffer::const_iterator& end, TUtf32String& out) { // because WriteSymbol from util/charset/wide.h works wrong in this case
for (TBuffer::const_iterator i = begin; i != end; ++i) {
out += i->first;
}
}
inline void SortBuffer() {
if (Buffer.size() < 2)
return;
StableSort(Buffer.begin(), Buffer.end(), TComparer());
}
template <class T>
inline void AddCharNoDecomposition(wchar32 c, T& out) {
TCombining cc = DecompositionCombining(c);
if (cc == 0) {
SortBuffer();
Buffer.push_back(TBuffer::value_type(c, cc));
Compositor.DoComposition(Buffer);
if (Buffer.size() > 1) {
Write(Buffer.begin(), Buffer.end() - 1, out);
Buffer.erase(Buffer.begin(), Buffer.end() - 1); // TODO I don't like this
}
} else {
Buffer.push_back(TBuffer::value_type(c, cc));
}
}
template <class T>
inline void AddChar(wchar32 c, T& out) {
const wchar32* decompBegin = Decompositor.Decomposition(c);
if (decompBegin) {
while (*decompBegin) {
Y_ASSERT(Decompositor.Decomposition(*decompBegin) == nullptr);
AddCharNoDecomposition(*(decompBegin++), out);
}
return;
} else {
AddCharNoDecomposition(c, out);
}
}
template <class T, typename TCharType>
inline void DoNormalize(const TCharType* begin, const TCharType* end, T& out) {
Buffer.clear();
for (const TCharType* i = begin; i != end;) {
AddChar(ReadSymbolAndAdvance(i, end), out);
}
SortBuffer();
Compositor.DoComposition(Buffer);
Write(Buffer.begin(), Buffer.end(), out);
}
public:
TNormalizer()
: Decompositor(*Singleton<NPrivate::TStandartDecompositor<IsCompat>>())
{
}
TNormalizer(const TDecompositor& decompositor)
: Decompositor(decompositor)
{
}
template <class T, typename TCharType>
inline void Normalize(const TCharType* begin, const TCharType* end, T& out) {
if (NPrivate::Normalized<Norm>(begin, end)) {
for (const TCharType* i = begin; i != end; ++i) {
WriteSymbol(*i, out);
}
} else {
DoNormalize(begin, end, out);
}
}
template <typename TCharType>
inline void Normalize(const TCharType* begin, const TCharType* end, TUtf32String& out) {
if (NPrivate::Normalized<Norm>(begin, end)) {
for (const TCharType* i = begin; i != end;) {
out += ReadSymbolAndAdvance(i, end);
}
} else {
DoNormalize(begin, end, out);
}
}
template <class T, typename TCharType>
inline void Normalize(const TCharType* begin, size_t len, T& out) {
return Normalize(begin, begin + len, out);
}
template <typename TCharType>
inline TBasicString<TCharType> Normalize(const TBasicString<TCharType>& src) {
if (NPrivate::Normalized<Norm>(src.begin(), src.end())) {
// nothing to normalize
return src;
} else {
TBasicString<TCharType> res;
res.reserve(src.length());
DoNormalize(src.begin(), src.end(), res);
return res;
}
}
};
}
//! decompose utf16 or utf32 string to any container supporting push_back or to T*
template <NUnicode::ENormalization Norm, class T, typename TCharType>
inline void Normalize(const TCharType* begin, size_t len, T& out) {
::NUnicode::TNormalizer<Norm> dec;
dec.Normalize(begin, len, out);
}
template <NUnicode::ENormalization N, typename TCharType>
inline TBasicString<TCharType> Normalize(const TCharType* str, size_t len) {
TBasicString<TCharType> res;
res.reserve(len);
Normalize<N>(str, len, res);
return res;
}
template <NUnicode::ENormalization N, typename TCharType>
inline TBasicString<TCharType> Normalize(const TBasicString<TCharType>& str) {
::NUnicode::TNormalizer<N> dec;
return dec.Normalize(str);
}
template <NUnicode::ENormalization N, typename TCharType>
inline TBasicString<TCharType> Normalize(const TBasicStringBuf<TCharType> str) {
return Normalize<N>(str.data(), str.size());
}