#pragma once #include "decomposition_table.h" #include <util/charset/unidata.h> #include <util/charset/wide.h> #include <util/generic/hash.h> #include <util/generic/vector.h> #include <util/generic/algorithm.h> #include <util/generic/singleton.h> #include <util/generic/noncopyable.h> #include <utility> namespace NUnicode { enum ENormalization { NFD, NFC, NFKD, NFKC, }; // Грубо говоря: // NFD расскладывает "ё" на "е + диакритику" // NFC сначала всё раскладывает, потом всё что может - складывает // NFKD делает то же, что и NFD. Кроме того, например, римскую IV (\x2163) // превращает в латинские I и V // NFKC - NFKD + композиция (римская четвёрка из I и V, естественно, не образуется) // Формальная спецификация: http://www.unicode.org/reports/tr15/ namespace NPrivate { inline const wchar32* Decomposition(const TDecompositionTable& table, wchar32 ch) { return table.Get(ch, static_cast<const wchar32*>(nullptr)); } class TDecompositor { private: const TDecompositionTable& Table; public: inline TDecompositor(const TDecompositionTable& table) : Table(table) { } inline const wchar32* Decomposition(wchar32 ch) const { return NPrivate::Decomposition(Table, ch); } }; template <bool IsCompat> struct TStandartDecompositor: public TDecompositor { TStandartDecompositor() : TDecompositor(NPrivate::DecompositionTable<IsCompat>()) { } }; template <ENormalization N> struct TShift; template <> struct TShift<NFD> { static const WC_TYPE Value = NFD_QC; }; template <> struct TShift<NFC> { static const WC_TYPE Value = NFC_QC; }; template <> struct TShift<NFKD> { static const WC_TYPE Value = NFKD_QC; }; template <> struct TShift<NFKC> { static const WC_TYPE Value = NFKC_QC; }; template <ENormalization N> inline bool Normalized(wchar32 ch) { return CharInfo(ch) & NPrivate::TShift<N>::Value; } class TComposition { private: struct TRawData { wchar32 Lead; wchar32 Tail; wchar32 Comp; }; static const TRawData RawData[]; static const size_t RawDataSize; class TKey: public std::pair<wchar32, wchar32> { public: inline TKey(wchar32 a, wchar32 b) : std::pair<wchar32, wchar32>(a, b) { } inline size_t Hash() const { return CombineHashes(first, second); } }; template <class T> struct THash { inline size_t operator()(const T& t) const { return t.Hash(); } }; typedef THashMap<TKey, wchar32, THash<TKey>> TData; TData Data; public: TComposition(); inline wchar32 Composite(wchar32 lead, wchar32 tail) const { TData::const_iterator i = Data.find(TKey(lead, tail)); if (i == Data.end()) return 0; return i->second; } }; typedef std::pair<wchar32, TCombining> TSymbol; typedef TVector<TSymbol> TBuffer; template <bool doCompose> class TCompositor; template <> class TCompositor<false> { public: inline void DoComposition(TBuffer& buffer) { Y_UNUSED(buffer); } }; template <> class TCompositor<true> { private: static const wchar32 NonComposite = 0; const TComposition* Composition; public: inline TCompositor() : Composition(Singleton<TComposition>()) { } inline void DoComposition(TBuffer& buffer) { if (buffer.size() < 2) return; const TSymbol& leadSymbol = buffer[0]; if (leadSymbol.second != 0) return; wchar32 lead = leadSymbol.first; bool oneMoreTurnPlease = false; do { oneMoreTurnPlease = false; TCombining lastCombining = 0; for (TBuffer::iterator i = buffer.begin() + 1, mi = buffer.end(); i != mi; ++i) { TCombining currentCombining = i->second; if (!(currentCombining != lastCombining && currentCombining != 0 || lastCombining == 0 && currentCombining == 0)) continue; lastCombining = currentCombining; wchar32 comb = Composition->Composite(lead, i->first); if (comb == NonComposite) continue; lead = comb; buffer.erase(i); oneMoreTurnPlease = true; break; } } while (oneMoreTurnPlease); Y_ASSERT(DecompositionCombining(lead) == 0); buffer[0] = TSymbol(lead, 0); } }; template <ENormalization N, typename TCharType> inline bool Normalized(const TCharType* begin, const TCharType* end) { TCombining lastCanonicalClass = 0; for (const TCharType* i = begin; i != end;) { wchar32 ch = ReadSymbolAndAdvance(i, end); TCombining canonicalClass = DecompositionCombining(ch); if (lastCanonicalClass > canonicalClass && canonicalClass != 0) return false; if (!Normalized<N>(ch)) return false; lastCanonicalClass = canonicalClass; } return true; } } template <bool compat> inline const wchar32* Decomposition(wchar32 ch) { return NPrivate::Decomposition(NPrivate::DecompositionTable<compat>(), ch); } template <ENormalization N, class TDecompositor = NPrivate::TDecompositor> class TNormalizer : NNonCopyable::TNonCopyable { private: static const ENormalization Norm = N; static const bool IsCompat = Norm == NFKD || Norm == NFKC; static const bool RequireComposition = Norm == NFC || Norm == NFKC; typedef NPrivate::TSymbol TSymbol; typedef NPrivate::TBuffer TBuffer; TBuffer Buffer; NPrivate::TCompositor<RequireComposition> Compositor; const TDecompositor& Decompositor; private: static inline bool Compare(const TSymbol& a, const TSymbol& b) { return a.second < b.second; } struct TComparer { inline bool operator()(const TSymbol& a, const TSymbol& b) { return Compare(a, b); } }; template <class T> static inline void Write(const TBuffer::const_iterator& begin, const TBuffer::const_iterator& end, T& out) { for (TBuffer::const_iterator i = begin; i != end; ++i) { WriteSymbol(i->first, out); } } static inline void Write(const TBuffer::const_iterator& begin, const TBuffer::const_iterator& end, TUtf32String& out) { // because WriteSymbol from util/charset/wide.h works wrong in this case for (TBuffer::const_iterator i = begin; i != end; ++i) { out += i->first; } } inline void SortBuffer() { if (Buffer.size() < 2) return; StableSort(Buffer.begin(), Buffer.end(), TComparer()); } template <class T> inline void AddCharNoDecomposition(wchar32 c, T& out) { TCombining cc = DecompositionCombining(c); if (cc == 0) { SortBuffer(); Buffer.push_back(TBuffer::value_type(c, cc)); Compositor.DoComposition(Buffer); if (Buffer.size() > 1) { Write(Buffer.begin(), Buffer.end() - 1, out); Buffer.erase(Buffer.begin(), Buffer.end() - 1); // TODO I don't like this } } else { Buffer.push_back(TBuffer::value_type(c, cc)); } } template <class T> inline void AddChar(wchar32 c, T& out) { const wchar32* decompBegin = Decompositor.Decomposition(c); if (decompBegin) { while (*decompBegin) { Y_ASSERT(Decompositor.Decomposition(*decompBegin) == nullptr); AddCharNoDecomposition(*(decompBegin++), out); } return; } else { AddCharNoDecomposition(c, out); } } template <class T, typename TCharType> inline void DoNormalize(const TCharType* begin, const TCharType* end, T& out) { Buffer.clear(); for (const TCharType* i = begin; i != end;) { AddChar(ReadSymbolAndAdvance(i, end), out); } SortBuffer(); Compositor.DoComposition(Buffer); Write(Buffer.begin(), Buffer.end(), out); } public: TNormalizer() : Decompositor(*Singleton<NPrivate::TStandartDecompositor<IsCompat>>()) { } TNormalizer(const TDecompositor& decompositor) : Decompositor(decompositor) { } template <class T, typename TCharType> inline void Normalize(const TCharType* begin, const TCharType* end, T& out) { if (NPrivate::Normalized<Norm>(begin, end)) { for (const TCharType* i = begin; i != end; ++i) { WriteSymbol(*i, out); } } else { DoNormalize(begin, end, out); } } template <typename TCharType> inline void Normalize(const TCharType* begin, const TCharType* end, TUtf32String& out) { if (NPrivate::Normalized<Norm>(begin, end)) { for (const TCharType* i = begin; i != end;) { out += ReadSymbolAndAdvance(i, end); } } else { DoNormalize(begin, end, out); } } template <class T, typename TCharType> inline void Normalize(const TCharType* begin, size_t len, T& out) { return Normalize(begin, begin + len, out); } template <typename TCharType> inline TBasicString<TCharType> Normalize(const TBasicString<TCharType>& src) { if (NPrivate::Normalized<Norm>(src.begin(), src.end())) { // nothing to normalize return src; } else { TBasicString<TCharType> res; res.reserve(src.length()); DoNormalize(src.begin(), src.end(), res); return res; } } }; } //! decompose utf16 or utf32 string to any container supporting push_back or to T* template <NUnicode::ENormalization Norm, class T, typename TCharType> inline void Normalize(const TCharType* begin, size_t len, T& out) { ::NUnicode::TNormalizer<Norm> dec; dec.Normalize(begin, len, out); } template <NUnicode::ENormalization N, typename TCharType> inline TBasicString<TCharType> Normalize(const TCharType* str, size_t len) { TBasicString<TCharType> res; res.reserve(len); Normalize<N>(str, len, res); return res; } template <NUnicode::ENormalization N, typename TCharType> inline TBasicString<TCharType> Normalize(const TBasicString<TCharType>& str) { ::NUnicode::TNormalizer<N> dec; return dec.Normalize(str); } template <NUnicode::ENormalization N, typename TCharType> inline TBasicString<TCharType> Normalize(const TBasicStringBuf<TCharType> str) { return Normalize<N>(str.data(), str.size()); }