diff options
author | umnov <[email protected]> | 2022-02-10 16:50:28 +0300 |
---|---|---|
committer | Daniil Cherednik <[email protected]> | 2022-02-10 16:50:28 +0300 |
commit | 0bc655f0b88816a992ff638c25c09627d67e55d0 (patch) | |
tree | 24a7e41d3f11e3890654ee681a0a26a780170fa3 /library/cpp/unicode/normalization/normalization.h | |
parent | 9138262b9b527644a2423b034122d89ddbfb25d2 (diff) |
Restoring authorship annotation for <[email protected]>. Commit 1 of 2.
Diffstat (limited to 'library/cpp/unicode/normalization/normalization.h')
-rw-r--r-- | library/cpp/unicode/normalization/normalization.h | 614 |
1 files changed, 307 insertions, 307 deletions
diff --git a/library/cpp/unicode/normalization/normalization.h b/library/cpp/unicode/normalization/normalization.h index 4f5f57881c3..7ee9172bea0 100644 --- a/library/cpp/unicode/normalization/normalization.h +++ b/library/cpp/unicode/normalization/normalization.h @@ -1,329 +1,329 @@ -#pragma once - -#include "decomposition_table.h" - -#include <util/charset/unidata.h> -#include <util/charset/wide.h> -#include <util/generic/hash.h> -#include <util/generic/vector.h> -#include <util/generic/algorithm.h> -#include <util/generic/singleton.h> -#include <util/generic/noncopyable.h> +#pragma once + +#include "decomposition_table.h" + +#include <util/charset/unidata.h> +#include <util/charset/wide.h> +#include <util/generic/hash.h> +#include <util/generic/vector.h> +#include <util/generic/algorithm.h> +#include <util/generic/singleton.h> +#include <util/generic/noncopyable.h> #include <utility> - -namespace NUnicode { - enum ENormalization { - NFD, - NFC, - NFKD, - NFKC, - }; - - // Грубо говоря: - // NFD расскладывает "ё" на "е + диакритику" - // NFC сначала всё раскладывает, потом всё что может - складывает - // NFKD делает то же, что и NFD. Кроме того, например, римскую IV (\x2163) - // превращает в латинские I и V - // NFKC - NFKD + композиция (римская четвёрка из I и V, естественно, не образуется) - - // Формальная спецификация: http://www.unicode.org/reports/tr15/ - - namespace NPrivate { - inline const wchar32* Decomposition(const TDecompositionTable& table, wchar32 ch) { + +namespace NUnicode { + enum ENormalization { + NFD, + NFC, + NFKD, + NFKC, + }; + + // Грубо говоря: + // NFD расскладывает "ё" на "е + диакритику" + // NFC сначала всё раскладывает, потом всё что может - складывает + // NFKD делает то же, что и NFD. Кроме того, например, римскую IV (\x2163) + // превращает в латинские I и V + // NFKC - NFKD + композиция (римская четвёрка из I и V, естественно, не образуется) + + // Формальная спецификация: http://www.unicode.org/reports/tr15/ + + namespace NPrivate { + inline const wchar32* Decomposition(const TDecompositionTable& table, wchar32 ch) { return table.Get(ch, static_cast<const wchar32*>(nullptr)); - } - - class TDecompositor { - private: - const TDecompositionTable& Table; - - public: - inline TDecompositor(const TDecompositionTable& table) - : Table(table) + } + + class TDecompositor { + private: + const TDecompositionTable& Table; + + public: + inline TDecompositor(const TDecompositionTable& table) + : Table(table) { } - - inline const wchar32* Decomposition(wchar32 ch) const { - return NPrivate::Decomposition(Table, ch); - } - }; - + + inline const wchar32* Decomposition(wchar32 ch) const { + return NPrivate::Decomposition(Table, ch); + } + }; + template <bool IsCompat> - struct TStandartDecompositor: public TDecompositor { - TStandartDecompositor() + struct TStandartDecompositor: public TDecompositor { + TStandartDecompositor() : TDecompositor(NPrivate::DecompositionTable<IsCompat>()) { } - }; - - template <ENormalization N> - struct TShift; - - template <> - struct TShift<NFD> { - static const WC_TYPE Value = NFD_QC; - }; - template <> - struct TShift<NFC> { - static const WC_TYPE Value = NFC_QC; - }; - template <> - struct TShift<NFKD> { - static const WC_TYPE Value = NFKD_QC; - }; - template <> - struct TShift<NFKC> { - static const WC_TYPE Value = NFKC_QC; - }; - - template <ENormalization N> - inline bool Normalized(wchar32 ch) { - return CharInfo(ch) & NPrivate::TShift<N>::Value; - } - - class TComposition { - private: - struct TRawData { + }; + + template <ENormalization N> + struct TShift; + + template <> + struct TShift<NFD> { + static const WC_TYPE Value = NFD_QC; + }; + template <> + struct TShift<NFC> { + static const WC_TYPE Value = NFC_QC; + }; + template <> + struct TShift<NFKD> { + static const WC_TYPE Value = NFKD_QC; + }; + template <> + struct TShift<NFKC> { + static const WC_TYPE Value = NFKC_QC; + }; + + template <ENormalization N> + inline bool Normalized(wchar32 ch) { + return CharInfo(ch) & NPrivate::TShift<N>::Value; + } + + class TComposition { + private: + struct TRawData { wchar32 Lead; wchar32 Tail; wchar32 Comp; - }; - - static const TRawData RawData[]; - static const size_t RawDataSize; - + }; + + static const TRawData RawData[]; + static const size_t RawDataSize; + class TKey: public std::pair<wchar32, wchar32> { - public: - inline TKey(wchar32 a, wchar32 b) + public: + inline TKey(wchar32 a, wchar32 b) : std::pair<wchar32, wchar32>(a, b) { } - - inline size_t Hash() const { - return CombineHashes(first, second); - } - }; - - template <class T> - struct THash { - inline size_t operator()(const T& t) const { - return t.Hash(); - } - }; - + + inline size_t Hash() const { + return CombineHashes(first, second); + } + }; + + template <class T> + struct THash { + inline size_t operator()(const T& t) const { + return t.Hash(); + } + }; + typedef THashMap<TKey, wchar32, THash<TKey>> TData; - TData Data; - - public: - TComposition(); - - inline wchar32 Composite(wchar32 lead, wchar32 tail) const { - TData::const_iterator i = Data.find(TKey(lead, tail)); - if (i == Data.end()) - return 0; - - return i->second; - } - }; - + TData Data; + + public: + TComposition(); + + inline wchar32 Composite(wchar32 lead, wchar32 tail) const { + TData::const_iterator i = Data.find(TKey(lead, tail)); + if (i == Data.end()) + return 0; + + return i->second; + } + }; + typedef std::pair<wchar32, TCombining> TSymbol; typedef TVector<TSymbol> TBuffer; - - template <bool doCompose> - class TCompositor; - - template <> - class TCompositor<false> { - public: - inline void DoComposition(TBuffer& buffer) { + + template <bool doCompose> + class TCompositor; + + template <> + class TCompositor<false> { + public: + inline void DoComposition(TBuffer& buffer) { Y_UNUSED(buffer); - } - }; - - template <> - class TCompositor<true> { - private: - static const wchar32 NonComposite = 0; - const TComposition* Composition; - - public: - inline TCompositor() - : Composition(Singleton<TComposition>()) + } + }; + + template <> + class TCompositor<true> { + private: + static const wchar32 NonComposite = 0; + const TComposition* Composition; + + public: + inline TCompositor() + : Composition(Singleton<TComposition>()) { } - - inline void DoComposition(TBuffer& buffer) { - if (buffer.size() < 2) - return; - - const TSymbol& leadSymbol = buffer[0]; - if (leadSymbol.second != 0) - return; - - wchar32 lead = leadSymbol.first; - bool oneMoreTurnPlease = false; - do { - oneMoreTurnPlease = false; - TCombining lastCombining = 0; - for (TBuffer::iterator i = buffer.begin() + 1, mi = buffer.end(); i != mi; ++i) { - TCombining currentCombining = i->second; - if (!(currentCombining != lastCombining && currentCombining != 0 || lastCombining == 0 && currentCombining == 0)) - continue; - - lastCombining = currentCombining; - wchar32 comb = Composition->Composite(lead, i->first); - if (comb == NonComposite) - continue; - - lead = comb; - buffer.erase(i); - oneMoreTurnPlease = true; - break; - } - } while (oneMoreTurnPlease); - + + inline void DoComposition(TBuffer& buffer) { + if (buffer.size() < 2) + return; + + const TSymbol& leadSymbol = buffer[0]; + if (leadSymbol.second != 0) + return; + + wchar32 lead = leadSymbol.first; + bool oneMoreTurnPlease = false; + do { + oneMoreTurnPlease = false; + TCombining lastCombining = 0; + for (TBuffer::iterator i = buffer.begin() + 1, mi = buffer.end(); i != mi; ++i) { + TCombining currentCombining = i->second; + if (!(currentCombining != lastCombining && currentCombining != 0 || lastCombining == 0 && currentCombining == 0)) + continue; + + lastCombining = currentCombining; + wchar32 comb = Composition->Composite(lead, i->first); + if (comb == NonComposite) + continue; + + lead = comb; + buffer.erase(i); + oneMoreTurnPlease = true; + break; + } + } while (oneMoreTurnPlease); + Y_ASSERT(DecompositionCombining(lead) == 0); - buffer[0] = TSymbol(lead, 0); - } - }; - + buffer[0] = TSymbol(lead, 0); + } + }; + template <ENormalization N, typename TCharType> inline bool Normalized(const TCharType* begin, const TCharType* end) { - TCombining lastCanonicalClass = 0; + TCombining lastCanonicalClass = 0; for (const TCharType* i = begin; i != end;) { - wchar32 ch = ReadSymbolAndAdvance(i, end); - - TCombining canonicalClass = DecompositionCombining(ch); - if (lastCanonicalClass > canonicalClass && canonicalClass != 0) - return false; - - if (!Normalized<N>(ch)) - return false; - - lastCanonicalClass = canonicalClass; - } - return true; - } - } - - template <bool compat> - inline const wchar32* Decomposition(wchar32 ch) { - return NPrivate::Decomposition(NPrivate::DecompositionTable<compat>(), ch); + wchar32 ch = ReadSymbolAndAdvance(i, end); + + TCombining canonicalClass = DecompositionCombining(ch); + if (lastCanonicalClass > canonicalClass && canonicalClass != 0) + return false; + + if (!Normalized<N>(ch)) + return false; + + lastCanonicalClass = canonicalClass; + } + return true; + } } - - template <ENormalization N, class TDecompositor = NPrivate::TDecompositor> - class TNormalizer : NNonCopyable::TNonCopyable { - private: - static const ENormalization Norm = N; - static const bool IsCompat = Norm == NFKD || Norm == NFKC; - static const bool RequireComposition = Norm == NFC || Norm == NFKC; - - typedef NPrivate::TSymbol TSymbol; - typedef NPrivate::TBuffer TBuffer; - - TBuffer Buffer; - - NPrivate::TCompositor<RequireComposition> Compositor; - const TDecompositor& Decompositor; - - private: - static inline bool Compare(const TSymbol& a, const TSymbol& b) { - return a.second < b.second; - } - - struct TComparer { + + template <bool compat> + inline const wchar32* Decomposition(wchar32 ch) { + return NPrivate::Decomposition(NPrivate::DecompositionTable<compat>(), ch); + } + + template <ENormalization N, class TDecompositor = NPrivate::TDecompositor> + class TNormalizer : NNonCopyable::TNonCopyable { + private: + static const ENormalization Norm = N; + static const bool IsCompat = Norm == NFKD || Norm == NFKC; + static const bool RequireComposition = Norm == NFC || Norm == NFKC; + + typedef NPrivate::TSymbol TSymbol; + typedef NPrivate::TBuffer TBuffer; + + TBuffer Buffer; + + NPrivate::TCompositor<RequireComposition> Compositor; + const TDecompositor& Decompositor; + + private: + static inline bool Compare(const TSymbol& a, const TSymbol& b) { + return a.second < b.second; + } + + struct TComparer { inline bool operator()(const TSymbol& a, const TSymbol& b) { - return Compare(a, b); - } - }; - - template <class T> - static inline void Write(const TBuffer::const_iterator& begin, const TBuffer::const_iterator& end, T& out) { - for (TBuffer::const_iterator i = begin; i != end; ++i) { - WriteSymbol(i->first, out); - } - } - + return Compare(a, b); + } + }; + + template <class T> + static inline void Write(const TBuffer::const_iterator& begin, const TBuffer::const_iterator& end, T& out) { + for (TBuffer::const_iterator i = begin; i != end; ++i) { + WriteSymbol(i->first, out); + } + } + static inline void Write(const TBuffer::const_iterator& begin, const TBuffer::const_iterator& end, TUtf32String& out) { // because WriteSymbol from util/charset/wide.h works wrong in this case for (TBuffer::const_iterator i = begin; i != end; ++i) { out += i->first; } } - inline void SortBuffer() { - if (Buffer.size() < 2) - return; - - StableSort(Buffer.begin(), Buffer.end(), TComparer()); - } - - template <class T> - inline void AddCharNoDecomposition(wchar32 c, T& out) { - TCombining cc = DecompositionCombining(c); - if (cc == 0) { - SortBuffer(); - Buffer.push_back(TBuffer::value_type(c, cc)); - - Compositor.DoComposition(Buffer); - - if (Buffer.size() > 1) { - Write(Buffer.begin(), Buffer.end() - 1, out); - Buffer.erase(Buffer.begin(), Buffer.end() - 1); // TODO I don't like this - } - } else { - Buffer.push_back(TBuffer::value_type(c, cc)); - } - } - - template <class T> - inline void AddChar(wchar32 c, T& out) { - const wchar32* decompBegin = Decompositor.Decomposition(c); - if (decompBegin) { - while (*decompBegin) { + inline void SortBuffer() { + if (Buffer.size() < 2) + return; + + StableSort(Buffer.begin(), Buffer.end(), TComparer()); + } + + template <class T> + inline void AddCharNoDecomposition(wchar32 c, T& out) { + TCombining cc = DecompositionCombining(c); + if (cc == 0) { + SortBuffer(); + Buffer.push_back(TBuffer::value_type(c, cc)); + + Compositor.DoComposition(Buffer); + + if (Buffer.size() > 1) { + Write(Buffer.begin(), Buffer.end() - 1, out); + Buffer.erase(Buffer.begin(), Buffer.end() - 1); // TODO I don't like this + } + } else { + Buffer.push_back(TBuffer::value_type(c, cc)); + } + } + + template <class T> + inline void AddChar(wchar32 c, T& out) { + const wchar32* decompBegin = Decompositor.Decomposition(c); + if (decompBegin) { + while (*decompBegin) { Y_ASSERT(Decompositor.Decomposition(*decompBegin) == nullptr); - AddCharNoDecomposition(*(decompBegin++), out); - } - return; - } else { - AddCharNoDecomposition(c, out); - } - } - + AddCharNoDecomposition(*(decompBegin++), out); + } + return; + } else { + AddCharNoDecomposition(c, out); + } + } + template <class T, typename TCharType> inline void DoNormalize(const TCharType* begin, const TCharType* end, T& out) { - Buffer.clear(); - + Buffer.clear(); + for (const TCharType* i = begin; i != end;) { - AddChar(ReadSymbolAndAdvance(i, end), out); - } - - SortBuffer(); - Compositor.DoComposition(Buffer); - Write(Buffer.begin(), Buffer.end(), out); - } - - public: - TNormalizer() + AddChar(ReadSymbolAndAdvance(i, end), out); + } + + SortBuffer(); + Compositor.DoComposition(Buffer); + Write(Buffer.begin(), Buffer.end(), out); + } + + public: + TNormalizer() : Decompositor(*Singleton<NPrivate::TStandartDecompositor<IsCompat>>()) { } - - TNormalizer(const TDecompositor& decompositor) - : Decompositor(decompositor) + + TNormalizer(const TDecompositor& decompositor) + : Decompositor(decompositor) { } - + template <class T, typename TCharType> inline void Normalize(const TCharType* begin, const TCharType* end, T& out) { - if (NPrivate::Normalized<Norm>(begin, end)) { + if (NPrivate::Normalized<Norm>(begin, end)) { for (const TCharType* i = begin; i != end; ++i) { - WriteSymbol(*i, out); - } - } else { - DoNormalize(begin, end, out); - } - } - + WriteSymbol(*i, out); + } + } else { + DoNormalize(begin, end, out); + } + } + template <typename TCharType> inline void Normalize(const TCharType* begin, const TCharType* end, TUtf32String& out) { if (NPrivate::Normalized<Norm>(begin, end)) { @@ -337,46 +337,46 @@ namespace NUnicode { template <class T, typename TCharType> inline void Normalize(const TCharType* begin, size_t len, T& out) { - return Normalize(begin, begin + len, out); - } - + return Normalize(begin, begin + len, out); + } + template <typename TCharType> inline TBasicString<TCharType> Normalize(const TBasicString<TCharType>& src) { - if (NPrivate::Normalized<Norm>(src.begin(), src.end())) { - // nothing to normalize - return src; - } else { + if (NPrivate::Normalized<Norm>(src.begin(), src.end())) { + // nothing to normalize + return src; + } else { TBasicString<TCharType> res; - res.reserve(src.length()); - DoNormalize(src.begin(), src.end(), res); - return res; - } - } - }; + res.reserve(src.length()); + DoNormalize(src.begin(), src.end(), res); + return res; + } + } + }; } - + //! decompose utf16 or utf32 string to any container supporting push_back or to T* template <NUnicode::ENormalization Norm, class T, typename TCharType> inline void Normalize(const TCharType* begin, size_t len, T& out) { ::NUnicode::TNormalizer<Norm> dec; - dec.Normalize(begin, len, out); -} - + dec.Normalize(begin, len, out); +} + template <NUnicode::ENormalization N, typename TCharType> inline TBasicString<TCharType> Normalize(const TCharType* str, size_t len) { TBasicString<TCharType> res; - res.reserve(len); - - Normalize<N>(str, len, res); - - return res; -} - + res.reserve(len); + + Normalize<N>(str, len, res); + + return res; +} + template <NUnicode::ENormalization N, typename TCharType> inline TBasicString<TCharType> Normalize(const TBasicString<TCharType>& str) { ::NUnicode::TNormalizer<N> dec; - return dec.Normalize(str); -} + return dec.Normalize(str); +} template <NUnicode::ENormalization N, typename TCharType> inline TBasicString<TCharType> Normalize(const TBasicStringBuf<TCharType> str) { |