#pragma once #include <util/draft/matrix.h> #include <util/generic/algorithm.h> #include <util/generic/vector.h> #include <util/system/yassert.h> #include <type_traits> #include <utility> namespace NLevenshtein { enum EEditMoveType { EMT_SPECIAL, EMT_PRESERVE, EMT_REPLACE, EMT_DELETE, EMT_INSERT }; inline bool IsImportantEditMove(EEditMoveType p) { return (p != EMT_SPECIAL && p != EMT_PRESERVE); } inline void MakeMove(EEditMoveType t, int& p1, int& p2) { switch (t) { case EMT_PRESERVE: case EMT_REPLACE: p1++; p2++; break; case EMT_DELETE: p1++; break; case EMT_INSERT: p2++; break; default: break; } } using TEditChain = TVector<EEditMoveType>; template <typename TArgType> struct TWeightOneUnaryGetter { int operator()(const TArgType&) const { return 1; } }; template <typename TArgType> struct TWeightOneBinaryGetter { int operator()(const TArgType&, const TArgType&) const { return 1; } }; template <typename TStringType> using TCharType = typename std::decay_t<decltype(std::add_const_t<TStringType>()[0])>; /// Finds sequence of "edit moves" for two strings template <class TStringType, class TWeightType = int, class TReplaceWeigher = TWeightOneBinaryGetter<TCharType<TStringType>>, class TDeleteWeigher = TWeightOneUnaryGetter<TCharType<TStringType>>, class TInsertWeigher = TWeightOneUnaryGetter<TCharType<TStringType>> > void GetEditChain(const TStringType& str1, const TStringType& str2, TEditChain& res, TWeightType* weight = nullptr, const TReplaceWeigher& replaceWeigher = TReplaceWeigher(), const TDeleteWeigher& deleteWeigher = TDeleteWeigher(), const TInsertWeigher& insertWeigher = TInsertWeigher()) { int l1 = (int)str1.size(); int l2 = (int)str2.size(); TMatrix<std::pair<TWeightType, EEditMoveType>> ma(l1 + 1, l2 + 1); /// ma[i][j].first = diff(str1[0..i-1], str2[0..j-1]) ma[0][0] = std::make_pair(0, EMT_SPECIAL); // starting point for (int i = 1; i <= l1; i++) { ma[i][0] = std::make_pair(ma[i - 1][0].first + deleteWeigher(str1[i - 1]), EMT_DELETE); } for (int i = 1; i <= l2; i++) { ma[0][i] = std::make_pair(ma[0][i - 1].first + insertWeigher(str2[i - 1]), EMT_INSERT); } // Here goes basic Levestein's algorithm for (int i = 1; i <= l1; i++) { for (int j = 1; j <= l2; j++) { if (str1[i - 1] == str2[j - 1]) { ma[i][j] = std::make_pair(ma[i - 1][j - 1].first, EMT_PRESERVE); } else { const TWeightType replaceWeight = replaceWeigher(str1[i - 1], str2[j - 1]); Y_ASSERT(replaceWeight >= 0); ma[i][j] = std::make_pair(ma[i - 1][j - 1].first + replaceWeight, EMT_REPLACE); } if (ma[i][j].first > ma[i - 1][j].first) { const TWeightType deleteWeight = deleteWeigher(str1[i - 1]); Y_ASSERT(deleteWeight >= 0); const TWeightType deletePathWeight = ma[i - 1][j].first + deleteWeight; if (deletePathWeight <= ma[i][j].first) { ma[i][j] = std::make_pair(deletePathWeight, EMT_DELETE); } } if (ma[i][j].first > ma[i][j - 1].first) { const TWeightType insertWeight = insertWeigher(str2[j - 1]); Y_ASSERT(insertWeight >= 0); const TWeightType insertPathWeight = ma[i][j - 1].first + insertWeight; if (insertPathWeight <= ma[i][j].first) { ma[i][j] = std::make_pair(insertPathWeight, EMT_INSERT); } } } } // Tracing the path from final point res.clear(); res.reserve(Max<size_t>(l1, l2)); for (int i = l1, j = l2; ma[i][j].second != EMT_SPECIAL;) { res.push_back(ma[i][j].second); switch (ma[i][j].second) { case EMT_PRESERVE: case EMT_REPLACE: --i; --j; break; case EMT_DELETE: --i; break; case EMT_INSERT: --j; break; default: // TODO: throw exception break; } } std::reverse(res.begin(), res.end()); if (weight != nullptr) { *weight = ma[l1][l2].first; } } template <class TStringType> size_t Distance(const TStringType& str1, const TStringType& str2) { TEditChain editChain; GetEditChain(str1, str2, editChain); size_t result = 0; for (auto edit : editChain) { if (IsImportantEditMove(edit)) result++; } return result; } /// Calculates substrings to be replaced for str1->str2 transformation struct TReplacement { int CorrectOffset, CorrectLength, MisspelledOffset, MisspelledLength; TReplacement() : CorrectOffset(0) , CorrectLength(0) , MisspelledOffset(0) , MisspelledLength(0) { } TReplacement(int correctOffset, int correctLength, int misspelledOffset, int misspelledLength) : CorrectOffset(correctOffset) , CorrectLength(correctLength) , MisspelledOffset(misspelledOffset) , MisspelledLength(misspelledLength) { } }; template <class TStringType> void GetStringReplacements(const TStringType& str1, const TStringType& str2, TVector<TReplacement>& res) { TEditChain editChain; GetEditChain(str1, str2, editChain); editChain.push_back(EMT_SPECIAL); int c1 = 0, c2 = 0; res.clear(); for (TEditChain::const_iterator it = editChain.begin(); it != editChain.end(); it++) { if (IsImportantEditMove(*it)) { int sc1 = c1, sc2 = c2; do { MakeMove(*it, c1, c2); ++it; } while (IsImportantEditMove(*it)); res.push_back(TReplacement(sc1, c1 - sc1, sc2, c2 - sc2)); } MakeMove(*it, c1, c2); } } }