diff options
author | Devtools Arcadia <arcadia-devtools@yandex-team.ru> | 2022-02-07 18:08:42 +0300 |
---|---|---|
committer | Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net> | 2022-02-07 18:08:42 +0300 |
commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h | |
download | ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h')
-rw-r--r-- | library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h | 192 |
1 files changed, 192 insertions, 0 deletions
diff --git a/library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h b/library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h new file mode 100644 index 0000000000..8a240bfed8 --- /dev/null +++ b/library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h @@ -0,0 +1,192 @@ +#pragma once + +#include <util/draft/matrix.h> +#include <util/generic/algorithm.h> +#include <util/generic/vector.h> +#include <util/system/yassert.h> + +#include <type_traits> +#include <utility> + +namespace NLevenshtein { + enum EEditMoveType { + EMT_SPECIAL, + EMT_PRESERVE, + EMT_REPLACE, + EMT_DELETE, + EMT_INSERT + }; + + inline bool IsImportantEditMove(EEditMoveType p) { + return (p != EMT_SPECIAL && p != EMT_PRESERVE); + } + + inline void MakeMove(EEditMoveType t, int& p1, int& p2) { + switch (t) { + case EMT_PRESERVE: + case EMT_REPLACE: + p1++; + p2++; + break; + case EMT_DELETE: + p1++; + break; + case EMT_INSERT: + p2++; + break; + default: + break; + } + } + + using TEditChain = TVector<EEditMoveType>; + + template <typename TArgType> + struct TWeightOneUnaryGetter { + int operator()(const TArgType&) const { + return 1; + } + }; + + template <typename TArgType> + struct TWeightOneBinaryGetter { + int operator()(const TArgType&, const TArgType&) const { + return 1; + } + }; + + template <typename TStringType> + using TCharType = typename std::decay_t<decltype(std::add_const_t<TStringType>()[0])>; + + /// Finds sequence of "edit moves" for two strings + template <class TStringType, class TWeightType = int, + class TReplaceWeigher = TWeightOneBinaryGetter<TCharType<TStringType>>, + class TDeleteWeigher = TWeightOneUnaryGetter<TCharType<TStringType>>, + class TInsertWeigher = TWeightOneUnaryGetter<TCharType<TStringType>> + > + void GetEditChain(const TStringType& str1, const TStringType& str2, TEditChain& res, TWeightType* weight = nullptr, + const TReplaceWeigher& replaceWeigher = TReplaceWeigher(), + const TDeleteWeigher& deleteWeigher = TDeleteWeigher(), + const TInsertWeigher& insertWeigher = TInsertWeigher()) + { + int l1 = (int)str1.size(); + int l2 = (int)str2.size(); + + TMatrix<std::pair<TWeightType, EEditMoveType>> ma(l1 + 1, l2 + 1); /// ma[i][j].first = diff(str1[0..i-1], str2[0..j-1]) + ma[0][0] = std::make_pair(0, EMT_SPECIAL); // starting point + for (int i = 1; i <= l1; i++) { + ma[i][0] = std::make_pair(ma[i - 1][0].first + deleteWeigher(str1[i - 1]), EMT_DELETE); + } + for (int i = 1; i <= l2; i++) { + ma[0][i] = std::make_pair(ma[0][i - 1].first + insertWeigher(str2[i - 1]), EMT_INSERT); + } + // Here goes basic Levestein's algorithm + for (int i = 1; i <= l1; i++) { + for (int j = 1; j <= l2; j++) { + if (str1[i - 1] == str2[j - 1]) { + ma[i][j] = std::make_pair(ma[i - 1][j - 1].first, EMT_PRESERVE); + } else { + const TWeightType replaceWeight = replaceWeigher(str1[i - 1], str2[j - 1]); + Y_ASSERT(replaceWeight >= 0); + ma[i][j] = std::make_pair(ma[i - 1][j - 1].first + replaceWeight, EMT_REPLACE); + } + + if (ma[i][j].first > ma[i - 1][j].first) { + const TWeightType deleteWeight = deleteWeigher(str1[i - 1]); + Y_ASSERT(deleteWeight >= 0); + const TWeightType deletePathWeight = ma[i - 1][j].first + deleteWeight; + if (deletePathWeight <= ma[i][j].first) { + ma[i][j] = std::make_pair(deletePathWeight, EMT_DELETE); + } + } + + if (ma[i][j].first > ma[i][j - 1].first) { + const TWeightType insertWeight = insertWeigher(str2[j - 1]); + Y_ASSERT(insertWeight >= 0); + const TWeightType insertPathWeight = ma[i][j - 1].first + insertWeight; + if (insertPathWeight <= ma[i][j].first) { + ma[i][j] = std::make_pair(insertPathWeight, EMT_INSERT); + } + } + } + } + // Tracing the path from final point + res.clear(); + res.reserve(Max<size_t>(l1, l2)); + for (int i = l1, j = l2; ma[i][j].second != EMT_SPECIAL;) { + res.push_back(ma[i][j].second); + switch (ma[i][j].second) { + case EMT_PRESERVE: + case EMT_REPLACE: + --i; + --j; + break; + case EMT_DELETE: + --i; + break; + case EMT_INSERT: + --j; + break; + default: + // TODO: throw exception + break; + } + } + std::reverse(res.begin(), res.end()); + + if (weight != nullptr) { + *weight = ma[l1][l2].first; + } + } + + template <class TStringType> + size_t Distance(const TStringType& str1, const TStringType& str2) { + TEditChain editChain; + GetEditChain(str1, str2, editChain); + size_t result = 0; + for (auto edit : editChain) { + if (IsImportantEditMove(edit)) + result++; + } + return result; + } + + /// Calculates substrings to be replaced for str1->str2 transformation + struct TReplacement { + int CorrectOffset, CorrectLength, MisspelledOffset, MisspelledLength; + TReplacement() + : CorrectOffset(0) + , CorrectLength(0) + , MisspelledOffset(0) + , MisspelledLength(0) + { + } + TReplacement(int correctOffset, int correctLength, int misspelledOffset, int misspelledLength) + : CorrectOffset(correctOffset) + , CorrectLength(correctLength) + , MisspelledOffset(misspelledOffset) + , MisspelledLength(misspelledLength) + { + } + }; + + template <class TStringType> + void GetStringReplacements(const TStringType& str1, const TStringType& str2, TVector<TReplacement>& res) { + TEditChain editChain; + GetEditChain(str1, str2, editChain); + editChain.push_back(EMT_SPECIAL); + int c1 = 0, c2 = 0; + res.clear(); + for (TEditChain::const_iterator it = editChain.begin(); it != editChain.end(); it++) { + if (IsImportantEditMove(*it)) { + int sc1 = c1, sc2 = c2; + do { + MakeMove(*it, c1, c2); + ++it; + } while (IsImportantEditMove(*it)); + res.push_back(TReplacement(sc1, c1 - sc1, sc2, c2 - sc2)); + } + MakeMove(*it, c1, c2); + } + } +} |