diff options
author | ilikepugs <ilikepugs@yandex-team.com> | 2023-01-31 10:07:58 +0300 |
---|---|---|
committer | ilikepugs <ilikepugs@yandex-team.com> | 2023-01-31 10:07:58 +0300 |
commit | 72dd5424aab89989d74273b6ab089b4d43156eca (patch) | |
tree | 38d0b224f9b96782f29601d4b62a6c762edb4154 | |
parent | 84d8b69eb78329102350117cecd3789f6b0a76bc (diff) | |
download | ydb-72dd5424aab89989d74273b6ab089b4d43156eca.tar.gz |
Implement Damerau-Levenshtein distance
-rw-r--r-- | library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h | 104 | ||||
-rw-r--r-- | library/cpp/string_utils/levenshtein_diff/levenshtein_diff_ut.cpp | 20 |
2 files changed, 104 insertions, 20 deletions
diff --git a/library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h b/library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h index 8a240bfed8..2184eb85db 100644 --- a/library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h +++ b/library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h @@ -14,7 +14,8 @@ namespace NLevenshtein { EMT_PRESERVE, EMT_REPLACE, EMT_DELETE, - EMT_INSERT + EMT_INSERT, + EMT_TRANSPOSE }; inline bool IsImportantEditMove(EEditMoveType p) { @@ -23,6 +24,10 @@ namespace NLevenshtein { inline void MakeMove(EEditMoveType t, int& p1, int& p2) { switch (t) { + case EMT_TRANSPOSE: + p1 += 2; + p2 += 2; + break; case EMT_PRESERVE: case EMT_REPLACE: p1++; @@ -41,33 +46,42 @@ namespace NLevenshtein { using TEditChain = TVector<EEditMoveType>; - template <typename TArgType> + template <typename TArgType, typename TWeightType = int> struct TWeightOneUnaryGetter { - int operator()(const TArgType&) const { + TWeightType operator()(const TArgType&) const { return 1; } }; - template <typename TArgType> + template <typename TArgType, typename TWeightType = int> struct TWeightOneBinaryGetter { - int operator()(const TArgType&, const TArgType&) const { + TWeightType operator()(const TArgType&, const TArgType&) const { return 1; } }; + template <typename TArgType, typename TWeightType = int> + struct TWeightInfBinaryGetter { + TWeightType operator()(const TArgType&, const TArgType&) const { + return std::numeric_limits<TWeightType>::max(); + } + }; + template <typename TStringType> using TCharType = typename std::decay_t<decltype(std::add_const_t<TStringType>()[0])>; /// Finds sequence of "edit moves" for two strings template <class TStringType, class TWeightType = int, - class TReplaceWeigher = TWeightOneBinaryGetter<TCharType<TStringType>>, - class TDeleteWeigher = TWeightOneUnaryGetter<TCharType<TStringType>>, - class TInsertWeigher = TWeightOneUnaryGetter<TCharType<TStringType>> + class TReplaceWeigher = TWeightOneBinaryGetter<TCharType<TStringType>, TWeightType>, + class TDeleteWeigher = TWeightOneUnaryGetter<TCharType<TStringType>, TWeightType>, + class TInsertWeigher = TWeightOneUnaryGetter<TCharType<TStringType>, TWeightType>, + class TTransposeWeigher = TWeightInfBinaryGetter<TCharType<TStringType>, TWeightType> > void GetEditChain(const TStringType& str1, const TStringType& str2, TEditChain& res, TWeightType* weight = nullptr, const TReplaceWeigher& replaceWeigher = TReplaceWeigher(), const TDeleteWeigher& deleteWeigher = TDeleteWeigher(), - const TInsertWeigher& insertWeigher = TInsertWeigher()) + const TInsertWeigher& insertWeigher = TInsertWeigher(), + const TTransposeWeigher& transposeWeigher = TTransposeWeigher()) { int l1 = (int)str1.size(); int l2 = (int)str2.size(); @@ -80,7 +94,8 @@ namespace NLevenshtein { for (int i = 1; i <= l2; i++) { ma[0][i] = std::make_pair(ma[0][i - 1].first + insertWeigher(str2[i - 1]), EMT_INSERT); } - // Here goes basic Levestein's algorithm + const TWeightType maxWeight = std::numeric_limits<TWeightType>::max(); + // Here goes basic Damerau-Levestein's algorithm for (int i = 1; i <= l1; i++) { for (int j = 1; j <= l2; j++) { if (str1[i - 1] == str2[j - 1]) { @@ -88,24 +103,43 @@ namespace NLevenshtein { } else { const TWeightType replaceWeight = replaceWeigher(str1[i - 1], str2[j - 1]); Y_ASSERT(replaceWeight >= 0); - ma[i][j] = std::make_pair(ma[i - 1][j - 1].first + replaceWeight, EMT_REPLACE); + if (replaceWeight < maxWeight) { + ma[i][j] = std::make_pair(ma[i - 1][j - 1].first + replaceWeight, EMT_REPLACE); + } else { + ma[i][j] = std::make_pair(replaceWeight, EMT_REPLACE); + } } if (ma[i][j].first > ma[i - 1][j].first) { const TWeightType deleteWeight = deleteWeigher(str1[i - 1]); Y_ASSERT(deleteWeight >= 0); - const TWeightType deletePathWeight = ma[i - 1][j].first + deleteWeight; - if (deletePathWeight <= ma[i][j].first) { - ma[i][j] = std::make_pair(deletePathWeight, EMT_DELETE); + if (deleteWeight < maxWeight) { + const TWeightType deletePathWeight = ma[i - 1][j].first + deleteWeight; + if (deletePathWeight <= ma[i][j].first) { + ma[i][j] = std::make_pair(deletePathWeight, EMT_DELETE); + } } } if (ma[i][j].first > ma[i][j - 1].first) { const TWeightType insertWeight = insertWeigher(str2[j - 1]); Y_ASSERT(insertWeight >= 0); - const TWeightType insertPathWeight = ma[i][j - 1].first + insertWeight; - if (insertPathWeight <= ma[i][j].first) { - ma[i][j] = std::make_pair(insertPathWeight, EMT_INSERT); + if (insertWeight < maxWeight) { + const TWeightType insertPathWeight = ma[i][j - 1].first + insertWeight; + if (insertPathWeight <= ma[i][j].first) { + ma[i][j] = std::make_pair(insertPathWeight, EMT_INSERT); + } + } + } + + if (i > 1 && j > 1 && str1[i - 1] == str2[j - 2] && str1[i - 2] == str2[j - 1]) { + const TWeightType transposeWeight = transposeWeigher(str1[i - 2], str2[j - 2]); + Y_ASSERT(transposeWeight >= 0); + if (transposeWeight < maxWeight) { + const TWeightType transposePathWeight = ma[i - 2][j - 2].first + transposeWeight; + if (transposePathWeight <= ma[i][j].first) { + ma[i][j] = std::make_pair(transposePathWeight, EMT_TRANSPOSE); + } } } } @@ -116,6 +150,10 @@ namespace NLevenshtein { for (int i = l1, j = l2; ma[i][j].second != EMT_SPECIAL;) { res.push_back(ma[i][j].second); switch (ma[i][j].second) { + case EMT_TRANSPOSE: + i -= 2; + j -= 2; + break; case EMT_PRESERVE: case EMT_REPLACE: --i; @@ -139,10 +177,26 @@ namespace NLevenshtein { } } - template <class TStringType> - size_t Distance(const TStringType& str1, const TStringType& str2) { + template <class TStringType, bool Damerau = false, class TWeightType = int> + void GetEditChainGeneric(const TStringType& str1, const TStringType& str2, TEditChain& res, TWeightType* weight = nullptr) { + typedef TCharType<TStringType> TArgType; + GetEditChain< + TStringType, TWeightType, + TWeightOneBinaryGetter<TArgType, TWeightType>, + TWeightOneUnaryGetter<TArgType, TWeightType>, + TWeightOneUnaryGetter<TArgType, TWeightType>, + std::conditional_t< + Damerau, + TWeightOneBinaryGetter<TArgType, TWeightType>, + TWeightInfBinaryGetter<TArgType, TWeightType> + > + >(str1, str2, res, weight); + } + + template <class TStringType, bool Damerau = false> + size_t DistanceImpl(const TStringType& str1, const TStringType& str2) { TEditChain editChain; - GetEditChain(str1, str2, editChain); + GetEditChainGeneric<TStringType, Damerau>(str1, str2, editChain); size_t result = 0; for (auto edit : editChain) { if (IsImportantEditMove(edit)) @@ -151,6 +205,16 @@ namespace NLevenshtein { return result; } + template <class TStringType> + size_t Distance(const TStringType& str1, const TStringType& str2) { + return DistanceImpl<TStringType, false>(str1, str2); + } + + template <class TStringType> + size_t DamerauDistance(const TStringType& str1, const TStringType& str2) { + return DistanceImpl<TStringType, true>(str1, str2); + } + /// Calculates substrings to be replaced for str1->str2 transformation struct TReplacement { int CorrectOffset, CorrectLength, MisspelledOffset, MisspelledLength; diff --git a/library/cpp/string_utils/levenshtein_diff/levenshtein_diff_ut.cpp b/library/cpp/string_utils/levenshtein_diff/levenshtein_diff_ut.cpp index cf0f78637f..570a7f5c13 100644 --- a/library/cpp/string_utils/levenshtein_diff/levenshtein_diff_ut.cpp +++ b/library/cpp/string_utils/levenshtein_diff/levenshtein_diff_ut.cpp @@ -187,4 +187,24 @@ Y_UNIT_TEST_SUITE(WeightedLevenstein) { UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[1]), static_cast<int>(NLevenshtein::EMT_REPLACE)); UNIT_ASSERT_VALUES_EQUAL(distance, 1.0f); } + + Y_UNIT_TEST(DamerauLevenshtein) { + int distance; + NLevenshtein::TEditChain chain; + NLevenshtein::GetEditChainGeneric<TString, true>(TString("status"), TString("tsatus"), chain, &distance); + UNIT_ASSERT_VALUES_EQUAL(distance, 1); + UNIT_ASSERT_VALUES_EQUAL(chain.size(), 5); + UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[0]), static_cast<int>(NLevenshtein::EMT_TRANSPOSE)); + UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[1]), static_cast<int>(NLevenshtein::EMT_PRESERVE)); + + NLevenshtein::GetEditChainGeneric<TString, true>(TString("hello"), TString("heh lol"), chain, &distance); + UNIT_ASSERT_VALUES_EQUAL(distance, 3); + UNIT_ASSERT_VALUES_EQUAL(chain.size(), 6); + UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[0]), static_cast<int>(NLevenshtein::EMT_PRESERVE)); + UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[1]), static_cast<int>(NLevenshtein::EMT_PRESERVE)); + UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[2]), static_cast<int>(NLevenshtein::EMT_INSERT)); + UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[3]), static_cast<int>(NLevenshtein::EMT_INSERT)); + UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[4]), static_cast<int>(NLevenshtein::EMT_PRESERVE)); + UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[5]), static_cast<int>(NLevenshtein::EMT_TRANSPOSE)); + } } |