diff options
author | Devtools Arcadia <arcadia-devtools@yandex-team.ru> | 2022-02-07 18:08:42 +0300 |
---|---|---|
committer | Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net> | 2022-02-07 18:08:42 +0300 |
commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/string_utils/levenshtein_diff | |
download | ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/string_utils/levenshtein_diff')
5 files changed, 405 insertions, 0 deletions
diff --git a/library/cpp/string_utils/levenshtein_diff/levenshtein_diff.cpp b/library/cpp/string_utils/levenshtein_diff/levenshtein_diff.cpp new file mode 100644 index 0000000000..8883d7df07 --- /dev/null +++ b/library/cpp/string_utils/levenshtein_diff/levenshtein_diff.cpp @@ -0,0 +1 @@ +#include "levenshtein_diff.h" diff --git a/library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h b/library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h new file mode 100644 index 0000000000..8a240bfed8 --- /dev/null +++ b/library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h @@ -0,0 +1,192 @@ +#pragma once + +#include <util/draft/matrix.h> +#include <util/generic/algorithm.h> +#include <util/generic/vector.h> +#include <util/system/yassert.h> + +#include <type_traits> +#include <utility> + +namespace NLevenshtein { + enum EEditMoveType { + EMT_SPECIAL, + EMT_PRESERVE, + EMT_REPLACE, + EMT_DELETE, + EMT_INSERT + }; + + inline bool IsImportantEditMove(EEditMoveType p) { + return (p != EMT_SPECIAL && p != EMT_PRESERVE); + } + + inline void MakeMove(EEditMoveType t, int& p1, int& p2) { + switch (t) { + case EMT_PRESERVE: + case EMT_REPLACE: + p1++; + p2++; + break; + case EMT_DELETE: + p1++; + break; + case EMT_INSERT: + p2++; + break; + default: + break; + } + } + + using TEditChain = TVector<EEditMoveType>; + + template <typename TArgType> + struct TWeightOneUnaryGetter { + int operator()(const TArgType&) const { + return 1; + } + }; + + template <typename TArgType> + struct TWeightOneBinaryGetter { + int operator()(const TArgType&, const TArgType&) const { + return 1; + } + }; + + template <typename TStringType> + using TCharType = typename std::decay_t<decltype(std::add_const_t<TStringType>()[0])>; + + /// Finds sequence of "edit moves" for two strings + template <class TStringType, class TWeightType = int, + class TReplaceWeigher = TWeightOneBinaryGetter<TCharType<TStringType>>, + class TDeleteWeigher = TWeightOneUnaryGetter<TCharType<TStringType>>, + class TInsertWeigher = TWeightOneUnaryGetter<TCharType<TStringType>> + > + void GetEditChain(const TStringType& str1, const TStringType& str2, TEditChain& res, TWeightType* weight = nullptr, + const TReplaceWeigher& replaceWeigher = TReplaceWeigher(), + const TDeleteWeigher& deleteWeigher = TDeleteWeigher(), + const TInsertWeigher& insertWeigher = TInsertWeigher()) + { + int l1 = (int)str1.size(); + int l2 = (int)str2.size(); + + TMatrix<std::pair<TWeightType, EEditMoveType>> ma(l1 + 1, l2 + 1); /// ma[i][j].first = diff(str1[0..i-1], str2[0..j-1]) + ma[0][0] = std::make_pair(0, EMT_SPECIAL); // starting point + for (int i = 1; i <= l1; i++) { + ma[i][0] = std::make_pair(ma[i - 1][0].first + deleteWeigher(str1[i - 1]), EMT_DELETE); + } + for (int i = 1; i <= l2; i++) { + ma[0][i] = std::make_pair(ma[0][i - 1].first + insertWeigher(str2[i - 1]), EMT_INSERT); + } + // Here goes basic Levestein's algorithm + for (int i = 1; i <= l1; i++) { + for (int j = 1; j <= l2; j++) { + if (str1[i - 1] == str2[j - 1]) { + ma[i][j] = std::make_pair(ma[i - 1][j - 1].first, EMT_PRESERVE); + } else { + const TWeightType replaceWeight = replaceWeigher(str1[i - 1], str2[j - 1]); + Y_ASSERT(replaceWeight >= 0); + ma[i][j] = std::make_pair(ma[i - 1][j - 1].first + replaceWeight, EMT_REPLACE); + } + + if (ma[i][j].first > ma[i - 1][j].first) { + const TWeightType deleteWeight = deleteWeigher(str1[i - 1]); + Y_ASSERT(deleteWeight >= 0); + const TWeightType deletePathWeight = ma[i - 1][j].first + deleteWeight; + if (deletePathWeight <= ma[i][j].first) { + ma[i][j] = std::make_pair(deletePathWeight, EMT_DELETE); + } + } + + if (ma[i][j].first > ma[i][j - 1].first) { + const TWeightType insertWeight = insertWeigher(str2[j - 1]); + Y_ASSERT(insertWeight >= 0); + const TWeightType insertPathWeight = ma[i][j - 1].first + insertWeight; + if (insertPathWeight <= ma[i][j].first) { + ma[i][j] = std::make_pair(insertPathWeight, EMT_INSERT); + } + } + } + } + // Tracing the path from final point + res.clear(); + res.reserve(Max<size_t>(l1, l2)); + for (int i = l1, j = l2; ma[i][j].second != EMT_SPECIAL;) { + res.push_back(ma[i][j].second); + switch (ma[i][j].second) { + case EMT_PRESERVE: + case EMT_REPLACE: + --i; + --j; + break; + case EMT_DELETE: + --i; + break; + case EMT_INSERT: + --j; + break; + default: + // TODO: throw exception + break; + } + } + std::reverse(res.begin(), res.end()); + + if (weight != nullptr) { + *weight = ma[l1][l2].first; + } + } + + template <class TStringType> + size_t Distance(const TStringType& str1, const TStringType& str2) { + TEditChain editChain; + GetEditChain(str1, str2, editChain); + size_t result = 0; + for (auto edit : editChain) { + if (IsImportantEditMove(edit)) + result++; + } + return result; + } + + /// Calculates substrings to be replaced for str1->str2 transformation + struct TReplacement { + int CorrectOffset, CorrectLength, MisspelledOffset, MisspelledLength; + TReplacement() + : CorrectOffset(0) + , CorrectLength(0) + , MisspelledOffset(0) + , MisspelledLength(0) + { + } + TReplacement(int correctOffset, int correctLength, int misspelledOffset, int misspelledLength) + : CorrectOffset(correctOffset) + , CorrectLength(correctLength) + , MisspelledOffset(misspelledOffset) + , MisspelledLength(misspelledLength) + { + } + }; + + template <class TStringType> + void GetStringReplacements(const TStringType& str1, const TStringType& str2, TVector<TReplacement>& res) { + TEditChain editChain; + GetEditChain(str1, str2, editChain); + editChain.push_back(EMT_SPECIAL); + int c1 = 0, c2 = 0; + res.clear(); + for (TEditChain::const_iterator it = editChain.begin(); it != editChain.end(); it++) { + if (IsImportantEditMove(*it)) { + int sc1 = c1, sc2 = c2; + do { + MakeMove(*it, c1, c2); + ++it; + } while (IsImportantEditMove(*it)); + res.push_back(TReplacement(sc1, c1 - sc1, sc2, c2 - sc2)); + } + MakeMove(*it, c1, c2); + } + } +} diff --git a/library/cpp/string_utils/levenshtein_diff/levenshtein_diff_ut.cpp b/library/cpp/string_utils/levenshtein_diff/levenshtein_diff_ut.cpp new file mode 100644 index 0000000000..cf0f78637f --- /dev/null +++ b/library/cpp/string_utils/levenshtein_diff/levenshtein_diff_ut.cpp @@ -0,0 +1,190 @@ +#include "levenshtein_diff.h" + +#include <library/cpp/testing/unittest/registar.h> + +#include <util/generic/string.h> + +namespace { + + float unaryZeroWeigher(const char&) { + return 0.0f; + }; + + float unaryMaxWeigher(const char&) { + return 1.0f; + }; + + float binaryZeroWeigher(const char&, const char&) { + return 0.0f; + }; + + float binaryMaxWeigher(const char&, const char&) { + return 1.0f; + }; + +} + +Y_UNIT_TEST_SUITE(Levenstein) { + Y_UNIT_TEST(Distance) { + UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::Distance(TStringBuf("hello"), TStringBuf("hulloah")), 3); + UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::Distance(TStringBuf("yeoman"), TStringBuf("yo man")), 2); + } +} + +Y_UNIT_TEST_SUITE(WeightedLevenstein) { + Y_UNIT_TEST(EqualStrings) { + NLevenshtein::TEditChain chain; + float distance = 0.0f; + NLevenshtein::GetEditChain(TString("12345"), TString("12345"), chain, &distance, binaryMaxWeigher, unaryMaxWeigher, unaryMaxWeigher); + UNIT_ASSERT_VALUES_EQUAL(distance, 0.0f); + UNIT_ASSERT_VALUES_EQUAL(chain.size(), 5); + } + + Y_UNIT_TEST(EmptyStrings) { + NLevenshtein::TEditChain chain; + float distance = 0.0f; + NLevenshtein::GetEditChain(TString(""), TString(""), chain, &distance, binaryMaxWeigher, unaryMaxWeigher, unaryMaxWeigher); + UNIT_ASSERT_VALUES_EQUAL(distance, 0.0f); + UNIT_ASSERT_VALUES_EQUAL(chain.size(), 0); + } + + Y_UNIT_TEST(InsertsOnly) { + auto unaryWeigher = [](const char&) { + return 2.0f; + }; + NLevenshtein::TEditChain chain; + float distance = 0.0f; + NLevenshtein::GetEditChain(TString(""), TString("12345"), chain, &distance, binaryZeroWeigher, unaryZeroWeigher, unaryWeigher); + UNIT_ASSERT_VALUES_EQUAL(distance, 10.0f); + UNIT_ASSERT_VALUES_EQUAL(chain.size(), 5); + } + + Y_UNIT_TEST(DeletionsOnly) { + auto unaryWeigher = [](const char&) { + return 3.0f; + }; + NLevenshtein::TEditChain chain; + float distance = 0.0f; + NLevenshtein::GetEditChain(TString("54321"), TString(""), chain, &distance, binaryZeroWeigher, unaryWeigher, unaryZeroWeigher); + UNIT_ASSERT_VALUES_EQUAL(distance, 15.0f); + UNIT_ASSERT_VALUES_EQUAL(chain.size(), 5); + } + + Y_UNIT_TEST(SymmetryCheck) { + const TString str1 = "123x5"; + const TString str2 = "x2345"; + const float trgDistance = 2.0f; + const size_t trgChainLen = 5; + + NLevenshtein::TEditChain chainLeftRight; + float distanceLeftRight = 0.0f; + NLevenshtein::GetEditChain(str1, str2, chainLeftRight, &distanceLeftRight, binaryMaxWeigher, unaryMaxWeigher, unaryMaxWeigher); + UNIT_ASSERT_VALUES_EQUAL(distanceLeftRight, trgDistance); + UNIT_ASSERT_VALUES_EQUAL(chainLeftRight.size(), trgChainLen); + UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chainLeftRight[0]), static_cast<int>(NLevenshtein::EMT_REPLACE)); + UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chainLeftRight[1]), static_cast<int>(NLevenshtein::EMT_PRESERVE)); + UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chainLeftRight[2]), static_cast<int>(NLevenshtein::EMT_PRESERVE)); + UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chainLeftRight[3]), static_cast<int>(NLevenshtein::EMT_REPLACE)); + UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chainLeftRight[4]), static_cast<int>(NLevenshtein::EMT_PRESERVE)); + + NLevenshtein::TEditChain chainRightLeft; + float distanceRightLeft = 0.0f; + NLevenshtein::GetEditChain(str2, str1, chainRightLeft, &distanceRightLeft, binaryMaxWeigher, unaryMaxWeigher, unaryMaxWeigher); + UNIT_ASSERT_VALUES_EQUAL(distanceRightLeft, trgDistance); + UNIT_ASSERT_VALUES_EQUAL(chainRightLeft.size(), trgChainLen); + UNIT_ASSERT(chainRightLeft == chainLeftRight); + } + + Y_UNIT_TEST(PreferReplacements) { + auto binaryWeigher = [](const char&, const char&) { + return 0.0625f; + }; + NLevenshtein::TEditChain chain; + float distance = 0.0f; + NLevenshtein::GetEditChain(TString("54321"), TString("43210"), chain, &distance, binaryWeigher, unaryMaxWeigher, unaryMaxWeigher); + UNIT_ASSERT_VALUES_EQUAL(distance, 0.3125f); + UNIT_ASSERT_VALUES_EQUAL(chain.size(), 5); + } + + Y_UNIT_TEST(PreferInsertDeletions) { + auto unaryWeigher = [](const char&) { + return 0.0625f; + }; + NLevenshtein::TEditChain chain; + float distance = 0.0f; + NLevenshtein::GetEditChain(TString("54321"), TString("98765"), chain, &distance, binaryMaxWeigher, unaryWeigher, unaryWeigher); + UNIT_ASSERT_VALUES_EQUAL(distance, 0.5f); + UNIT_ASSERT_VALUES_EQUAL(chain.size(), 9); + } + + Y_UNIT_TEST(NoXDeletions) { + auto unaryWeigher = [](const char& c) { + return c == 'x' ? 100.0f : 1.0f; + }; + NLevenshtein::TEditChain chain; + float distance = 0.0f; + NLevenshtein::GetEditChain(TString("543x1"), TString("5431"), chain, &distance, binaryMaxWeigher, unaryWeigher, unaryMaxWeigher); + UNIT_ASSERT_VALUES_EQUAL(chain.size(), 5); + UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[3]), static_cast<int>(NLevenshtein::EMT_REPLACE)); + UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[4]), static_cast<int>(NLevenshtein::EMT_DELETE)); + UNIT_ASSERT_VALUES_EQUAL(distance, 2.0f); + } + + Y_UNIT_TEST(NoXInsertions) { + auto unaryWeigher = [](const char& c) { + return c == 'x' ? 100.0f : 1.0f; + }; + NLevenshtein::TEditChain chain; + float distance = 0.0f; + NLevenshtein::GetEditChain(TString("5431"), TString("543x1"), chain, &distance, binaryMaxWeigher, unaryMaxWeigher, unaryWeigher); + UNIT_ASSERT_VALUES_EQUAL(chain.size(), 5); + UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[3]), static_cast<int>(NLevenshtein::EMT_REPLACE)); + UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[4]), static_cast<int>(NLevenshtein::EMT_INSERT)); + UNIT_ASSERT_VALUES_EQUAL(distance, 2.0f); + } + + Y_UNIT_TEST(NoReplacementsOfX) { + auto binaryWeigher = [](const char& l, const char&) { + return l == 'x' ? 100.0f : 1.0f; + }; + NLevenshtein::TEditChain chain; + float distance = 0.0f; + NLevenshtein::GetEditChain(TString("5432x"), TString("5432y"), chain, &distance, binaryWeigher, unaryMaxWeigher, unaryMaxWeigher); + UNIT_ASSERT_VALUES_EQUAL(chain.size(), 6); + UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[4]), static_cast<int>(NLevenshtein::EMT_DELETE)); + UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[5]), static_cast<int>(NLevenshtein::EMT_INSERT)); + UNIT_ASSERT_VALUES_EQUAL(distance, 2.0f); + } + + Y_UNIT_TEST(NoReplacementsForX) { + auto binaryWeigher = [](const char&, const char& r) { + return r == 'x' ? 100.0f : 1.0f; + }; + NLevenshtein::TEditChain chain; + float distance = 0.0f; + NLevenshtein::GetEditChain(TString("y4321"), TString("x4321"), chain, &distance, binaryWeigher, unaryMaxWeigher, unaryMaxWeigher); + UNIT_ASSERT_VALUES_EQUAL(chain.size(), 6); + UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[0]), static_cast<int>(NLevenshtein::EMT_DELETE)); + UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[1]), static_cast<int>(NLevenshtein::EMT_INSERT)); + UNIT_ASSERT_VALUES_EQUAL(distance, 2.0f); + } + + Y_UNIT_TEST(SimilarOperationPriorities) { + auto replaceWeigher = [](const char&, const char&) { + return 0.5f; + }; + auto deleteWeigher = [](const char&) { + return 0.2f; + }; + auto insertWeigher = [](const char&) { + return 0.9f; + }; + NLevenshtein::TEditChain chain; + float distance = 0.0f; + NLevenshtein::GetEditChain(TString("y0"), TString("0x"), chain, &distance, replaceWeigher, deleteWeigher, insertWeigher); + UNIT_ASSERT_VALUES_EQUAL(chain.size(), 2); + UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[0]), static_cast<int>(NLevenshtein::EMT_REPLACE)); + UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[1]), static_cast<int>(NLevenshtein::EMT_REPLACE)); + UNIT_ASSERT_VALUES_EQUAL(distance, 1.0f); + } +} diff --git a/library/cpp/string_utils/levenshtein_diff/ut/ya.make b/library/cpp/string_utils/levenshtein_diff/ut/ya.make new file mode 100644 index 0000000000..a3b9b8fea5 --- /dev/null +++ b/library/cpp/string_utils/levenshtein_diff/ut/ya.make @@ -0,0 +1,9 @@ +UNITTEST_FOR(library/cpp/string_utils/levenshtein_diff) + +OWNER(myltsev) + +SRCS( + levenshtein_diff_ut.cpp +) + +END() diff --git a/library/cpp/string_utils/levenshtein_diff/ya.make b/library/cpp/string_utils/levenshtein_diff/ya.make new file mode 100644 index 0000000000..bafefe5365 --- /dev/null +++ b/library/cpp/string_utils/levenshtein_diff/ya.make @@ -0,0 +1,13 @@ +LIBRARY() + +OWNER(g:mt) + +SRCS( + levenshtein_diff.cpp +) + +PEERDIR( + util/draft +) + +END() |