aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/string_utils/levenshtein_diff
diff options
context:
space:
mode:
authorDevtools Arcadia <arcadia-devtools@yandex-team.ru>2022-02-07 18:08:42 +0300
committerDevtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net>2022-02-07 18:08:42 +0300
commit1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
treee26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/string_utils/levenshtein_diff
downloadydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/string_utils/levenshtein_diff')
-rw-r--r--library/cpp/string_utils/levenshtein_diff/levenshtein_diff.cpp1
-rw-r--r--library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h192
-rw-r--r--library/cpp/string_utils/levenshtein_diff/levenshtein_diff_ut.cpp190
-rw-r--r--library/cpp/string_utils/levenshtein_diff/ut/ya.make9
-rw-r--r--library/cpp/string_utils/levenshtein_diff/ya.make13
5 files changed, 405 insertions, 0 deletions
diff --git a/library/cpp/string_utils/levenshtein_diff/levenshtein_diff.cpp b/library/cpp/string_utils/levenshtein_diff/levenshtein_diff.cpp
new file mode 100644
index 0000000000..8883d7df07
--- /dev/null
+++ b/library/cpp/string_utils/levenshtein_diff/levenshtein_diff.cpp
@@ -0,0 +1 @@
+#include "levenshtein_diff.h"
diff --git a/library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h b/library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h
new file mode 100644
index 0000000000..8a240bfed8
--- /dev/null
+++ b/library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h
@@ -0,0 +1,192 @@
+#pragma once
+
+#include <util/draft/matrix.h>
+#include <util/generic/algorithm.h>
+#include <util/generic/vector.h>
+#include <util/system/yassert.h>
+
+#include <type_traits>
+#include <utility>
+
+namespace NLevenshtein {
+ enum EEditMoveType {
+ EMT_SPECIAL,
+ EMT_PRESERVE,
+ EMT_REPLACE,
+ EMT_DELETE,
+ EMT_INSERT
+ };
+
+ inline bool IsImportantEditMove(EEditMoveType p) {
+ return (p != EMT_SPECIAL && p != EMT_PRESERVE);
+ }
+
+ inline void MakeMove(EEditMoveType t, int& p1, int& p2) {
+ switch (t) {
+ case EMT_PRESERVE:
+ case EMT_REPLACE:
+ p1++;
+ p2++;
+ break;
+ case EMT_DELETE:
+ p1++;
+ break;
+ case EMT_INSERT:
+ p2++;
+ break;
+ default:
+ break;
+ }
+ }
+
+ using TEditChain = TVector<EEditMoveType>;
+
+ template <typename TArgType>
+ struct TWeightOneUnaryGetter {
+ int operator()(const TArgType&) const {
+ return 1;
+ }
+ };
+
+ template <typename TArgType>
+ struct TWeightOneBinaryGetter {
+ int operator()(const TArgType&, const TArgType&) const {
+ return 1;
+ }
+ };
+
+ template <typename TStringType>
+ using TCharType = typename std::decay_t<decltype(std::add_const_t<TStringType>()[0])>;
+
+ /// Finds sequence of "edit moves" for two strings
+ template <class TStringType, class TWeightType = int,
+ class TReplaceWeigher = TWeightOneBinaryGetter<TCharType<TStringType>>,
+ class TDeleteWeigher = TWeightOneUnaryGetter<TCharType<TStringType>>,
+ class TInsertWeigher = TWeightOneUnaryGetter<TCharType<TStringType>>
+ >
+ void GetEditChain(const TStringType& str1, const TStringType& str2, TEditChain& res, TWeightType* weight = nullptr,
+ const TReplaceWeigher& replaceWeigher = TReplaceWeigher(),
+ const TDeleteWeigher& deleteWeigher = TDeleteWeigher(),
+ const TInsertWeigher& insertWeigher = TInsertWeigher())
+ {
+ int l1 = (int)str1.size();
+ int l2 = (int)str2.size();
+
+ TMatrix<std::pair<TWeightType, EEditMoveType>> ma(l1 + 1, l2 + 1); /// ma[i][j].first = diff(str1[0..i-1], str2[0..j-1])
+ ma[0][0] = std::make_pair(0, EMT_SPECIAL); // starting point
+ for (int i = 1; i <= l1; i++) {
+ ma[i][0] = std::make_pair(ma[i - 1][0].first + deleteWeigher(str1[i - 1]), EMT_DELETE);
+ }
+ for (int i = 1; i <= l2; i++) {
+ ma[0][i] = std::make_pair(ma[0][i - 1].first + insertWeigher(str2[i - 1]), EMT_INSERT);
+ }
+ // Here goes basic Levestein's algorithm
+ for (int i = 1; i <= l1; i++) {
+ for (int j = 1; j <= l2; j++) {
+ if (str1[i - 1] == str2[j - 1]) {
+ ma[i][j] = std::make_pair(ma[i - 1][j - 1].first, EMT_PRESERVE);
+ } else {
+ const TWeightType replaceWeight = replaceWeigher(str1[i - 1], str2[j - 1]);
+ Y_ASSERT(replaceWeight >= 0);
+ ma[i][j] = std::make_pair(ma[i - 1][j - 1].first + replaceWeight, EMT_REPLACE);
+ }
+
+ if (ma[i][j].first > ma[i - 1][j].first) {
+ const TWeightType deleteWeight = deleteWeigher(str1[i - 1]);
+ Y_ASSERT(deleteWeight >= 0);
+ const TWeightType deletePathWeight = ma[i - 1][j].first + deleteWeight;
+ if (deletePathWeight <= ma[i][j].first) {
+ ma[i][j] = std::make_pair(deletePathWeight, EMT_DELETE);
+ }
+ }
+
+ if (ma[i][j].first > ma[i][j - 1].first) {
+ const TWeightType insertWeight = insertWeigher(str2[j - 1]);
+ Y_ASSERT(insertWeight >= 0);
+ const TWeightType insertPathWeight = ma[i][j - 1].first + insertWeight;
+ if (insertPathWeight <= ma[i][j].first) {
+ ma[i][j] = std::make_pair(insertPathWeight, EMT_INSERT);
+ }
+ }
+ }
+ }
+ // Tracing the path from final point
+ res.clear();
+ res.reserve(Max<size_t>(l1, l2));
+ for (int i = l1, j = l2; ma[i][j].second != EMT_SPECIAL;) {
+ res.push_back(ma[i][j].second);
+ switch (ma[i][j].second) {
+ case EMT_PRESERVE:
+ case EMT_REPLACE:
+ --i;
+ --j;
+ break;
+ case EMT_DELETE:
+ --i;
+ break;
+ case EMT_INSERT:
+ --j;
+ break;
+ default:
+ // TODO: throw exception
+ break;
+ }
+ }
+ std::reverse(res.begin(), res.end());
+
+ if (weight != nullptr) {
+ *weight = ma[l1][l2].first;
+ }
+ }
+
+ template <class TStringType>
+ size_t Distance(const TStringType& str1, const TStringType& str2) {
+ TEditChain editChain;
+ GetEditChain(str1, str2, editChain);
+ size_t result = 0;
+ for (auto edit : editChain) {
+ if (IsImportantEditMove(edit))
+ result++;
+ }
+ return result;
+ }
+
+ /// Calculates substrings to be replaced for str1->str2 transformation
+ struct TReplacement {
+ int CorrectOffset, CorrectLength, MisspelledOffset, MisspelledLength;
+ TReplacement()
+ : CorrectOffset(0)
+ , CorrectLength(0)
+ , MisspelledOffset(0)
+ , MisspelledLength(0)
+ {
+ }
+ TReplacement(int correctOffset, int correctLength, int misspelledOffset, int misspelledLength)
+ : CorrectOffset(correctOffset)
+ , CorrectLength(correctLength)
+ , MisspelledOffset(misspelledOffset)
+ , MisspelledLength(misspelledLength)
+ {
+ }
+ };
+
+ template <class TStringType>
+ void GetStringReplacements(const TStringType& str1, const TStringType& str2, TVector<TReplacement>& res) {
+ TEditChain editChain;
+ GetEditChain(str1, str2, editChain);
+ editChain.push_back(EMT_SPECIAL);
+ int c1 = 0, c2 = 0;
+ res.clear();
+ for (TEditChain::const_iterator it = editChain.begin(); it != editChain.end(); it++) {
+ if (IsImportantEditMove(*it)) {
+ int sc1 = c1, sc2 = c2;
+ do {
+ MakeMove(*it, c1, c2);
+ ++it;
+ } while (IsImportantEditMove(*it));
+ res.push_back(TReplacement(sc1, c1 - sc1, sc2, c2 - sc2));
+ }
+ MakeMove(*it, c1, c2);
+ }
+ }
+}
diff --git a/library/cpp/string_utils/levenshtein_diff/levenshtein_diff_ut.cpp b/library/cpp/string_utils/levenshtein_diff/levenshtein_diff_ut.cpp
new file mode 100644
index 0000000000..cf0f78637f
--- /dev/null
+++ b/library/cpp/string_utils/levenshtein_diff/levenshtein_diff_ut.cpp
@@ -0,0 +1,190 @@
+#include "levenshtein_diff.h"
+
+#include <library/cpp/testing/unittest/registar.h>
+
+#include <util/generic/string.h>
+
+namespace {
+
+ float unaryZeroWeigher(const char&) {
+ return 0.0f;
+ };
+
+ float unaryMaxWeigher(const char&) {
+ return 1.0f;
+ };
+
+ float binaryZeroWeigher(const char&, const char&) {
+ return 0.0f;
+ };
+
+ float binaryMaxWeigher(const char&, const char&) {
+ return 1.0f;
+ };
+
+}
+
+Y_UNIT_TEST_SUITE(Levenstein) {
+ Y_UNIT_TEST(Distance) {
+ UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::Distance(TStringBuf("hello"), TStringBuf("hulloah")), 3);
+ UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::Distance(TStringBuf("yeoman"), TStringBuf("yo man")), 2);
+ }
+}
+
+Y_UNIT_TEST_SUITE(WeightedLevenstein) {
+ Y_UNIT_TEST(EqualStrings) {
+ NLevenshtein::TEditChain chain;
+ float distance = 0.0f;
+ NLevenshtein::GetEditChain(TString("12345"), TString("12345"), chain, &distance, binaryMaxWeigher, unaryMaxWeigher, unaryMaxWeigher);
+ UNIT_ASSERT_VALUES_EQUAL(distance, 0.0f);
+ UNIT_ASSERT_VALUES_EQUAL(chain.size(), 5);
+ }
+
+ Y_UNIT_TEST(EmptyStrings) {
+ NLevenshtein::TEditChain chain;
+ float distance = 0.0f;
+ NLevenshtein::GetEditChain(TString(""), TString(""), chain, &distance, binaryMaxWeigher, unaryMaxWeigher, unaryMaxWeigher);
+ UNIT_ASSERT_VALUES_EQUAL(distance, 0.0f);
+ UNIT_ASSERT_VALUES_EQUAL(chain.size(), 0);
+ }
+
+ Y_UNIT_TEST(InsertsOnly) {
+ auto unaryWeigher = [](const char&) {
+ return 2.0f;
+ };
+ NLevenshtein::TEditChain chain;
+ float distance = 0.0f;
+ NLevenshtein::GetEditChain(TString(""), TString("12345"), chain, &distance, binaryZeroWeigher, unaryZeroWeigher, unaryWeigher);
+ UNIT_ASSERT_VALUES_EQUAL(distance, 10.0f);
+ UNIT_ASSERT_VALUES_EQUAL(chain.size(), 5);
+ }
+
+ Y_UNIT_TEST(DeletionsOnly) {
+ auto unaryWeigher = [](const char&) {
+ return 3.0f;
+ };
+ NLevenshtein::TEditChain chain;
+ float distance = 0.0f;
+ NLevenshtein::GetEditChain(TString("54321"), TString(""), chain, &distance, binaryZeroWeigher, unaryWeigher, unaryZeroWeigher);
+ UNIT_ASSERT_VALUES_EQUAL(distance, 15.0f);
+ UNIT_ASSERT_VALUES_EQUAL(chain.size(), 5);
+ }
+
+ Y_UNIT_TEST(SymmetryCheck) {
+ const TString str1 = "123x5";
+ const TString str2 = "x2345";
+ const float trgDistance = 2.0f;
+ const size_t trgChainLen = 5;
+
+ NLevenshtein::TEditChain chainLeftRight;
+ float distanceLeftRight = 0.0f;
+ NLevenshtein::GetEditChain(str1, str2, chainLeftRight, &distanceLeftRight, binaryMaxWeigher, unaryMaxWeigher, unaryMaxWeigher);
+ UNIT_ASSERT_VALUES_EQUAL(distanceLeftRight, trgDistance);
+ UNIT_ASSERT_VALUES_EQUAL(chainLeftRight.size(), trgChainLen);
+ UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chainLeftRight[0]), static_cast<int>(NLevenshtein::EMT_REPLACE));
+ UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chainLeftRight[1]), static_cast<int>(NLevenshtein::EMT_PRESERVE));
+ UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chainLeftRight[2]), static_cast<int>(NLevenshtein::EMT_PRESERVE));
+ UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chainLeftRight[3]), static_cast<int>(NLevenshtein::EMT_REPLACE));
+ UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chainLeftRight[4]), static_cast<int>(NLevenshtein::EMT_PRESERVE));
+
+ NLevenshtein::TEditChain chainRightLeft;
+ float distanceRightLeft = 0.0f;
+ NLevenshtein::GetEditChain(str2, str1, chainRightLeft, &distanceRightLeft, binaryMaxWeigher, unaryMaxWeigher, unaryMaxWeigher);
+ UNIT_ASSERT_VALUES_EQUAL(distanceRightLeft, trgDistance);
+ UNIT_ASSERT_VALUES_EQUAL(chainRightLeft.size(), trgChainLen);
+ UNIT_ASSERT(chainRightLeft == chainLeftRight);
+ }
+
+ Y_UNIT_TEST(PreferReplacements) {
+ auto binaryWeigher = [](const char&, const char&) {
+ return 0.0625f;
+ };
+ NLevenshtein::TEditChain chain;
+ float distance = 0.0f;
+ NLevenshtein::GetEditChain(TString("54321"), TString("43210"), chain, &distance, binaryWeigher, unaryMaxWeigher, unaryMaxWeigher);
+ UNIT_ASSERT_VALUES_EQUAL(distance, 0.3125f);
+ UNIT_ASSERT_VALUES_EQUAL(chain.size(), 5);
+ }
+
+ Y_UNIT_TEST(PreferInsertDeletions) {
+ auto unaryWeigher = [](const char&) {
+ return 0.0625f;
+ };
+ NLevenshtein::TEditChain chain;
+ float distance = 0.0f;
+ NLevenshtein::GetEditChain(TString("54321"), TString("98765"), chain, &distance, binaryMaxWeigher, unaryWeigher, unaryWeigher);
+ UNIT_ASSERT_VALUES_EQUAL(distance, 0.5f);
+ UNIT_ASSERT_VALUES_EQUAL(chain.size(), 9);
+ }
+
+ Y_UNIT_TEST(NoXDeletions) {
+ auto unaryWeigher = [](const char& c) {
+ return c == 'x' ? 100.0f : 1.0f;
+ };
+ NLevenshtein::TEditChain chain;
+ float distance = 0.0f;
+ NLevenshtein::GetEditChain(TString("543x1"), TString("5431"), chain, &distance, binaryMaxWeigher, unaryWeigher, unaryMaxWeigher);
+ UNIT_ASSERT_VALUES_EQUAL(chain.size(), 5);
+ UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[3]), static_cast<int>(NLevenshtein::EMT_REPLACE));
+ UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[4]), static_cast<int>(NLevenshtein::EMT_DELETE));
+ UNIT_ASSERT_VALUES_EQUAL(distance, 2.0f);
+ }
+
+ Y_UNIT_TEST(NoXInsertions) {
+ auto unaryWeigher = [](const char& c) {
+ return c == 'x' ? 100.0f : 1.0f;
+ };
+ NLevenshtein::TEditChain chain;
+ float distance = 0.0f;
+ NLevenshtein::GetEditChain(TString("5431"), TString("543x1"), chain, &distance, binaryMaxWeigher, unaryMaxWeigher, unaryWeigher);
+ UNIT_ASSERT_VALUES_EQUAL(chain.size(), 5);
+ UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[3]), static_cast<int>(NLevenshtein::EMT_REPLACE));
+ UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[4]), static_cast<int>(NLevenshtein::EMT_INSERT));
+ UNIT_ASSERT_VALUES_EQUAL(distance, 2.0f);
+ }
+
+ Y_UNIT_TEST(NoReplacementsOfX) {
+ auto binaryWeigher = [](const char& l, const char&) {
+ return l == 'x' ? 100.0f : 1.0f;
+ };
+ NLevenshtein::TEditChain chain;
+ float distance = 0.0f;
+ NLevenshtein::GetEditChain(TString("5432x"), TString("5432y"), chain, &distance, binaryWeigher, unaryMaxWeigher, unaryMaxWeigher);
+ UNIT_ASSERT_VALUES_EQUAL(chain.size(), 6);
+ UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[4]), static_cast<int>(NLevenshtein::EMT_DELETE));
+ UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[5]), static_cast<int>(NLevenshtein::EMT_INSERT));
+ UNIT_ASSERT_VALUES_EQUAL(distance, 2.0f);
+ }
+
+ Y_UNIT_TEST(NoReplacementsForX) {
+ auto binaryWeigher = [](const char&, const char& r) {
+ return r == 'x' ? 100.0f : 1.0f;
+ };
+ NLevenshtein::TEditChain chain;
+ float distance = 0.0f;
+ NLevenshtein::GetEditChain(TString("y4321"), TString("x4321"), chain, &distance, binaryWeigher, unaryMaxWeigher, unaryMaxWeigher);
+ UNIT_ASSERT_VALUES_EQUAL(chain.size(), 6);
+ UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[0]), static_cast<int>(NLevenshtein::EMT_DELETE));
+ UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[1]), static_cast<int>(NLevenshtein::EMT_INSERT));
+ UNIT_ASSERT_VALUES_EQUAL(distance, 2.0f);
+ }
+
+ Y_UNIT_TEST(SimilarOperationPriorities) {
+ auto replaceWeigher = [](const char&, const char&) {
+ return 0.5f;
+ };
+ auto deleteWeigher = [](const char&) {
+ return 0.2f;
+ };
+ auto insertWeigher = [](const char&) {
+ return 0.9f;
+ };
+ NLevenshtein::TEditChain chain;
+ float distance = 0.0f;
+ NLevenshtein::GetEditChain(TString("y0"), TString("0x"), chain, &distance, replaceWeigher, deleteWeigher, insertWeigher);
+ UNIT_ASSERT_VALUES_EQUAL(chain.size(), 2);
+ UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[0]), static_cast<int>(NLevenshtein::EMT_REPLACE));
+ UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[1]), static_cast<int>(NLevenshtein::EMT_REPLACE));
+ UNIT_ASSERT_VALUES_EQUAL(distance, 1.0f);
+ }
+}
diff --git a/library/cpp/string_utils/levenshtein_diff/ut/ya.make b/library/cpp/string_utils/levenshtein_diff/ut/ya.make
new file mode 100644
index 0000000000..a3b9b8fea5
--- /dev/null
+++ b/library/cpp/string_utils/levenshtein_diff/ut/ya.make
@@ -0,0 +1,9 @@
+UNITTEST_FOR(library/cpp/string_utils/levenshtein_diff)
+
+OWNER(myltsev)
+
+SRCS(
+ levenshtein_diff_ut.cpp
+)
+
+END()
diff --git a/library/cpp/string_utils/levenshtein_diff/ya.make b/library/cpp/string_utils/levenshtein_diff/ya.make
new file mode 100644
index 0000000000..bafefe5365
--- /dev/null
+++ b/library/cpp/string_utils/levenshtein_diff/ya.make
@@ -0,0 +1,13 @@
+LIBRARY()
+
+OWNER(g:mt)
+
+SRCS(
+ levenshtein_diff.cpp
+)
+
+PEERDIR(
+ util/draft
+)
+
+END()