aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h
diff options
context:
space:
mode:
authorDevtools Arcadia <arcadia-devtools@yandex-team.ru>2022-02-07 18:08:42 +0300
committerDevtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net>2022-02-07 18:08:42 +0300
commit1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
treee26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h
downloadydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h')
-rw-r--r--library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h192
1 files changed, 192 insertions, 0 deletions
diff --git a/library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h b/library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h
new file mode 100644
index 0000000000..8a240bfed8
--- /dev/null
+++ b/library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h
@@ -0,0 +1,192 @@
+#pragma once
+
+#include <util/draft/matrix.h>
+#include <util/generic/algorithm.h>
+#include <util/generic/vector.h>
+#include <util/system/yassert.h>
+
+#include <type_traits>
+#include <utility>
+
+namespace NLevenshtein {
+ enum EEditMoveType {
+ EMT_SPECIAL,
+ EMT_PRESERVE,
+ EMT_REPLACE,
+ EMT_DELETE,
+ EMT_INSERT
+ };
+
+ inline bool IsImportantEditMove(EEditMoveType p) {
+ return (p != EMT_SPECIAL && p != EMT_PRESERVE);
+ }
+
+ inline void MakeMove(EEditMoveType t, int& p1, int& p2) {
+ switch (t) {
+ case EMT_PRESERVE:
+ case EMT_REPLACE:
+ p1++;
+ p2++;
+ break;
+ case EMT_DELETE:
+ p1++;
+ break;
+ case EMT_INSERT:
+ p2++;
+ break;
+ default:
+ break;
+ }
+ }
+
+ using TEditChain = TVector<EEditMoveType>;
+
+ template <typename TArgType>
+ struct TWeightOneUnaryGetter {
+ int operator()(const TArgType&) const {
+ return 1;
+ }
+ };
+
+ template <typename TArgType>
+ struct TWeightOneBinaryGetter {
+ int operator()(const TArgType&, const TArgType&) const {
+ return 1;
+ }
+ };
+
+ template <typename TStringType>
+ using TCharType = typename std::decay_t<decltype(std::add_const_t<TStringType>()[0])>;
+
+ /// Finds sequence of "edit moves" for two strings
+ template <class TStringType, class TWeightType = int,
+ class TReplaceWeigher = TWeightOneBinaryGetter<TCharType<TStringType>>,
+ class TDeleteWeigher = TWeightOneUnaryGetter<TCharType<TStringType>>,
+ class TInsertWeigher = TWeightOneUnaryGetter<TCharType<TStringType>>
+ >
+ void GetEditChain(const TStringType& str1, const TStringType& str2, TEditChain& res, TWeightType* weight = nullptr,
+ const TReplaceWeigher& replaceWeigher = TReplaceWeigher(),
+ const TDeleteWeigher& deleteWeigher = TDeleteWeigher(),
+ const TInsertWeigher& insertWeigher = TInsertWeigher())
+ {
+ int l1 = (int)str1.size();
+ int l2 = (int)str2.size();
+
+ TMatrix<std::pair<TWeightType, EEditMoveType>> ma(l1 + 1, l2 + 1); /// ma[i][j].first = diff(str1[0..i-1], str2[0..j-1])
+ ma[0][0] = std::make_pair(0, EMT_SPECIAL); // starting point
+ for (int i = 1; i <= l1; i++) {
+ ma[i][0] = std::make_pair(ma[i - 1][0].first + deleteWeigher(str1[i - 1]), EMT_DELETE);
+ }
+ for (int i = 1; i <= l2; i++) {
+ ma[0][i] = std::make_pair(ma[0][i - 1].first + insertWeigher(str2[i - 1]), EMT_INSERT);
+ }
+ // Here goes basic Levestein's algorithm
+ for (int i = 1; i <= l1; i++) {
+ for (int j = 1; j <= l2; j++) {
+ if (str1[i - 1] == str2[j - 1]) {
+ ma[i][j] = std::make_pair(ma[i - 1][j - 1].first, EMT_PRESERVE);
+ } else {
+ const TWeightType replaceWeight = replaceWeigher(str1[i - 1], str2[j - 1]);
+ Y_ASSERT(replaceWeight >= 0);
+ ma[i][j] = std::make_pair(ma[i - 1][j - 1].first + replaceWeight, EMT_REPLACE);
+ }
+
+ if (ma[i][j].first > ma[i - 1][j].first) {
+ const TWeightType deleteWeight = deleteWeigher(str1[i - 1]);
+ Y_ASSERT(deleteWeight >= 0);
+ const TWeightType deletePathWeight = ma[i - 1][j].first + deleteWeight;
+ if (deletePathWeight <= ma[i][j].first) {
+ ma[i][j] = std::make_pair(deletePathWeight, EMT_DELETE);
+ }
+ }
+
+ if (ma[i][j].first > ma[i][j - 1].first) {
+ const TWeightType insertWeight = insertWeigher(str2[j - 1]);
+ Y_ASSERT(insertWeight >= 0);
+ const TWeightType insertPathWeight = ma[i][j - 1].first + insertWeight;
+ if (insertPathWeight <= ma[i][j].first) {
+ ma[i][j] = std::make_pair(insertPathWeight, EMT_INSERT);
+ }
+ }
+ }
+ }
+ // Tracing the path from final point
+ res.clear();
+ res.reserve(Max<size_t>(l1, l2));
+ for (int i = l1, j = l2; ma[i][j].second != EMT_SPECIAL;) {
+ res.push_back(ma[i][j].second);
+ switch (ma[i][j].second) {
+ case EMT_PRESERVE:
+ case EMT_REPLACE:
+ --i;
+ --j;
+ break;
+ case EMT_DELETE:
+ --i;
+ break;
+ case EMT_INSERT:
+ --j;
+ break;
+ default:
+ // TODO: throw exception
+ break;
+ }
+ }
+ std::reverse(res.begin(), res.end());
+
+ if (weight != nullptr) {
+ *weight = ma[l1][l2].first;
+ }
+ }
+
+ template <class TStringType>
+ size_t Distance(const TStringType& str1, const TStringType& str2) {
+ TEditChain editChain;
+ GetEditChain(str1, str2, editChain);
+ size_t result = 0;
+ for (auto edit : editChain) {
+ if (IsImportantEditMove(edit))
+ result++;
+ }
+ return result;
+ }
+
+ /// Calculates substrings to be replaced for str1->str2 transformation
+ struct TReplacement {
+ int CorrectOffset, CorrectLength, MisspelledOffset, MisspelledLength;
+ TReplacement()
+ : CorrectOffset(0)
+ , CorrectLength(0)
+ , MisspelledOffset(0)
+ , MisspelledLength(0)
+ {
+ }
+ TReplacement(int correctOffset, int correctLength, int misspelledOffset, int misspelledLength)
+ : CorrectOffset(correctOffset)
+ , CorrectLength(correctLength)
+ , MisspelledOffset(misspelledOffset)
+ , MisspelledLength(misspelledLength)
+ {
+ }
+ };
+
+ template <class TStringType>
+ void GetStringReplacements(const TStringType& str1, const TStringType& str2, TVector<TReplacement>& res) {
+ TEditChain editChain;
+ GetEditChain(str1, str2, editChain);
+ editChain.push_back(EMT_SPECIAL);
+ int c1 = 0, c2 = 0;
+ res.clear();
+ for (TEditChain::const_iterator it = editChain.begin(); it != editChain.end(); it++) {
+ if (IsImportantEditMove(*it)) {
+ int sc1 = c1, sc2 = c2;
+ do {
+ MakeMove(*it, c1, c2);
+ ++it;
+ } while (IsImportantEditMove(*it));
+ res.push_back(TReplacement(sc1, c1 - sc1, sc2, c2 - sc2));
+ }
+ MakeMove(*it, c1, c2);
+ }
+ }
+}