diff options
author | akinshchikov <akinshchikov@yandex-team.com> | 2023-07-03 12:26:39 +0300 |
---|---|---|
committer | akinshchikov <akinshchikov@yandex-team.com> | 2023-07-03 12:26:39 +0300 |
commit | 72c25e282d527a441d02f56820179cd6990010d1 (patch) | |
tree | c6dea4e7c15241f83584fce9c27917054af01e04 /library/cpp/string_utils/levenshtein_diff | |
parent | f12bd19e8cddac19aed1767a20500459bdd2feb5 (diff) | |
download | ydb-72c25e282d527a441d02f56820179cd6990010d1.tar.gz |
Replace the functions for the `Levenshtein` and the `Damerau–Levenshtein` distances with the memory effective version.
YQL UDFs `String::LevensteinDistance` and `Unicode::LevensteinDistance` use `NLevenshtein::Distance` inside.
The previous version of `NLevenshtein::Distance` uses O(nm) space instead of O(min(n, m)).
Diffstat (limited to 'library/cpp/string_utils/levenshtein_diff')
-rw-r--r-- | library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h | 44 | ||||
-rw-r--r-- | library/cpp/string_utils/levenshtein_diff/levenshtein_diff_ut.cpp | 11 |
2 files changed, 47 insertions, 8 deletions
diff --git a/library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h b/library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h index 2184eb85db..7d02fbaaa4 100644 --- a/library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h +++ b/library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h @@ -95,7 +95,7 @@ namespace NLevenshtein { ma[0][i] = std::make_pair(ma[0][i - 1].first + insertWeigher(str2[i - 1]), EMT_INSERT); } const TWeightType maxWeight = std::numeric_limits<TWeightType>::max(); - // Here goes basic Damerau-Levestein's algorithm + // Here goes basic Damerau-Levenshtein's algorithm for (int i = 1; i <= l1; i++) { for (int j = 1; j <= l2; j++) { if (str1[i - 1] == str2[j - 1]) { @@ -195,14 +195,42 @@ namespace NLevenshtein { template <class TStringType, bool Damerau = false> size_t DistanceImpl(const TStringType& str1, const TStringType& str2) { - TEditChain editChain; - GetEditChainGeneric<TStringType, Damerau>(str1, str2, editChain); - size_t result = 0; - for (auto edit : editChain) { - if (IsImportantEditMove(edit)) - result++; + if (str1.size() > str2.size()) { + return DistanceImpl<TStringType, Damerau>(str2, str1); + } + + size_t size1 = str1.size(); + size_t size2 = str2.size(); + + TVector<size_t> currentRow(size1 + 1); + TVector<size_t> previousRow(size1 + 1); + TVector<size_t> transpositionRow(size1 + 1); + + for (size_t i = 0; i <= size1; ++i) { + previousRow[i] = i; + } + + for (size_t i = 1; i <= size2; ++i) { + currentRow[0] = i; + + for (size_t j = 1; j <= size1; ++j) { + size_t cost = str1[j - 1] == str2[i - 1] ? 0 : 1; + + currentRow[j] = std::min(previousRow[j - 1] + cost, std::min(currentRow[j - 1], previousRow[j]) + 1); + + if (Damerau && i > 1 && j > 1 && str1[j - 2] == str2[i - 1] && str1[j - 1] == str2[i - 2]) { + currentRow[j] = std::min(currentRow[j], transpositionRow[j - 2] + cost); + } + } + + if (Damerau) { + std::swap(transpositionRow, previousRow); + } + + std::swap(previousRow, currentRow); } - return result; + + return previousRow[size1]; } template <class TStringType> diff --git a/library/cpp/string_utils/levenshtein_diff/levenshtein_diff_ut.cpp b/library/cpp/string_utils/levenshtein_diff/levenshtein_diff_ut.cpp index 67c0f1da6f..6256a841c2 100644 --- a/library/cpp/string_utils/levenshtein_diff/levenshtein_diff_ut.cpp +++ b/library/cpp/string_utils/levenshtein_diff/levenshtein_diff_ut.cpp @@ -26,9 +26,20 @@ namespace { Y_UNIT_TEST_SUITE(Levenstein) { Y_UNIT_TEST(Distance) { + UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::Distance(TStringBuf("knight"), TStringBuf("king")), 4); + UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::Distance(TStringBuf("life flies"), TStringBuf("fly lives")), 6); + UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::Distance(TStringBuf("spot trail"), TStringBuf("stop trial")), 4); UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::Distance(TStringBuf("hello"), TStringBuf("hulloah")), 3); UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::Distance(TStringBuf("yeoman"), TStringBuf("yo man")), 2); } + + Y_UNIT_TEST(DamerauDistance) { + UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::DamerauDistance(TStringBuf("knight"), TStringBuf("king")), 3); + UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::DamerauDistance(TStringBuf("life flies"), TStringBuf("fly lives")), 6); + UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::DamerauDistance(TStringBuf("spot trail"), TStringBuf("stop trial")), 3); + UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::DamerauDistance(TStringBuf("status"), TStringBuf("tsatus")), 1); + UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::DamerauDistance(TStringBuf("hello"), TStringBuf("heh lol")), 3); + } } Y_UNIT_TEST_SUITE(WeightedLevenstein) { |