aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/string_utils/levenshtein_diff
diff options
context:
space:
mode:
authorakinshchikov <akinshchikov@yandex-team.com>2023-07-03 12:26:39 +0300
committerakinshchikov <akinshchikov@yandex-team.com>2023-07-03 12:26:39 +0300
commit72c25e282d527a441d02f56820179cd6990010d1 (patch)
treec6dea4e7c15241f83584fce9c27917054af01e04 /library/cpp/string_utils/levenshtein_diff
parentf12bd19e8cddac19aed1767a20500459bdd2feb5 (diff)
downloadydb-72c25e282d527a441d02f56820179cd6990010d1.tar.gz
Replace the functions for the `Levenshtein` and the `Damerau–Levenshtein` distances with the memory effective version.
YQL UDFs `String::LevensteinDistance` and `Unicode::LevensteinDistance` use `NLevenshtein::Distance` inside. The previous version of `NLevenshtein::Distance` uses O(nm) space instead of O(min(n, m)).
Diffstat (limited to 'library/cpp/string_utils/levenshtein_diff')
-rw-r--r--library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h44
-rw-r--r--library/cpp/string_utils/levenshtein_diff/levenshtein_diff_ut.cpp11
2 files changed, 47 insertions, 8 deletions
diff --git a/library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h b/library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h
index 2184eb85db..7d02fbaaa4 100644
--- a/library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h
+++ b/library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h
@@ -95,7 +95,7 @@ namespace NLevenshtein {
ma[0][i] = std::make_pair(ma[0][i - 1].first + insertWeigher(str2[i - 1]), EMT_INSERT);
}
const TWeightType maxWeight = std::numeric_limits<TWeightType>::max();
- // Here goes basic Damerau-Levestein's algorithm
+ // Here goes basic Damerau-Levenshtein's algorithm
for (int i = 1; i <= l1; i++) {
for (int j = 1; j <= l2; j++) {
if (str1[i - 1] == str2[j - 1]) {
@@ -195,14 +195,42 @@ namespace NLevenshtein {
template <class TStringType, bool Damerau = false>
size_t DistanceImpl(const TStringType& str1, const TStringType& str2) {
- TEditChain editChain;
- GetEditChainGeneric<TStringType, Damerau>(str1, str2, editChain);
- size_t result = 0;
- for (auto edit : editChain) {
- if (IsImportantEditMove(edit))
- result++;
+ if (str1.size() > str2.size()) {
+ return DistanceImpl<TStringType, Damerau>(str2, str1);
+ }
+
+ size_t size1 = str1.size();
+ size_t size2 = str2.size();
+
+ TVector<size_t> currentRow(size1 + 1);
+ TVector<size_t> previousRow(size1 + 1);
+ TVector<size_t> transpositionRow(size1 + 1);
+
+ for (size_t i = 0; i <= size1; ++i) {
+ previousRow[i] = i;
+ }
+
+ for (size_t i = 1; i <= size2; ++i) {
+ currentRow[0] = i;
+
+ for (size_t j = 1; j <= size1; ++j) {
+ size_t cost = str1[j - 1] == str2[i - 1] ? 0 : 1;
+
+ currentRow[j] = std::min(previousRow[j - 1] + cost, std::min(currentRow[j - 1], previousRow[j]) + 1);
+
+ if (Damerau && i > 1 && j > 1 && str1[j - 2] == str2[i - 1] && str1[j - 1] == str2[i - 2]) {
+ currentRow[j] = std::min(currentRow[j], transpositionRow[j - 2] + cost);
+ }
+ }
+
+ if (Damerau) {
+ std::swap(transpositionRow, previousRow);
+ }
+
+ std::swap(previousRow, currentRow);
}
- return result;
+
+ return previousRow[size1];
}
template <class TStringType>
diff --git a/library/cpp/string_utils/levenshtein_diff/levenshtein_diff_ut.cpp b/library/cpp/string_utils/levenshtein_diff/levenshtein_diff_ut.cpp
index 67c0f1da6f..6256a841c2 100644
--- a/library/cpp/string_utils/levenshtein_diff/levenshtein_diff_ut.cpp
+++ b/library/cpp/string_utils/levenshtein_diff/levenshtein_diff_ut.cpp
@@ -26,9 +26,20 @@ namespace {
Y_UNIT_TEST_SUITE(Levenstein) {
Y_UNIT_TEST(Distance) {
+ UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::Distance(TStringBuf("knight"), TStringBuf("king")), 4);
+ UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::Distance(TStringBuf("life flies"), TStringBuf("fly lives")), 6);
+ UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::Distance(TStringBuf("spot trail"), TStringBuf("stop trial")), 4);
UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::Distance(TStringBuf("hello"), TStringBuf("hulloah")), 3);
UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::Distance(TStringBuf("yeoman"), TStringBuf("yo man")), 2);
}
+
+ Y_UNIT_TEST(DamerauDistance) {
+ UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::DamerauDistance(TStringBuf("knight"), TStringBuf("king")), 3);
+ UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::DamerauDistance(TStringBuf("life flies"), TStringBuf("fly lives")), 6);
+ UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::DamerauDistance(TStringBuf("spot trail"), TStringBuf("stop trial")), 3);
+ UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::DamerauDistance(TStringBuf("status"), TStringBuf("tsatus")), 1);
+ UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::DamerauDistance(TStringBuf("hello"), TStringBuf("heh lol")), 3);
+ }
}
Y_UNIT_TEST_SUITE(WeightedLevenstein) {