aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h
diff options
context:
space:
mode:
authorakinshchikov <akinshchikov@yandex-team.com>2023-07-03 12:26:39 +0300
committerakinshchikov <akinshchikov@yandex-team.com>2023-07-03 12:26:39 +0300
commit72c25e282d527a441d02f56820179cd6990010d1 (patch)
treec6dea4e7c15241f83584fce9c27917054af01e04 /library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h
parentf12bd19e8cddac19aed1767a20500459bdd2feb5 (diff)
downloadydb-72c25e282d527a441d02f56820179cd6990010d1.tar.gz
Replace the functions for the `Levenshtein` and the `Damerau–Levenshtein` distances with the memory effective version.
YQL UDFs `String::LevensteinDistance` and `Unicode::LevensteinDistance` use `NLevenshtein::Distance` inside. The previous version of `NLevenshtein::Distance` uses O(nm) space instead of O(min(n, m)).
Diffstat (limited to 'library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h')
-rw-r--r--library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h44
1 files changed, 36 insertions, 8 deletions
diff --git a/library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h b/library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h
index 2184eb85db..7d02fbaaa4 100644
--- a/library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h
+++ b/library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h
@@ -95,7 +95,7 @@ namespace NLevenshtein {
ma[0][i] = std::make_pair(ma[0][i - 1].first + insertWeigher(str2[i - 1]), EMT_INSERT);
}
const TWeightType maxWeight = std::numeric_limits<TWeightType>::max();
- // Here goes basic Damerau-Levestein's algorithm
+ // Here goes basic Damerau-Levenshtein's algorithm
for (int i = 1; i <= l1; i++) {
for (int j = 1; j <= l2; j++) {
if (str1[i - 1] == str2[j - 1]) {
@@ -195,14 +195,42 @@ namespace NLevenshtein {
template <class TStringType, bool Damerau = false>
size_t DistanceImpl(const TStringType& str1, const TStringType& str2) {
- TEditChain editChain;
- GetEditChainGeneric<TStringType, Damerau>(str1, str2, editChain);
- size_t result = 0;
- for (auto edit : editChain) {
- if (IsImportantEditMove(edit))
- result++;
+ if (str1.size() > str2.size()) {
+ return DistanceImpl<TStringType, Damerau>(str2, str1);
+ }
+
+ size_t size1 = str1.size();
+ size_t size2 = str2.size();
+
+ TVector<size_t> currentRow(size1 + 1);
+ TVector<size_t> previousRow(size1 + 1);
+ TVector<size_t> transpositionRow(size1 + 1);
+
+ for (size_t i = 0; i <= size1; ++i) {
+ previousRow[i] = i;
+ }
+
+ for (size_t i = 1; i <= size2; ++i) {
+ currentRow[0] = i;
+
+ for (size_t j = 1; j <= size1; ++j) {
+ size_t cost = str1[j - 1] == str2[i - 1] ? 0 : 1;
+
+ currentRow[j] = std::min(previousRow[j - 1] + cost, std::min(currentRow[j - 1], previousRow[j]) + 1);
+
+ if (Damerau && i > 1 && j > 1 && str1[j - 2] == str2[i - 1] && str1[j - 1] == str2[i - 2]) {
+ currentRow[j] = std::min(currentRow[j], transpositionRow[j - 2] + cost);
+ }
+ }
+
+ if (Damerau) {
+ std::swap(transpositionRow, previousRow);
+ }
+
+ std::swap(previousRow, currentRow);
}
- return result;
+
+ return previousRow[size1];
}
template <class TStringType>