diff options
author | agroshev <agroshev@yandex-team.ru> | 2022-02-10 16:49:54 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:49:54 +0300 |
commit | 4421e61d3257602a578678b61193f648cd9da816 (patch) | |
tree | 0bd5c8ab13fef7a3bea73c895cf0565b744104cd | |
parent | 085a76cb8fd1fd01e9483d056938f022970cb4f0 (diff) | |
download | ydb-4421e61d3257602a578678b61193f648cd9da816.tar.gz |
Restoring authorship annotation for <agroshev@yandex-team.ru>. Commit 1 of 2.
-rw-r--r-- | library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h | 126 | ||||
-rw-r--r-- | library/cpp/string_utils/levenshtein_diff/levenshtein_diff_ut.cpp | 278 |
2 files changed, 202 insertions, 202 deletions
diff --git a/library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h b/library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h index 8a240bfed8..3f1f8e6c51 100644 --- a/library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h +++ b/library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h @@ -1,11 +1,11 @@ #pragma once -#include <util/draft/matrix.h> +#include <util/draft/matrix.h> #include <util/generic/algorithm.h> #include <util/generic/vector.h> -#include <util/system/yassert.h> - -#include <type_traits> +#include <util/system/yassert.h> + +#include <type_traits> #include <utility> namespace NLevenshtein { @@ -41,73 +41,73 @@ namespace NLevenshtein { using TEditChain = TVector<EEditMoveType>; - template <typename TArgType> - struct TWeightOneUnaryGetter { - int operator()(const TArgType&) const { - return 1; - } - }; - - template <typename TArgType> - struct TWeightOneBinaryGetter { - int operator()(const TArgType&, const TArgType&) const { - return 1; - } - }; - - template <typename TStringType> - using TCharType = typename std::decay_t<decltype(std::add_const_t<TStringType>()[0])>; - + template <typename TArgType> + struct TWeightOneUnaryGetter { + int operator()(const TArgType&) const { + return 1; + } + }; + + template <typename TArgType> + struct TWeightOneBinaryGetter { + int operator()(const TArgType&, const TArgType&) const { + return 1; + } + }; + + template <typename TStringType> + using TCharType = typename std::decay_t<decltype(std::add_const_t<TStringType>()[0])>; + /// Finds sequence of "edit moves" for two strings - template <class TStringType, class TWeightType = int, - class TReplaceWeigher = TWeightOneBinaryGetter<TCharType<TStringType>>, - class TDeleteWeigher = TWeightOneUnaryGetter<TCharType<TStringType>>, - class TInsertWeigher = TWeightOneUnaryGetter<TCharType<TStringType>> - > - void GetEditChain(const TStringType& str1, const TStringType& str2, TEditChain& res, TWeightType* weight = nullptr, - const TReplaceWeigher& replaceWeigher = TReplaceWeigher(), - const TDeleteWeigher& deleteWeigher = TDeleteWeigher(), - const TInsertWeigher& insertWeigher = TInsertWeigher()) - { + template <class TStringType, class TWeightType = int, + class TReplaceWeigher = TWeightOneBinaryGetter<TCharType<TStringType>>, + class TDeleteWeigher = TWeightOneUnaryGetter<TCharType<TStringType>>, + class TInsertWeigher = TWeightOneUnaryGetter<TCharType<TStringType>> + > + void GetEditChain(const TStringType& str1, const TStringType& str2, TEditChain& res, TWeightType* weight = nullptr, + const TReplaceWeigher& replaceWeigher = TReplaceWeigher(), + const TDeleteWeigher& deleteWeigher = TDeleteWeigher(), + const TInsertWeigher& insertWeigher = TInsertWeigher()) + { int l1 = (int)str1.size(); int l2 = (int)str2.size(); - - TMatrix<std::pair<TWeightType, EEditMoveType>> ma(l1 + 1, l2 + 1); /// ma[i][j].first = diff(str1[0..i-1], str2[0..j-1]) + + TMatrix<std::pair<TWeightType, EEditMoveType>> ma(l1 + 1, l2 + 1); /// ma[i][j].first = diff(str1[0..i-1], str2[0..j-1]) ma[0][0] = std::make_pair(0, EMT_SPECIAL); // starting point for (int i = 1; i <= l1; i++) { - ma[i][0] = std::make_pair(ma[i - 1][0].first + deleteWeigher(str1[i - 1]), EMT_DELETE); + ma[i][0] = std::make_pair(ma[i - 1][0].first + deleteWeigher(str1[i - 1]), EMT_DELETE); } for (int i = 1; i <= l2; i++) { - ma[0][i] = std::make_pair(ma[0][i - 1].first + insertWeigher(str2[i - 1]), EMT_INSERT); + ma[0][i] = std::make_pair(ma[0][i - 1].first + insertWeigher(str2[i - 1]), EMT_INSERT); } // Here goes basic Levestein's algorithm for (int i = 1; i <= l1; i++) { for (int j = 1; j <= l2; j++) { - if (str1[i - 1] == str2[j - 1]) { + if (str1[i - 1] == str2[j - 1]) { ma[i][j] = std::make_pair(ma[i - 1][j - 1].first, EMT_PRESERVE); - } else { - const TWeightType replaceWeight = replaceWeigher(str1[i - 1], str2[j - 1]); - Y_ASSERT(replaceWeight >= 0); - ma[i][j] = std::make_pair(ma[i - 1][j - 1].first + replaceWeight, EMT_REPLACE); - } - - if (ma[i][j].first > ma[i - 1][j].first) { - const TWeightType deleteWeight = deleteWeigher(str1[i - 1]); - Y_ASSERT(deleteWeight >= 0); - const TWeightType deletePathWeight = ma[i - 1][j].first + deleteWeight; - if (deletePathWeight <= ma[i][j].first) { - ma[i][j] = std::make_pair(deletePathWeight, EMT_DELETE); - } - } - - if (ma[i][j].first > ma[i][j - 1].first) { - const TWeightType insertWeight = insertWeigher(str2[j - 1]); - Y_ASSERT(insertWeight >= 0); - const TWeightType insertPathWeight = ma[i][j - 1].first + insertWeight; - if (insertPathWeight <= ma[i][j].first) { - ma[i][j] = std::make_pair(insertPathWeight, EMT_INSERT); - } - } + } else { + const TWeightType replaceWeight = replaceWeigher(str1[i - 1], str2[j - 1]); + Y_ASSERT(replaceWeight >= 0); + ma[i][j] = std::make_pair(ma[i - 1][j - 1].first + replaceWeight, EMT_REPLACE); + } + + if (ma[i][j].first > ma[i - 1][j].first) { + const TWeightType deleteWeight = deleteWeigher(str1[i - 1]); + Y_ASSERT(deleteWeight >= 0); + const TWeightType deletePathWeight = ma[i - 1][j].first + deleteWeight; + if (deletePathWeight <= ma[i][j].first) { + ma[i][j] = std::make_pair(deletePathWeight, EMT_DELETE); + } + } + + if (ma[i][j].first > ma[i][j - 1].first) { + const TWeightType insertWeight = insertWeigher(str2[j - 1]); + Y_ASSERT(insertWeight >= 0); + const TWeightType insertPathWeight = ma[i][j - 1].first + insertWeight; + if (insertPathWeight <= ma[i][j].first) { + ma[i][j] = std::make_pair(insertPathWeight, EMT_INSERT); + } + } } } // Tracing the path from final point @@ -133,10 +133,10 @@ namespace NLevenshtein { } } std::reverse(res.begin(), res.end()); - - if (weight != nullptr) { - *weight = ma[l1][l2].first; - } + + if (weight != nullptr) { + *weight = ma[l1][l2].first; + } } template <class TStringType> diff --git a/library/cpp/string_utils/levenshtein_diff/levenshtein_diff_ut.cpp b/library/cpp/string_utils/levenshtein_diff/levenshtein_diff_ut.cpp index cf0f78637f..74db352a3f 100644 --- a/library/cpp/string_utils/levenshtein_diff/levenshtein_diff_ut.cpp +++ b/library/cpp/string_utils/levenshtein_diff/levenshtein_diff_ut.cpp @@ -2,189 +2,189 @@ #include <library/cpp/testing/unittest/registar.h> -#include <util/generic/string.h> - -namespace { - - float unaryZeroWeigher(const char&) { - return 0.0f; - }; - - float unaryMaxWeigher(const char&) { - return 1.0f; - }; - - float binaryZeroWeigher(const char&, const char&) { - return 0.0f; - }; - - float binaryMaxWeigher(const char&, const char&) { - return 1.0f; - }; - -} - +#include <util/generic/string.h> + +namespace { + + float unaryZeroWeigher(const char&) { + return 0.0f; + }; + + float unaryMaxWeigher(const char&) { + return 1.0f; + }; + + float binaryZeroWeigher(const char&, const char&) { + return 0.0f; + }; + + float binaryMaxWeigher(const char&, const char&) { + return 1.0f; + }; + +} + Y_UNIT_TEST_SUITE(Levenstein) { Y_UNIT_TEST(Distance) { UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::Distance(TStringBuf("hello"), TStringBuf("hulloah")), 3); UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::Distance(TStringBuf("yeoman"), TStringBuf("yo man")), 2); } } - -Y_UNIT_TEST_SUITE(WeightedLevenstein) { - Y_UNIT_TEST(EqualStrings) { + +Y_UNIT_TEST_SUITE(WeightedLevenstein) { + Y_UNIT_TEST(EqualStrings) { NLevenshtein::TEditChain chain; - float distance = 0.0f; + float distance = 0.0f; NLevenshtein::GetEditChain(TString("12345"), TString("12345"), chain, &distance, binaryMaxWeigher, unaryMaxWeigher, unaryMaxWeigher); - UNIT_ASSERT_VALUES_EQUAL(distance, 0.0f); - UNIT_ASSERT_VALUES_EQUAL(chain.size(), 5); - } - - Y_UNIT_TEST(EmptyStrings) { + UNIT_ASSERT_VALUES_EQUAL(distance, 0.0f); + UNIT_ASSERT_VALUES_EQUAL(chain.size(), 5); + } + + Y_UNIT_TEST(EmptyStrings) { NLevenshtein::TEditChain chain; - float distance = 0.0f; + float distance = 0.0f; NLevenshtein::GetEditChain(TString(""), TString(""), chain, &distance, binaryMaxWeigher, unaryMaxWeigher, unaryMaxWeigher); - UNIT_ASSERT_VALUES_EQUAL(distance, 0.0f); - UNIT_ASSERT_VALUES_EQUAL(chain.size(), 0); - } - - Y_UNIT_TEST(InsertsOnly) { - auto unaryWeigher = [](const char&) { - return 2.0f; - }; + UNIT_ASSERT_VALUES_EQUAL(distance, 0.0f); + UNIT_ASSERT_VALUES_EQUAL(chain.size(), 0); + } + + Y_UNIT_TEST(InsertsOnly) { + auto unaryWeigher = [](const char&) { + return 2.0f; + }; NLevenshtein::TEditChain chain; - float distance = 0.0f; + float distance = 0.0f; NLevenshtein::GetEditChain(TString(""), TString("12345"), chain, &distance, binaryZeroWeigher, unaryZeroWeigher, unaryWeigher); - UNIT_ASSERT_VALUES_EQUAL(distance, 10.0f); - UNIT_ASSERT_VALUES_EQUAL(chain.size(), 5); - } - - Y_UNIT_TEST(DeletionsOnly) { - auto unaryWeigher = [](const char&) { - return 3.0f; - }; + UNIT_ASSERT_VALUES_EQUAL(distance, 10.0f); + UNIT_ASSERT_VALUES_EQUAL(chain.size(), 5); + } + + Y_UNIT_TEST(DeletionsOnly) { + auto unaryWeigher = [](const char&) { + return 3.0f; + }; NLevenshtein::TEditChain chain; - float distance = 0.0f; + float distance = 0.0f; NLevenshtein::GetEditChain(TString("54321"), TString(""), chain, &distance, binaryZeroWeigher, unaryWeigher, unaryZeroWeigher); - UNIT_ASSERT_VALUES_EQUAL(distance, 15.0f); - UNIT_ASSERT_VALUES_EQUAL(chain.size(), 5); - } - - Y_UNIT_TEST(SymmetryCheck) { - const TString str1 = "123x5"; - const TString str2 = "x2345"; - const float trgDistance = 2.0f; - const size_t trgChainLen = 5; - + UNIT_ASSERT_VALUES_EQUAL(distance, 15.0f); + UNIT_ASSERT_VALUES_EQUAL(chain.size(), 5); + } + + Y_UNIT_TEST(SymmetryCheck) { + const TString str1 = "123x5"; + const TString str2 = "x2345"; + const float trgDistance = 2.0f; + const size_t trgChainLen = 5; + NLevenshtein::TEditChain chainLeftRight; - float distanceLeftRight = 0.0f; + float distanceLeftRight = 0.0f; NLevenshtein::GetEditChain(str1, str2, chainLeftRight, &distanceLeftRight, binaryMaxWeigher, unaryMaxWeigher, unaryMaxWeigher); - UNIT_ASSERT_VALUES_EQUAL(distanceLeftRight, trgDistance); - UNIT_ASSERT_VALUES_EQUAL(chainLeftRight.size(), trgChainLen); + UNIT_ASSERT_VALUES_EQUAL(distanceLeftRight, trgDistance); + UNIT_ASSERT_VALUES_EQUAL(chainLeftRight.size(), trgChainLen); UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chainLeftRight[0]), static_cast<int>(NLevenshtein::EMT_REPLACE)); UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chainLeftRight[1]), static_cast<int>(NLevenshtein::EMT_PRESERVE)); UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chainLeftRight[2]), static_cast<int>(NLevenshtein::EMT_PRESERVE)); UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chainLeftRight[3]), static_cast<int>(NLevenshtein::EMT_REPLACE)); UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chainLeftRight[4]), static_cast<int>(NLevenshtein::EMT_PRESERVE)); - + NLevenshtein::TEditChain chainRightLeft; - float distanceRightLeft = 0.0f; + float distanceRightLeft = 0.0f; NLevenshtein::GetEditChain(str2, str1, chainRightLeft, &distanceRightLeft, binaryMaxWeigher, unaryMaxWeigher, unaryMaxWeigher); - UNIT_ASSERT_VALUES_EQUAL(distanceRightLeft, trgDistance); - UNIT_ASSERT_VALUES_EQUAL(chainRightLeft.size(), trgChainLen); - UNIT_ASSERT(chainRightLeft == chainLeftRight); - } - - Y_UNIT_TEST(PreferReplacements) { - auto binaryWeigher = [](const char&, const char&) { - return 0.0625f; - }; + UNIT_ASSERT_VALUES_EQUAL(distanceRightLeft, trgDistance); + UNIT_ASSERT_VALUES_EQUAL(chainRightLeft.size(), trgChainLen); + UNIT_ASSERT(chainRightLeft == chainLeftRight); + } + + Y_UNIT_TEST(PreferReplacements) { + auto binaryWeigher = [](const char&, const char&) { + return 0.0625f; + }; NLevenshtein::TEditChain chain; - float distance = 0.0f; + float distance = 0.0f; NLevenshtein::GetEditChain(TString("54321"), TString("43210"), chain, &distance, binaryWeigher, unaryMaxWeigher, unaryMaxWeigher); - UNIT_ASSERT_VALUES_EQUAL(distance, 0.3125f); - UNIT_ASSERT_VALUES_EQUAL(chain.size(), 5); - } - - Y_UNIT_TEST(PreferInsertDeletions) { - auto unaryWeigher = [](const char&) { - return 0.0625f; - }; + UNIT_ASSERT_VALUES_EQUAL(distance, 0.3125f); + UNIT_ASSERT_VALUES_EQUAL(chain.size(), 5); + } + + Y_UNIT_TEST(PreferInsertDeletions) { + auto unaryWeigher = [](const char&) { + return 0.0625f; + }; NLevenshtein::TEditChain chain; - float distance = 0.0f; + float distance = 0.0f; NLevenshtein::GetEditChain(TString("54321"), TString("98765"), chain, &distance, binaryMaxWeigher, unaryWeigher, unaryWeigher); - UNIT_ASSERT_VALUES_EQUAL(distance, 0.5f); - UNIT_ASSERT_VALUES_EQUAL(chain.size(), 9); - } - - Y_UNIT_TEST(NoXDeletions) { - auto unaryWeigher = [](const char& c) { - return c == 'x' ? 100.0f : 1.0f; - }; + UNIT_ASSERT_VALUES_EQUAL(distance, 0.5f); + UNIT_ASSERT_VALUES_EQUAL(chain.size(), 9); + } + + Y_UNIT_TEST(NoXDeletions) { + auto unaryWeigher = [](const char& c) { + return c == 'x' ? 100.0f : 1.0f; + }; NLevenshtein::TEditChain chain; - float distance = 0.0f; + float distance = 0.0f; NLevenshtein::GetEditChain(TString("543x1"), TString("5431"), chain, &distance, binaryMaxWeigher, unaryWeigher, unaryMaxWeigher); - UNIT_ASSERT_VALUES_EQUAL(chain.size(), 5); + UNIT_ASSERT_VALUES_EQUAL(chain.size(), 5); UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[3]), static_cast<int>(NLevenshtein::EMT_REPLACE)); UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[4]), static_cast<int>(NLevenshtein::EMT_DELETE)); - UNIT_ASSERT_VALUES_EQUAL(distance, 2.0f); - } - - Y_UNIT_TEST(NoXInsertions) { - auto unaryWeigher = [](const char& c) { - return c == 'x' ? 100.0f : 1.0f; - }; + UNIT_ASSERT_VALUES_EQUAL(distance, 2.0f); + } + + Y_UNIT_TEST(NoXInsertions) { + auto unaryWeigher = [](const char& c) { + return c == 'x' ? 100.0f : 1.0f; + }; NLevenshtein::TEditChain chain; - float distance = 0.0f; + float distance = 0.0f; NLevenshtein::GetEditChain(TString("5431"), TString("543x1"), chain, &distance, binaryMaxWeigher, unaryMaxWeigher, unaryWeigher); - UNIT_ASSERT_VALUES_EQUAL(chain.size(), 5); + UNIT_ASSERT_VALUES_EQUAL(chain.size(), 5); UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[3]), static_cast<int>(NLevenshtein::EMT_REPLACE)); UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[4]), static_cast<int>(NLevenshtein::EMT_INSERT)); - UNIT_ASSERT_VALUES_EQUAL(distance, 2.0f); - } - - Y_UNIT_TEST(NoReplacementsOfX) { - auto binaryWeigher = [](const char& l, const char&) { - return l == 'x' ? 100.0f : 1.0f; - }; + UNIT_ASSERT_VALUES_EQUAL(distance, 2.0f); + } + + Y_UNIT_TEST(NoReplacementsOfX) { + auto binaryWeigher = [](const char& l, const char&) { + return l == 'x' ? 100.0f : 1.0f; + }; NLevenshtein::TEditChain chain; - float distance = 0.0f; + float distance = 0.0f; NLevenshtein::GetEditChain(TString("5432x"), TString("5432y"), chain, &distance, binaryWeigher, unaryMaxWeigher, unaryMaxWeigher); - UNIT_ASSERT_VALUES_EQUAL(chain.size(), 6); + UNIT_ASSERT_VALUES_EQUAL(chain.size(), 6); UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[4]), static_cast<int>(NLevenshtein::EMT_DELETE)); UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[5]), static_cast<int>(NLevenshtein::EMT_INSERT)); - UNIT_ASSERT_VALUES_EQUAL(distance, 2.0f); - } - - Y_UNIT_TEST(NoReplacementsForX) { - auto binaryWeigher = [](const char&, const char& r) { - return r == 'x' ? 100.0f : 1.0f; - }; + UNIT_ASSERT_VALUES_EQUAL(distance, 2.0f); + } + + Y_UNIT_TEST(NoReplacementsForX) { + auto binaryWeigher = [](const char&, const char& r) { + return r == 'x' ? 100.0f : 1.0f; + }; NLevenshtein::TEditChain chain; - float distance = 0.0f; + float distance = 0.0f; NLevenshtein::GetEditChain(TString("y4321"), TString("x4321"), chain, &distance, binaryWeigher, unaryMaxWeigher, unaryMaxWeigher); - UNIT_ASSERT_VALUES_EQUAL(chain.size(), 6); + UNIT_ASSERT_VALUES_EQUAL(chain.size(), 6); UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[0]), static_cast<int>(NLevenshtein::EMT_DELETE)); UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[1]), static_cast<int>(NLevenshtein::EMT_INSERT)); - UNIT_ASSERT_VALUES_EQUAL(distance, 2.0f); - } - - Y_UNIT_TEST(SimilarOperationPriorities) { - auto replaceWeigher = [](const char&, const char&) { - return 0.5f; - }; - auto deleteWeigher = [](const char&) { - return 0.2f; - }; - auto insertWeigher = [](const char&) { - return 0.9f; - }; + UNIT_ASSERT_VALUES_EQUAL(distance, 2.0f); + } + + Y_UNIT_TEST(SimilarOperationPriorities) { + auto replaceWeigher = [](const char&, const char&) { + return 0.5f; + }; + auto deleteWeigher = [](const char&) { + return 0.2f; + }; + auto insertWeigher = [](const char&) { + return 0.9f; + }; NLevenshtein::TEditChain chain; - float distance = 0.0f; + float distance = 0.0f; NLevenshtein::GetEditChain(TString("y0"), TString("0x"), chain, &distance, replaceWeigher, deleteWeigher, insertWeigher); - UNIT_ASSERT_VALUES_EQUAL(chain.size(), 2); + UNIT_ASSERT_VALUES_EQUAL(chain.size(), 2); UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[0]), static_cast<int>(NLevenshtein::EMT_REPLACE)); UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[1]), static_cast<int>(NLevenshtein::EMT_REPLACE)); - UNIT_ASSERT_VALUES_EQUAL(distance, 1.0f); - } -} + UNIT_ASSERT_VALUES_EQUAL(distance, 1.0f); + } +} |