diff options
author | Ruslan Kovalev <ruslan.a.kovalev@gmail.com> | 2022-02-10 16:46:45 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:46:45 +0300 |
commit | 9123176b341b6f2658cff5132482b8237c1416c8 (patch) | |
tree | 49e222ea1c5804306084bb3ae065bb702625360f /library/cpp/lcs | |
parent | 59e19371de37995fcb36beb16cd6ec030af960bc (diff) | |
download | ydb-9123176b341b6f2658cff5132482b8237c1416c8.tar.gz |
Restoring authorship annotation for Ruslan Kovalev <ruslan.a.kovalev@gmail.com>. Commit 2 of 2.
Diffstat (limited to 'library/cpp/lcs')
-rw-r--r-- | library/cpp/lcs/README.md | 6 | ||||
-rw-r--r-- | library/cpp/lcs/lcs_via_lis.h | 94 | ||||
-rw-r--r-- | library/cpp/lcs/lcs_via_lis_ut.cpp | 114 | ||||
-rw-r--r-- | library/cpp/lcs/ya.make | 4 |
4 files changed, 109 insertions, 109 deletions
diff --git a/library/cpp/lcs/README.md b/library/cpp/lcs/README.md index d58334e776..7ed3d946d6 100644 --- a/library/cpp/lcs/README.md +++ b/library/cpp/lcs/README.md @@ -1,11 +1,11 @@ A set of algorithms for approximate string matching. ==================================================== - + **Lcs_via_lis.h** Combinatorial algorithm LCS (Longest common subsequence) through LIS (Longest increasing subsequence) for rows S1 and S2 of lengths n and m respectively (we assume that n < m). Complexity is O(r log n) by time and O(r) of additional memory, where r is the number of pairs (i, j) for which S1[i] = S2[j]. Thus for the uniform distribution of letters of an alphabet of s elements the estimate is O(nm / s * log n). - + Effective for large alphabets s = O(n) (like hashes of words in a text). If only the LCS length is needed, the LCS itself will not be built. See Gusfield, Algorithms on Strings, Trees and Sequences, 1997, Chapt.12.5 Or here: http://www.cs.ucf.edu/courses/cap5937/fall2004/Longest%20common%20subsequence.pdf @@ -23,7 +23,7 @@ Denote by: It is easy to prove the theorem that the dimensions of SC and LIS are the same, and it is possible to construct LIS from SC. Next, let's suppose we have 2 strings of S1 and S2. It can be shown that if for each symbol in S1 we take the list of its appearances in S2 in the reverse order, and concatenate all such lists keeping order, then any IS in the resulting list will be equivalent to some common subsequence S1 and S2 of the same length. And, consequently, LIS will be equivalent to LCS. - + The idea of the algorithm for constructing DS: - Going along the original sequence, for the current member x in the list of its DS we find the leftmost, such that its last term is not less than x. - If there is one, then add x to the end. diff --git a/library/cpp/lcs/lcs_via_lis.h b/library/cpp/lcs/lcs_via_lis.h index 43e1c7ccd5..d26733d94e 100644 --- a/library/cpp/lcs/lcs_via_lis.h +++ b/library/cpp/lcs/lcs_via_lis.h @@ -1,34 +1,34 @@ -#pragma once - +#pragma once + #include <library/cpp/containers/paged_vector/paged_vector.h> -#include <util/generic/ptr.h> -#include <util/generic/hash.h> -#include <util/generic/vector.h> +#include <util/generic/ptr.h> +#include <util/generic/hash.h> +#include <util/generic/vector.h> #include <util/generic/algorithm.h> -#include <util/memory/pool.h> - -namespace NLCS { +#include <util/memory/pool.h> + +namespace NLCS { template <typename TVal> struct TLCSCtx { typedef TVector<ui32> TSubsequence; typedef THashMap<TVal, TSubsequence, THash<TVal>, TEqualTo<TVal>, ::TPoolAllocator> TEncounterIndex; typedef TVector<std::pair<ui32, ui32>> TLastIndex; typedef NPagedVector::TPagedVector<TSubsequence, 4096> TCover; - + TMemoryPool Pool; THolder<TEncounterIndex> Encounters; TLastIndex LastIndex; TCover Cover; - + TSubsequence ResultBuffer; - + TLCSCtx() : Pool(16 * 1024 - 64, TMemoryPool::TExpGrow::Instance()) { Reset(); } - + void Reset() { Encounters.Reset(nullptr); Pool.Clear(); @@ -38,17 +38,17 @@ namespace NLCS { ResultBuffer.clear(); } }; - + namespace NPrivate { template <typename TIt, typename TVl> struct TSequence { typedef TIt TIter; typedef TVl TVal; - + const TIter Begin; const TIter End; const size_t Size; - + TSequence(TIter beg, TIter end) : Begin(beg) , End(end) @@ -56,45 +56,45 @@ namespace NLCS { { } }; - + template <typename TVal, typename TSequence, typename TResult> size_t MakeLCS(TSequence s1, TSequence s2, TResult* res = nullptr, TLCSCtx<TVal>* ctx = nullptr) { typedef TLCSCtx<TVal> TCtx; - + THolder<TCtx> ctxhld; - + if (!ctx) { ctxhld.Reset(new TCtx()); ctx = ctxhld.Get(); } else { ctx->Reset(); } - + size_t maxsize = Max(s1.Size, s2.Size); auto& index = *(ctx->Encounters); - + for (auto it = s1.Begin; it != s1.End; ++it) { index[*it]; } - + for (auto it = s2.Begin; it != s2.End; ++it) { auto hit = index.find(*it); - + if (hit != index.end()) { hit->second.push_back(it - s2.Begin); } } - + if (!res) { auto& lastindex = ctx->ResultBuffer; lastindex.reserve(maxsize); for (auto it1 = s1.Begin; it1 != s1.End; ++it1) { const auto& sub2 = index[*it1]; - + for (auto it2 = sub2.rbegin(); it2 != sub2.rend(); ++it2) { ui32 x = *it2; - + auto lit = LowerBound(lastindex.begin(), lastindex.end(), x); if (lit == lastindex.end()) { @@ -104,22 +104,22 @@ namespace NLCS { } } } - + return lastindex.size(); } else { auto& lastindex = ctx->LastIndex; auto& cover = ctx->Cover; lastindex.reserve(maxsize); - + for (auto it1 = s1.Begin; it1 != s1.End; ++it1) { const auto& sub2 = index[*it1]; - + for (auto it2 = sub2.rbegin(); it2 != sub2.rend(); ++it2) { ui32 x = *it2; - + auto lit = LowerBound(lastindex.begin(), lastindex.end(), std::make_pair(x, (ui32)0u)); - + if (lit == lastindex.end()) { lastindex.push_back(std::make_pair(x, cover.size())); cover.emplace_back(); @@ -129,16 +129,16 @@ namespace NLCS { cover[lit->second].push_back(x); } } - } - + } + if (cover.empty()) { return 0; } - + std::reverse(cover.begin(), cover.end()); - + auto& resbuf = ctx->ResultBuffer; - + resbuf.push_back(cover.front().front()); for (auto it = cover.begin() + 1; it != cover.end(); ++it) { @@ -148,9 +148,9 @@ namespace NLCS { resbuf.push_back(*pit); } - + std::reverse(resbuf.begin(), resbuf.end()); - + for (auto it = resbuf.begin(); it != resbuf.end(); ++it) { res->push_back(*(s2.Begin + *it)); } @@ -158,36 +158,36 @@ namespace NLCS { return lastindex.size(); } } - } - + } + template <typename TVal, typename TIter, typename TResult> size_t MakeLCS(TIter beg1, TIter end1, TIter beg2, TIter end2, TResult* res = nullptr, TLCSCtx<TVal>* ctx = nullptr) { typedef NPrivate::TSequence<TIter, TVal> TSeq; - + size_t sz1 = end1 - beg1; size_t sz2 = end2 - beg2; - + if (sz2 > sz1) { DoSwap(beg1, beg2); DoSwap(end1, end2); DoSwap(sz1, sz2); } - + return NPrivate::MakeLCS<TVal>(TSeq(beg1, end1), TSeq(beg2, end2), res, ctx); - } - + } + template <typename TVal, typename TColl, typename TRes> size_t MakeLCS(const TColl& coll1, const TColl& coll2, TRes* res = nullptr, TLCSCtx<TVal>* ctx = nullptr) { return MakeLCS<TVal>(coll1.begin(), coll1.end(), coll2.begin(), coll2.end(), res, ctx); } - + template <typename TVal, typename TIter> size_t MeasureLCS(TIter beg1, TIter end1, TIter beg2, TIter end2, TLCSCtx<TVal>* ctx = nullptr) { return MakeLCS<TVal>(beg1, end1, beg2, end2, (TVector<TVal>*)nullptr, ctx); } - + template <typename TVal, typename TColl> size_t MeasureLCS(const TColl& coll1, const TColl& coll2, TLCSCtx<TVal>* ctx = nullptr) { return MeasureLCS<TVal>(coll1.begin(), coll1.end(), coll2.begin(), coll2.end(), ctx); } -} +} diff --git a/library/cpp/lcs/lcs_via_lis_ut.cpp b/library/cpp/lcs/lcs_via_lis_ut.cpp index 4d90dd400e..f6ad5152b6 100644 --- a/library/cpp/lcs/lcs_via_lis_ut.cpp +++ b/library/cpp/lcs/lcs_via_lis_ut.cpp @@ -1,64 +1,64 @@ -#include <util/generic/utility.h> -#include <util/string/util.h> +#include <util/generic/utility.h> +#include <util/string/util.h> #include <library/cpp/testing/unittest/registar.h> -#include "lcs_via_lis.h" - -class TLCSTest: public TTestBase { +#include "lcs_via_lis.h" + +class TLCSTest: public TTestBase { UNIT_TEST_SUITE(TLCSTest); UNIT_TEST(LCSTest); UNIT_TEST_SUITE_END(); -private: - size_t Length(TStringBuf s1, TStringBuf s2) { +private: + size_t Length(TStringBuf s1, TStringBuf s2) { TVector<TVector<size_t>> c; - c.resize(s1.size() + 1); - - for (size_t i = 0; i < c.size(); ++i) { - c[i].resize(s2.size() + 1); - } - - for (size_t i = 0; i < s1.size(); ++i) { - for (size_t j = 0; j < s2.size(); ++j) { - if (s1[i] == s2[j]) - c[i + 1][j + 1] = c[i][j] + 1; - else - c[i + 1][j + 1] = Max(c[i + 1][j], c[i][j + 1]); - } - } - - return c[s1.size()][s2.size()]; - } - - void CheckLCSLength(TStringBuf s1, TStringBuf s2, size_t size) { - size_t len = NLCS::MeasureLCS<char>(s1, s2); - - UNIT_ASSERT_VALUES_EQUAL(len, Length(s1, s2)); - UNIT_ASSERT_VALUES_EQUAL(len, size); - } - - void CheckLCSString(TStringBuf s1, TStringBuf s2, TStringBuf reflcs) { + c.resize(s1.size() + 1); + + for (size_t i = 0; i < c.size(); ++i) { + c[i].resize(s2.size() + 1); + } + + for (size_t i = 0; i < s1.size(); ++i) { + for (size_t j = 0; j < s2.size(); ++j) { + if (s1[i] == s2[j]) + c[i + 1][j + 1] = c[i][j] + 1; + else + c[i + 1][j + 1] = Max(c[i + 1][j], c[i][j + 1]); + } + } + + return c[s1.size()][s2.size()]; + } + + void CheckLCSLength(TStringBuf s1, TStringBuf s2, size_t size) { + size_t len = NLCS::MeasureLCS<char>(s1, s2); + + UNIT_ASSERT_VALUES_EQUAL(len, Length(s1, s2)); + UNIT_ASSERT_VALUES_EQUAL(len, size); + } + + void CheckLCSString(TStringBuf s1, TStringBuf s2, TStringBuf reflcs) { TString lcs; - size_t len = NLCS::MakeLCS<char>(s1, s2, &lcs); + size_t len = NLCS::MakeLCS<char>(s1, s2, &lcs); const char* comment = Sprintf("%s & %s = %s", s1.data(), s2.data(), reflcs.data()).c_str(); - - UNIT_ASSERT_VALUES_EQUAL_C(Length(s1, s2), len, comment); + + UNIT_ASSERT_VALUES_EQUAL_C(Length(s1, s2), len, comment); UNIT_ASSERT_VALUES_EQUAL_C(lcs.size(), len, comment); - UNIT_ASSERT_VALUES_EQUAL_C(NLCS::MeasureLCS<char>(s1, s2), len, comment); - UNIT_ASSERT_VALUES_EQUAL_C(reflcs, TStringBuf(lcs), comment); - } - - void LCSTest() { - CheckLCSString("abacx", "baabca", "bac"); + UNIT_ASSERT_VALUES_EQUAL_C(NLCS::MeasureLCS<char>(s1, s2), len, comment); + UNIT_ASSERT_VALUES_EQUAL_C(reflcs, TStringBuf(lcs), comment); + } + + void LCSTest() { + CheckLCSString("abacx", "baabca", "bac"); const char* m = "mama_myla_ramu"; const char* n = "papa_lubil_mamu"; const char* s = "aa_l_amu"; - CheckLCSString(m, n, s); - CheckLCSString(n, m, s); - CheckLCSString(m, m, m); - CheckLCSString(m, "", ""); - CheckLCSString("", m, ""); - CheckLCSString("", "", ""); - { + CheckLCSString(m, n, s); + CheckLCSString(n, m, s); + CheckLCSString(m, m, m); + CheckLCSString(m, "", ""); + CheckLCSString("", m, ""); + CheckLCSString("", "", ""); + { const char* s1 = "atmwuaoccmgirveexxtbkkmioclarskvicyesddirlrgflnietcagkswbvsdnxksipfndmusnuee" "tojygyjyobdfiutsbruuspvibmywhokxsarwbyirsqqnxxnbtkmucmdafaogwmobuuhejspgurpe" @@ -83,9 +83,9 @@ private: "ctsnyjleqgttcgpnhlnagxenuknpxiramgeshhjyoesupkcfcvvpwyweuvcwrawsgvfshppijuug" "hdnujdqjtcdissmlnjgibdljjxntxrgytxlbgvsrrusatqelspeoyvndjifjqxqrpduwbyojjbhi" "tmondbbnuuhpkglmfykeheddwkxjyapfniqoic"; - CheckLCSLength(s1, s2, 247); - } - { + CheckLCSLength(s1, s2, 247); + } + { const char* s1 = "ssyrtvllktbhijkyckviokukaodexiykxcvlvninoasblpuujnqnilnhmacsulaulskphkccnfop" "jlhejqhemdpnihasouinuceugkfkaifdepvffntbkfivsddtxyslnlwiyfbbpnrwlkryetncahih" @@ -113,8 +113,8 @@ private: "xmfwokjskummmkaksbeoowfgjwiumpbkdujexectnghqjjuotofwuwvcgtluephymdkrscbfiaeg" "aaypnclkstfqimisanikjxocmhcrotkntprwjbuudswuyuujfawjucwgifhqgepxeidmvcwqsqrv" "karuvpnxhmrvdcocidgtuxasdqkwsdvijmnpmayhfiva"; - CheckLCSLength(s1, s2, 288); - } - } -}; -UNIT_TEST_SUITE_REGISTRATION(TLCSTest) + CheckLCSLength(s1, s2, 288); + } + } +}; +UNIT_TEST_SUITE_REGISTRATION(TLCSTest) diff --git a/library/cpp/lcs/ya.make b/library/cpp/lcs/ya.make index 50455f1243..3a7caafc88 100644 --- a/library/cpp/lcs/ya.make +++ b/library/cpp/lcs/ya.make @@ -1,5 +1,5 @@ -LIBRARY() - +LIBRARY() + OWNER(velavokr) PEERDIR( |