diff options
author | Devtools Arcadia <arcadia-devtools@yandex-team.ru> | 2022-02-07 18:08:42 +0300 |
---|---|---|
committer | Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net> | 2022-02-07 18:08:42 +0300 |
commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/diff | |
download | ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/diff')
-rw-r--r-- | library/cpp/diff/README.md | 1 | ||||
-rw-r--r-- | library/cpp/diff/diff.cpp | 87 | ||||
-rw-r--r-- | library/cpp/diff/diff.h | 112 | ||||
-rw-r--r-- | library/cpp/diff/diff_ut.cpp | 197 | ||||
-rw-r--r-- | library/cpp/diff/ut/ya.make | 15 | ||||
-rw-r--r-- | library/cpp/diff/ya.make | 14 |
6 files changed, 426 insertions, 0 deletions
diff --git a/library/cpp/diff/README.md b/library/cpp/diff/README.md new file mode 100644 index 0000000000..ff68b10eae --- /dev/null +++ b/library/cpp/diff/README.md @@ -0,0 +1 @@ +Note: underlying algorithm `library/cpp/lcs` has complexity of O(r log n) by time and O(r) of additional memory, where r is the number of pairs (i, j) for which S1[i] = S2[j]. When comparing file with itself (or with little modifications) it becomes quadratic on the number of occurences of the most frequent line. diff --git a/library/cpp/diff/diff.cpp b/library/cpp/diff/diff.cpp new file mode 100644 index 0000000000..be57da7f39 --- /dev/null +++ b/library/cpp/diff/diff.cpp @@ -0,0 +1,87 @@ +#include "diff.h" + +#include <util/generic/hash.h> +#include <util/digest/fnv.h> + +#include <iterator> + +template <typename T> +struct TCollectionImpl { + TVector<TConstArrayRef<T>> Words; + TVector<ui64> Keys; + + inline bool Consume(const T* b, const T* e, const T*) { + if (b < e) { + Words.push_back(TConstArrayRef<T>(b, e)); + Keys.push_back(FnvHash<ui64>((const char*)b, (e - b) * sizeof(T))); + } + return true; + } + + TConstArrayRef<T> Remap(const TConstArrayRef<ui64>& keys) const { + if (keys.empty()) { + return TConstArrayRef<T>(); + } + auto firstWordPos = std::distance(Keys.data(), keys.begin()); + auto lastWordPos = std::distance(Keys.data(), keys.end()) - 1; + Y_ASSERT(firstWordPos >= 0); + Y_ASSERT(lastWordPos >= firstWordPos); + Y_ASSERT(static_cast<size_t>(lastWordPos) < Words.size()); + + return TConstArrayRef<T>(Words[firstWordPos].begin(), Words[lastWordPos].end()); + } + + TConstArrayRef<ui64> GetKeys() const { + return TConstArrayRef<ui64>(Keys); + } +}; + +template <typename T> +struct TCollection { +}; + +template <> +struct TCollection<char>: public TCollectionImpl<char> { + TCollection(const TStringBuf& str, const TString& delims) { + TSetDelimiter<const char> set(delims.data()); + TKeepDelimiters<TCollection<char>> c(this); + SplitString(str.begin(), str.end(), set, c); + } +}; + +template <> +struct TCollection<wchar16>: public TCollectionImpl<wchar16> { + TCollection(const TWtringBuf& str, const TUtf16String& delims) { + TSetDelimiter<const wchar16> set(delims.data()); + TKeepDelimiters<TCollection<wchar16>> c(this); + SplitString(str.begin(), str.end(), set, c); + } +}; + +size_t NDiff::InlineDiff(TVector<TChunk<char>>& chunks, const TStringBuf& left, const TStringBuf& right, const TString& delims) { + if (delims.empty()) { + return InlineDiff<char>(chunks, TConstArrayRef<char>(left.data(), left.size()), TConstArrayRef<char>(right.data(), right.size())); + } + TCollection<char> c1(left, delims); + TCollection<char> c2(right, delims); + TVector<TChunk<ui64>> diff; + const size_t dist = InlineDiff<ui64>(diff, c1.GetKeys(), c2.GetKeys()); + for (const auto& it : diff) { + chunks.push_back(TChunk<char>(c1.Remap(it.Left), c2.Remap(it.Right), c1.Remap(it.Common))); + } + return dist; +} + +size_t NDiff::InlineDiff(TVector<TChunk<wchar16>>& chunks, const TWtringBuf& left, const TWtringBuf& right, const TUtf16String& delims) { + if (delims.empty()) { + return InlineDiff<wchar16>(chunks, TConstArrayRef<wchar16>(left.data(), left.size()), TConstArrayRef<wchar16>(right.data(), right.size())); + } + TCollection<wchar16> c1(left, delims); + TCollection<wchar16> c2(right, delims); + TVector<TChunk<ui64>> diff; + const size_t dist = InlineDiff<ui64>(diff, c1.GetKeys(), c2.GetKeys()); + for (const auto& it : diff) { + chunks.push_back(TChunk<wchar16>(c1.Remap(it.Left), c2.Remap(it.Right), c1.Remap(it.Common))); + } + return dist; +} diff --git a/library/cpp/diff/diff.h b/library/cpp/diff/diff.h new file mode 100644 index 0000000000..94fb00cd0b --- /dev/null +++ b/library/cpp/diff/diff.h @@ -0,0 +1,112 @@ +#pragma once + +#include <library/cpp/lcs/lcs_via_lis.h> + +#include <util/generic/algorithm.h> +#include <util/generic/array_ref.h> +#include <util/generic/strbuf.h> +#include <util/generic/vector.h> +#include <util/stream/output.h> +#include <util/string/split.h> + +namespace NDiff { + template <typename T> + struct TChunk { + TConstArrayRef<T> Left; + TConstArrayRef<T> Right; + TConstArrayRef<T> Common; + + TChunk() = default; + + TChunk(const TConstArrayRef<T>& left, const TConstArrayRef<T>& right, const TConstArrayRef<T>& common) + : Left(left) + , Right(right) + , Common(common) + { + } + }; + + template <typename T> + size_t InlineDiff(TVector<TChunk<T>>& chunks, const TConstArrayRef<T>& left, const TConstArrayRef<T>& right) { + TConstArrayRef<T> s1(left); + TConstArrayRef<T> s2(right); + + bool swapped = false; + if (s1.size() < s2.size()) { + // NLCS will silently swap strings if second string is longer + // So we swap strings here and remember the fact since it is crucial to diff + DoSwap(s1, s2); + swapped = true; + } + + TVector<T> lcs; + NLCS::TLCSCtx<T> ctx; + NLCS::MakeLCS<T>(s1, s2, &lcs, &ctx); + + // Start points of current common and diff parts + const T* c1 = nullptr; + const T* c2 = nullptr; + const T* d1 = s1.begin(); + const T* d2 = s2.begin(); + + // End points of current common parts + const T* e1 = s1.begin(); + const T* e2 = s2.begin(); + + size_t dist = s1.size() - lcs.size(); + + const size_t n = ctx.ResultBuffer.size(); + for (size_t i = 0; i <= n && (e1 != s1.end() || e2 != s2.end());) { + if (i < n) { + // Common character exists + // LCS is marked against positions in s2 + // Save the beginning of common part in s2 + c2 = s2.begin() + ctx.ResultBuffer[i]; + // Find the beginning of common part in s1 + c1 = Find(e1, s1.end(), *c2); + // Follow common substring + for (e1 = c1, e2 = c2; i < n && *e1 == *e2; ++e1, ++e2) { + ++i; + } + } else { + // No common character, common part is empty + c1 = s1.end(); + c2 = s2.end(); + e1 = s1.end(); + e2 = s2.end(); + } + + TChunk<T> chunk(TConstArrayRef<T>(d1, c1), TConstArrayRef<T>(d2, c2), TConstArrayRef<T>(c1, e1)); + if (swapped) { + DoSwap(chunk.Left, chunk.Right); + chunk.Common = TConstArrayRef<T>(c2, e2); + } + chunks.push_back(chunk); + + d1 = e1; + d2 = e2; + } + + return dist; + } + + template <typename TFormatter, typename T> + void PrintChunks(IOutputStream& out, const TFormatter& fmt, const TVector<TChunk<T>>& chunks) { + for (typename TVector<TChunk<T>>::const_iterator chunk = chunks.begin(); chunk != chunks.end(); ++chunk) { + if (!chunk->Left.empty() || !chunk->Right.empty()) { + out << fmt.Special("("); + out << fmt.Left(chunk->Left); + out << fmt.Special("|"); + out << fmt.Right(chunk->Right); + out << fmt.Special(")"); + } + out << fmt.Common(chunk->Common); + } + } + + // Without delimiters calculates character-wise diff + // With delimiters calculates token-wise diff + size_t InlineDiff(TVector<TChunk<char>>& chunks, const TStringBuf& left, const TStringBuf& right, const TString& delims = TString()); + size_t InlineDiff(TVector<TChunk<wchar16>>& chunks, const TWtringBuf& left, const TWtringBuf& right, const TUtf16String& delims = TUtf16String()); + +} diff --git a/library/cpp/diff/diff_ut.cpp b/library/cpp/diff/diff_ut.cpp new file mode 100644 index 0000000000..b82a7b000e --- /dev/null +++ b/library/cpp/diff/diff_ut.cpp @@ -0,0 +1,197 @@ +#include "diff.h" + +#include <library/cpp/testing/unittest/registar.h> + +using namespace NDiff; + +struct TDiffTester { + TStringStream Res; + TVector<TChunk<char>> Chunks; + + TStringBuf Special(const TStringBuf& str) const { + return str; + } + + TStringBuf Common(const TConstArrayRef<const char>& str) const { + return TStringBuf(str.begin(), str.end()); + } + + TStringBuf Left(const TConstArrayRef<const char>& str) const { + return TStringBuf(str.begin(), str.end()); + } + + TStringBuf Right(const TConstArrayRef<const char>& str) const { + return TStringBuf(str.begin(), str.end()); + } + + void Test(const TStringBuf& a, const TStringBuf& b, const TString& delims = " \t\n") { + Chunks.clear(); + InlineDiff(Chunks, a, b, delims); + Res.clear(); + PrintChunks(Res, *this, Chunks); + } + + const TString& Result() const { + return Res.Str(); + } +}; + +Y_UNIT_TEST_SUITE(DiffTokens) { + Y_UNIT_TEST(ReturnValue) { + TVector<TChunk<char>> res; + UNIT_ASSERT_VALUES_EQUAL(InlineDiff(res, "aaa", "aaa"), 0); + UNIT_ASSERT_VALUES_EQUAL(InlineDiff(res, "aaa", "aa"), 1); + UNIT_ASSERT_VALUES_EQUAL(InlineDiff(res, "aaa", "a"), 2); + UNIT_ASSERT_VALUES_EQUAL(InlineDiff(res, "aaa", "abc"), 2); + UNIT_ASSERT_VALUES_EQUAL(InlineDiff(res, "aaa", "aba"), 1); + UNIT_ASSERT_VALUES_EQUAL(InlineDiff(res, "", "aba"), 3); + UNIT_ASSERT_VALUES_EQUAL(InlineDiff(res, "aaa", "aaaa"), 1); + UNIT_ASSERT_VALUES_EQUAL(InlineDiff(res, "abc", "xyz"), 3); + } + + Y_UNIT_TEST(EqualStringsOneToken) { + TDiffTester tester; + + tester.Test("aaa", "aaa"); + //~ Cerr << tester.Result() << Endl; + UNIT_ASSERT_VALUES_EQUAL(tester.Result(), "aaa"); + } + + Y_UNIT_TEST(NonCrossingStringsOneToken) { + TDiffTester tester; + + tester.Test("aaa", "bbb"); + //~ Cerr << tester.Result() << Endl; + UNIT_ASSERT_VALUES_EQUAL(tester.Result(), "(aaa|bbb)"); + + tester.Test("aaa", "bbbb"); + //~ Cerr << tester.Result() << Endl; + UNIT_ASSERT_VALUES_EQUAL(tester.Result(), "(aaa|bbbb)"); + } + + Y_UNIT_TEST(Simple) { + TDiffTester tester; + + tester.Test("aaa", "abb", ""); + //~ Cerr << tester.Result() << Endl; + UNIT_ASSERT_VALUES_EQUAL(tester.Result(), "a(aa|bb)"); + + tester.Test("aac", "abc", ""); + //~ Cerr << tester.Result() << Endl; + UNIT_ASSERT_VALUES_EQUAL(tester.Result(), "a(a|b)c"); + + tester.Test("123", "133", ""); + //~ Cerr << tester.Result() << Endl; + UNIT_ASSERT_VALUES_EQUAL(tester.Result(), "1(2|3)3"); + + tester.Test("[1, 2, 3]", "[1, 3, 3]", ""); + //~ Cerr << tester.Result() << Endl; + UNIT_ASSERT_VALUES_EQUAL(tester.Result(), "[1, (2|3), 3]"); + } + + Y_UNIT_TEST(CommonCharOneToken) { + TDiffTester tester; + + tester.Test("abcde", "accfg"); + //~ Cerr << tester.Result() << Endl; + UNIT_ASSERT_VALUES_EQUAL(tester.Result(), "(abcde|accfg)"); + } + + Y_UNIT_TEST(EqualStringsTwoTokens) { + TDiffTester tester; + + TStringBuf str("aaa bbb"); + tester.Test(str, str); + + //~ Cerr << tester.Result() << Endl; + UNIT_ASSERT_VALUES_EQUAL(tester.Result(), "aaa bbb"); + } + + Y_UNIT_TEST(NonCrossingStringsTwoTokens) { + TDiffTester tester; + + tester.Test("aaa bbb", "ccc ddd"); + //~ Cerr << tester.Result() << Endl; + UNIT_ASSERT_VALUES_EQUAL(tester.Result(), "(aaa|ccc) (bbb|ddd)"); + + tester.Test("aaa bbb", "c d"); + //~ Cerr << tester.Result() << Endl; + UNIT_ASSERT_VALUES_EQUAL(tester.Result(), "(aaa|c) (bbb|d)"); + } + + Y_UNIT_TEST(SimpleTwoTokens) { + TDiffTester tester; + + tester.Test("aaa ccd", "abb cce"); + //~ Cerr << tester.Result() << Endl; + UNIT_ASSERT_VALUES_EQUAL(tester.Result(), "(aaa|abb) (ccd|cce)"); + + tester.Test("aac cbb", "aa bb"); + //~ Cerr << tester.Result() << Endl; + UNIT_ASSERT_VALUES_EQUAL(tester.Result(), "(aac|aa) (cbb|bb)"); + } + + Y_UNIT_TEST(MixedTwoTokens) { + TDiffTester tester; + + tester.Test("aaa bbb", "bbb aaa"); + //~ Cerr << tester.Result() << Endl; + UNIT_ASSERT_VALUES_EQUAL(tester.Result(), "(|bbb )aaa( bbb|)"); + + tester.Test("aaa bbb", " bbb aaa"); + //~ Cerr << tester.Result() << Endl; + UNIT_ASSERT_VALUES_EQUAL(tester.Result(), "(aaa|) bbb(| aaa)"); + + tester.Test(" aaa bbb ", " bbb aaa "); + //~ Cerr << tester.Result() << Endl; + UNIT_ASSERT_VALUES_EQUAL(tester.Result(), "(| bbb) aaa (bbb |)"); + + tester.Test("aaa bb", " bbb aa"); + //~ Cerr << tester.Result() << Endl; + UNIT_ASSERT_VALUES_EQUAL(tester.Result(), "(aaa|) (bb|bbb aa)"); + } + + Y_UNIT_TEST(TwoTokensInOneString) { + TDiffTester tester; + + tester.Test("aaa bbb", "aaa"); + //~ Cerr << tester.Result() << Endl; + UNIT_ASSERT_VALUES_EQUAL(tester.Result(), "aaa( bbb|)"); + + tester.Test("aaa bbb", "aaa "); + //~ Cerr << tester.Result() << Endl; + UNIT_ASSERT_VALUES_EQUAL(tester.Result(), "aaa (bbb|)"); + + tester.Test("aaa bbb", " bbb"); + //~ Cerr << tester.Result() << Endl; + UNIT_ASSERT_VALUES_EQUAL(tester.Result(), "(aaa|) bbb"); + + tester.Test("aaa bbb", "bbb"); + //~ Cerr << tester.Result() << Endl; + UNIT_ASSERT_VALUES_EQUAL(tester.Result(), "(aaa |)bbb"); + } + + Y_UNIT_TEST(Multiline) { + TDiffTester tester; + + tester.Test("aaa\nabc\nbbb", "aaa\nacc\nbbb"); + //~ Cerr << tester.Result() << Endl; + UNIT_ASSERT_VALUES_EQUAL(tester.Result(), "aaa\n(abc|acc)\nbbb"); + + tester.Test("aaa\nabc\nbbb", "aaa\nac\nbbb"); + //~ Cerr << tester.Result() << Endl; + UNIT_ASSERT_VALUES_EQUAL(tester.Result(), "aaa\n(abc|ac)\nbbb"); + } + + Y_UNIT_TEST(DifferentDelimiters) { + TDiffTester tester; + + tester.Test("aaa bbb", "aaa\tbbb"); + //~ Cerr << tester.Result() << Endl; + UNIT_ASSERT_VALUES_EQUAL(tester.Result(), "aaa( |\t)bbb"); + + tester.Test(" aaa\tbbb\n", "\taaa\nbbb "); + //~ Cerr << tester.Result() << Endl; + UNIT_ASSERT_VALUES_EQUAL(tester.Result(), "( |\t)aaa(\t|\n)bbb(\n| )"); + } +} diff --git a/library/cpp/diff/ut/ya.make b/library/cpp/diff/ut/ya.make new file mode 100644 index 0000000000..360134d186 --- /dev/null +++ b/library/cpp/diff/ut/ya.make @@ -0,0 +1,15 @@ +UNITTEST() + +OWNER(antonovvk) + +SRCDIR(library/cpp/diff) + +PEERDIR( + library/cpp/diff +) + +SRCS( + diff_ut.cpp +) + +END() diff --git a/library/cpp/diff/ya.make b/library/cpp/diff/ya.make new file mode 100644 index 0000000000..a05b7ccbbc --- /dev/null +++ b/library/cpp/diff/ya.make @@ -0,0 +1,14 @@ +LIBRARY() + +OWNER(antonovvk) + +PEERDIR( + library/cpp/lcs + library/cpp/containers/stack_array +) + +SRCS( + diff.cpp +) + +END() |