aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/diff
diff options
context:
space:
mode:
authorDevtools Arcadia <arcadia-devtools@yandex-team.ru>2022-02-07 18:08:42 +0300
committerDevtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net>2022-02-07 18:08:42 +0300
commit1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
treee26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/diff
downloadydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/diff')
-rw-r--r--library/cpp/diff/README.md1
-rw-r--r--library/cpp/diff/diff.cpp87
-rw-r--r--library/cpp/diff/diff.h112
-rw-r--r--library/cpp/diff/diff_ut.cpp197
-rw-r--r--library/cpp/diff/ut/ya.make15
-rw-r--r--library/cpp/diff/ya.make14
6 files changed, 426 insertions, 0 deletions
diff --git a/library/cpp/diff/README.md b/library/cpp/diff/README.md
new file mode 100644
index 0000000000..ff68b10eae
--- /dev/null
+++ b/library/cpp/diff/README.md
@@ -0,0 +1 @@
+Note: underlying algorithm `library/cpp/lcs` has complexity of O(r log n) by time and O(r) of additional memory, where r is the number of pairs (i, j) for which S1[i] = S2[j]. When comparing file with itself (or with little modifications) it becomes quadratic on the number of occurences of the most frequent line.
diff --git a/library/cpp/diff/diff.cpp b/library/cpp/diff/diff.cpp
new file mode 100644
index 0000000000..be57da7f39
--- /dev/null
+++ b/library/cpp/diff/diff.cpp
@@ -0,0 +1,87 @@
+#include "diff.h"
+
+#include <util/generic/hash.h>
+#include <util/digest/fnv.h>
+
+#include <iterator>
+
+template <typename T>
+struct TCollectionImpl {
+ TVector<TConstArrayRef<T>> Words;
+ TVector<ui64> Keys;
+
+ inline bool Consume(const T* b, const T* e, const T*) {
+ if (b < e) {
+ Words.push_back(TConstArrayRef<T>(b, e));
+ Keys.push_back(FnvHash<ui64>((const char*)b, (e - b) * sizeof(T)));
+ }
+ return true;
+ }
+
+ TConstArrayRef<T> Remap(const TConstArrayRef<ui64>& keys) const {
+ if (keys.empty()) {
+ return TConstArrayRef<T>();
+ }
+ auto firstWordPos = std::distance(Keys.data(), keys.begin());
+ auto lastWordPos = std::distance(Keys.data(), keys.end()) - 1;
+ Y_ASSERT(firstWordPos >= 0);
+ Y_ASSERT(lastWordPos >= firstWordPos);
+ Y_ASSERT(static_cast<size_t>(lastWordPos) < Words.size());
+
+ return TConstArrayRef<T>(Words[firstWordPos].begin(), Words[lastWordPos].end());
+ }
+
+ TConstArrayRef<ui64> GetKeys() const {
+ return TConstArrayRef<ui64>(Keys);
+ }
+};
+
+template <typename T>
+struct TCollection {
+};
+
+template <>
+struct TCollection<char>: public TCollectionImpl<char> {
+ TCollection(const TStringBuf& str, const TString& delims) {
+ TSetDelimiter<const char> set(delims.data());
+ TKeepDelimiters<TCollection<char>> c(this);
+ SplitString(str.begin(), str.end(), set, c);
+ }
+};
+
+template <>
+struct TCollection<wchar16>: public TCollectionImpl<wchar16> {
+ TCollection(const TWtringBuf& str, const TUtf16String& delims) {
+ TSetDelimiter<const wchar16> set(delims.data());
+ TKeepDelimiters<TCollection<wchar16>> c(this);
+ SplitString(str.begin(), str.end(), set, c);
+ }
+};
+
+size_t NDiff::InlineDiff(TVector<TChunk<char>>& chunks, const TStringBuf& left, const TStringBuf& right, const TString& delims) {
+ if (delims.empty()) {
+ return InlineDiff<char>(chunks, TConstArrayRef<char>(left.data(), left.size()), TConstArrayRef<char>(right.data(), right.size()));
+ }
+ TCollection<char> c1(left, delims);
+ TCollection<char> c2(right, delims);
+ TVector<TChunk<ui64>> diff;
+ const size_t dist = InlineDiff<ui64>(diff, c1.GetKeys(), c2.GetKeys());
+ for (const auto& it : diff) {
+ chunks.push_back(TChunk<char>(c1.Remap(it.Left), c2.Remap(it.Right), c1.Remap(it.Common)));
+ }
+ return dist;
+}
+
+size_t NDiff::InlineDiff(TVector<TChunk<wchar16>>& chunks, const TWtringBuf& left, const TWtringBuf& right, const TUtf16String& delims) {
+ if (delims.empty()) {
+ return InlineDiff<wchar16>(chunks, TConstArrayRef<wchar16>(left.data(), left.size()), TConstArrayRef<wchar16>(right.data(), right.size()));
+ }
+ TCollection<wchar16> c1(left, delims);
+ TCollection<wchar16> c2(right, delims);
+ TVector<TChunk<ui64>> diff;
+ const size_t dist = InlineDiff<ui64>(diff, c1.GetKeys(), c2.GetKeys());
+ for (const auto& it : diff) {
+ chunks.push_back(TChunk<wchar16>(c1.Remap(it.Left), c2.Remap(it.Right), c1.Remap(it.Common)));
+ }
+ return dist;
+}
diff --git a/library/cpp/diff/diff.h b/library/cpp/diff/diff.h
new file mode 100644
index 0000000000..94fb00cd0b
--- /dev/null
+++ b/library/cpp/diff/diff.h
@@ -0,0 +1,112 @@
+#pragma once
+
+#include <library/cpp/lcs/lcs_via_lis.h>
+
+#include <util/generic/algorithm.h>
+#include <util/generic/array_ref.h>
+#include <util/generic/strbuf.h>
+#include <util/generic/vector.h>
+#include <util/stream/output.h>
+#include <util/string/split.h>
+
+namespace NDiff {
+ template <typename T>
+ struct TChunk {
+ TConstArrayRef<T> Left;
+ TConstArrayRef<T> Right;
+ TConstArrayRef<T> Common;
+
+ TChunk() = default;
+
+ TChunk(const TConstArrayRef<T>& left, const TConstArrayRef<T>& right, const TConstArrayRef<T>& common)
+ : Left(left)
+ , Right(right)
+ , Common(common)
+ {
+ }
+ };
+
+ template <typename T>
+ size_t InlineDiff(TVector<TChunk<T>>& chunks, const TConstArrayRef<T>& left, const TConstArrayRef<T>& right) {
+ TConstArrayRef<T> s1(left);
+ TConstArrayRef<T> s2(right);
+
+ bool swapped = false;
+ if (s1.size() < s2.size()) {
+ // NLCS will silently swap strings if second string is longer
+ // So we swap strings here and remember the fact since it is crucial to diff
+ DoSwap(s1, s2);
+ swapped = true;
+ }
+
+ TVector<T> lcs;
+ NLCS::TLCSCtx<T> ctx;
+ NLCS::MakeLCS<T>(s1, s2, &lcs, &ctx);
+
+ // Start points of current common and diff parts
+ const T* c1 = nullptr;
+ const T* c2 = nullptr;
+ const T* d1 = s1.begin();
+ const T* d2 = s2.begin();
+
+ // End points of current common parts
+ const T* e1 = s1.begin();
+ const T* e2 = s2.begin();
+
+ size_t dist = s1.size() - lcs.size();
+
+ const size_t n = ctx.ResultBuffer.size();
+ for (size_t i = 0; i <= n && (e1 != s1.end() || e2 != s2.end());) {
+ if (i < n) {
+ // Common character exists
+ // LCS is marked against positions in s2
+ // Save the beginning of common part in s2
+ c2 = s2.begin() + ctx.ResultBuffer[i];
+ // Find the beginning of common part in s1
+ c1 = Find(e1, s1.end(), *c2);
+ // Follow common substring
+ for (e1 = c1, e2 = c2; i < n && *e1 == *e2; ++e1, ++e2) {
+ ++i;
+ }
+ } else {
+ // No common character, common part is empty
+ c1 = s1.end();
+ c2 = s2.end();
+ e1 = s1.end();
+ e2 = s2.end();
+ }
+
+ TChunk<T> chunk(TConstArrayRef<T>(d1, c1), TConstArrayRef<T>(d2, c2), TConstArrayRef<T>(c1, e1));
+ if (swapped) {
+ DoSwap(chunk.Left, chunk.Right);
+ chunk.Common = TConstArrayRef<T>(c2, e2);
+ }
+ chunks.push_back(chunk);
+
+ d1 = e1;
+ d2 = e2;
+ }
+
+ return dist;
+ }
+
+ template <typename TFormatter, typename T>
+ void PrintChunks(IOutputStream& out, const TFormatter& fmt, const TVector<TChunk<T>>& chunks) {
+ for (typename TVector<TChunk<T>>::const_iterator chunk = chunks.begin(); chunk != chunks.end(); ++chunk) {
+ if (!chunk->Left.empty() || !chunk->Right.empty()) {
+ out << fmt.Special("(");
+ out << fmt.Left(chunk->Left);
+ out << fmt.Special("|");
+ out << fmt.Right(chunk->Right);
+ out << fmt.Special(")");
+ }
+ out << fmt.Common(chunk->Common);
+ }
+ }
+
+ // Without delimiters calculates character-wise diff
+ // With delimiters calculates token-wise diff
+ size_t InlineDiff(TVector<TChunk<char>>& chunks, const TStringBuf& left, const TStringBuf& right, const TString& delims = TString());
+ size_t InlineDiff(TVector<TChunk<wchar16>>& chunks, const TWtringBuf& left, const TWtringBuf& right, const TUtf16String& delims = TUtf16String());
+
+}
diff --git a/library/cpp/diff/diff_ut.cpp b/library/cpp/diff/diff_ut.cpp
new file mode 100644
index 0000000000..b82a7b000e
--- /dev/null
+++ b/library/cpp/diff/diff_ut.cpp
@@ -0,0 +1,197 @@
+#include "diff.h"
+
+#include <library/cpp/testing/unittest/registar.h>
+
+using namespace NDiff;
+
+struct TDiffTester {
+ TStringStream Res;
+ TVector<TChunk<char>> Chunks;
+
+ TStringBuf Special(const TStringBuf& str) const {
+ return str;
+ }
+
+ TStringBuf Common(const TConstArrayRef<const char>& str) const {
+ return TStringBuf(str.begin(), str.end());
+ }
+
+ TStringBuf Left(const TConstArrayRef<const char>& str) const {
+ return TStringBuf(str.begin(), str.end());
+ }
+
+ TStringBuf Right(const TConstArrayRef<const char>& str) const {
+ return TStringBuf(str.begin(), str.end());
+ }
+
+ void Test(const TStringBuf& a, const TStringBuf& b, const TString& delims = " \t\n") {
+ Chunks.clear();
+ InlineDiff(Chunks, a, b, delims);
+ Res.clear();
+ PrintChunks(Res, *this, Chunks);
+ }
+
+ const TString& Result() const {
+ return Res.Str();
+ }
+};
+
+Y_UNIT_TEST_SUITE(DiffTokens) {
+ Y_UNIT_TEST(ReturnValue) {
+ TVector<TChunk<char>> res;
+ UNIT_ASSERT_VALUES_EQUAL(InlineDiff(res, "aaa", "aaa"), 0);
+ UNIT_ASSERT_VALUES_EQUAL(InlineDiff(res, "aaa", "aa"), 1);
+ UNIT_ASSERT_VALUES_EQUAL(InlineDiff(res, "aaa", "a"), 2);
+ UNIT_ASSERT_VALUES_EQUAL(InlineDiff(res, "aaa", "abc"), 2);
+ UNIT_ASSERT_VALUES_EQUAL(InlineDiff(res, "aaa", "aba"), 1);
+ UNIT_ASSERT_VALUES_EQUAL(InlineDiff(res, "", "aba"), 3);
+ UNIT_ASSERT_VALUES_EQUAL(InlineDiff(res, "aaa", "aaaa"), 1);
+ UNIT_ASSERT_VALUES_EQUAL(InlineDiff(res, "abc", "xyz"), 3);
+ }
+
+ Y_UNIT_TEST(EqualStringsOneToken) {
+ TDiffTester tester;
+
+ tester.Test("aaa", "aaa");
+ //~ Cerr << tester.Result() << Endl;
+ UNIT_ASSERT_VALUES_EQUAL(tester.Result(), "aaa");
+ }
+
+ Y_UNIT_TEST(NonCrossingStringsOneToken) {
+ TDiffTester tester;
+
+ tester.Test("aaa", "bbb");
+ //~ Cerr << tester.Result() << Endl;
+ UNIT_ASSERT_VALUES_EQUAL(tester.Result(), "(aaa|bbb)");
+
+ tester.Test("aaa", "bbbb");
+ //~ Cerr << tester.Result() << Endl;
+ UNIT_ASSERT_VALUES_EQUAL(tester.Result(), "(aaa|bbbb)");
+ }
+
+ Y_UNIT_TEST(Simple) {
+ TDiffTester tester;
+
+ tester.Test("aaa", "abb", "");
+ //~ Cerr << tester.Result() << Endl;
+ UNIT_ASSERT_VALUES_EQUAL(tester.Result(), "a(aa|bb)");
+
+ tester.Test("aac", "abc", "");
+ //~ Cerr << tester.Result() << Endl;
+ UNIT_ASSERT_VALUES_EQUAL(tester.Result(), "a(a|b)c");
+
+ tester.Test("123", "133", "");
+ //~ Cerr << tester.Result() << Endl;
+ UNIT_ASSERT_VALUES_EQUAL(tester.Result(), "1(2|3)3");
+
+ tester.Test("[1, 2, 3]", "[1, 3, 3]", "");
+ //~ Cerr << tester.Result() << Endl;
+ UNIT_ASSERT_VALUES_EQUAL(tester.Result(), "[1, (2|3), 3]");
+ }
+
+ Y_UNIT_TEST(CommonCharOneToken) {
+ TDiffTester tester;
+
+ tester.Test("abcde", "accfg");
+ //~ Cerr << tester.Result() << Endl;
+ UNIT_ASSERT_VALUES_EQUAL(tester.Result(), "(abcde|accfg)");
+ }
+
+ Y_UNIT_TEST(EqualStringsTwoTokens) {
+ TDiffTester tester;
+
+ TStringBuf str("aaa bbb");
+ tester.Test(str, str);
+
+ //~ Cerr << tester.Result() << Endl;
+ UNIT_ASSERT_VALUES_EQUAL(tester.Result(), "aaa bbb");
+ }
+
+ Y_UNIT_TEST(NonCrossingStringsTwoTokens) {
+ TDiffTester tester;
+
+ tester.Test("aaa bbb", "ccc ddd");
+ //~ Cerr << tester.Result() << Endl;
+ UNIT_ASSERT_VALUES_EQUAL(tester.Result(), "(aaa|ccc) (bbb|ddd)");
+
+ tester.Test("aaa bbb", "c d");
+ //~ Cerr << tester.Result() << Endl;
+ UNIT_ASSERT_VALUES_EQUAL(tester.Result(), "(aaa|c) (bbb|d)");
+ }
+
+ Y_UNIT_TEST(SimpleTwoTokens) {
+ TDiffTester tester;
+
+ tester.Test("aaa ccd", "abb cce");
+ //~ Cerr << tester.Result() << Endl;
+ UNIT_ASSERT_VALUES_EQUAL(tester.Result(), "(aaa|abb) (ccd|cce)");
+
+ tester.Test("aac cbb", "aa bb");
+ //~ Cerr << tester.Result() << Endl;
+ UNIT_ASSERT_VALUES_EQUAL(tester.Result(), "(aac|aa) (cbb|bb)");
+ }
+
+ Y_UNIT_TEST(MixedTwoTokens) {
+ TDiffTester tester;
+
+ tester.Test("aaa bbb", "bbb aaa");
+ //~ Cerr << tester.Result() << Endl;
+ UNIT_ASSERT_VALUES_EQUAL(tester.Result(), "(|bbb )aaa( bbb|)");
+
+ tester.Test("aaa bbb", " bbb aaa");
+ //~ Cerr << tester.Result() << Endl;
+ UNIT_ASSERT_VALUES_EQUAL(tester.Result(), "(aaa|) bbb(| aaa)");
+
+ tester.Test(" aaa bbb ", " bbb aaa ");
+ //~ Cerr << tester.Result() << Endl;
+ UNIT_ASSERT_VALUES_EQUAL(tester.Result(), "(| bbb) aaa (bbb |)");
+
+ tester.Test("aaa bb", " bbb aa");
+ //~ Cerr << tester.Result() << Endl;
+ UNIT_ASSERT_VALUES_EQUAL(tester.Result(), "(aaa|) (bb|bbb aa)");
+ }
+
+ Y_UNIT_TEST(TwoTokensInOneString) {
+ TDiffTester tester;
+
+ tester.Test("aaa bbb", "aaa");
+ //~ Cerr << tester.Result() << Endl;
+ UNIT_ASSERT_VALUES_EQUAL(tester.Result(), "aaa( bbb|)");
+
+ tester.Test("aaa bbb", "aaa ");
+ //~ Cerr << tester.Result() << Endl;
+ UNIT_ASSERT_VALUES_EQUAL(tester.Result(), "aaa (bbb|)");
+
+ tester.Test("aaa bbb", " bbb");
+ //~ Cerr << tester.Result() << Endl;
+ UNIT_ASSERT_VALUES_EQUAL(tester.Result(), "(aaa|) bbb");
+
+ tester.Test("aaa bbb", "bbb");
+ //~ Cerr << tester.Result() << Endl;
+ UNIT_ASSERT_VALUES_EQUAL(tester.Result(), "(aaa |)bbb");
+ }
+
+ Y_UNIT_TEST(Multiline) {
+ TDiffTester tester;
+
+ tester.Test("aaa\nabc\nbbb", "aaa\nacc\nbbb");
+ //~ Cerr << tester.Result() << Endl;
+ UNIT_ASSERT_VALUES_EQUAL(tester.Result(), "aaa\n(abc|acc)\nbbb");
+
+ tester.Test("aaa\nabc\nbbb", "aaa\nac\nbbb");
+ //~ Cerr << tester.Result() << Endl;
+ UNIT_ASSERT_VALUES_EQUAL(tester.Result(), "aaa\n(abc|ac)\nbbb");
+ }
+
+ Y_UNIT_TEST(DifferentDelimiters) {
+ TDiffTester tester;
+
+ tester.Test("aaa bbb", "aaa\tbbb");
+ //~ Cerr << tester.Result() << Endl;
+ UNIT_ASSERT_VALUES_EQUAL(tester.Result(), "aaa( |\t)bbb");
+
+ tester.Test(" aaa\tbbb\n", "\taaa\nbbb ");
+ //~ Cerr << tester.Result() << Endl;
+ UNIT_ASSERT_VALUES_EQUAL(tester.Result(), "( |\t)aaa(\t|\n)bbb(\n| )");
+ }
+}
diff --git a/library/cpp/diff/ut/ya.make b/library/cpp/diff/ut/ya.make
new file mode 100644
index 0000000000..360134d186
--- /dev/null
+++ b/library/cpp/diff/ut/ya.make
@@ -0,0 +1,15 @@
+UNITTEST()
+
+OWNER(antonovvk)
+
+SRCDIR(library/cpp/diff)
+
+PEERDIR(
+ library/cpp/diff
+)
+
+SRCS(
+ diff_ut.cpp
+)
+
+END()
diff --git a/library/cpp/diff/ya.make b/library/cpp/diff/ya.make
new file mode 100644
index 0000000000..a05b7ccbbc
--- /dev/null
+++ b/library/cpp/diff/ya.make
@@ -0,0 +1,14 @@
+LIBRARY()
+
+OWNER(antonovvk)
+
+PEERDIR(
+ library/cpp/lcs
+ library/cpp/containers/stack_array
+)
+
+SRCS(
+ diff.cpp
+)
+
+END()