summaryrefslogtreecommitdiffstats
path: root/library/cpp/diff/diff.h
diff options
context:
space:
mode:
authorDevtools Arcadia <[email protected]>2022-02-07 18:08:42 +0300
committerDevtools Arcadia <[email protected]>2022-02-07 18:08:42 +0300
commit1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
treee26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/diff/diff.h
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/diff/diff.h')
-rw-r--r--library/cpp/diff/diff.h112
1 files changed, 112 insertions, 0 deletions
diff --git a/library/cpp/diff/diff.h b/library/cpp/diff/diff.h
new file mode 100644
index 00000000000..94fb00cd0b3
--- /dev/null
+++ b/library/cpp/diff/diff.h
@@ -0,0 +1,112 @@
+#pragma once
+
+#include <library/cpp/lcs/lcs_via_lis.h>
+
+#include <util/generic/algorithm.h>
+#include <util/generic/array_ref.h>
+#include <util/generic/strbuf.h>
+#include <util/generic/vector.h>
+#include <util/stream/output.h>
+#include <util/string/split.h>
+
+namespace NDiff {
+ template <typename T>
+ struct TChunk {
+ TConstArrayRef<T> Left;
+ TConstArrayRef<T> Right;
+ TConstArrayRef<T> Common;
+
+ TChunk() = default;
+
+ TChunk(const TConstArrayRef<T>& left, const TConstArrayRef<T>& right, const TConstArrayRef<T>& common)
+ : Left(left)
+ , Right(right)
+ , Common(common)
+ {
+ }
+ };
+
+ template <typename T>
+ size_t InlineDiff(TVector<TChunk<T>>& chunks, const TConstArrayRef<T>& left, const TConstArrayRef<T>& right) {
+ TConstArrayRef<T> s1(left);
+ TConstArrayRef<T> s2(right);
+
+ bool swapped = false;
+ if (s1.size() < s2.size()) {
+ // NLCS will silently swap strings if second string is longer
+ // So we swap strings here and remember the fact since it is crucial to diff
+ DoSwap(s1, s2);
+ swapped = true;
+ }
+
+ TVector<T> lcs;
+ NLCS::TLCSCtx<T> ctx;
+ NLCS::MakeLCS<T>(s1, s2, &lcs, &ctx);
+
+ // Start points of current common and diff parts
+ const T* c1 = nullptr;
+ const T* c2 = nullptr;
+ const T* d1 = s1.begin();
+ const T* d2 = s2.begin();
+
+ // End points of current common parts
+ const T* e1 = s1.begin();
+ const T* e2 = s2.begin();
+
+ size_t dist = s1.size() - lcs.size();
+
+ const size_t n = ctx.ResultBuffer.size();
+ for (size_t i = 0; i <= n && (e1 != s1.end() || e2 != s2.end());) {
+ if (i < n) {
+ // Common character exists
+ // LCS is marked against positions in s2
+ // Save the beginning of common part in s2
+ c2 = s2.begin() + ctx.ResultBuffer[i];
+ // Find the beginning of common part in s1
+ c1 = Find(e1, s1.end(), *c2);
+ // Follow common substring
+ for (e1 = c1, e2 = c2; i < n && *e1 == *e2; ++e1, ++e2) {
+ ++i;
+ }
+ } else {
+ // No common character, common part is empty
+ c1 = s1.end();
+ c2 = s2.end();
+ e1 = s1.end();
+ e2 = s2.end();
+ }
+
+ TChunk<T> chunk(TConstArrayRef<T>(d1, c1), TConstArrayRef<T>(d2, c2), TConstArrayRef<T>(c1, e1));
+ if (swapped) {
+ DoSwap(chunk.Left, chunk.Right);
+ chunk.Common = TConstArrayRef<T>(c2, e2);
+ }
+ chunks.push_back(chunk);
+
+ d1 = e1;
+ d2 = e2;
+ }
+
+ return dist;
+ }
+
+ template <typename TFormatter, typename T>
+ void PrintChunks(IOutputStream& out, const TFormatter& fmt, const TVector<TChunk<T>>& chunks) {
+ for (typename TVector<TChunk<T>>::const_iterator chunk = chunks.begin(); chunk != chunks.end(); ++chunk) {
+ if (!chunk->Left.empty() || !chunk->Right.empty()) {
+ out << fmt.Special("(");
+ out << fmt.Left(chunk->Left);
+ out << fmt.Special("|");
+ out << fmt.Right(chunk->Right);
+ out << fmt.Special(")");
+ }
+ out << fmt.Common(chunk->Common);
+ }
+ }
+
+ // Without delimiters calculates character-wise diff
+ // With delimiters calculates token-wise diff
+ size_t InlineDiff(TVector<TChunk<char>>& chunks, const TStringBuf& left, const TStringBuf& right, const TString& delims = TString());
+ size_t InlineDiff(TVector<TChunk<wchar16>>& chunks, const TWtringBuf& left, const TWtringBuf& right, const TUtf16String& delims = TUtf16String());
+
+}