YQL-20112: Improve dramatically yql/utils/docs

Introduced `links.json` format to link names to documentation sections. Implement general links verification framework. Also fixed two small typos. Extended Description: https://nda.ya.ru/t/zR4voivb7GzD9r. commit_hash:e72db0e202b4ff612374c73fa384f70d029f0ef0
author: vitya-smirnov <[email protected]> 2025-07-30 11:26:26 +0300
committer: vitya-smirnov <[email protected]> 2025-07-30 11:38:37 +0300
commit: cf9f591e5c90bf964bb922c0f6c3716045972b02 (patch)
tree: 36d4eb0816606653836399ac32ea58d2eca08b53 /yql/essentials/utils/docs/link_page.cpp
parent: ada885655c2e21f6b55e2d3d724e57c9a1fdb843 (diff)
1 files changed, 116 insertions, 0 deletions
diff --git a/yql/essentials/utils/docs/link_page.cpp b/yql/essentials/utils/docs/link_page.cpp
new file mode 100644
index 00000000000..eb71979462d
--- /dev/null
+++ b/yql/essentials/utils/docs/link_page.cpp
@@ -0,0 +1,116 @@
+#include "link_page.h"
+
+#include "name.h"
+
+#include <util/generic/hash_set.h>
+#include <util/string/split.h>
+
+namespace NYql::NDocs {
+
+    TMaybe<TString> MatchSingleFunctionHeader(TStringBuf header) {
+        return NormalizedName(TString(header));
+    }
+
+    TVector<TString> SplitBy(TStringBuf delim, const TVector<TString>& strings) {
+        TVector<TString> parts;
+        for (const TString& s : strings) {
+            StringSplitter(s).SplitByString(delim).AddTo(&parts);
+        }
+        return parts;
+    }
+
+    TVector<TString> SplitByPunctuation(TStringBuf header) {
+        TVector<TString> parts = {TString(header)};
+        parts = SplitBy(" и ", parts);
+        parts = SplitBy(" / ", parts);
+        parts = SplitBy(", ", parts);
+        return parts;
+    }
+
+    TVector<TString> MatchMultiFunctionHeader(TStringBuf header) {
+        TVector<TString> names = SplitByPunctuation(header);
+
+        for (TString& name : names) {
+            TMaybe<TString> normalized = NormalizedName(std::move(name));
+            if (!normalized) {
+                return {};
+            }
+
+            name = std::move(*normalized);
+        }
+
+        return names;
+    }
+
+    TVector<TString> ExtractNormalized(TStringBuf header) {
+        if (auto single = MatchSingleFunctionHeader(header)) {
+            return {*single};
+        }
+        if (auto multi = MatchMultiFunctionHeader(header)) {
+            return multi;
+        }
+        return {};
+    }
+
+    void EnrichFromMarkdown(TLinks& links, const TString& path, const TMarkdownHeader& header) {
+        for (const TString& name : ExtractNormalized(header.Content)) {
+            links[name] = {
+                .RelativePath = path,
+                .Anchor = header.Anchor,
+            };
+        }
+    }
+
+    void EnrichFromMarkdown(TLinks& links, const TString& path, const TMarkdownPage& page) {
+        for (const auto& [anchor, section] : page.SectionsByAnchor) {
+            const TMarkdownHeader& header = section.Header;
+            EnrichFromMarkdown(links, path, header);
+        }
+    }
+
+    void EnrichFromMarkdown(TLinks& links, const TPages& pages) {
+        for (const auto& [path, page] : pages) {
+            EnrichFromMarkdown(links, path, page);
+        }
+    }
+
+    TLinks GetLinksFromPages(const TPages& pages) {
+        TLinks links;
+        EnrichFromMarkdown(links, pages);
+        return links;
+    }
+
+    TPages Stripped(TPages&& pages, const TLinks& links) {
+        THashSet<TString> usedPaths;
+        THashMap<TString, THashSet<TString>> usedAnchors;
+        for (const auto& [_, link] : links) {
+            TString anchor = link.Anchor.GetOrElse("");
+            usedAnchors[link.RelativePath].emplace(std::move(anchor));
+        }
+
+        THashSet<TString> unusedPaths;
+        THashMap<TString, THashSet<TString>> unusedAnchors;
+        for (const auto& [path, page] : pages) {
+            for (const auto& [anchor, _] : page.SectionsByAnchor) {
+                if (!usedAnchors.contains(path)) {
+                    unusedPaths.emplace(path);
+                } else if (!usedAnchors[path].contains(anchor)) {
+                    unusedAnchors[path].emplace(anchor);
+                }
+            }
+        }
+
+        for (const auto& [path, anchors] : unusedAnchors) {
+            for (const auto& anchor : anchors) {
+                pages[path].SectionsByAnchor.erase(anchor);
+            }
+        }
+
+        for (const auto& path : unusedPaths) {
+            pages.erase(path);
+        }
+
+        return pages;
+    }
+
+} // namespace NYql::NDocs
author	vitya-smirnov <[email protected]>	2025-07-30 11:26:26 +0300
committer	vitya-smirnov <[email protected]>	2025-07-30 11:38:37 +0300
commit	cf9f591e5c90bf964bb922c0f6c3716045972b02 (patch)
tree	36d4eb0816606653836399ac32ea58d2eca08b53 /yql/essentials/utils/docs/link_page.cpp
parent	ada885655c2e21f6b55e2d3d724e57c9a1fdb843 (diff)