summaryrefslogtreecommitdiffstats
path: root/yql/essentials/utils/docs/link_page.cpp
diff options
context:
space:
mode:
authorvitya-smirnov <[email protected]>2025-07-30 11:26:26 +0300
committervitya-smirnov <[email protected]>2025-07-30 11:38:37 +0300
commitcf9f591e5c90bf964bb922c0f6c3716045972b02 (patch)
tree36d4eb0816606653836399ac32ea58d2eca08b53 /yql/essentials/utils/docs/link_page.cpp
parentada885655c2e21f6b55e2d3d724e57c9a1fdb843 (diff)
YQL-20112: Improve dramatically yql/utils/docs
Introduced `links.json` format to link names to documentation sections. Implement general links verification framework. Also fixed two small typos. Extended Description: https://nda.ya.ru/t/zR4voivb7GzD9r. commit_hash:e72db0e202b4ff612374c73fa384f70d029f0ef0
Diffstat (limited to 'yql/essentials/utils/docs/link_page.cpp')
-rw-r--r--yql/essentials/utils/docs/link_page.cpp116
1 files changed, 116 insertions, 0 deletions
diff --git a/yql/essentials/utils/docs/link_page.cpp b/yql/essentials/utils/docs/link_page.cpp
new file mode 100644
index 00000000000..eb71979462d
--- /dev/null
+++ b/yql/essentials/utils/docs/link_page.cpp
@@ -0,0 +1,116 @@
+#include "link_page.h"
+
+#include "name.h"
+
+#include <util/generic/hash_set.h>
+#include <util/string/split.h>
+
+namespace NYql::NDocs {
+
+ TMaybe<TString> MatchSingleFunctionHeader(TStringBuf header) {
+ return NormalizedName(TString(header));
+ }
+
+ TVector<TString> SplitBy(TStringBuf delim, const TVector<TString>& strings) {
+ TVector<TString> parts;
+ for (const TString& s : strings) {
+ StringSplitter(s).SplitByString(delim).AddTo(&parts);
+ }
+ return parts;
+ }
+
+ TVector<TString> SplitByPunctuation(TStringBuf header) {
+ TVector<TString> parts = {TString(header)};
+ parts = SplitBy(" и ", parts);
+ parts = SplitBy(" / ", parts);
+ parts = SplitBy(", ", parts);
+ return parts;
+ }
+
+ TVector<TString> MatchMultiFunctionHeader(TStringBuf header) {
+ TVector<TString> names = SplitByPunctuation(header);
+
+ for (TString& name : names) {
+ TMaybe<TString> normalized = NormalizedName(std::move(name));
+ if (!normalized) {
+ return {};
+ }
+
+ name = std::move(*normalized);
+ }
+
+ return names;
+ }
+
+ TVector<TString> ExtractNormalized(TStringBuf header) {
+ if (auto single = MatchSingleFunctionHeader(header)) {
+ return {*single};
+ }
+ if (auto multi = MatchMultiFunctionHeader(header)) {
+ return multi;
+ }
+ return {};
+ }
+
+ void EnrichFromMarkdown(TLinks& links, const TString& path, const TMarkdownHeader& header) {
+ for (const TString& name : ExtractNormalized(header.Content)) {
+ links[name] = {
+ .RelativePath = path,
+ .Anchor = header.Anchor,
+ };
+ }
+ }
+
+ void EnrichFromMarkdown(TLinks& links, const TString& path, const TMarkdownPage& page) {
+ for (const auto& [anchor, section] : page.SectionsByAnchor) {
+ const TMarkdownHeader& header = section.Header;
+ EnrichFromMarkdown(links, path, header);
+ }
+ }
+
+ void EnrichFromMarkdown(TLinks& links, const TPages& pages) {
+ for (const auto& [path, page] : pages) {
+ EnrichFromMarkdown(links, path, page);
+ }
+ }
+
+ TLinks GetLinksFromPages(const TPages& pages) {
+ TLinks links;
+ EnrichFromMarkdown(links, pages);
+ return links;
+ }
+
+ TPages Stripped(TPages&& pages, const TLinks& links) {
+ THashSet<TString> usedPaths;
+ THashMap<TString, THashSet<TString>> usedAnchors;
+ for (const auto& [_, link] : links) {
+ TString anchor = link.Anchor.GetOrElse("");
+ usedAnchors[link.RelativePath].emplace(std::move(anchor));
+ }
+
+ THashSet<TString> unusedPaths;
+ THashMap<TString, THashSet<TString>> unusedAnchors;
+ for (const auto& [path, page] : pages) {
+ for (const auto& [anchor, _] : page.SectionsByAnchor) {
+ if (!usedAnchors.contains(path)) {
+ unusedPaths.emplace(path);
+ } else if (!usedAnchors[path].contains(anchor)) {
+ unusedAnchors[path].emplace(anchor);
+ }
+ }
+ }
+
+ for (const auto& [path, anchors] : unusedAnchors) {
+ for (const auto& anchor : anchors) {
+ pages[path].SectionsByAnchor.erase(anchor);
+ }
+ }
+
+ for (const auto& path : unusedPaths) {
+ pages.erase(path);
+ }
+
+ return pages;
+ }
+
+} // namespace NYql::NDocs