aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/robots_txt
diff options
context:
space:
mode:
authorqrort <qrort@yandex-team.com>2022-11-30 23:47:12 +0300
committerqrort <qrort@yandex-team.com>2022-11-30 23:47:12 +0300
commit22f8ae0e3f5d68b92aecccdf96c1d841a0334311 (patch)
treebffa27765faf54126ad44bcafa89fadecb7a73d7 /library/cpp/robots_txt
parent332b99e2173f0425444abb759eebcb2fafaa9209 (diff)
downloadydb-22f8ae0e3f5d68b92aecccdf96c1d841a0334311.tar.gz
validate canons without yatest_common
Diffstat (limited to 'library/cpp/robots_txt')
-rw-r--r--library/cpp/robots_txt/constants.h9
-rw-r--r--library/cpp/robots_txt/prefix_tree.cpp172
-rw-r--r--library/cpp/robots_txt/prefix_tree.h47
-rw-r--r--library/cpp/robots_txt/prefix_tree_rules_handler.cpp706
-rw-r--r--library/cpp/robots_txt/robots_txt.h605
-rw-r--r--library/cpp/robots_txt/robots_txt_parser.cpp116
-rw-r--r--library/cpp/robots_txt/robots_txt_parser.h38
-rw-r--r--library/cpp/robots_txt/robotstxtcfg.h3
-rw-r--r--library/cpp/robots_txt/robotstxtcfg/bot_id_set.cpp2
-rw-r--r--library/cpp/robots_txt/robotstxtcfg/bot_id_set.h132
-rw-r--r--library/cpp/robots_txt/robotstxtcfg/robotstxtcfg.cpp2
-rw-r--r--library/cpp/robots_txt/robotstxtcfg/robotstxtcfg.h11
-rw-r--r--library/cpp/robots_txt/robotstxtcfg/user_agents.cpp2
-rw-r--r--library/cpp/robots_txt/robotstxtcfg/user_agents.h292
-rw-r--r--library/cpp/robots_txt/rules_handler.cpp514
15 files changed, 2651 insertions, 0 deletions
diff --git a/library/cpp/robots_txt/constants.h b/library/cpp/robots_txt/constants.h
new file mode 100644
index 0000000000..e5e2a57e18
--- /dev/null
+++ b/library/cpp/robots_txt/constants.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <util/generic/size_literals.h>
+#include <util/system/defaults.h>
+
+
+constexpr auto robots_max = 500_KB;
+constexpr auto max_rules_count = 10'000;
+constexpr auto max_rule_length = 10_KB;
diff --git a/library/cpp/robots_txt/prefix_tree.cpp b/library/cpp/robots_txt/prefix_tree.cpp
new file mode 100644
index 0000000000..f7b1848a43
--- /dev/null
+++ b/library/cpp/robots_txt/prefix_tree.cpp
@@ -0,0 +1,172 @@
+#include <cstring>
+#include <algorithm>
+
+#include "prefix_tree.h"
+
+TPrefixTreeNodeElement::TPrefixTreeNodeElement()
+ : Key(nullptr)
+ , KeyLen(0)
+ , Val(-1)
+ , Index(-1)
+{
+}
+
+TPrefixTreeNodeElement::TPrefixTreeNodeElement(const char* key, i32 keyLen = 0, i32 val = -1, i32 index = -1)
+ : Key(key)
+ , KeyLen(keyLen)
+ , Val(val)
+ , Index(index)
+{
+}
+
+TPrefixTreeNode::TPrefixTreeNode()
+ : Elements()
+{
+}
+
+int TPrefixTreeNode::Find(char ch) const {
+ for (size_t i = 0; i < Elements.size(); ++i)
+ if (ch == *(Elements[i].Key))
+ return i;
+ return -1;
+}
+
+void TPrefixTreeNode::Set(const char* key, i32 keyLen, i32 val, i32 index) {
+ TPrefixTreeNodeElement element(key, keyLen, val, index);
+ int i = Find(*key);
+ if (i < 0)
+ Elements.push_back(element);
+ else
+ Elements[i] = element;
+}
+
+void TPrefixTreeNode::Dump(FILE* logFile) const {
+ if (!logFile)
+ logFile = stderr;
+ fprintf(logFile, "size=%" PRISZT "\n", Elements.size());
+ static char b[1234];
+ for (size_t i = 0; i < Elements.size(); ++i) {
+ strncpy(b, Elements[i].Key, Elements[i].KeyLen);
+ b[Elements[i].KeyLen] = 0;
+ fprintf(logFile, "{key=[%s]:%d, val=%d, index=%d}\n", b, Elements[i].KeyLen, Elements[i].Val, Elements[i].Index);
+ }
+}
+
+void TPrefixTree::Dump(FILE* logFile) const {
+ if (!logFile)
+ logFile = stderr;
+ fprintf(logFile, "%" PRISZT " nodes\n", Nodes.size());
+ for (size_t i = 0; i < Nodes.size(); ++i) {
+ fprintf(logFile, "%" PRISZT ": ", i);
+ Nodes[i].Dump(logFile);
+ fprintf(logFile, "\n");
+ }
+}
+
+TPrefixTree::TPrefixTree(int maxSize) {
+ Init(maxSize);
+}
+
+void TPrefixTree::Init(int maxSize) {
+ Nodes.clear();
+ Nodes.reserve(std::max(maxSize + 1, 1));
+ Nodes.push_back(TPrefixTreeNode());
+}
+
+void TPrefixTree::Clear() {
+ Nodes.clear();
+ Init(0);
+}
+
+void TPrefixTree::Add(const char* s, i32 index) {
+ AddInternal(s, Nodes[0], index);
+}
+
+void TPrefixTree::AddInternal(const char* s, TPrefixTreeNode& node, i32 index) {
+ if (!s || !*s)
+ return;
+
+ int i = node.Find(*s);
+ if (i >= 0) {
+ TPrefixTreeNodeElement& d = node.Elements[i];
+ const char* p = d.Key;
+ while (*s && (p - d.Key) < d.KeyLen && *s == *p)
+ ++s, ++p;
+
+ if (*s) {
+ if ((p - d.Key) < d.KeyLen) {
+ Nodes.push_back(TPrefixTreeNode());
+ Nodes.back().Set(p, d.KeyLen - (p - d.Key), d.Val, d.Index);
+ Nodes.back().Set(s, strlen(s), -1, index);
+
+ d.Val = Nodes.size() - 1;
+ d.KeyLen = p - d.Key;
+ d.Index = INDEX_BOUND;
+ } else {
+ if (d.Val != -1 && index < d.Index)
+ AddInternal(s, Nodes[d.Val], index);
+ }
+ } else {
+ if ((p - d.Key) < d.KeyLen) {
+ Nodes.push_back(TPrefixTreeNode());
+ Nodes.back().Set(p, d.KeyLen - (p - d.Key), d.Val, d.Index);
+ d.Val = Nodes.size() - 1;
+ d.KeyLen = p - d.Key;
+ d.Index = index;
+ } else {
+ d.Index = std::min(d.Index, index);
+ }
+ }
+ } else {
+ node.Set(s, strlen(s), -1, index);
+ }
+}
+
+int TPrefixTree::GetMemorySize() const {
+ int res = Nodes.capacity() * sizeof(TPrefixTreeNode);
+ for (size_t i = 0; i < Nodes.size(); ++i)
+ res += Nodes[i].Elements.capacity() * sizeof(TPrefixTreeNodeElement);
+ return res;
+}
+
+void TPrefixTree::Compress() {
+ Nodes.shrink_to_fit();
+ for (size_t i = 0; i < Nodes.size(); ++i)
+ Nodes[i].Elements.shrink_to_fit();
+}
+
+i32 TPrefixTree::MinPrefixIndex(const char* s) const {
+ if (!*s)
+ return -1;
+ int i = Nodes[0].Find(*s);
+ if (i < 0)
+ return -1;
+ const TPrefixTreeNodeElement* d = &Nodes[0].Elements[i];
+
+ const char* p = d->Key;
+ if (!p || !*p)
+ return -1;
+
+ i32 result = INDEX_BOUND;
+ i32 nodeIndex = 0;
+ while (*s == *p) {
+ if (++p - d->Key >= d->KeyLen)
+ result = std::min(result, d->Index);
+ if (!*++s)
+ break;
+
+ if (p - d->Key >= d->KeyLen) {
+ nodeIndex = d->Val;
+ if (nodeIndex == -1)
+ break;
+ i = Nodes[nodeIndex].Find(*s);
+ if (i < 0)
+ break;
+ d = &Nodes[nodeIndex].Elements[i];
+ p = d->Key;
+ if (!p || !*p)
+ break;
+ }
+ }
+ return result < INDEX_BOUND ? result : -1;
+}
diff --git a/library/cpp/robots_txt/prefix_tree.h b/library/cpp/robots_txt/prefix_tree.h
new file mode 100644
index 0000000000..5feafcb74d
--- /dev/null
+++ b/library/cpp/robots_txt/prefix_tree.h
@@ -0,0 +1,47 @@
+#pragma once
+
+#include <util/generic/ptr.h>
+#include <util/generic/vector.h>
+#include <cstdio>
+#include <util/generic/noncopyable.h>
+
+struct TPrefixTreeNodeElement {
+ const char* Key;
+ i32 KeyLen;
+ i32 Val;
+ i32 Index;
+
+ TPrefixTreeNodeElement();
+ TPrefixTreeNodeElement(const char*, i32, i32, i32);
+};
+
+class TPrefixTreeNode {
+public:
+ TVector<TPrefixTreeNodeElement> Elements;
+ TPrefixTreeNode();
+
+ int Find(char) const;
+ void Set(const char*, i32, i32, i32);
+ void Dump(FILE*) const;
+};
+
+class TPrefixTree : TNonCopyable {
+private:
+ static const i32 INDEX_BOUND = 1 << 30;
+
+ TVector<TPrefixTreeNode> Nodes;
+
+public:
+ void Init(int);
+ TPrefixTree(int);
+
+ void Add(const char*, i32);
+ i32 MinPrefixIndex(const char*) const;
+ void Clear();
+ void Dump(FILE*) const;
+ int GetMemorySize() const;
+ void Compress();
+
+private:
+ void AddInternal(const char*, TPrefixTreeNode&, i32);
+};
diff --git a/library/cpp/robots_txt/prefix_tree_rules_handler.cpp b/library/cpp/robots_txt/prefix_tree_rules_handler.cpp
new file mode 100644
index 0000000000..8dd579d060
--- /dev/null
+++ b/library/cpp/robots_txt/prefix_tree_rules_handler.cpp
@@ -0,0 +1,706 @@
+#include "robots_txt.h"
+
+#include <util/digest/fnv.h>
+#include <util/system/tls.h>
+#include <util/generic/buffer.h>
+#include <util/generic/yexception.h>
+
+namespace {
+
+TString NormalizeRule(TStringBuf rule) {
+ TString result;
+ result.reserve(rule.size() + 1);
+
+ // remove consecutive '*'
+ for (auto c : rule) {
+ if (c != '*' || !result.EndsWith('*')) {
+ result.append(c);
+ }
+ }
+
+ if (rule == "*") {
+ result = "/*";
+ return result;
+ }
+
+ // unify suffix
+ if (result.EndsWith('$')) {
+ result.pop_back();
+ } else if (!result.EndsWith('*')) {
+ result.append('*');
+ }
+
+ return result;
+}
+
+// Prefix rules
+bool IsPrefixRule(TStringBuf rule) {
+ return rule.EndsWith('*') && !TStringBuf(rule.begin(), rule.end() - 1).Contains('*');
+}
+
+// Converts rule to internal representation, i.e.
+// For prefix rules: "/foo", 'D' -> 'D', "/foo"
+// For generic rules: "/*foo", 'D' -> ("/*/*foo*", 'd') or ("/*foo$", 'A') -> ("/*foo", 'a')
+// The distinction is in uppercase/lowercase rule type
+std::pair<TString, char> ConvertRule(TStringBuf rule, char type) {
+ switch (type) {
+ case 'H':
+ case 'S':
+ case 'C':
+ case 'P':
+ return {TString(rule), type};
+ case 'A':
+ case 'D':
+ break;
+ default:
+ return {{}, type};
+ }
+
+ auto result = NormalizeRule(rule);
+ if (IsPrefixRule(result)) {
+ result.pop_back(); // remove extra '*' from the end
+ } else {
+ type = tolower(type);
+ }
+
+ return {std::move(result), type};
+}
+
+} // namespace
+
+TPrefixTreeRobotsTxtRulesHandler::TPrefixTreeRobotsTxtRulesHandler(
+ TBotIdSet supportedBotIds,
+ int robotsMaxSize,
+ int maxRulesNumber,
+ bool saveDataForAnyBot)
+ : TRobotsTxtRulesHandlerBase(supportedBotIds, robotsMaxSize, maxRulesNumber, saveDataForAnyBot)
+{}
+
+TPrefixTreeRobotsTxtRulesHandler::TPrefixTreeRobotsTxtRulesHandler(
+ std::initializer_list<ui32> supportedBotIds,
+ int robotsMaxSize,
+ int maxRulesNumber,
+ bool saveDataForAnyBot)
+ : TRobotsTxtRulesHandlerBase(TBotIdSet(supportedBotIds), robotsMaxSize, maxRulesNumber, saveDataForAnyBot)
+{}
+
+TPrefixTreeRobotsTxtRulesHandler::TPrefixTreeRobotsTxtRulesHandler(
+ const TSet<ui32>& supportedBotIds,
+ int robotsMaxSize,
+ int maxRulesNumber,
+ bool saveDataForAnyBot)
+ : TRobotsTxtRulesHandlerBase(supportedBotIds, robotsMaxSize, maxRulesNumber, saveDataForAnyBot)
+{}
+
+bool TPrefixTreeRobotsTxtRulesHandler::Empty(const ui32 botId) const {
+ const auto& botInfo = BotIdToPrefixTreeBotInfo[GetNotOptimizedBotId(botId)];
+ return !botInfo || (botInfo->BufferPosition <= sizeof(botInfo->BufferPosition));
+}
+
+TRobotsTxtRulesIterator TPrefixTreeRobotsTxtRulesHandler::GetRulesIterator(const ui32 botId) const {
+ const auto& botInfo = BotIdToPrefixTreeBotInfo[GetNotOptimizedBotId(botId)];
+ if (!botInfo) {
+ return {};
+ }
+ return TRobotsTxtRulesIterator(botInfo->Buffer.Get() + sizeof(botInfo->BufferPosition), botInfo->Buffer.Get() + botInfo->BufferPosition);
+}
+
+size_t TPrefixTreeRobotsTxtRulesHandler::GetMemorySize() {
+ size_t allBotsSize = 0;
+ for (const auto& botInfo : BotIdToPrefixTreeBotInfo) {
+ if (!botInfo) {
+ continue;
+ }
+
+ allBotsSize += botInfo->PrefixRules.GetMemorySize()
+ + botInfo->BufferSize * sizeof(char)
+ + botInfo->ComplexRulesSize * sizeof(char**)
+ + botInfo->RulesSize * sizeof(char*) + (1 << 8);
+ }
+ return allBotsSize;
+}
+
+void TPrefixTreeRobotsTxtRulesHandler::ClearInternal(const ui32 botId) {
+ if (botId >= BotIdToPrefixTreeBotInfo.size()) {
+ return;
+ }
+ BotIdToPrefixTreeBotInfo[botId].Reset();
+ TRobotsTxtRulesHandlerBase::ClearInternal(botId);
+}
+
+bool TPrefixTreeRobotsTxtRulesHandler::OptimizeSize() {
+ ResetOptimized();
+
+ TMap<ui64, ui32> hashToBotId;
+ for (auto botId : LoadedBotIds) {
+ auto& botInfo = BotIdToPrefixTreeBotInfo[botId];
+ if (botInfo->BufferPosition <= sizeof(ui32)) {
+ botInfo.Reset();
+ LoadedBotIds.remove(botId);
+ continue;
+ }
+
+ ui64 hash = FnvHash<ui64>(botInfo->Buffer.Get(), botInfo->BufferPosition);
+ if (auto p = hashToBotId.FindPtr(hash)) {
+ OptimizedBotIdToStoredBotId[botId] = *p;
+ ClearInternal(botId);
+ botInfo.Reset();
+ } else {
+ hashToBotId[hash] = botId;
+ }
+ }
+
+ if (IsFullTotal()) {
+ DoAllowAll();
+ return false;
+ }
+
+ return true;
+}
+
+void TPrefixTreeRobotsTxtRulesHandler::Clear() {
+ for (size_t botId = 0; botId < robotstxtcfg::max_botid; ++botId)
+ if (IsBotIdSupported(botId))
+ ClearInternal(botId);
+ TRobotsTxtRulesHandlerBase::Clear();
+}
+
+void TPrefixTreeRobotsTxtRulesHandler::ResizeBuffer(const ui32 botId, int newSize) {
+ auto& botInfo = GetInfo(botId);
+ TArrayHolder<char> newBuffer(new char[newSize]);
+ memcpy(newBuffer.Get(), botInfo.Buffer.Get(), std::min(botInfo.BufferSize, newSize));
+ botInfo.Buffer.Swap(newBuffer);
+ botInfo.BufferSize = newSize;
+}
+
+bool TPrefixTreeRobotsTxtRulesHandler::AddRule(const ui32 botId, TStringBuf rule, char type) {
+ if (rule.empty() || rule.Contains('\0')) {
+ return true;
+ }
+
+ auto& botInfo = GetInfo(botId);
+
+ if (IsFull(botId, rule.size())) {
+ DoAllowAll();
+ return false;
+ }
+
+ auto [convertedRule, convertedType] = ConvertRule(rule, type);
+ const auto len = convertedRule.size() + 2; // 1 byte for convertedType and another for '\0'
+
+ if (auto newPos = botInfo.BufferPosition + len; newPos >= size_t(botInfo.BufferSize)) {
+ size_t newSize = botInfo.BufferSize;
+ while (newPos >= newSize)
+ newSize *= 2;
+ ResizeBuffer(botId, newSize);
+ }
+
+ auto out = botInfo.Buffer.Get() + botInfo.BufferPosition;
+ *out++ = convertedType;
+ strcpy(out, convertedRule.data());
+ botInfo.BufferPosition += len;
+
+ if (type == 'A' || type == 'D') {
+ botInfo.RulesPosition++;
+ }
+
+ return true;
+}
+
+const char* TPrefixTreeRobotsTxtRulesHandler::GetRule(const ui32 botId, const char* s, char type) const {
+ const auto& botInfo = BotIdToPrefixTreeBotInfo[GetNotOptimizedBotId(botId)];
+ if (!botInfo) {
+ return nullptr;
+ }
+
+ int m = botInfo->RulesPosition + 1;
+ int k = botInfo->PrefixRules.MinPrefixIndex(s);
+ if (k >= 0)
+ m = k;
+ char* rule;
+ int j;
+ for (int i = 0; i < botInfo->ComplexRulesPosition; ++i) {
+ rule = *botInfo->ComplexRules.Get()[i];
+ j = botInfo->ComplexRules.Get()[i] - botInfo->Rules.Get();
+ if (j >= m)
+ break;
+ if (CheckRule(s, rule)) {
+ m = j;
+ break;
+ }
+ }
+ if (m >= botInfo->RulesPosition)
+ return nullptr;
+ return toupper(*(botInfo->Rules.Get()[m] - 1)) == type ? botInfo->Rules.Get()[m] : nullptr;
+}
+
+inline bool TPrefixTreeRobotsTxtRulesHandler::IsAllowAll(const ui32 botId) const {
+ const auto id = GetMappedBotId(botId, false);
+ auto& botInfo = BotIdToPrefixTreeBotInfo[id ? *id : robotstxtcfg::id_anybot];
+ return botInfo && botInfo->AllowAll;
+}
+
+inline bool TPrefixTreeRobotsTxtRulesHandler::IsAllowAll() const {
+ for (ui32 botId = 0; botId < robotstxtcfg::max_botid; ++botId)
+ if (robotstxtcfg::IsYandexBotId(botId) && IsBotIdSupported(botId) && !IsAllowAll(botId)) {
+ return false;
+ }
+
+ return true;
+}
+
+inline bool TPrefixTreeRobotsTxtRulesHandler::IsDisallowAll(const ui32 botId, bool useAny) const {
+ const auto id = GetMappedBotId(botId, false);
+ if (id) {
+ const auto& botInfo = BotIdToPrefixTreeBotInfo[*id];
+ return botInfo && botInfo->DisallowAll;
+ }
+
+ auto& botInfo = BotIdToPrefixTreeBotInfo[robotstxtcfg::id_anybot];
+ return useAny && botInfo && botInfo->DisallowAll;
+}
+
+inline bool TPrefixTreeRobotsTxtRulesHandler::IsDisallowAll() const {
+ for (ui32 botId = 0; botId < robotstxtcfg::max_botid; ++botId)
+ if (robotstxtcfg::IsYandexBotId(botId) && IsBotIdSupported(botId) && !IsDisallowAll(botId))
+ return false;
+
+ return true;
+}
+
+void TPrefixTreeRobotsTxtRulesHandler::DoAllowAll() {
+ using robotstxtcfg::id_anybot;
+
+ // Drop all bots to default
+ SupportedBotIds.insert(id_anybot);
+ for (ui32 botId = 0; botId < robotstxtcfg::max_botid; ++botId) {
+ if (IsBotIdSupported(botId)) {
+ ClearInternal(botId);
+ OptimizedBotIdToStoredBotId[botId] = id_anybot;
+ LoadedBotIds.insert(botId);
+ }
+ }
+
+ // Initialize anybot with "allow all" rule
+ AddRule(id_anybot, "/", 'A');
+ GetInfo(id_anybot).AllowAll = true;
+ SaveRulesToBuffer();
+}
+
+void TPrefixTreeRobotsTxtRulesHandler::DoDisallowAll() {
+ for (ui32 botId = 0; botId < robotstxtcfg::max_botid; ++botId) {
+ if (!IsBotIdSupported(botId))
+ continue;
+ ClearInternal(botId);
+ if (botId == robotstxtcfg::id_anybot) {
+ auto& botInfo = GetInfo(botId);
+ AddRule(botId, "/", 'D');
+ botInfo.DisallowAll = true;
+ SaveRulesToBuffer();
+ } else {
+ OptimizedBotIdToStoredBotId[botId] = robotstxtcfg::id_anybot;
+ }
+ LoadedBotIds.insert(botId);
+ }
+}
+
+const char* TPrefixTreeRobotsTxtRulesHandler::IsDisallow(const ui32 botId, const char* s, bool useAny) const {
+ const auto id = GetMappedBotId(botId, useAny);
+ if (!id)
+ return nullptr;
+
+ const auto& botInfo = BotIdToPrefixTreeBotInfo[*id];
+ if (botInfo && IsDisallowAll(*id, useAny)) {
+ int index = (const_cast<TPrefixTreeRobotsTxtRulesHandler*>(this))->FindRuleAll(*botInfo, 'D');
+ if (index < 0) { //o_O
+ return botInfo->Rules.Get()[0];
+ } else {
+ return botInfo->Rules.Get()[index];
+ }
+ }
+
+ return GetRule(*id, s, 'D');
+}
+
+const char* TPrefixTreeRobotsTxtRulesHandler::IsAllow(const ui32 botId, const char* s) const {
+ const auto id = GetMappedBotId(botId, true);
+ if (auto p = GetRule(*id, s, 'A'))
+ return p;
+ return GetRule(*id, s, 'D') ? nullptr : "/";
+}
+
+int TPrefixTreeRobotsTxtRulesHandler::StrLenWithoutStars(const char* s) {
+ int len = 0;
+
+ for (size_t index = 0; s[index]; ++index) {
+ if (s[index] != '*') {
+ ++len;
+ }
+ }
+
+ return len;
+}
+
+int TPrefixTreeRobotsTxtRulesHandler::TraceBuffer(const ui32 botId, int countRules, const TArrayHolder<TRuleInfo>* ruleInfos) {
+ CheckBotIdValidity(botId);
+ auto& prefixBotInfo = GetInfo(botId);
+ TBotInfo& botInfo = BotIdToInfo[botId];
+
+ bool store = countRules >= 0;
+ if (store) {
+ prefixBotInfo.Rules.Reset(new char*[prefixBotInfo.RulesSize = countRules]);
+ }
+
+ int beg = -1, n = 0;
+ *((int*)prefixBotInfo.Buffer.Get()) = prefixBotInfo.BufferSize;
+ for (size_t i = sizeof(prefixBotInfo.BufferPosition); i < prefixBotInfo.BufferPosition; ++i)
+ if (prefixBotInfo.Buffer.Get()[i] == '\n' || prefixBotInfo.Buffer.Get()[i] == 0) {
+ if (beg < 0 || beg + 1 == (int)i)
+ continue;
+
+ char* s = prefixBotInfo.Buffer.Get() + beg;
+ if (store) {
+ switch (*s) {
+ case 'H':
+ HostDirective = s + 1;
+ break;
+ case 'S':
+ SiteMaps.insert(s + 1);
+ break;
+ case 'C':
+ ParseCrawlDelay(s + 1, botInfo.CrawlDelay);
+ break;
+ case 'P':
+ CleanParams.insert(s + 1);
+ break;
+ default:
+ prefixBotInfo.Rules.Get()[n] = s + 1;
+ (*ruleInfos).Get()[n].Len = StrLenWithoutStars(s + 1);
+ (*ruleInfos).Get()[n].Allow = toupper(*s) == 'A';
+
+ prefixBotInfo.HasAllow |= toupper(*s) == 'A';
+ prefixBotInfo.HasDisallow |= toupper(*s) == 'D';
+ break;
+ }
+ }
+ n += (*s != 'H' && *s != 'S' && *s != 'C' && *s != 'P');
+ beg = -1;
+ } else if (beg < 0)
+ beg = i;
+
+ return n;
+}
+
+int TPrefixTreeRobotsTxtRulesHandler::FindRuleAll(const TPrefixTreeBotInfo& prefixBotInfo, const char neededType) {
+ static const char* all[] = {"*", "/", "*/", "/*", "*/*"};
+ for (int ruleNumber = prefixBotInfo.RulesSize - 1; ruleNumber >= 0; --ruleNumber) {
+ const char* curRule = prefixBotInfo.Rules.Get()[ruleNumber];
+ char ruleType = *(curRule - 1);
+
+ if (strlen(curRule) > 3)
+ break;
+ if (neededType != ruleType)
+ continue;
+
+ for (size_t i = 0; i < sizeof(all) / sizeof(char*); ++i)
+ if (strcmp(all[i], curRule) == 0)
+ return ruleNumber;
+ }
+ return -1;
+}
+
+bool TPrefixTreeRobotsTxtRulesHandler::HasDisallowRulePrevAllowAll(const TPrefixTreeBotInfo& prefixBotInfo, int ruleAllAllow) {
+ for (int ruleNumber = ruleAllAllow - 1; ruleNumber >= 0; --ruleNumber) {
+ const char* curRule = prefixBotInfo.Rules.Get()[ruleNumber];
+ char ruleType = *(curRule - 1);
+ if (tolower(ruleType) == 'd')
+ return true;
+ }
+ return false;
+}
+
+bool TPrefixTreeRobotsTxtRulesHandler::CheckAllowDisallowAll(const ui32 botId, const bool checkDisallow) {
+ CheckBotIdValidity(botId);
+
+ auto& botInfo = GetInfo(botId);
+
+ if (botInfo.RulesSize == 0)
+ return !checkDisallow;
+ if (botInfo.RulesPosition <= 0)
+ return 0;
+
+ if (checkDisallow)
+ return !botInfo.HasAllow && FindRuleAll(botInfo, 'D') >= 0;
+ int ruleAllAllow = FindRuleAll(botInfo, 'A');
+ if (ruleAllAllow == -1)
+ return !botInfo.HasDisallow;
+ return !HasDisallowRulePrevAllowAll(botInfo, ruleAllAllow);
+}
+
+void TPrefixTreeRobotsTxtRulesHandler::SortRules(
+ TPrefixTreeBotInfo& prefixBotInfo,
+ size_t count,
+ const TArrayHolder<TRuleInfo>* ruleInfos) {
+ TVector<size_t> indexes(count);
+ for (size_t index = 0; index < count; ++index)
+ indexes[index] = index;
+
+ TRulesSortFunc sortFunc(ruleInfos);
+ std::sort(indexes.begin(), indexes.end(), sortFunc);
+
+ TArrayHolder<char*> workingCopy;
+ workingCopy.Reset(new char*[count]);
+
+ for (size_t index = 0; index < count; ++index)
+ workingCopy.Get()[index] = prefixBotInfo.Rules.Get()[index];
+ for (size_t index = 0; index < count; ++index)
+ prefixBotInfo.Rules.Get()[index] = workingCopy.Get()[indexes[index]];
+}
+
+void TPrefixTreeRobotsTxtRulesHandler::SaveRulesToBuffer() {
+ // as sitemaps, clean-params and HostDirective from prefix tree was deleted
+ for (const auto& sitemap: SiteMaps)
+ AddRule(robotstxtcfg::id_anybot, sitemap, 'S');
+ for (const auto& param : CleanParams)
+ AddRule(robotstxtcfg::id_anybot, param, 'P');
+ if (!HostDirective.empty())
+ AddRule(robotstxtcfg::id_anybot, HostDirective, 'H');
+}
+
+void TPrefixTreeRobotsTxtRulesHandler::SaveRulesFromBuffer(const ui32 botId) {
+ CheckBotIdValidity(botId);
+
+ auto& botInfo = GetInfo(botId);
+
+ TArrayHolder<TRuleInfo> ruleInfos;
+
+ int n = TraceBuffer(botId, -1, nullptr), countPrefix = 0;
+ ruleInfos.Reset(new TRuleInfo[n]);
+ botInfo.RulesPosition = TraceBuffer(botId, n, &ruleInfos);
+ assert(botInfo.RulesPosition == n);
+
+ SortRules(botInfo, n, &ruleInfos);
+
+ botInfo.DisallowAll = CheckAllowDisallowAll(botId, true);
+ botInfo.AllowAll = CheckAllowDisallowAll(botId, false);
+
+ for (int i = 0; i < n; ++i)
+ countPrefix += !!isupper(*(botInfo.Rules.Get()[i] - 1));
+
+ botInfo.PrefixRules.Init(countPrefix);
+ botInfo.ComplexRules.Reset(new char**[botInfo.ComplexRulesSize = n - countPrefix]);
+ botInfo.ComplexRulesPosition = 0;
+
+ for (int i = 0; i < n; ++i) {
+ char* s = botInfo.Rules.Get()[i];
+ if (isupper(*(s - 1)))
+ botInfo.PrefixRules.Add(s, i);
+ else
+ botInfo.ComplexRules.Get()[botInfo.ComplexRulesPosition++] = &botInfo.Rules.Get()[i];
+ }
+ botInfo.PrefixRules.Compress();
+}
+
+void TPrefixTreeRobotsTxtRulesHandler::AfterParse(const ui32 botId) {
+ CheckBotIdValidity(botId);
+
+ auto& botInfo = GetInfo(botId);
+
+ ResizeBuffer(botId, botInfo.BufferPosition);
+ SaveRulesFromBuffer(botId);
+
+ if (botInfo.RulesPosition == 0) {
+ AddRule(botId, "/", 'A');
+ }
+}
+
+TPrefixTreeRobotsTxtRulesHandler::TPrefixTreeBotInfo& TPrefixTreeRobotsTxtRulesHandler::GetInfo(ui32 botId) {
+ Y_ENSURE(botId < robotstxtcfg::max_botid);
+ auto& res = BotIdToPrefixTreeBotInfo[botId];
+ if (!res) {
+ res = MakeHolder<TPrefixTreeBotInfo>();
+ }
+ return *res;
+}
+
+bool TPrefixTreeRobotsTxtRulesHandler::CheckRule(const char* s, const char* rule) {
+ const char* r = rule;
+ const char* s_end = s + strlen(s);
+ const char* r_end = r + strlen(r);
+ // assert( r && !strstr(r, "**") );
+ for (; *s; ++s) {
+ if ((s_end - s + 1) * 2 < (r_end - r))
+ return 0;
+ while (*r == '*')
+ ++r;
+
+ if (*s == *r) {
+ ++r;
+ } else {
+ while (r != rule && *r != '*')
+ --r;
+
+ if (*r != '*')
+ return 0;
+ if (*r == '*')
+ ++r;
+ if (*r == *s)
+ ++r;
+ }
+ }
+ return !*r || (!*(r + 1) && *r == '*');
+}
+
+bool TPrefixTreeRobotsTxtRulesHandler::IsFull(ui32 botId, size_t length) const {
+ Y_ENSURE(botId < robotstxtcfg::max_botid);
+ const auto& botInfo = BotIdToPrefixTreeBotInfo[botId];
+ if (!botInfo) {
+ return false;
+ }
+
+ return (size_t(botInfo->RulesPosition) >= MaxRulesNumber) || (botInfo->BufferPosition + length + 300 > size_t(RobotsMaxSize));
+}
+
+bool TPrefixTreeRobotsTxtRulesHandler::IsFullTotal() const {
+ size_t allBotsRulesCount = 0;
+ size_t allBotsBufferSize = 0;
+
+ for (const auto& botInfo : BotIdToPrefixTreeBotInfo) {
+ if (botInfo) {
+ allBotsRulesCount += botInfo->RulesPosition;
+ allBotsBufferSize += botInfo->BufferPosition;
+ }
+ }
+
+ return (allBotsRulesCount >= MaxRulesNumber) || (allBotsBufferSize + 300 > size_t(RobotsMaxSize));
+}
+
+size_t TPrefixTreeRobotsTxtRulesHandler::GetPacked(const char*& data) const {
+ Y_STATIC_THREAD(TBuffer)
+ packedRepresentation;
+
+ // calculate size, needed for packed data
+ size_t totalPackedSize = sizeof(ui32); // num of botids
+ ui32 numOfSupportedBots = 0;
+
+ for (size_t botId = 0; botId < robotstxtcfg::max_botid; ++botId) {
+ if (!IsBotIdSupported(botId)) {
+ continue;
+ }
+
+ const auto& botInfo = BotIdToPrefixTreeBotInfo[GetNotOptimizedBotId(botId)];
+ // botId + packedDataSize + packedData
+ totalPackedSize += sizeof(ui32) + (botInfo ? botInfo->BufferPosition : sizeof(ui32));
+ ++numOfSupportedBots;
+ }
+
+ ((TBuffer&)packedRepresentation).Reserve(totalPackedSize);
+
+ // fill packed data
+ char* packedPtr = ((TBuffer&)packedRepresentation).Data();
+
+ *((ui32*)packedPtr) = numOfSupportedBots;
+ packedPtr += sizeof(ui32);
+
+ for (size_t botId = 0; botId < robotstxtcfg::max_botid; ++botId) {
+ if (!IsBotIdSupported(botId)) {
+ continue;
+ }
+
+ const auto& botInfo = BotIdToPrefixTreeBotInfo[GetNotOptimizedBotId(botId)];
+ memcpy(packedPtr, &botId, sizeof(ui32));
+ packedPtr += sizeof(ui32);
+
+ if (botInfo) {
+ *((ui32*)botInfo->Buffer.Get()) = botInfo->BufferPosition;
+ memcpy(packedPtr, botInfo->Buffer.Get(), botInfo->BufferPosition);
+ packedPtr += botInfo->BufferPosition;
+ } else {
+ // In absense of bot info we serialize only size of its buffer, which is 4 because it takes 4 bytes
+ ui32 emptyBufferPosition = sizeof(ui32);
+ memcpy(packedPtr, &emptyBufferPosition, sizeof(ui32));
+ packedPtr += sizeof(ui32);
+ }
+ }
+
+ data = ((TBuffer&)packedRepresentation).Data();
+ return totalPackedSize;
+}
+
+void TPrefixTreeRobotsTxtRulesHandler::LoadPacked(const char* botsData, const char* botsDataEnd) {
+ Clear();
+
+ if (Y_UNLIKELY(botsDataEnd != nullptr && botsData >= botsDataEnd)) {
+ ythrow yexception() << "Buffer overflow";
+ }
+
+ ui32 numOfBots = *((ui32*)botsData);
+ botsData += sizeof(ui32);
+
+ for (ui32 botIndex = 0; botIndex < numOfBots; ++botIndex) {
+ if (Y_UNLIKELY(botsDataEnd != nullptr && botsData >= botsDataEnd)) {
+ ythrow yexception() << "Buffer overflow";
+ }
+
+ ui32 botId = 0;
+ memcpy(&botId, botsData, sizeof(ui32));
+ botsData += sizeof(ui32);
+
+ // skip bot id's, that not supported for now
+ if (botId >= robotstxtcfg::max_botid || !IsBotIdSupported(botId)) {
+ if (Y_UNLIKELY(botsDataEnd != nullptr && botsData >= botsDataEnd)) {
+ ythrow yexception() << "Buffer overflow";
+ }
+
+ ui32 oneBotPackedSize = 0;
+ memcpy(&oneBotPackedSize, botsData, sizeof(ui32));
+ botsData += oneBotPackedSize;
+
+ continue;
+ }
+
+ //SupportedBotIds.insert(botId);
+
+ auto& botInfo = GetInfo(botId);
+
+ if (Y_UNLIKELY(botsDataEnd != nullptr && botsData >= botsDataEnd)) {
+ ythrow yexception() << "Buffer overflow";
+ }
+
+ static_assert(sizeof(botInfo.BufferSize) == sizeof(ui32), "BufferSize must be 4 bytes");
+ static_assert(sizeof(botInfo.BufferPosition) == sizeof(ui32), "BufferPosition must be 4 bytes");
+
+ memcpy(&botInfo.BufferSize, botsData, sizeof(ui32));
+ memcpy(&botInfo.BufferPosition, botsData, sizeof(ui32));
+
+ if (Y_UNLIKELY(botsDataEnd != nullptr && (botsData + botInfo.BufferSize) > botsDataEnd)) {
+ ythrow yexception() << "Buffer overflow";
+ }
+
+ botInfo.Buffer.Reset(new char[botInfo.BufferSize]);
+ memcpy(botInfo.Buffer.Get(), botsData, botInfo.BufferSize);
+ SaveRulesFromBuffer(botId);
+
+ if (botInfo.BufferSize > (int)sizeof(ui32)) { // empty data for robots means, that we don't have section for this bot
+ LoadedBotIds.insert(botId);
+ }
+
+ botsData += botInfo.BufferSize;
+ }
+
+ OptimizeSize();
+}
+
+void TPrefixTreeRobotsTxtRulesHandler::Dump(const ui32 botId, FILE* dumpFile) {
+ if (!dumpFile)
+ dumpFile = stderr;
+ fprintf(dumpFile, "User-Agent: %s\n", robotstxtcfg::GetFullName(botId).data());
+ for (TRobotsTxtRulesIterator it = GetRulesIterator(botId); it.HasRule(); it.Next())
+ fprintf(dumpFile, "%s: %s\n", DirTypeToName(it.GetRuleType()), it.GetInitialRule().data());
+}
+
+void TPrefixTreeRobotsTxtRulesHandler::Dump(const ui32 botId, IOutputStream& out) {
+ out << "User-Agent: " << robotstxtcfg::GetFullName(botId) << Endl;
+ for (TRobotsTxtRulesIterator it = GetRulesIterator(botId); it.HasRule(); it.Next())
+ out << DirTypeToName(it.GetRuleType()) << ": " << it.GetInitialRule() << Endl;
+}
diff --git a/library/cpp/robots_txt/robots_txt.h b/library/cpp/robots_txt/robots_txt.h
new file mode 100644
index 0000000000..5ee48fb14f
--- /dev/null
+++ b/library/cpp/robots_txt/robots_txt.h
@@ -0,0 +1,605 @@
+#pragma once
+
+#include "constants.h"
+#include "robots_txt_parser.h"
+#include "prefix_tree.h"
+#include "robotstxtcfg.h"
+
+#include <util/generic/noncopyable.h>
+#include <util/generic/map.h>
+#include <util/generic/maybe.h>
+#include <util/generic/ptr.h>
+#include <util/generic/set.h>
+
+#include <array>
+#include <utility>
+
+
+enum EDirectiveType {
+ USER_AGENT = 1,
+ DISALLOW = 2,
+ ALLOW = 3,
+ HOST = 4,
+ SITEMAP = 5,
+ CRAWL_DELAY = 6,
+ CLEAN_PARAM = 7,
+ UNKNOWN = 9,
+};
+
+enum EFormatErrorType {
+ ERROR_RULE_NOT_SLASH = 1,
+ ERROR_ASTERISK_MULTI = 2,
+ ERROR_HOST_MULTI = 3,
+ ERROR_ROBOTS_HUGE = 4,
+ ERROR_RULE_BEFORE_USER_AGENT = 5,
+ ERROR_RULE_HUGE = 6,
+ ERROR_HOST_FORMAT = 7,
+ ERROR_TRASH = 8,
+ ERROR_SITEMAP_FORMAT = 9,
+ ERROR_CRAWL_DELAY_FORMAT = 10,
+ ERROR_CRAWL_DELAY_MULTI = 11,
+ ERROR_CLEAN_PARAM_FORMAT = 12,
+
+ WARNING_EMPTY_RULE = 30,
+ WARNING_SUSPECT_SYMBOL = 31,
+ WARNING_UNKNOWN_FIELD = 33,
+ WARNING_UPPER_REGISTER = 34,
+ WARNING_SITEMAP = 35,
+};
+
+class TRobotsTxtRulesIterator {
+private:
+ const char* Begin = nullptr;
+ const char* End = nullptr;
+
+public:
+ TRobotsTxtRulesIterator() = default;
+ TRobotsTxtRulesIterator(const char* begin, const char* end);
+ void Next();
+ bool HasRule() const;
+ const char* GetRule() const;
+ TString GetInitialRule() const; // unlike GetRule(), it neither omits trailing '$' nor adds redundant '*'
+ EDirectiveType GetRuleType() const;
+
+ static EDirectiveType CharToDirType(char ch);
+};
+
+class TRobotsTxtRulesHandlerBase {
+public:
+ typedef TVector<std::pair<EFormatErrorType, int>> TErrorVector;
+
+ TRobotsTxtRulesHandlerBase(
+ TBotIdSet supportedBotIds,
+ int robotsMaxSize,
+ int maxRulesNumber,
+ bool saveDataForAnyBot);
+
+ TRobotsTxtRulesHandlerBase(
+ const TSet<ui32>& supportedBotIds,
+ int robotsMaxSize,
+ int maxRulesNumber,
+ bool saveDataForAnyBot);
+
+ virtual ~TRobotsTxtRulesHandlerBase();
+
+ int GetCrawlDelay(ui32 botId, bool* realInfo = nullptr) const;
+ int GetMinCrawlDelay(int defaultCrawlDelay = -1) const;
+ bool IsHandlingErrors() const;
+ const TString& GetHostDirective() const;
+ const TVector<TString> GetSiteMaps() const;
+ const TVector<TString> GetCleanParams() const;
+ const TErrorVector& GetErrors() const;
+ TVector<int> GetAcceptedLines(ui32 botId = robotstxtcfg::id_yandexbot) const;
+
+ template <class THostHandler>
+ static int ParseRules(TRobotsTxtParser& parser, TRobotsTxtRulesHandlerBase* rulesHandler, THostHandler* hostHandler, const char* host = nullptr);
+ static inline void ClearAllExceptCrossSection(TRobotsTxtParser& parser, TRobotsTxtRulesHandlerBase* rulesHandler, ui32 botId);
+ static int CheckHost(const char* host);
+ static int CheckSitemapUrl(const char* url, const char* host, TString& modifiedUrl);
+ static int CheckRule(const char* value, int line, TRobotsTxtRulesHandlerBase* rulesHandler);
+ static int CheckAndNormCleanParam(TString& s);
+ static int ParseCrawlDelay(const char* value, int& crawlDelay);
+ static EDirectiveType NameToDirType(const char* d);
+ static const char* DirTypeToName(EDirectiveType t);
+
+ void SetErrorsHandling(bool handleErrors);
+ void SetHostDirective(const char* hostDirective);
+ void SetCrawlDelay(ui32 botId, int crawlDelay);
+ void AddAcceptedLine(ui32 line, const TBotIdSet& botIds, bool isCrossSection);
+ void AddSiteMap(const char* sitemap);
+ void AddCleanParam(const char* cleanParam);
+ bool AddRuleWithErrorCheck(ui32 botId, TStringBuf rule, char type, TRobotsTxtParser& parser);
+ int OnHost(ui32 botId, TRobotsTxtParser& parser, const char* value, TRobotsTxtRulesHandlerBase*& rulesHandler);
+
+ virtual void Clear();
+ virtual bool IsAllowAll(ui32 botId) const = 0;
+ virtual bool IsAllowAll() const = 0;
+ virtual bool IsDisallowAll(ui32 botId, bool useAny = true) const = 0;
+ virtual bool IsDisallowAll() const = 0;
+ virtual const char* IsDisallow(ui32 botId, const char* s, bool useAny = true) const = 0;
+ virtual const char* IsAllow(ui32 botId, const char* s) const = 0;
+ virtual TRobotsTxtRulesIterator GetRulesIterator(ui32 botId) const = 0;
+ virtual void Dump(ui32 botId, FILE* logFile) = 0;
+ virtual void Dump(ui32 botId, IOutputStream& out) = 0;
+ virtual bool Empty(ui32 botId) const = 0;
+ virtual void LoadPacked(const char* botsData, const char* botsDataEnd = nullptr) = 0;
+ virtual size_t GetPacked(const char*& data) const = 0;
+ virtual void AfterParse(ui32 botId) = 0;
+ virtual void DoAllowAll() = 0;
+ virtual void DoDisallowAll() = 0;
+ bool IsBotIdLoaded(ui32 botId) const;
+ bool IsBotIdSupported(ui32 botId) const;
+ ui32 GetNotOptimizedBotId(ui32 botId) const;
+ TMaybe<ui32> GetMappedBotId(ui32 botId, bool useAny = true) const;
+
+protected:
+ void CheckBotIdValidity(ui32 botId) const;
+ virtual bool OptimizeSize() = 0;
+
+private:
+ bool HandleErrors;
+
+protected:
+ struct TBotInfo {
+ int CrawlDelay;
+
+ TBotInfo()
+ : CrawlDelay(-1)
+ {
+ }
+ };
+
+ TBotIdSet LoadedBotIds;
+ TSet<TString> SiteMaps;
+ TSet<TString> CleanParams;
+ TString HostDirective;
+ TErrorVector Errors;
+ typedef std::pair<ui32, ui32> TBotIdAcceptedLine;
+ TVector<TBotIdAcceptedLine> AcceptedLines;
+ TVector<ui32> CrossSectionAcceptedLines;
+
+ TVector<TBotInfo> BotIdToInfo;
+ int CrawlDelay;
+ size_t RobotsMaxSize;
+ size_t MaxRulesNumber;
+ bool SaveDataForAnyBot;
+
+ TBotIdSet SupportedBotIds;
+ std::array<ui8, robotstxtcfg::max_botid> OptimizedBotIdToStoredBotId;
+
+ virtual bool IsFull(ui32 botId, size_t length) const = 0;
+ virtual bool IsFullTotal() const = 0;
+ virtual bool AddRule(ui32 botId, TStringBuf rule, char type) = 0;
+ //parts of ParseRules
+ inline static void CheckRobotsLines(TRobotsTxtRulesHandlerBase* rulesHandler, TVector<int>& nonRobotsLines);
+ inline static void CheckAsterisk(TRobotsTxtRulesHandlerBase* rulesHandler, const char* value, ui32 lineNumber, bool& wasAsterisk);
+ inline static bool CheckWasUserAgent(TRobotsTxtRulesHandlerBase* rulesHandler, bool wasUserAgent, bool& ruleBeforeUserAgent, bool& wasRule, ui32 lineNumber);
+ inline static bool CheckRuleNotSlash(TRobotsTxtRulesHandlerBase* rulesHandler, const char* value, ui32 lineNumber);
+ inline static bool CheckSupportedBots(const TBotIdSet& currentBotIds, TBotIdSet& wasRuleForBot, const TBotIdSet& isSupportedBot);
+ inline static bool CheckEmptyRule(TRobotsTxtRulesHandlerBase* rulesHandler, const char* value, EDirectiveType& type, ui32 lineNumber);
+ inline static bool ProcessSitemap(TRobotsTxtRulesHandlerBase* rulesHandler, TRobotsTxtParser& parser, const char* value, const char* host);
+ inline static bool ProcessCleanParam(TRobotsTxtRulesHandlerBase* rulesHandler, TRobotsTxtParser& parser, TString& value);
+ inline static bool AddRules(
+ TRobotsTxtRulesHandlerBase* rulesHandler,
+ TRobotsTxtParser& parser,
+ const char* value,
+ char type,
+ const TBotIdSet& currentBotIds,
+ const TBotIdSet& isSupportedBot);
+
+ inline static bool ProcessCrawlDelay(
+ TRobotsTxtRulesHandlerBase* rulesHandler,
+ TRobotsTxtParser& parser,
+ const TBotIdSet& currentBotIds,
+ const TBotIdSet& isSupportedBot,
+ const char* value);
+
+ inline static void ProcessUserAgent(
+ TRobotsTxtRulesHandlerBase* rulesHandler,
+ TRobotsTxtParser& parser,
+ const TBotIdSet& currentBotIds,
+ TBotIdSet& wasRuleForBot,
+ TBotIdSet& isSupportedBot,
+ TVector<ui32>& botIdToMaxAppropriateUserAgentNameLength,
+ const char* value);
+
+ bool CheckRobot(
+ const char* userAgent,
+ TBotIdSet& botIds,
+ const TVector<ui32>* botIdToMaxAppropriateUserAgentNameLength = nullptr) const;
+
+ virtual void ClearInternal(ui32 botId);
+
+ void AddError(EFormatErrorType type, int line);
+
+ void ResetOptimized() noexcept;
+};
+
+class TPrefixTreeRobotsTxtRulesHandler: public TRobotsTxtRulesHandlerBase, TNonCopyable {
+private:
+ static const int INIT_BUFFER_SIZE = 1 << 6;
+
+ struct TRuleInfo {
+ size_t Len;
+ bool Allow;
+ };
+
+ bool IsFull(ui32 botId, size_t length) const override;
+ bool IsFullTotal() const override;
+ bool AddRule(ui32 botId, TStringBuf rule, char type) override;
+ const char* GetRule(ui32 botId, const char* s, char type) const;
+ void ResizeBuffer(ui32 botId, int newSize);
+ void SaveRulesFromBuffer(ui32 botId);
+ int TraceBuffer(ui32 botId, int countRules, const TArrayHolder<TRuleInfo>* ruleInfos);
+ bool CheckAllowDisallowAll(ui32 botId, bool checkDisallow);
+ void SaveRulesToBuffer();
+ int StrLenWithoutStars(const char* s);
+
+protected:
+ class TRulesSortFunc {
+ private:
+ const TArrayHolder<TRuleInfo>* RuleInfos;
+
+ public:
+ TRulesSortFunc(const TArrayHolder<TRuleInfo>* ruleInfos)
+ : RuleInfos(ruleInfos)
+ {
+ }
+ bool operator()(const size_t& lhs, const size_t& rhs) {
+ const TRuleInfo& left = (*RuleInfos).Get()[lhs];
+ const TRuleInfo& right = (*RuleInfos).Get()[rhs];
+ return (left.Len == right.Len) ? left.Allow && !right.Allow : left.Len > right.Len;
+ }
+ };
+
+ struct TPrefixTreeBotInfo {
+ bool DisallowAll = false;
+ bool AllowAll = false;
+ bool HasDisallow = false;
+ bool HasAllow = false;
+
+ TArrayHolder<char> Buffer{new char[INIT_BUFFER_SIZE]};
+ ui32 BufferPosition = sizeof(BufferPosition);
+ int BufferSize = INIT_BUFFER_SIZE;
+
+ TArrayHolder<char*> Rules = nullptr;
+ int RulesPosition = 0;
+ int RulesSize = 0;
+
+ TArrayHolder<char**> ComplexRules = nullptr;
+ int ComplexRulesPosition = 0;
+ int ComplexRulesSize = 0;
+
+ TPrefixTree PrefixRules {0};
+ };
+
+ std::array<THolder<TPrefixTreeBotInfo>, robotstxtcfg::max_botid> BotIdToPrefixTreeBotInfo;
+
+ TPrefixTreeBotInfo& GetInfo(ui32 botId);
+ static bool CheckRule(const char* s, const char* rule);
+ void ClearInternal(ui32 botId) override;
+ bool OptimizeSize() override;
+
+private:
+ void SortRules(TPrefixTreeBotInfo& prefixBotInfo, size_t count, const TArrayHolder<TRuleInfo>* ruleInfos);
+ bool HasDisallowRulePrevAllowAll(const TPrefixTreeBotInfo& prefixBotInfo, int ruleAllAllow);
+ int FindRuleAll(const TPrefixTreeBotInfo& prefixBotInfo, char neededType);
+
+public:
+ TPrefixTreeRobotsTxtRulesHandler(
+ TBotIdSet supportedBotIds = robotstxtcfg::defaultSupportedBotIds,
+ int robotsMaxSize = robots_max,
+ int maxRulesCount = -1,
+ bool saveDataForAnyBot = true);
+
+ TPrefixTreeRobotsTxtRulesHandler(
+ std::initializer_list<ui32> supportedBotIds,
+ int robotsMaxSize = robots_max,
+ int maxRulesCount = -1,
+ bool saveDataForAnyBot = true);
+
+ TPrefixTreeRobotsTxtRulesHandler(
+ const TSet<ui32>& supportedBotIds,
+ int robotsMaxSize = robots_max,
+ int maxRulesCount = -1,
+ bool saveDataForAnyBot = true);
+
+ void Clear() override;
+ void AfterParse(ui32 botId) override;
+ bool IsAllowAll(ui32 botId) const override;
+ bool IsAllowAll() const override;
+ bool IsDisallowAll(ui32 botId, bool useAny = true) const override;
+ bool IsDisallowAll() const override;
+ const char* IsDisallow(ui32 botId, const char* s, bool useAny = true) const override;
+ const char* IsAllow(ui32 botId, const char* s) const override;
+ TRobotsTxtRulesIterator GetRulesIterator(ui32 botId) const override;
+ void DoAllowAll() override;
+ void DoDisallowAll() override;
+ bool Empty(ui32 botId) const override;
+
+ void LoadPacked(const char* botsData, const char* botsDataEnd = nullptr) override;
+ size_t GetPacked(const char*& data) const override;
+ void Dump(ui32 botId, FILE* logFile) override;
+ void Dump(ui32 botId, IOutputStream& out) override;
+ size_t GetMemorySize();
+};
+
+using TRobotsTxt = TPrefixTreeRobotsTxtRulesHandler;
+
+void TRobotsTxtRulesHandlerBase::ClearAllExceptCrossSection(TRobotsTxtParser& parser, TRobotsTxtRulesHandlerBase* rulesHandler, ui32 botId) {
+ rulesHandler->ClearInternal(botId);
+ if (botId == robotstxtcfg::id_anybot) {
+ // as sitemaps, clean-params and HostDirective from prefix tree was deleted
+ for (const auto& sitemap : rulesHandler->SiteMaps) {
+ rulesHandler->AddRuleWithErrorCheck(robotstxtcfg::id_anybot, sitemap, 'S', parser);
+ }
+ for (const auto& param : rulesHandler->CleanParams) {
+ rulesHandler->AddRuleWithErrorCheck(robotstxtcfg::id_anybot, param, 'P', parser);
+ }
+ if (!rulesHandler->HostDirective.empty()) {
+ rulesHandler->AddRuleWithErrorCheck(robotstxtcfg::id_anybot, rulesHandler->HostDirective, 'H', parser);
+ }
+ }
+}
+
+void TRobotsTxtRulesHandlerBase::CheckRobotsLines(TRobotsTxtRulesHandlerBase* rulesHandler, TVector<int>& nonRobotsLines) {
+ if (rulesHandler->IsHandlingErrors()) {
+ for (size_t i = 0; i < nonRobotsLines.size(); ++i)
+ rulesHandler->AddError(ERROR_TRASH, nonRobotsLines[i]);
+ nonRobotsLines.clear();
+ }
+}
+
+void TRobotsTxtRulesHandlerBase::CheckAsterisk(TRobotsTxtRulesHandlerBase* rulesHandler, const char* value, ui32 lineNumber, bool& wasAsterisk) {
+ if (strcmp(value, "*") == 0) {
+ if (wasAsterisk)
+ rulesHandler->AddError(ERROR_ASTERISK_MULTI, lineNumber);
+ wasAsterisk = true;
+ }
+}
+
+bool TRobotsTxtRulesHandlerBase::CheckWasUserAgent(TRobotsTxtRulesHandlerBase* rulesHandler, bool wasUserAgent, bool& ruleBeforeUserAgent, bool& wasRule, ui32 lineNumber) {
+ if (wasUserAgent) {
+ wasRule = true;
+ return false;
+ }
+ if (!ruleBeforeUserAgent) {
+ ruleBeforeUserAgent = true;
+ rulesHandler->AddError(ERROR_RULE_BEFORE_USER_AGENT, lineNumber);
+ }
+ return true;
+}
+
+bool TRobotsTxtRulesHandlerBase::CheckRuleNotSlash(TRobotsTxtRulesHandlerBase* rulesHandler, const char* value, ui32 lineNumber) {
+ if (*value && *value != '/' && *value != '*') {
+ rulesHandler->AddError(ERROR_RULE_NOT_SLASH, lineNumber);
+ return true;
+ }
+ return false;
+}
+
+bool TRobotsTxtRulesHandlerBase::CheckSupportedBots(
+ const TBotIdSet& currentBotIds,
+ TBotIdSet& wasRuleForBot,
+ const TBotIdSet& isSupportedBot)
+{
+ bool hasAtLeastOneSupportedBot = false;
+ for (ui32 currentBotId : currentBotIds) {
+ wasRuleForBot.insert(currentBotId);
+ hasAtLeastOneSupportedBot = hasAtLeastOneSupportedBot || isSupportedBot.contains(currentBotId);
+ }
+ return hasAtLeastOneSupportedBot;
+}
+
+bool TRobotsTxtRulesHandlerBase::CheckEmptyRule(TRobotsTxtRulesHandlerBase* rulesHandler, const char* value, EDirectiveType& type, ui32 lineNumber) {
+ if (value && strlen(value) == 0) {
+ rulesHandler->AddError(WARNING_EMPTY_RULE, lineNumber);
+ type = type == ALLOW ? DISALLOW : ALLOW;
+ return true;
+ }
+ return false;
+}
+
+bool TRobotsTxtRulesHandlerBase::AddRules(
+ TRobotsTxtRulesHandlerBase* rulesHandler,
+ TRobotsTxtParser& parser,
+ const char* value,
+ char type,
+ const TBotIdSet& currentBotIds,
+ const TBotIdSet& isSupportedBot)
+{
+ for (ui32 currentBotId : currentBotIds) {
+ if (!isSupportedBot.contains(currentBotId))
+ continue;
+ if (!rulesHandler->AddRuleWithErrorCheck(currentBotId, value, type, parser))
+ return true;
+ }
+ return false;
+}
+
+bool TRobotsTxtRulesHandlerBase::ProcessSitemap(TRobotsTxtRulesHandlerBase* rulesHandler, TRobotsTxtParser& parser, const char* value, const char* host) {
+ TString modifiedUrl;
+ if (!CheckSitemapUrl(value, host, modifiedUrl))
+ rulesHandler->AddError(ERROR_SITEMAP_FORMAT, parser.GetLineNumber());
+ else {
+ rulesHandler->AddSiteMap(modifiedUrl.data());
+ if (!rulesHandler->AddRuleWithErrorCheck(robotstxtcfg::id_anybot, modifiedUrl.data(), 'S', parser))
+ return true;
+ }
+ return false;
+}
+
+bool TRobotsTxtRulesHandlerBase::ProcessCleanParam(TRobotsTxtRulesHandlerBase* rulesHandler, TRobotsTxtParser& parser, TString& value) {
+ if (!CheckAndNormCleanParam(value))
+ rulesHandler->AddError(ERROR_CLEAN_PARAM_FORMAT, parser.GetLineNumber());
+ else {
+ rulesHandler->AddCleanParam(value.data());
+ if (!rulesHandler->AddRuleWithErrorCheck(robotstxtcfg::id_anybot, value.data(), 'P', parser))
+ return true;
+ }
+ return false;
+}
+
+bool TRobotsTxtRulesHandlerBase::ProcessCrawlDelay(
+ TRobotsTxtRulesHandlerBase* rulesHandler,
+ TRobotsTxtParser& parser,
+ const TBotIdSet& currentBotIds,
+ const TBotIdSet& isSupportedBot,
+ const char* value) {
+ for (ui32 currentBotId : currentBotIds) {
+ if (!isSupportedBot.contains(currentBotId))
+ continue;
+ if (rulesHandler->BotIdToInfo[currentBotId].CrawlDelay >= 0) {
+ rulesHandler->AddError(ERROR_CRAWL_DELAY_MULTI, parser.GetLineNumber());
+ break;
+ }
+ int crawlDelay = -1;
+ if (!ParseCrawlDelay(value, crawlDelay))
+ rulesHandler->AddError(ERROR_CRAWL_DELAY_FORMAT, parser.GetLineNumber());
+ else {
+ rulesHandler->SetCrawlDelay(currentBotId, crawlDelay);
+ if (!rulesHandler->AddRuleWithErrorCheck(currentBotId, value, 'C', parser))
+ return true;
+ }
+ }
+ return false;
+}
+
+void TRobotsTxtRulesHandlerBase::ProcessUserAgent(
+ TRobotsTxtRulesHandlerBase* rulesHandler,
+ TRobotsTxtParser& parser,
+ const TBotIdSet& currentBotIds,
+ TBotIdSet& wasSupportedBot,
+ TBotIdSet& isSupportedBot,
+ TVector<ui32>& botIdToMaxAppropriateUserAgentNameLength,
+ const char* value)
+{
+ ui32 userAgentNameLength = (ui32)strlen(value);
+
+ for (ui32 currentBotId : currentBotIds) {
+ bool userAgentNameLonger = userAgentNameLength > botIdToMaxAppropriateUserAgentNameLength[currentBotId];
+ bool userAgentNameSame = userAgentNameLength == botIdToMaxAppropriateUserAgentNameLength[currentBotId];
+
+ if (!wasSupportedBot.contains(currentBotId) || userAgentNameLonger)
+ ClearAllExceptCrossSection(parser, rulesHandler, currentBotId);
+
+ wasSupportedBot.insert(currentBotId);
+ if (userAgentNameLonger || userAgentNameSame) {
+ isSupportedBot.insert(currentBotId); // Allow multiple blocks for the same user agent
+ }
+ botIdToMaxAppropriateUserAgentNameLength[currentBotId] = Max(userAgentNameLength, botIdToMaxAppropriateUserAgentNameLength[currentBotId]);
+ }
+}
+
+template <class THostHandler>
+int TRobotsTxtRulesHandlerBase::ParseRules(TRobotsTxtParser& parser, TRobotsTxtRulesHandlerBase* rulesHandler, THostHandler* hostHandler, const char* host) {
+ rulesHandler->Clear();
+
+ TBotIdSet wasSupportedBot;
+ TBotIdSet wasRuleForBot;
+ bool wasAsterisk = false;
+ TVector<int> nonRobotsLines;
+ TVector<ui32> botIdToMaxAppropriateUserAgentNameLength(robotstxtcfg::max_botid, 0);
+ static char all[] = "/";
+ EDirectiveType prevType = USER_AGENT;
+ while (parser.HasRecord()) {
+ TRobotsTxtRulesRecord record = parser.NextRecord();
+ bool wasUserAgent = false;
+ bool isRobotsRecordUseful = false;
+ TBotIdSet isSupportedBot;
+ TBotIdSet currentBotIds;
+ TString field;
+ TString value;
+ bool ruleBeforeUserAgent = false;
+ int ret = 0;
+ bool wasRule = false;
+ bool wasBlank = false;
+ while (record.NextPair(field, value, isRobotsRecordUseful && rulesHandler->IsHandlingErrors(), nonRobotsLines, &wasBlank)) {
+ CheckRobotsLines(rulesHandler, nonRobotsLines);
+ EDirectiveType type = NameToDirType(field.data());
+ EDirectiveType typeBeforeChange = type;
+
+ if ((prevType != type || wasBlank) && type == USER_AGENT) {
+ currentBotIds.clear();
+ }
+ prevType = type;
+
+ switch (type) {
+ case USER_AGENT:
+ if (wasUserAgent && wasRule) {
+ wasRule = false;
+ currentBotIds.clear();
+ isSupportedBot.clear();
+ }
+ wasUserAgent = true;
+ value.to_lower();
+ CheckAsterisk(rulesHandler, value.data(), parser.GetLineNumber(), wasAsterisk);
+ isRobotsRecordUseful = rulesHandler->CheckRobot(value.data(), currentBotIds, &botIdToMaxAppropriateUserAgentNameLength);
+ if (isRobotsRecordUseful)
+ ProcessUserAgent(rulesHandler, parser, currentBotIds, wasSupportedBot, isSupportedBot, botIdToMaxAppropriateUserAgentNameLength, value.data());
+ break;
+
+ case DISALLOW:
+ case ALLOW:
+ if (CheckWasUserAgent(rulesHandler, wasUserAgent, ruleBeforeUserAgent, wasRule, parser.GetLineNumber()))
+ break;
+ if (CheckRuleNotSlash(rulesHandler, value.data(), parser.GetLineNumber()))
+ break;
+ CheckRule(value.data(), parser.GetLineNumber(), rulesHandler);
+ if (!CheckSupportedBots(currentBotIds, wasRuleForBot, isSupportedBot)) {
+ break;
+ }
+ if (CheckEmptyRule(rulesHandler, value.data(), type, parser.GetLineNumber())) {
+ value = all;
+ if (typeBeforeChange == ALLOW)
+ continue;
+ }
+
+ if (AddRules(rulesHandler, parser, value.data(), type == ALLOW ? 'A' : 'D', currentBotIds, isSupportedBot))
+ return 2;
+ break;
+
+ case HOST:
+ value.to_lower();
+ ret = hostHandler->OnHost(robotstxtcfg::id_anybot, parser, value.data(), rulesHandler);
+ if (ret)
+ return ret;
+ break;
+
+ case SITEMAP:
+ if (ProcessSitemap(rulesHandler, parser, value.data(), host))
+ return 2;
+ break;
+
+ case CLEAN_PARAM:
+ if (ProcessCleanParam(rulesHandler, parser, value))
+ return 2;
+ break;
+
+ case CRAWL_DELAY:
+ if (ProcessCrawlDelay(rulesHandler, parser, currentBotIds, isSupportedBot, value.data()))
+ return 2;
+ break;
+
+ default:
+ rulesHandler->AddError(WARNING_UNKNOWN_FIELD, parser.GetLineNumber());
+ break;
+ }
+ bool isCrossSection = type == SITEMAP || type == HOST || type == CLEAN_PARAM;
+ if (rulesHandler->IsHandlingErrors() && (isRobotsRecordUseful || isCrossSection))
+ rulesHandler->AddAcceptedLine(parser.GetLineNumber(), currentBotIds, isCrossSection);
+ }
+ }
+
+ for (auto botId : wasSupportedBot) {
+ rulesHandler->LoadedBotIds.insert(botId);
+ if (rulesHandler->IsBotIdSupported(botId))
+ rulesHandler->AfterParse(botId);
+ }
+
+ if (!rulesHandler->OptimizeSize()) {
+ return 2;
+ }
+
+ return 1;
+}
diff --git a/library/cpp/robots_txt/robots_txt_parser.cpp b/library/cpp/robots_txt/robots_txt_parser.cpp
new file mode 100644
index 0000000000..8e2fe6073d
--- /dev/null
+++ b/library/cpp/robots_txt/robots_txt_parser.cpp
@@ -0,0 +1,116 @@
+#include "robots_txt_parser.h"
+#include <util/generic/string.h>
+#include <util/stream/output.h>
+
+TRobotsTxtParser::TRobotsTxtParser(IInputStream& inputStream)
+ : InputStream(inputStream)
+ , LineNumber(0)
+ , IsLastSymbolCR(false)
+{
+}
+
+int TRobotsTxtParser::GetLineNumber() {
+ return LineNumber;
+}
+
+const char* TRobotsTxtParser::ReadLine() {
+ Line = "";
+ char c;
+
+ if (IsLastSymbolCR) {
+ if (!InputStream.ReadChar(c))
+ return nullptr;
+ if (c != '\n')
+ Line.append(c);
+ }
+
+ bool hasMoreSymbols;
+ while (hasMoreSymbols = InputStream.ReadChar(c)) {
+ if (c == '\r') {
+ IsLastSymbolCR = true;
+ break;
+ } else {
+ IsLastSymbolCR = false;
+ if (c == '\n')
+ break;
+ Line.append(c);
+ }
+ }
+ if (!hasMoreSymbols && Line.empty())
+ return nullptr;
+
+ // BOM UTF-8: EF BB BF
+ if (0 == LineNumber && Line.size() >= 3 && Line[0] == '\xEF' && Line[1] == '\xBB' && Line[2] == '\xBF')
+ Line = Line.substr(3, Line.size() - 3);
+
+ ++LineNumber;
+ int i = Line.find('#');
+ if (i == 0)
+ Line = "";
+ else if (i > 0)
+ Line = Line.substr(0, i);
+ return Line.data();
+}
+
+bool TRobotsTxtParser::IsBlankLine(const char* s) {
+ for (const char* p = s; *p; ++p)
+ if (!isspace(*p))
+ return 0;
+ return 1;
+}
+
+char* TRobotsTxtParser::Trim(char* s) {
+ while (isspace(*s))
+ ++s;
+ char* p = s + strlen(s) - 1;
+ while (s < p && isspace(*p))
+ --p;
+ *(p + 1) = 0;
+ return s;
+}
+
+inline bool TRobotsTxtParser::IsRobotsLine(const char* s) {
+ return strchr(s, ':');
+}
+
+bool TRobotsTxtParser::HasRecord() {
+ while (!IsRobotsLine(Line.data()))
+ if (!ReadLine())
+ return 0;
+ return 1;
+}
+
+TRobotsTxtRulesRecord TRobotsTxtParser::NextRecord() {
+ return TRobotsTxtRulesRecord(*this);
+}
+
+TRobotsTxtRulesRecord::TRobotsTxtRulesRecord(TRobotsTxtParser& parser)
+ : Parser(parser)
+{
+}
+
+bool TRobotsTxtRulesRecord::NextPair(TString& field, TString& value, bool handleErrors, TVector<int>& nonRobotsLines, bool* wasBlank) {
+ if (wasBlank) {
+ *wasBlank = false;
+ }
+ while (!Parser.IsRobotsLine(Parser.Line.data())) {
+ if (!Parser.ReadLine())
+ return 0;
+ if (Parser.IsBlankLine(Parser.Line.data())) {
+ if (wasBlank) {
+ *wasBlank = true;
+ }
+ continue;
+ }
+ if (handleErrors && !Parser.IsRobotsLine(Parser.Line.data()))
+ nonRobotsLines.push_back(Parser.GetLineNumber());
+ }
+
+ char* s = strchr(Parser.Line.begin(), ':');
+ *s = 0;
+ char* p = s + 1;
+
+ field = TRobotsTxtParser::Trim(strlwr(Parser.Line.begin()));
+ value = TRobotsTxtParser::Trim(p);
+ return 1;
+}
diff --git a/library/cpp/robots_txt/robots_txt_parser.h b/library/cpp/robots_txt/robots_txt_parser.h
new file mode 100644
index 0000000000..8032d0d20b
--- /dev/null
+++ b/library/cpp/robots_txt/robots_txt_parser.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include <algorithm>
+#include <util/generic/string.h>
+#include <util/generic/vector.h>
+#include <util/stream/input.h>
+
+class TRobotsTxtParser;
+
+class TRobotsTxtRulesRecord {
+private:
+ TRobotsTxtParser& Parser;
+
+public:
+ TRobotsTxtRulesRecord(TRobotsTxtParser& parser);
+ bool NextPair(TString& field, TString& value, bool handleErrors, TVector<int>& nonRobotsLines, bool* wasBlank = nullptr);
+};
+
+class TRobotsTxtParser {
+ friend class TRobotsTxtRulesRecord;
+
+private:
+ IInputStream& InputStream;
+ TString Line;
+ int LineNumber;
+ bool IsLastSymbolCR;
+
+ const char* ReadLine();
+ static bool IsBlankLine(const char*);
+ static bool IsRobotsLine(const char*);
+
+public:
+ static char* Trim(char*);
+ TRobotsTxtParser(IInputStream& inputStream);
+ bool HasRecord();
+ TRobotsTxtRulesRecord NextRecord();
+ int GetLineNumber();
+};
diff --git a/library/cpp/robots_txt/robotstxtcfg.h b/library/cpp/robots_txt/robotstxtcfg.h
new file mode 100644
index 0000000000..5ca1682a0c
--- /dev/null
+++ b/library/cpp/robots_txt/robotstxtcfg.h
@@ -0,0 +1,3 @@
+#pragma once
+
+#include <library/cpp/robots_txt/robotstxtcfg/robotstxtcfg.h>
diff --git a/library/cpp/robots_txt/robotstxtcfg/bot_id_set.cpp b/library/cpp/robots_txt/robotstxtcfg/bot_id_set.cpp
new file mode 100644
index 0000000000..aec668582c
--- /dev/null
+++ b/library/cpp/robots_txt/robotstxtcfg/bot_id_set.cpp
@@ -0,0 +1,2 @@
+#include "bot_id_set.h"
+// header compile test
diff --git a/library/cpp/robots_txt/robotstxtcfg/bot_id_set.h b/library/cpp/robots_txt/robotstxtcfg/bot_id_set.h
new file mode 100644
index 0000000000..08aaa68a50
--- /dev/null
+++ b/library/cpp/robots_txt/robotstxtcfg/bot_id_set.h
@@ -0,0 +1,132 @@
+#pragma once
+
+#include "user_agents.h"
+
+#include <bitset>
+
+
+/// Simple vector-based set for bot ids, meant to optimize memory and lookups
+class TBotIdSet
+{
+public:
+ using TData = std::bitset<robotstxtcfg::max_botid>;
+
+ constexpr TBotIdSet() noexcept = default;
+ constexpr TBotIdSet(const TBotIdSet&) noexcept = default;
+ constexpr TBotIdSet(TBotIdSet&&) noexcept = default;
+ constexpr TBotIdSet& operator = (const TBotIdSet&) noexcept = default;
+ constexpr TBotIdSet& operator = (TBotIdSet&&) noexcept = default;
+
+ TBotIdSet(std::initializer_list<ui32> botIds) {
+ for (auto id : botIds) {
+ insert(id);
+ }
+ }
+
+ static TBotIdSet All() noexcept {
+ TBotIdSet res;
+ res.Bots.set();
+ return res;
+ }
+
+ constexpr bool contains(ui32 botId) const noexcept {
+ return (botId < Bots.size()) && Bots[botId];
+ }
+
+ bool insert(ui32 botId) noexcept {
+ if (botId >= Bots.size() || Bots[botId]) {
+ return false;
+ }
+ Bots[botId] = true;
+ return true;
+ }
+
+ bool remove(ui32 botId) noexcept {
+ if (botId >= Bots.size() || !Bots[botId]) {
+ return false;
+ }
+ Bots[botId] = false;
+ return true;
+ }
+
+ void clear() noexcept {
+ Bots.reset();
+ }
+
+ size_t size() const noexcept {
+ return Bots.count();
+ }
+
+ bool empty() const noexcept {
+ return Bots.none();
+ }
+
+ bool operator==(const TBotIdSet& rhs) const noexcept = default;
+
+ TBotIdSet operator&(TBotIdSet rhs) const noexcept {
+ rhs.Bots &= Bots;
+ return rhs;
+ }
+
+ TBotIdSet operator|(TBotIdSet rhs) const noexcept {
+ rhs.Bots |= Bots;
+ return rhs;
+ }
+
+ TBotIdSet operator~() const noexcept {
+ TBotIdSet result;
+ result.Bots = ~Bots;
+ return result;
+ }
+
+ class iterator
+ {
+ public:
+ auto operator * () const noexcept {
+ return BotId;
+ }
+
+ iterator& operator ++ () noexcept {
+ while (BotId < Bots.size()) {
+ if (Bots[++BotId]) {
+ break;
+ }
+ }
+ return *this;
+ }
+
+ bool operator == (const iterator& rhs) const noexcept {
+ return (&Bots == &rhs.Bots) && (BotId == rhs.BotId);
+ }
+
+ bool operator != (const iterator& rhs) const noexcept {
+ return !(*this == rhs);
+ }
+
+ private:
+ friend class TBotIdSet;
+ iterator(const TData& bots, ui32 botId)
+ : Bots(bots)
+ , BotId(botId)
+ {
+ while (BotId < Bots.size() && !Bots[BotId]) {
+ ++BotId;
+ }
+ }
+
+ private:
+ const TData& Bots;
+ ui32 BotId;
+ };
+
+ iterator begin() const noexcept {
+ return {Bots, robotstxtcfg::id_anybot};
+ }
+
+ iterator end() const noexcept {
+ return {Bots, robotstxtcfg::max_botid};
+ }
+
+private:
+ TData Bots {};
+};
diff --git a/library/cpp/robots_txt/robotstxtcfg/robotstxtcfg.cpp b/library/cpp/robots_txt/robotstxtcfg/robotstxtcfg.cpp
new file mode 100644
index 0000000000..c5652b81c5
--- /dev/null
+++ b/library/cpp/robots_txt/robotstxtcfg/robotstxtcfg.cpp
@@ -0,0 +1,2 @@
+#include "robotstxtcfg.h"
+// header compile test
diff --git a/library/cpp/robots_txt/robotstxtcfg/robotstxtcfg.h b/library/cpp/robots_txt/robotstxtcfg/robotstxtcfg.h
new file mode 100644
index 0000000000..2cf9430d7c
--- /dev/null
+++ b/library/cpp/robots_txt/robotstxtcfg/robotstxtcfg.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include "bot_id_set.h"
+
+
+namespace robotstxtcfg {
+
+static const TBotIdSet defaultSupportedBotIds = {id_defbot};
+static const TBotIdSet allSupportedBotIds = TBotIdSet::All();
+
+} // namespace robotstxtcfg
diff --git a/library/cpp/robots_txt/robotstxtcfg/user_agents.cpp b/library/cpp/robots_txt/robotstxtcfg/user_agents.cpp
new file mode 100644
index 0000000000..60b353a427
--- /dev/null
+++ b/library/cpp/robots_txt/robotstxtcfg/user_agents.cpp
@@ -0,0 +1,2 @@
+#include "user_agents.h"
+// header compile test
diff --git a/library/cpp/robots_txt/robotstxtcfg/user_agents.h b/library/cpp/robots_txt/robotstxtcfg/user_agents.h
new file mode 100644
index 0000000000..a56e5b66f4
--- /dev/null
+++ b/library/cpp/robots_txt/robotstxtcfg/user_agents.h
@@ -0,0 +1,292 @@
+#pragma once
+
+#include <library/cpp/case_insensitive_string/case_insensitive_string.h>
+
+
+namespace robotstxtcfg {
+ // robots.txt agents and identifiers
+
+ enum EBots : ui32 {
+ id_anybot = 0,
+ id_yandexbot = 1,
+ id_yandexmediabot = 2,
+ id_yandeximagesbot = 3,
+ id_googlebot = 4,
+ id_yandexbotmirr = 5,
+ id_yahooslurp = 6,
+ id_msnbot = 7,
+ id_yandexcatalogbot = 8,
+ id_yandexdirectbot = 9,
+ id_yandexblogsbot = 10,
+ id_yandexnewsbot = 11,
+ id_yandexpagechk = 12,
+ id_yandexmetrikabot = 13,
+ id_yandexbrowser = 14,
+ id_yandexmarketbot = 15,
+ id_yandexcalendarbot = 16,
+ id_yandexwebmasterbot = 17,
+ id_yandexvideobot = 18,
+ id_yandeximageresizerbot = 19,
+ id_yandexadnetbot = 20,
+ id_yandexpartnerbot = 21,
+ id_yandexdirectdbot = 22,
+ id_yandextravelbot = 23,
+ id_yandexmobilebot = 24,
+ id_yandexrcabot = 25,
+ id_yandexdirectdynbot = 26,
+ id_yandexmobilebot_ed = 27,
+ id_yandexaccessibilitybot = 28,
+ id_baidubot = 29,
+ id_yandexscreenshotbot = 30,
+ id_yandexmetrikayabs = 31,
+ id_yandexvideoparserbot = 32,
+ id_yandexnewsbot4 = 33,
+ id_yandexmarketbot2 = 34,
+ id_yandexmedianabot = 35,
+ id_yandexsearchshopbot = 36,
+ id_yandexontodbbot = 37,
+ id_yandexontodbapibot = 38,
+ id_yandexampbot = 39,
+ id_yandexvideohosting = 40,
+ id_yandexmediaselling = 41,
+ id_yandexverticals = 42,
+ id_yandexturbobot = 43,
+ id_yandexzenbot = 44,
+ id_yandextrackerbot = 45,
+ id_yandexmetrikabot4 = 46,
+ id_yandexmobilescreenshotbot = 47,
+ id_yandexfaviconsbot = 48,
+ max_botid
+ };
+
+ static const ui32 id_defbot = id_yandexbot;
+
+ struct TBotInfo {
+ TCaseInsensitiveStringBuf ReqPrefix;
+ TCaseInsensitiveStringBuf FullName;
+ TStringBuf FromField = {};
+ TStringBuf UserAgent = {};
+ TStringBuf RotorUserAgent = {};
+ bool ExplicitDisallow = false;
+ };
+
+ static constexpr TStringBuf UserAgentFrom("support@search.yandex.ru");
+
+ static constexpr TBotInfo BotInfoArr[] = {
+ {"*", "*"},
+ {"Yandex", "YandexBot/3.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268",
+ false},
+ {"Yandex", "YandexMedia/3.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexMedia/3.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexMedia/3.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268",
+ false},
+ {"Yandex", "YandexImages/3.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexImages/3.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexImages/3.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268",
+ false},
+ {"Google", "GoogleBot"},
+ {"Yandex", "YandexBot/3.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexBot/3.0; MirrorDetector; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexBot/3.0; MirrorDetector; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268",
+ false},
+ {"Slurp", "Slurp"},
+ {"msn", "msnbot"},
+ {"Yandex", "YandexCatalog/3.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexCatalog/3.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexCatalog/3.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268",
+ false},
+ {"YaDirectFetcher", "YaDirectFetcher/1.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YaDirectFetcher/1.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YaDirectFetcher/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268",
+ true},
+
+ {"Yandex", "YandexBlogs/0.99", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexBlogs/0.99; robot; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexBlogs/0.99; robot; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268",
+ false},
+ {"Yandex", "YandexNews/3.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexNews/3.0; robot; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexNews/3.0; robot; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268",
+ false},
+ {"Yandex", "YandexPagechecker/2.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexPagechecker/2.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexPagechecker/2.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268",
+ false},
+ {"Yandex", "YandexMetrika/3.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexMetrika/3.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexMetrika/3.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268",
+ false},
+ {"Yandex", "YandexBrowser/1.0", UserAgentFrom,
+ "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/536.5 (KHTML, like Gecko) YaBrowser/1.0.1084.5402 Chrome/19.0.1084.5409 Safari/536.5",
+ "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/536.5 (KHTML, like Gecko) YaBrowser/1.0.1084.5402 Chrome/19.0.1084.5409 Safari/536.5",
+ false},
+ {"Yandex", "YandexMarket/1.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexMarket/1.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexMarket/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268",
+ false},
+ {"YandexCalendar", "YandexCalendar/1.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexCalendar/1.0 +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexCalendar/1.0 +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268",
+ true},
+ {"Yandex", "YandexWebmaster/2.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexWebmaster/2.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexWebmaster/2.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268",
+ false},
+ {"Yandex", "YandexVideo/3.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexVideo/3.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexVideo/3.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268",
+ false},
+ {"Yandex", "YandexImageResizer/2.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexImageResizer/2.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexImageResizer/2.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268",
+ false},
+
+ {"YandexDirect", "YandexDirect/3.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexDirect/3.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexDirect/3.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268",
+ true},
+ {"YandexPartner", "YandexPartner/3.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexPartner/3.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexPartner/3.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268",
+ true},
+ {"YaDirectFetcher", "YaDirectFetcher/1.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YaDirectFetcher/1.0; Dyatel; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YaDirectFetcher/1.0; Dyatel; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268",
+ true},
+ {"Yandex", "YandexTravel/1.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexTravel/1.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexTravel/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268",
+ false},
+ {"Yandex", "YandexBot/3.0", UserAgentFrom,
+ "Mozilla/5.0 (iPhone; CPU iPhone OS 8_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12B411 Safari/600.1.4 (compatible; YandexBot/3.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (iPhone; CPU iPhone OS 8_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12B411 Safari/600.1.4 (compatible; YandexBot/3.0; +http://yandex.com/bots)",
+ false},
+ {"YandexRCA", "YandexRCA/1.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexRCA/1.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexRCA/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268",
+ true},
+ {"YandexDirectDyn", "YandexDirectDyn/1.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexDirectDyn/1.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexDirectDyn/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268",
+ true},
+ {"YandexMobileBot", "YandexMobileBot/3.0", UserAgentFrom,
+ "Mozilla/5.0 (iPhone; CPU iPhone OS 15_4_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.4 Mobile/15E148 Safari/604.1 (compatible; YandexMobileBot/3.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (iPhone; CPU iPhone OS 15_4_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.4 Mobile/15E148 Safari/604.1 (compatible; YandexMobileBot/3.0; +http://yandex.com/bots)",
+ true},
+ {"YandexAccessibilityBot", "YandexAccessibilityBot/3.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexAccessibilityBot/3.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexAccessibilityBot/3.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268",
+ true},
+ {"Baidu", "Baiduspider"},
+
+ {"YandexScreenshotBot", "YandexScreenshotBot/3.0", UserAgentFrom,
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36 (compatible; YandexScreenshotBot/3.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36 (compatible; YandexScreenshotBot/3.0; +http://yandex.com/bots)",
+ true},
+ {"YandexMetrika", "YandexMetrika/2.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexMetrika/2.0; +http://yandex.com/bots yabs01)",
+ "Mozilla/5.0 (compatible; YandexMetrika/2.0; +http://yandex.com/bots yabs01) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268",
+ true},
+ {"YandexVideoParser", "YandexVideoParser/1.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexVideoParser/1.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexVideoParser/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268",
+ true},
+ {"Yandex", "YandexNews/4.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexNews/4.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexNews/4.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268",
+ true},
+ {"YandexMarket", "YandexMarket/2.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexMarket/2.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexMarket/2.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268",
+ true},
+ {"YandexMedianaBot", "YandexMedianaBot/1.0", UserAgentFrom,
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36 (compatible; YandexMedianaBot/1.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36 (compatible; YandexMedianaBot/1.0; +http://yandex.com/bots)",
+ true},
+ {"YandexSearchShop", "YandexSearchShop/1.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexSearchShop/1.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexSearchShop/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268",
+ true},
+ {"Yandex", "YandexOntoDB/1.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexOntoDB/1.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexOntoDB/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268",
+ false},
+ {"YandexOntoDBAPI", "YandexOntoDBAPI/1.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexOntoDBAPI/1.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexOntoDBAPI/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268",
+ true},
+ {"Yandex-AMPHTML", "Yandex-AMPHTML", UserAgentFrom,
+ "Mozilla/5.0 (compatible; Yandex-AMPHTML; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; Yandex-AMPHTML; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268",
+ true},
+
+ {"YandexVideoHosting", "YandexVideoHosting/1.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexVideoHosting/1.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexVideoHosting/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268",
+ true},
+ {"YandexMediaSelling", "YandexMediaSelling/1.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexMediaSelling/1.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexMediaSelling/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268",
+ true},
+ {"YandexVerticals", "YandexVerticals/1.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexVerticals/1.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexVerticals/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268",
+ true},
+ {"YandexTurbo", "YandexTurbo/1.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexTurbo/1.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexTurbo/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268",
+ true},
+ {"YandexZenRss", "YandexZenRss/1.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexZenRss/1.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexZenRss/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268",
+ true},
+ {"YandexTracker", "YandexTracker/1.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexTracker/1.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexTracker/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268",
+ true},
+ {"YandexMetrika", "YandexMetrika/4.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexMetrika/4.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexMetrika/4.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268",
+ true},
+ {"YandexMobileScreenShotBot", "YandexMobileScreenShotBot/1.0", UserAgentFrom,
+ "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/11.0 Mobile/12B411 Safari/600.1.4 (compatible; YandexBot/3.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/11.0 Mobile/12B411 Safari/600.1.4 (compatible; YandexBot/3.0; +http://yandex.com/bots)",
+ true},
+ {"YandexFavicons", "YandexFavicons/1.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexFavicons/1.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexFavicons/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268",
+ true}};
+
+ static_assert(std::size(BotInfoArr) == max_botid);
+
+ constexpr auto GetReqPrefix(ui32 botId) {
+ return BotInfoArr[botId].ReqPrefix;
+ }
+
+ constexpr auto GetFullName(ui32 botId) {
+ return BotInfoArr[botId].FullName;
+ }
+
+ constexpr auto GetFromField(ui32 botId) {
+ return BotInfoArr[botId].FromField;
+ }
+
+ constexpr auto GetUserAgent(ui32 botId) {
+ return BotInfoArr[botId].UserAgent;
+ }
+
+ constexpr auto GetRotorUserAgent(ui32 botId) {
+ return BotInfoArr[botId].RotorUserAgent;
+ }
+
+ constexpr bool IsExplicitDisallow(ui32 botId) {
+ return BotInfoArr[botId].ExplicitDisallow;
+ }
+
+ constexpr bool IsYandexBotId(ui32 botId) {
+ return !BotInfoArr[botId].UserAgent.empty();
+ }
+
+} // namespace robotstxtcfg
diff --git a/library/cpp/robots_txt/rules_handler.cpp b/library/cpp/robots_txt/rules_handler.cpp
new file mode 100644
index 0000000000..14f6810948
--- /dev/null
+++ b/library/cpp/robots_txt/rules_handler.cpp
@@ -0,0 +1,514 @@
+#include "robots_txt.h"
+#include "constants.h"
+
+#include <library/cpp/uri/http_url.h>
+#include <library/cpp/charset/ci_string.h>
+#include <library/cpp/string_utils/url/url.h>
+#include <util/system/maxlen.h>
+#include <util/generic/yexception.h>
+#include <util/generic/algorithm.h>
+
+
+namespace {
+
+TBotIdSet ConvertBotIdSet(const TSet<ui32>& botIds) noexcept {
+ TBotIdSet result;
+ for (auto id : botIds) {
+ result.insert(id);
+ }
+ return result;
+}
+
+} // namespace
+
+TRobotsTxtRulesIterator::TRobotsTxtRulesIterator(const char* begin, const char* end)
+ : Begin(begin)
+ , End(end)
+{
+}
+
+void TRobotsTxtRulesIterator::Next() {
+ while (Begin < End && *Begin)
+ ++Begin;
+ while (Begin < End && !isalpha(*Begin))
+ ++Begin;
+}
+
+bool TRobotsTxtRulesIterator::HasRule() const {
+ return Begin < End;
+}
+
+const char* TRobotsTxtRulesIterator::GetRule() const {
+ return Begin + 1;
+}
+
+TString TRobotsTxtRulesIterator::GetInitialRule() const {
+ auto begin = Begin + 1;
+ TStringBuf rule(begin, strlen(begin));
+
+ switch (*Begin) {
+ case 'a':
+ case 'd':
+ return rule.EndsWith('*') ? TString(rule.Chop(1)) : TString::Join(rule, '$');
+ default:
+ return TString(rule);
+ }
+}
+
+EDirectiveType TRobotsTxtRulesIterator::GetRuleType() const {
+ return CharToDirType(*Begin);
+}
+
+EDirectiveType TRobotsTxtRulesIterator::CharToDirType(char ch) {
+ switch (toupper(ch)) {
+ case 'A':
+ return ALLOW;
+ case 'C':
+ return CRAWL_DELAY;
+ case 'D':
+ return DISALLOW;
+ case 'H':
+ return HOST;
+ case 'P':
+ return CLEAN_PARAM;
+ case 'S':
+ return SITEMAP;
+ }
+ return UNKNOWN;
+}
+
+TRobotsTxtRulesHandlerBase::TRobotsTxtRulesHandlerBase(
+ TBotIdSet supportedBotIds,
+ int robotsMaxSize,
+ int maxRulesNumber,
+ bool saveDataForAnyBot)
+ : HandleErrors(false)
+ , SiteMaps()
+ , CleanParams()
+ , HostDirective("")
+ , Errors()
+ , AcceptedLines()
+ , CrossSectionAcceptedLines()
+ , BotIdToInfo(robotstxtcfg::max_botid)
+ , RobotsMaxSize(robotsMaxSize)
+ , MaxRulesNumber(maxRulesNumber)
+ , SaveDataForAnyBot(saveDataForAnyBot)
+ , SupportedBotIds(supportedBotIds)
+{
+ Y_ENSURE(!supportedBotIds.empty());
+
+ if (RobotsMaxSize <= 0)
+ RobotsMaxSize = robots_max;
+ if (MaxRulesNumber <= 0)
+ MaxRulesNumber = max_rules_count;
+
+ ResetOptimized();
+}
+
+TRobotsTxtRulesHandlerBase::TRobotsTxtRulesHandlerBase(
+ const TSet<ui32>& supportedBotIds,
+ int robotsMaxSize,
+ int maxRulesNumber,
+ bool saveDataForAnyBot)
+ : TRobotsTxtRulesHandlerBase(ConvertBotIdSet(supportedBotIds), robotsMaxSize, maxRulesNumber, saveDataForAnyBot)
+{}
+
+TRobotsTxtRulesHandlerBase::~TRobotsTxtRulesHandlerBase() = default;
+
+void TRobotsTxtRulesHandlerBase::CheckBotIdValidity(const ui32 botId) const {
+ if (botId >= robotstxtcfg::max_botid || !IsBotIdSupported(botId))
+ ythrow yexception() << "robots.txt parser requested for invalid or unsupported botId = " << botId << Endl;
+ ;
+}
+
+int TRobotsTxtRulesHandlerBase::GetCrawlDelay(const ui32 botId, bool* realInfo) const {
+ const auto id = GetMappedBotId(botId, false);
+ if (realInfo)
+ *realInfo = bool(id);
+ return BotIdToInfo[id.GetOrElse(robotstxtcfg::id_anybot)].CrawlDelay;
+}
+
+int TRobotsTxtRulesHandlerBase::GetMinCrawlDelay(int defaultCrawlDelay) const {
+ int res = INT_MAX;
+ bool useDefault = false;
+ for (ui32 botId = 0; botId < robotstxtcfg::max_botid; ++botId) {
+ if (robotstxtcfg::IsYandexBotId(botId) && IsBotIdSupported(botId) && !IsDisallowAll(botId)) {
+ bool realInfo;
+ int curCrawlDelay = GetCrawlDelay(botId, &realInfo);
+ if (realInfo) {
+ if (curCrawlDelay == -1) {
+ useDefault = true;
+ } else {
+ res = Min(res, curCrawlDelay);
+ }
+ }
+ }
+ }
+
+ if (useDefault && defaultCrawlDelay < res) {
+ return -1;
+ }
+
+ if (res == INT_MAX) {
+ res = GetCrawlDelay(robotstxtcfg::id_anybot);
+ }
+
+ return res;
+}
+
+void TRobotsTxtRulesHandlerBase::SetCrawlDelay(const ui32 botId, int crawlDelay) {
+ CheckBotIdValidity(botId);
+ BotIdToInfo[botId].CrawlDelay = crawlDelay;
+}
+
+const TVector<TString> TRobotsTxtRulesHandlerBase::GetSiteMaps() const {
+ return TVector<TString>(SiteMaps.begin(), SiteMaps.end());
+}
+
+void TRobotsTxtRulesHandlerBase::AddSiteMap(const char* sitemap) {
+ SiteMaps.insert(sitemap);
+}
+
+const TVector<TString> TRobotsTxtRulesHandlerBase::GetCleanParams() const {
+ return TVector<TString>(CleanParams.begin(), CleanParams.end());
+}
+
+void TRobotsTxtRulesHandlerBase::AddCleanParam(const char* cleanParam) {
+ CleanParams.insert(cleanParam);
+}
+
+const TString& TRobotsTxtRulesHandlerBase::GetHostDirective() const {
+ return HostDirective;
+}
+
+void TRobotsTxtRulesHandlerBase::SetHostDirective(const char* hostDirective) {
+ HostDirective = hostDirective;
+}
+
+const TRobotsTxtRulesHandlerBase::TErrorVector& TRobotsTxtRulesHandlerBase::GetErrors() const {
+ return Errors;
+}
+
+TVector<int> TRobotsTxtRulesHandlerBase::GetAcceptedLines(const ui32 botId) const {
+ TVector<int> ret;
+ for (size_t i = 0; i < CrossSectionAcceptedLines.size(); ++i)
+ ret.push_back(CrossSectionAcceptedLines[i]);
+
+ bool hasLinesForBotId = false;
+ for (size_t i = 0; i < AcceptedLines.size(); ++i) {
+ if (AcceptedLines[i].first == botId) {
+ hasLinesForBotId = true;
+ break;
+ }
+ }
+
+ for (size_t i = 0; i < AcceptedLines.size(); ++i) {
+ if (hasLinesForBotId && AcceptedLines[i].first == botId) {
+ ret.push_back(AcceptedLines[i].second);
+ } else if (!hasLinesForBotId && AcceptedLines[i].first == robotstxtcfg::id_anybot) {
+ ret.push_back(AcceptedLines[i].second);
+ }
+ }
+
+ Sort(ret.begin(), ret.end());
+
+ return ret;
+}
+
+void TRobotsTxtRulesHandlerBase::AddAcceptedLine(ui32 line, const TBotIdSet& botIds, bool isCrossSection) {
+ if (isCrossSection) {
+ CrossSectionAcceptedLines.push_back(line);
+ return;
+ }
+
+ for (auto botId : botIds) {
+ AcceptedLines.push_back(TBotIdAcceptedLine(botId, line));
+ }
+}
+
+void TRobotsTxtRulesHandlerBase::SetErrorsHandling(bool handleErrors) {
+ HandleErrors = handleErrors;
+}
+
+bool TRobotsTxtRulesHandlerBase::IsHandlingErrors() const {
+ return HandleErrors;
+}
+
+EDirectiveType TRobotsTxtRulesHandlerBase::NameToDirType(const char* d) {
+ if (!strcmp("disallow", d))
+ return DISALLOW;
+ if (!strcmp("allow", d))
+ return ALLOW;
+ if (!strcmp("user-agent", d))
+ return USER_AGENT;
+ if (!strcmp("host", d))
+ return HOST;
+ if (!strcmp("sitemap", d))
+ return SITEMAP;
+ if (!strcmp("clean-param", d))
+ return CLEAN_PARAM;
+ if (!strcmp("crawl-delay", d))
+ return CRAWL_DELAY;
+ return UNKNOWN;
+};
+
+const char* TRobotsTxtRulesHandlerBase::DirTypeToName(EDirectiveType t) {
+ static const char* name[] = {"Allow", "Crawl-Delay", "Disallow", "Host", "Clean-Param", "Sitemap", "User-Agent", "Unknown"};
+ switch (t) {
+ case ALLOW:
+ return name[0];
+ case CRAWL_DELAY:
+ return name[1];
+ case DISALLOW:
+ return name[2];
+ case HOST:
+ return name[3];
+ case CLEAN_PARAM:
+ return name[4];
+ case SITEMAP:
+ return name[5];
+ case USER_AGENT:
+ return name[6];
+ case UNKNOWN:
+ return name[7];
+ }
+ return name[7];
+};
+
+bool TRobotsTxtRulesHandlerBase::CheckRobot(
+ const char* userAgent,
+ TBotIdSet& botIds,
+ const TVector<ui32>* botIdToMaxAppropriateUserAgentNameLength) const
+{
+ TCaseInsensitiveStringBuf agent(userAgent);
+
+ for (size_t botIndex = 0; botIndex < robotstxtcfg::max_botid; ++botIndex) {
+ if (!IsBotIdSupported(botIndex))
+ continue;
+
+ bool hasRequiredAgentNamePrefix = agent.StartsWith(robotstxtcfg::GetReqPrefix(botIndex));
+ bool isContainedInFullName = robotstxtcfg::GetFullName(botIndex).StartsWith(agent);
+ bool wasMoreImportantAgent = false;
+ if (botIdToMaxAppropriateUserAgentNameLength)
+ wasMoreImportantAgent = agent.size() < (*botIdToMaxAppropriateUserAgentNameLength)[botIndex];
+
+ if (hasRequiredAgentNamePrefix && isContainedInFullName && !wasMoreImportantAgent) {
+ botIds.insert(botIndex);
+ }
+ }
+
+ return !botIds.empty();
+}
+
+int TRobotsTxtRulesHandlerBase::CheckRule(const char* value, int line, TRobotsTxtRulesHandlerBase* rulesHandler) {
+ if (!rulesHandler->IsHandlingErrors())
+ return 0;
+
+ if (auto len = strlen(value); len > max_rule_length) {
+ rulesHandler->AddError(ERROR_RULE_HUGE, line);
+ }
+
+ bool upper = false, suspect = false;
+ for (const char* r = value; *r; ++r) {
+ if (!upper && isupper(*r))
+ upper = true;
+ if (!suspect && !isalnum(*r) && !strchr("/_?=.-*%&~[]:;@", *r) && (*(r + 1) || *r != '$'))
+ suspect = true;
+ }
+ if (suspect)
+ rulesHandler->AddError(WARNING_SUSPECT_SYMBOL, line);
+ if (upper)
+ rulesHandler->AddError(WARNING_UPPER_REGISTER, line);
+ return suspect || upper;
+}
+
+void TRobotsTxtRulesHandlerBase::AddError(EFormatErrorType type, int line) {
+ if (!HandleErrors)
+ return;
+ Errors.push_back(std::make_pair(type, line));
+}
+
+void TRobotsTxtRulesHandlerBase::ResetOptimized() noexcept {
+ for (ui32 i = 0; i < OptimizedBotIdToStoredBotId.size(); ++i) {
+ OptimizedBotIdToStoredBotId[i] = i; // by default, every bot maps to itself
+ }
+}
+
+void TRobotsTxtRulesHandlerBase::Clear() {
+ SiteMaps.clear();
+ CleanParams.clear();
+ HostDirective = "";
+ if (HandleErrors) {
+ AcceptedLines.clear();
+ CrossSectionAcceptedLines.clear();
+ Errors.clear();
+ }
+
+ for (size_t botId = 0; botId < BotIdToInfo.size(); ++botId) {
+ BotIdToInfo[botId].CrawlDelay = -1;
+ }
+
+ LoadedBotIds.clear();
+}
+
+void TRobotsTxtRulesHandlerBase::ClearInternal(const ui32 botId) {
+ CheckBotIdValidity(botId);
+ BotIdToInfo[botId].CrawlDelay = -1;
+
+ TVector<TBotIdAcceptedLine> newAcceptedLines;
+ for (size_t i = 0; i < AcceptedLines.size(); ++i)
+ if (AcceptedLines[i].first != botId)
+ newAcceptedLines.push_back(AcceptedLines[i]);
+
+ AcceptedLines.swap(newAcceptedLines);
+}
+
+int TRobotsTxtRulesHandlerBase::CheckHost(const char* host) {
+ THttpURL parsed;
+ TString copyHost = host;
+
+ if (GetHttpPrefixSize(copyHost) == 0) {
+ copyHost = TString("http://") + copyHost;
+ }
+
+ return parsed.Parse(copyHost.data(), THttpURL::FeaturesRobot) == THttpURL::ParsedOK && parsed.GetField(THttpURL::FieldHost) != TString("");
+}
+
+int TRobotsTxtRulesHandlerBase::CheckSitemapUrl(const char* url, const char* host, TString& modifiedUrl) {
+ if (host != nullptr && strlen(url) > 0 && url[0] == '/') {
+ modifiedUrl = TString(host) + url;
+ } else {
+ modifiedUrl = url;
+ }
+
+ url = modifiedUrl.data();
+
+ if (strlen(url) >= URL_MAX - 8)
+ return 0;
+ THttpURL parsed;
+ if (parsed.Parse(url, THttpURL::FeaturesRobot) || !parsed.IsValidAbs())
+ return 0;
+ if (parsed.GetScheme() != THttpURL::SchemeHTTP && parsed.GetScheme() != THttpURL::SchemeHTTPS)
+ return 0;
+ return CheckHost(parsed.PrintS(THttpURL::FlagHostPort).data());
+}
+
+// s - is space separated pair of clean-params (separated by &) and path prefix
+int TRobotsTxtRulesHandlerBase::CheckAndNormCleanParam(TString& value) {
+ if (value.find(' ') == TString::npos) {
+ value.push_back(' ');
+ }
+
+ const char* s = value.data();
+ if (!s || !*s || strlen(s) > URL_MAX / 2 - 9)
+ return 0;
+ const char* p = s;
+ while (*p && !isspace(*p))
+ ++p;
+ for (; s != p; ++s) {
+ // allowed only following not alpha-numerical symbols
+ if (!isalnum(*s) && !strchr("+-=_&%[]{}():.", *s))
+ return 0;
+ // clean-params for prefix can be enumerated by & symbol, && not allowed syntax
+ if (*s == '&' && *(s + 1) == '&')
+ return 0;
+ }
+ const char* pathPrefix = p + 1;
+ while (isspace(*p))
+ ++p;
+ char r[URL_MAX];
+ char* pr = r;
+ for (; *p; ++p) {
+ if (!isalnum(*p) && !strchr(".-/*_,;:%", *p))
+ return 0;
+ if (*p == '*')
+ *pr++ = '.';
+ if (*p == '.')
+ *pr++ = '\\';
+ *pr++ = *p;
+ }
+ *pr++ = '.';
+ *pr++ = '*';
+ *pr = 0;
+ TString params = value.substr(0, pathPrefix - value.data());
+ value = params + r;
+ return 1;
+}
+
+int TRobotsTxtRulesHandlerBase::ParseCrawlDelay(const char* value, int& crawlDelay) {
+ static const int MAX_CRAWL_DELAY = 1 << 10;
+ int val = 0;
+ const char* p = value;
+ for (; isdigit(*p); ++p) {
+ val = val * 10 + *p - '0';
+ if (val > MAX_CRAWL_DELAY)
+ return 0;
+ }
+ if (*p) {
+ if (*p++ != '.')
+ return 0;
+ if (strspn(p, "1234567890") != strlen(p))
+ return 0;
+ }
+ for (const char* s = p; s - p < 3; ++s)
+ val = val * 10 + (s < p + strlen(p) ? *s - '0' : 0);
+ crawlDelay = val;
+ return 1;
+}
+
+bool TRobotsTxtRulesHandlerBase::AddRuleWithErrorCheck(const ui32 botId, TStringBuf rule, char type, TRobotsTxtParser& parser) {
+ if (!IsBotIdSupported(botId))
+ return true;
+
+ if (!AddRule(botId, rule, type)) {
+ AddError(ERROR_ROBOTS_HUGE, parser.GetLineNumber());
+ AfterParse(botId);
+ return false;
+ }
+ return true;
+}
+
+int TRobotsTxtRulesHandlerBase::OnHost(const ui32 botId, TRobotsTxtParser& parser, const char* value, TRobotsTxtRulesHandlerBase*& rulesHandler) {
+ // Temporary hack for correct repacking robots.txt from new format to old
+ // Remove it, when robot-stable-2010-10-17 will be deployed in production
+ if (!IsBotIdSupported(botId))
+ return 0;
+ // end of hack
+
+ if (rulesHandler->HostDirective != "")
+ rulesHandler->AddError(ERROR_HOST_MULTI, parser.GetLineNumber());
+ else {
+ if (!CheckHost(value))
+ rulesHandler->AddError(ERROR_HOST_FORMAT, parser.GetLineNumber());
+ else {
+ rulesHandler->SetHostDirective(value);
+ if (!rulesHandler->AddRuleWithErrorCheck(botId, value, 'H', parser))
+ return 2;
+ }
+ }
+ return 0;
+}
+
+bool TRobotsTxtRulesHandlerBase::IsBotIdLoaded(const ui32 botId) const {
+ return LoadedBotIds.contains(botId);
+}
+
+bool TRobotsTxtRulesHandlerBase::IsBotIdSupported(const ui32 botId) const {
+ return (SaveDataForAnyBot && botId == robotstxtcfg::id_anybot) || SupportedBotIds.contains(botId);
+}
+
+ui32 TRobotsTxtRulesHandlerBase::GetNotOptimizedBotId(const ui32 botId) const {
+ return (botId < OptimizedBotIdToStoredBotId.size())
+ ? OptimizedBotIdToStoredBotId[botId]
+ : botId;
+}
+
+TMaybe<ui32> TRobotsTxtRulesHandlerBase::GetMappedBotId(ui32 botId, bool useAny) const {
+ botId = GetNotOptimizedBotId(botId);
+ CheckBotIdValidity(botId);
+ if (IsBotIdLoaded(botId))
+ return botId;
+ if (useAny)
+ return robotstxtcfg::id_anybot;
+ return {};
+}