diff options
author | qrort <qrort@yandex-team.com> | 2022-11-30 23:47:12 +0300 |
---|---|---|
committer | qrort <qrort@yandex-team.com> | 2022-11-30 23:47:12 +0300 |
commit | 22f8ae0e3f5d68b92aecccdf96c1d841a0334311 (patch) | |
tree | bffa27765faf54126ad44bcafa89fadecb7a73d7 /library/cpp/robots_txt | |
parent | 332b99e2173f0425444abb759eebcb2fafaa9209 (diff) | |
download | ydb-22f8ae0e3f5d68b92aecccdf96c1d841a0334311.tar.gz |
validate canons without yatest_common
Diffstat (limited to 'library/cpp/robots_txt')
-rw-r--r-- | library/cpp/robots_txt/constants.h | 9 | ||||
-rw-r--r-- | library/cpp/robots_txt/prefix_tree.cpp | 172 | ||||
-rw-r--r-- | library/cpp/robots_txt/prefix_tree.h | 47 | ||||
-rw-r--r-- | library/cpp/robots_txt/prefix_tree_rules_handler.cpp | 706 | ||||
-rw-r--r-- | library/cpp/robots_txt/robots_txt.h | 605 | ||||
-rw-r--r-- | library/cpp/robots_txt/robots_txt_parser.cpp | 116 | ||||
-rw-r--r-- | library/cpp/robots_txt/robots_txt_parser.h | 38 | ||||
-rw-r--r-- | library/cpp/robots_txt/robotstxtcfg.h | 3 | ||||
-rw-r--r-- | library/cpp/robots_txt/robotstxtcfg/bot_id_set.cpp | 2 | ||||
-rw-r--r-- | library/cpp/robots_txt/robotstxtcfg/bot_id_set.h | 132 | ||||
-rw-r--r-- | library/cpp/robots_txt/robotstxtcfg/robotstxtcfg.cpp | 2 | ||||
-rw-r--r-- | library/cpp/robots_txt/robotstxtcfg/robotstxtcfg.h | 11 | ||||
-rw-r--r-- | library/cpp/robots_txt/robotstxtcfg/user_agents.cpp | 2 | ||||
-rw-r--r-- | library/cpp/robots_txt/robotstxtcfg/user_agents.h | 292 | ||||
-rw-r--r-- | library/cpp/robots_txt/rules_handler.cpp | 514 |
15 files changed, 2651 insertions, 0 deletions
diff --git a/library/cpp/robots_txt/constants.h b/library/cpp/robots_txt/constants.h new file mode 100644 index 0000000000..e5e2a57e18 --- /dev/null +++ b/library/cpp/robots_txt/constants.h @@ -0,0 +1,9 @@ +#pragma once + +#include <util/generic/size_literals.h> +#include <util/system/defaults.h> + + +constexpr auto robots_max = 500_KB; +constexpr auto max_rules_count = 10'000; +constexpr auto max_rule_length = 10_KB; diff --git a/library/cpp/robots_txt/prefix_tree.cpp b/library/cpp/robots_txt/prefix_tree.cpp new file mode 100644 index 0000000000..f7b1848a43 --- /dev/null +++ b/library/cpp/robots_txt/prefix_tree.cpp @@ -0,0 +1,172 @@ +#include <cstring> +#include <algorithm> + +#include "prefix_tree.h" + +TPrefixTreeNodeElement::TPrefixTreeNodeElement() + : Key(nullptr) + , KeyLen(0) + , Val(-1) + , Index(-1) +{ +} + +TPrefixTreeNodeElement::TPrefixTreeNodeElement(const char* key, i32 keyLen = 0, i32 val = -1, i32 index = -1) + : Key(key) + , KeyLen(keyLen) + , Val(val) + , Index(index) +{ +} + +TPrefixTreeNode::TPrefixTreeNode() + : Elements() +{ +} + +int TPrefixTreeNode::Find(char ch) const { + for (size_t i = 0; i < Elements.size(); ++i) + if (ch == *(Elements[i].Key)) + return i; + return -1; +} + +void TPrefixTreeNode::Set(const char* key, i32 keyLen, i32 val, i32 index) { + TPrefixTreeNodeElement element(key, keyLen, val, index); + int i = Find(*key); + if (i < 0) + Elements.push_back(element); + else + Elements[i] = element; +} + +void TPrefixTreeNode::Dump(FILE* logFile) const { + if (!logFile) + logFile = stderr; + fprintf(logFile, "size=%" PRISZT "\n", Elements.size()); + static char b[1234]; + for (size_t i = 0; i < Elements.size(); ++i) { + strncpy(b, Elements[i].Key, Elements[i].KeyLen); + b[Elements[i].KeyLen] = 0; + fprintf(logFile, "{key=[%s]:%d, val=%d, index=%d}\n", b, Elements[i].KeyLen, Elements[i].Val, Elements[i].Index); + } +} + +void TPrefixTree::Dump(FILE* logFile) const { + if (!logFile) + logFile = stderr; + fprintf(logFile, "%" PRISZT " nodes\n", Nodes.size()); + for (size_t i = 0; i < Nodes.size(); ++i) { + fprintf(logFile, "%" PRISZT ": ", i); + Nodes[i].Dump(logFile); + fprintf(logFile, "\n"); + } +} + +TPrefixTree::TPrefixTree(int maxSize) { + Init(maxSize); +} + +void TPrefixTree::Init(int maxSize) { + Nodes.clear(); + Nodes.reserve(std::max(maxSize + 1, 1)); + Nodes.push_back(TPrefixTreeNode()); +} + +void TPrefixTree::Clear() { + Nodes.clear(); + Init(0); +} + +void TPrefixTree::Add(const char* s, i32 index) { + AddInternal(s, Nodes[0], index); +} + +void TPrefixTree::AddInternal(const char* s, TPrefixTreeNode& node, i32 index) { + if (!s || !*s) + return; + + int i = node.Find(*s); + if (i >= 0) { + TPrefixTreeNodeElement& d = node.Elements[i]; + const char* p = d.Key; + while (*s && (p - d.Key) < d.KeyLen && *s == *p) + ++s, ++p; + + if (*s) { + if ((p - d.Key) < d.KeyLen) { + Nodes.push_back(TPrefixTreeNode()); + Nodes.back().Set(p, d.KeyLen - (p - d.Key), d.Val, d.Index); + Nodes.back().Set(s, strlen(s), -1, index); + + d.Val = Nodes.size() - 1; + d.KeyLen = p - d.Key; + d.Index = INDEX_BOUND; + } else { + if (d.Val != -1 && index < d.Index) + AddInternal(s, Nodes[d.Val], index); + } + } else { + if ((p - d.Key) < d.KeyLen) { + Nodes.push_back(TPrefixTreeNode()); + Nodes.back().Set(p, d.KeyLen - (p - d.Key), d.Val, d.Index); + d.Val = Nodes.size() - 1; + d.KeyLen = p - d.Key; + d.Index = index; + } else { + d.Index = std::min(d.Index, index); + } + } + } else { + node.Set(s, strlen(s), -1, index); + } +} + +int TPrefixTree::GetMemorySize() const { + int res = Nodes.capacity() * sizeof(TPrefixTreeNode); + for (size_t i = 0; i < Nodes.size(); ++i) + res += Nodes[i].Elements.capacity() * sizeof(TPrefixTreeNodeElement); + return res; +} + +void TPrefixTree::Compress() { + Nodes.shrink_to_fit(); + for (size_t i = 0; i < Nodes.size(); ++i) + Nodes[i].Elements.shrink_to_fit(); +} + +i32 TPrefixTree::MinPrefixIndex(const char* s) const { + if (!*s) + return -1; + int i = Nodes[0].Find(*s); + if (i < 0) + return -1; + const TPrefixTreeNodeElement* d = &Nodes[0].Elements[i]; + + const char* p = d->Key; + if (!p || !*p) + return -1; + + i32 result = INDEX_BOUND; + i32 nodeIndex = 0; + while (*s == *p) { + if (++p - d->Key >= d->KeyLen) + result = std::min(result, d->Index); + if (!*++s) + break; + + if (p - d->Key >= d->KeyLen) { + nodeIndex = d->Val; + if (nodeIndex == -1) + break; + i = Nodes[nodeIndex].Find(*s); + if (i < 0) + break; + d = &Nodes[nodeIndex].Elements[i]; + p = d->Key; + if (!p || !*p) + break; + } + } + return result < INDEX_BOUND ? result : -1; +} diff --git a/library/cpp/robots_txt/prefix_tree.h b/library/cpp/robots_txt/prefix_tree.h new file mode 100644 index 0000000000..5feafcb74d --- /dev/null +++ b/library/cpp/robots_txt/prefix_tree.h @@ -0,0 +1,47 @@ +#pragma once + +#include <util/generic/ptr.h> +#include <util/generic/vector.h> +#include <cstdio> +#include <util/generic/noncopyable.h> + +struct TPrefixTreeNodeElement { + const char* Key; + i32 KeyLen; + i32 Val; + i32 Index; + + TPrefixTreeNodeElement(); + TPrefixTreeNodeElement(const char*, i32, i32, i32); +}; + +class TPrefixTreeNode { +public: + TVector<TPrefixTreeNodeElement> Elements; + TPrefixTreeNode(); + + int Find(char) const; + void Set(const char*, i32, i32, i32); + void Dump(FILE*) const; +}; + +class TPrefixTree : TNonCopyable { +private: + static const i32 INDEX_BOUND = 1 << 30; + + TVector<TPrefixTreeNode> Nodes; + +public: + void Init(int); + TPrefixTree(int); + + void Add(const char*, i32); + i32 MinPrefixIndex(const char*) const; + void Clear(); + void Dump(FILE*) const; + int GetMemorySize() const; + void Compress(); + +private: + void AddInternal(const char*, TPrefixTreeNode&, i32); +}; diff --git a/library/cpp/robots_txt/prefix_tree_rules_handler.cpp b/library/cpp/robots_txt/prefix_tree_rules_handler.cpp new file mode 100644 index 0000000000..8dd579d060 --- /dev/null +++ b/library/cpp/robots_txt/prefix_tree_rules_handler.cpp @@ -0,0 +1,706 @@ +#include "robots_txt.h" + +#include <util/digest/fnv.h> +#include <util/system/tls.h> +#include <util/generic/buffer.h> +#include <util/generic/yexception.h> + +namespace { + +TString NormalizeRule(TStringBuf rule) { + TString result; + result.reserve(rule.size() + 1); + + // remove consecutive '*' + for (auto c : rule) { + if (c != '*' || !result.EndsWith('*')) { + result.append(c); + } + } + + if (rule == "*") { + result = "/*"; + return result; + } + + // unify suffix + if (result.EndsWith('$')) { + result.pop_back(); + } else if (!result.EndsWith('*')) { + result.append('*'); + } + + return result; +} + +// Prefix rules +bool IsPrefixRule(TStringBuf rule) { + return rule.EndsWith('*') && !TStringBuf(rule.begin(), rule.end() - 1).Contains('*'); +} + +// Converts rule to internal representation, i.e. +// For prefix rules: "/foo", 'D' -> 'D', "/foo" +// For generic rules: "/*foo", 'D' -> ("/*/*foo*", 'd') or ("/*foo$", 'A') -> ("/*foo", 'a') +// The distinction is in uppercase/lowercase rule type +std::pair<TString, char> ConvertRule(TStringBuf rule, char type) { + switch (type) { + case 'H': + case 'S': + case 'C': + case 'P': + return {TString(rule), type}; + case 'A': + case 'D': + break; + default: + return {{}, type}; + } + + auto result = NormalizeRule(rule); + if (IsPrefixRule(result)) { + result.pop_back(); // remove extra '*' from the end + } else { + type = tolower(type); + } + + return {std::move(result), type}; +} + +} // namespace + +TPrefixTreeRobotsTxtRulesHandler::TPrefixTreeRobotsTxtRulesHandler( + TBotIdSet supportedBotIds, + int robotsMaxSize, + int maxRulesNumber, + bool saveDataForAnyBot) + : TRobotsTxtRulesHandlerBase(supportedBotIds, robotsMaxSize, maxRulesNumber, saveDataForAnyBot) +{} + +TPrefixTreeRobotsTxtRulesHandler::TPrefixTreeRobotsTxtRulesHandler( + std::initializer_list<ui32> supportedBotIds, + int robotsMaxSize, + int maxRulesNumber, + bool saveDataForAnyBot) + : TRobotsTxtRulesHandlerBase(TBotIdSet(supportedBotIds), robotsMaxSize, maxRulesNumber, saveDataForAnyBot) +{} + +TPrefixTreeRobotsTxtRulesHandler::TPrefixTreeRobotsTxtRulesHandler( + const TSet<ui32>& supportedBotIds, + int robotsMaxSize, + int maxRulesNumber, + bool saveDataForAnyBot) + : TRobotsTxtRulesHandlerBase(supportedBotIds, robotsMaxSize, maxRulesNumber, saveDataForAnyBot) +{} + +bool TPrefixTreeRobotsTxtRulesHandler::Empty(const ui32 botId) const { + const auto& botInfo = BotIdToPrefixTreeBotInfo[GetNotOptimizedBotId(botId)]; + return !botInfo || (botInfo->BufferPosition <= sizeof(botInfo->BufferPosition)); +} + +TRobotsTxtRulesIterator TPrefixTreeRobotsTxtRulesHandler::GetRulesIterator(const ui32 botId) const { + const auto& botInfo = BotIdToPrefixTreeBotInfo[GetNotOptimizedBotId(botId)]; + if (!botInfo) { + return {}; + } + return TRobotsTxtRulesIterator(botInfo->Buffer.Get() + sizeof(botInfo->BufferPosition), botInfo->Buffer.Get() + botInfo->BufferPosition); +} + +size_t TPrefixTreeRobotsTxtRulesHandler::GetMemorySize() { + size_t allBotsSize = 0; + for (const auto& botInfo : BotIdToPrefixTreeBotInfo) { + if (!botInfo) { + continue; + } + + allBotsSize += botInfo->PrefixRules.GetMemorySize() + + botInfo->BufferSize * sizeof(char) + + botInfo->ComplexRulesSize * sizeof(char**) + + botInfo->RulesSize * sizeof(char*) + (1 << 8); + } + return allBotsSize; +} + +void TPrefixTreeRobotsTxtRulesHandler::ClearInternal(const ui32 botId) { + if (botId >= BotIdToPrefixTreeBotInfo.size()) { + return; + } + BotIdToPrefixTreeBotInfo[botId].Reset(); + TRobotsTxtRulesHandlerBase::ClearInternal(botId); +} + +bool TPrefixTreeRobotsTxtRulesHandler::OptimizeSize() { + ResetOptimized(); + + TMap<ui64, ui32> hashToBotId; + for (auto botId : LoadedBotIds) { + auto& botInfo = BotIdToPrefixTreeBotInfo[botId]; + if (botInfo->BufferPosition <= sizeof(ui32)) { + botInfo.Reset(); + LoadedBotIds.remove(botId); + continue; + } + + ui64 hash = FnvHash<ui64>(botInfo->Buffer.Get(), botInfo->BufferPosition); + if (auto p = hashToBotId.FindPtr(hash)) { + OptimizedBotIdToStoredBotId[botId] = *p; + ClearInternal(botId); + botInfo.Reset(); + } else { + hashToBotId[hash] = botId; + } + } + + if (IsFullTotal()) { + DoAllowAll(); + return false; + } + + return true; +} + +void TPrefixTreeRobotsTxtRulesHandler::Clear() { + for (size_t botId = 0; botId < robotstxtcfg::max_botid; ++botId) + if (IsBotIdSupported(botId)) + ClearInternal(botId); + TRobotsTxtRulesHandlerBase::Clear(); +} + +void TPrefixTreeRobotsTxtRulesHandler::ResizeBuffer(const ui32 botId, int newSize) { + auto& botInfo = GetInfo(botId); + TArrayHolder<char> newBuffer(new char[newSize]); + memcpy(newBuffer.Get(), botInfo.Buffer.Get(), std::min(botInfo.BufferSize, newSize)); + botInfo.Buffer.Swap(newBuffer); + botInfo.BufferSize = newSize; +} + +bool TPrefixTreeRobotsTxtRulesHandler::AddRule(const ui32 botId, TStringBuf rule, char type) { + if (rule.empty() || rule.Contains('\0')) { + return true; + } + + auto& botInfo = GetInfo(botId); + + if (IsFull(botId, rule.size())) { + DoAllowAll(); + return false; + } + + auto [convertedRule, convertedType] = ConvertRule(rule, type); + const auto len = convertedRule.size() + 2; // 1 byte for convertedType and another for '\0' + + if (auto newPos = botInfo.BufferPosition + len; newPos >= size_t(botInfo.BufferSize)) { + size_t newSize = botInfo.BufferSize; + while (newPos >= newSize) + newSize *= 2; + ResizeBuffer(botId, newSize); + } + + auto out = botInfo.Buffer.Get() + botInfo.BufferPosition; + *out++ = convertedType; + strcpy(out, convertedRule.data()); + botInfo.BufferPosition += len; + + if (type == 'A' || type == 'D') { + botInfo.RulesPosition++; + } + + return true; +} + +const char* TPrefixTreeRobotsTxtRulesHandler::GetRule(const ui32 botId, const char* s, char type) const { + const auto& botInfo = BotIdToPrefixTreeBotInfo[GetNotOptimizedBotId(botId)]; + if (!botInfo) { + return nullptr; + } + + int m = botInfo->RulesPosition + 1; + int k = botInfo->PrefixRules.MinPrefixIndex(s); + if (k >= 0) + m = k; + char* rule; + int j; + for (int i = 0; i < botInfo->ComplexRulesPosition; ++i) { + rule = *botInfo->ComplexRules.Get()[i]; + j = botInfo->ComplexRules.Get()[i] - botInfo->Rules.Get(); + if (j >= m) + break; + if (CheckRule(s, rule)) { + m = j; + break; + } + } + if (m >= botInfo->RulesPosition) + return nullptr; + return toupper(*(botInfo->Rules.Get()[m] - 1)) == type ? botInfo->Rules.Get()[m] : nullptr; +} + +inline bool TPrefixTreeRobotsTxtRulesHandler::IsAllowAll(const ui32 botId) const { + const auto id = GetMappedBotId(botId, false); + auto& botInfo = BotIdToPrefixTreeBotInfo[id ? *id : robotstxtcfg::id_anybot]; + return botInfo && botInfo->AllowAll; +} + +inline bool TPrefixTreeRobotsTxtRulesHandler::IsAllowAll() const { + for (ui32 botId = 0; botId < robotstxtcfg::max_botid; ++botId) + if (robotstxtcfg::IsYandexBotId(botId) && IsBotIdSupported(botId) && !IsAllowAll(botId)) { + return false; + } + + return true; +} + +inline bool TPrefixTreeRobotsTxtRulesHandler::IsDisallowAll(const ui32 botId, bool useAny) const { + const auto id = GetMappedBotId(botId, false); + if (id) { + const auto& botInfo = BotIdToPrefixTreeBotInfo[*id]; + return botInfo && botInfo->DisallowAll; + } + + auto& botInfo = BotIdToPrefixTreeBotInfo[robotstxtcfg::id_anybot]; + return useAny && botInfo && botInfo->DisallowAll; +} + +inline bool TPrefixTreeRobotsTxtRulesHandler::IsDisallowAll() const { + for (ui32 botId = 0; botId < robotstxtcfg::max_botid; ++botId) + if (robotstxtcfg::IsYandexBotId(botId) && IsBotIdSupported(botId) && !IsDisallowAll(botId)) + return false; + + return true; +} + +void TPrefixTreeRobotsTxtRulesHandler::DoAllowAll() { + using robotstxtcfg::id_anybot; + + // Drop all bots to default + SupportedBotIds.insert(id_anybot); + for (ui32 botId = 0; botId < robotstxtcfg::max_botid; ++botId) { + if (IsBotIdSupported(botId)) { + ClearInternal(botId); + OptimizedBotIdToStoredBotId[botId] = id_anybot; + LoadedBotIds.insert(botId); + } + } + + // Initialize anybot with "allow all" rule + AddRule(id_anybot, "/", 'A'); + GetInfo(id_anybot).AllowAll = true; + SaveRulesToBuffer(); +} + +void TPrefixTreeRobotsTxtRulesHandler::DoDisallowAll() { + for (ui32 botId = 0; botId < robotstxtcfg::max_botid; ++botId) { + if (!IsBotIdSupported(botId)) + continue; + ClearInternal(botId); + if (botId == robotstxtcfg::id_anybot) { + auto& botInfo = GetInfo(botId); + AddRule(botId, "/", 'D'); + botInfo.DisallowAll = true; + SaveRulesToBuffer(); + } else { + OptimizedBotIdToStoredBotId[botId] = robotstxtcfg::id_anybot; + } + LoadedBotIds.insert(botId); + } +} + +const char* TPrefixTreeRobotsTxtRulesHandler::IsDisallow(const ui32 botId, const char* s, bool useAny) const { + const auto id = GetMappedBotId(botId, useAny); + if (!id) + return nullptr; + + const auto& botInfo = BotIdToPrefixTreeBotInfo[*id]; + if (botInfo && IsDisallowAll(*id, useAny)) { + int index = (const_cast<TPrefixTreeRobotsTxtRulesHandler*>(this))->FindRuleAll(*botInfo, 'D'); + if (index < 0) { //o_O + return botInfo->Rules.Get()[0]; + } else { + return botInfo->Rules.Get()[index]; + } + } + + return GetRule(*id, s, 'D'); +} + +const char* TPrefixTreeRobotsTxtRulesHandler::IsAllow(const ui32 botId, const char* s) const { + const auto id = GetMappedBotId(botId, true); + if (auto p = GetRule(*id, s, 'A')) + return p; + return GetRule(*id, s, 'D') ? nullptr : "/"; +} + +int TPrefixTreeRobotsTxtRulesHandler::StrLenWithoutStars(const char* s) { + int len = 0; + + for (size_t index = 0; s[index]; ++index) { + if (s[index] != '*') { + ++len; + } + } + + return len; +} + +int TPrefixTreeRobotsTxtRulesHandler::TraceBuffer(const ui32 botId, int countRules, const TArrayHolder<TRuleInfo>* ruleInfos) { + CheckBotIdValidity(botId); + auto& prefixBotInfo = GetInfo(botId); + TBotInfo& botInfo = BotIdToInfo[botId]; + + bool store = countRules >= 0; + if (store) { + prefixBotInfo.Rules.Reset(new char*[prefixBotInfo.RulesSize = countRules]); + } + + int beg = -1, n = 0; + *((int*)prefixBotInfo.Buffer.Get()) = prefixBotInfo.BufferSize; + for (size_t i = sizeof(prefixBotInfo.BufferPosition); i < prefixBotInfo.BufferPosition; ++i) + if (prefixBotInfo.Buffer.Get()[i] == '\n' || prefixBotInfo.Buffer.Get()[i] == 0) { + if (beg < 0 || beg + 1 == (int)i) + continue; + + char* s = prefixBotInfo.Buffer.Get() + beg; + if (store) { + switch (*s) { + case 'H': + HostDirective = s + 1; + break; + case 'S': + SiteMaps.insert(s + 1); + break; + case 'C': + ParseCrawlDelay(s + 1, botInfo.CrawlDelay); + break; + case 'P': + CleanParams.insert(s + 1); + break; + default: + prefixBotInfo.Rules.Get()[n] = s + 1; + (*ruleInfos).Get()[n].Len = StrLenWithoutStars(s + 1); + (*ruleInfos).Get()[n].Allow = toupper(*s) == 'A'; + + prefixBotInfo.HasAllow |= toupper(*s) == 'A'; + prefixBotInfo.HasDisallow |= toupper(*s) == 'D'; + break; + } + } + n += (*s != 'H' && *s != 'S' && *s != 'C' && *s != 'P'); + beg = -1; + } else if (beg < 0) + beg = i; + + return n; +} + +int TPrefixTreeRobotsTxtRulesHandler::FindRuleAll(const TPrefixTreeBotInfo& prefixBotInfo, const char neededType) { + static const char* all[] = {"*", "/", "*/", "/*", "*/*"}; + for (int ruleNumber = prefixBotInfo.RulesSize - 1; ruleNumber >= 0; --ruleNumber) { + const char* curRule = prefixBotInfo.Rules.Get()[ruleNumber]; + char ruleType = *(curRule - 1); + + if (strlen(curRule) > 3) + break; + if (neededType != ruleType) + continue; + + for (size_t i = 0; i < sizeof(all) / sizeof(char*); ++i) + if (strcmp(all[i], curRule) == 0) + return ruleNumber; + } + return -1; +} + +bool TPrefixTreeRobotsTxtRulesHandler::HasDisallowRulePrevAllowAll(const TPrefixTreeBotInfo& prefixBotInfo, int ruleAllAllow) { + for (int ruleNumber = ruleAllAllow - 1; ruleNumber >= 0; --ruleNumber) { + const char* curRule = prefixBotInfo.Rules.Get()[ruleNumber]; + char ruleType = *(curRule - 1); + if (tolower(ruleType) == 'd') + return true; + } + return false; +} + +bool TPrefixTreeRobotsTxtRulesHandler::CheckAllowDisallowAll(const ui32 botId, const bool checkDisallow) { + CheckBotIdValidity(botId); + + auto& botInfo = GetInfo(botId); + + if (botInfo.RulesSize == 0) + return !checkDisallow; + if (botInfo.RulesPosition <= 0) + return 0; + + if (checkDisallow) + return !botInfo.HasAllow && FindRuleAll(botInfo, 'D') >= 0; + int ruleAllAllow = FindRuleAll(botInfo, 'A'); + if (ruleAllAllow == -1) + return !botInfo.HasDisallow; + return !HasDisallowRulePrevAllowAll(botInfo, ruleAllAllow); +} + +void TPrefixTreeRobotsTxtRulesHandler::SortRules( + TPrefixTreeBotInfo& prefixBotInfo, + size_t count, + const TArrayHolder<TRuleInfo>* ruleInfos) { + TVector<size_t> indexes(count); + for (size_t index = 0; index < count; ++index) + indexes[index] = index; + + TRulesSortFunc sortFunc(ruleInfos); + std::sort(indexes.begin(), indexes.end(), sortFunc); + + TArrayHolder<char*> workingCopy; + workingCopy.Reset(new char*[count]); + + for (size_t index = 0; index < count; ++index) + workingCopy.Get()[index] = prefixBotInfo.Rules.Get()[index]; + for (size_t index = 0; index < count; ++index) + prefixBotInfo.Rules.Get()[index] = workingCopy.Get()[indexes[index]]; +} + +void TPrefixTreeRobotsTxtRulesHandler::SaveRulesToBuffer() { + // as sitemaps, clean-params and HostDirective from prefix tree was deleted + for (const auto& sitemap: SiteMaps) + AddRule(robotstxtcfg::id_anybot, sitemap, 'S'); + for (const auto& param : CleanParams) + AddRule(robotstxtcfg::id_anybot, param, 'P'); + if (!HostDirective.empty()) + AddRule(robotstxtcfg::id_anybot, HostDirective, 'H'); +} + +void TPrefixTreeRobotsTxtRulesHandler::SaveRulesFromBuffer(const ui32 botId) { + CheckBotIdValidity(botId); + + auto& botInfo = GetInfo(botId); + + TArrayHolder<TRuleInfo> ruleInfos; + + int n = TraceBuffer(botId, -1, nullptr), countPrefix = 0; + ruleInfos.Reset(new TRuleInfo[n]); + botInfo.RulesPosition = TraceBuffer(botId, n, &ruleInfos); + assert(botInfo.RulesPosition == n); + + SortRules(botInfo, n, &ruleInfos); + + botInfo.DisallowAll = CheckAllowDisallowAll(botId, true); + botInfo.AllowAll = CheckAllowDisallowAll(botId, false); + + for (int i = 0; i < n; ++i) + countPrefix += !!isupper(*(botInfo.Rules.Get()[i] - 1)); + + botInfo.PrefixRules.Init(countPrefix); + botInfo.ComplexRules.Reset(new char**[botInfo.ComplexRulesSize = n - countPrefix]); + botInfo.ComplexRulesPosition = 0; + + for (int i = 0; i < n; ++i) { + char* s = botInfo.Rules.Get()[i]; + if (isupper(*(s - 1))) + botInfo.PrefixRules.Add(s, i); + else + botInfo.ComplexRules.Get()[botInfo.ComplexRulesPosition++] = &botInfo.Rules.Get()[i]; + } + botInfo.PrefixRules.Compress(); +} + +void TPrefixTreeRobotsTxtRulesHandler::AfterParse(const ui32 botId) { + CheckBotIdValidity(botId); + + auto& botInfo = GetInfo(botId); + + ResizeBuffer(botId, botInfo.BufferPosition); + SaveRulesFromBuffer(botId); + + if (botInfo.RulesPosition == 0) { + AddRule(botId, "/", 'A'); + } +} + +TPrefixTreeRobotsTxtRulesHandler::TPrefixTreeBotInfo& TPrefixTreeRobotsTxtRulesHandler::GetInfo(ui32 botId) { + Y_ENSURE(botId < robotstxtcfg::max_botid); + auto& res = BotIdToPrefixTreeBotInfo[botId]; + if (!res) { + res = MakeHolder<TPrefixTreeBotInfo>(); + } + return *res; +} + +bool TPrefixTreeRobotsTxtRulesHandler::CheckRule(const char* s, const char* rule) { + const char* r = rule; + const char* s_end = s + strlen(s); + const char* r_end = r + strlen(r); + // assert( r && !strstr(r, "**") ); + for (; *s; ++s) { + if ((s_end - s + 1) * 2 < (r_end - r)) + return 0; + while (*r == '*') + ++r; + + if (*s == *r) { + ++r; + } else { + while (r != rule && *r != '*') + --r; + + if (*r != '*') + return 0; + if (*r == '*') + ++r; + if (*r == *s) + ++r; + } + } + return !*r || (!*(r + 1) && *r == '*'); +} + +bool TPrefixTreeRobotsTxtRulesHandler::IsFull(ui32 botId, size_t length) const { + Y_ENSURE(botId < robotstxtcfg::max_botid); + const auto& botInfo = BotIdToPrefixTreeBotInfo[botId]; + if (!botInfo) { + return false; + } + + return (size_t(botInfo->RulesPosition) >= MaxRulesNumber) || (botInfo->BufferPosition + length + 300 > size_t(RobotsMaxSize)); +} + +bool TPrefixTreeRobotsTxtRulesHandler::IsFullTotal() const { + size_t allBotsRulesCount = 0; + size_t allBotsBufferSize = 0; + + for (const auto& botInfo : BotIdToPrefixTreeBotInfo) { + if (botInfo) { + allBotsRulesCount += botInfo->RulesPosition; + allBotsBufferSize += botInfo->BufferPosition; + } + } + + return (allBotsRulesCount >= MaxRulesNumber) || (allBotsBufferSize + 300 > size_t(RobotsMaxSize)); +} + +size_t TPrefixTreeRobotsTxtRulesHandler::GetPacked(const char*& data) const { + Y_STATIC_THREAD(TBuffer) + packedRepresentation; + + // calculate size, needed for packed data + size_t totalPackedSize = sizeof(ui32); // num of botids + ui32 numOfSupportedBots = 0; + + for (size_t botId = 0; botId < robotstxtcfg::max_botid; ++botId) { + if (!IsBotIdSupported(botId)) { + continue; + } + + const auto& botInfo = BotIdToPrefixTreeBotInfo[GetNotOptimizedBotId(botId)]; + // botId + packedDataSize + packedData + totalPackedSize += sizeof(ui32) + (botInfo ? botInfo->BufferPosition : sizeof(ui32)); + ++numOfSupportedBots; + } + + ((TBuffer&)packedRepresentation).Reserve(totalPackedSize); + + // fill packed data + char* packedPtr = ((TBuffer&)packedRepresentation).Data(); + + *((ui32*)packedPtr) = numOfSupportedBots; + packedPtr += sizeof(ui32); + + for (size_t botId = 0; botId < robotstxtcfg::max_botid; ++botId) { + if (!IsBotIdSupported(botId)) { + continue; + } + + const auto& botInfo = BotIdToPrefixTreeBotInfo[GetNotOptimizedBotId(botId)]; + memcpy(packedPtr, &botId, sizeof(ui32)); + packedPtr += sizeof(ui32); + + if (botInfo) { + *((ui32*)botInfo->Buffer.Get()) = botInfo->BufferPosition; + memcpy(packedPtr, botInfo->Buffer.Get(), botInfo->BufferPosition); + packedPtr += botInfo->BufferPosition; + } else { + // In absense of bot info we serialize only size of its buffer, which is 4 because it takes 4 bytes + ui32 emptyBufferPosition = sizeof(ui32); + memcpy(packedPtr, &emptyBufferPosition, sizeof(ui32)); + packedPtr += sizeof(ui32); + } + } + + data = ((TBuffer&)packedRepresentation).Data(); + return totalPackedSize; +} + +void TPrefixTreeRobotsTxtRulesHandler::LoadPacked(const char* botsData, const char* botsDataEnd) { + Clear(); + + if (Y_UNLIKELY(botsDataEnd != nullptr && botsData >= botsDataEnd)) { + ythrow yexception() << "Buffer overflow"; + } + + ui32 numOfBots = *((ui32*)botsData); + botsData += sizeof(ui32); + + for (ui32 botIndex = 0; botIndex < numOfBots; ++botIndex) { + if (Y_UNLIKELY(botsDataEnd != nullptr && botsData >= botsDataEnd)) { + ythrow yexception() << "Buffer overflow"; + } + + ui32 botId = 0; + memcpy(&botId, botsData, sizeof(ui32)); + botsData += sizeof(ui32); + + // skip bot id's, that not supported for now + if (botId >= robotstxtcfg::max_botid || !IsBotIdSupported(botId)) { + if (Y_UNLIKELY(botsDataEnd != nullptr && botsData >= botsDataEnd)) { + ythrow yexception() << "Buffer overflow"; + } + + ui32 oneBotPackedSize = 0; + memcpy(&oneBotPackedSize, botsData, sizeof(ui32)); + botsData += oneBotPackedSize; + + continue; + } + + //SupportedBotIds.insert(botId); + + auto& botInfo = GetInfo(botId); + + if (Y_UNLIKELY(botsDataEnd != nullptr && botsData >= botsDataEnd)) { + ythrow yexception() << "Buffer overflow"; + } + + static_assert(sizeof(botInfo.BufferSize) == sizeof(ui32), "BufferSize must be 4 bytes"); + static_assert(sizeof(botInfo.BufferPosition) == sizeof(ui32), "BufferPosition must be 4 bytes"); + + memcpy(&botInfo.BufferSize, botsData, sizeof(ui32)); + memcpy(&botInfo.BufferPosition, botsData, sizeof(ui32)); + + if (Y_UNLIKELY(botsDataEnd != nullptr && (botsData + botInfo.BufferSize) > botsDataEnd)) { + ythrow yexception() << "Buffer overflow"; + } + + botInfo.Buffer.Reset(new char[botInfo.BufferSize]); + memcpy(botInfo.Buffer.Get(), botsData, botInfo.BufferSize); + SaveRulesFromBuffer(botId); + + if (botInfo.BufferSize > (int)sizeof(ui32)) { // empty data for robots means, that we don't have section for this bot + LoadedBotIds.insert(botId); + } + + botsData += botInfo.BufferSize; + } + + OptimizeSize(); +} + +void TPrefixTreeRobotsTxtRulesHandler::Dump(const ui32 botId, FILE* dumpFile) { + if (!dumpFile) + dumpFile = stderr; + fprintf(dumpFile, "User-Agent: %s\n", robotstxtcfg::GetFullName(botId).data()); + for (TRobotsTxtRulesIterator it = GetRulesIterator(botId); it.HasRule(); it.Next()) + fprintf(dumpFile, "%s: %s\n", DirTypeToName(it.GetRuleType()), it.GetInitialRule().data()); +} + +void TPrefixTreeRobotsTxtRulesHandler::Dump(const ui32 botId, IOutputStream& out) { + out << "User-Agent: " << robotstxtcfg::GetFullName(botId) << Endl; + for (TRobotsTxtRulesIterator it = GetRulesIterator(botId); it.HasRule(); it.Next()) + out << DirTypeToName(it.GetRuleType()) << ": " << it.GetInitialRule() << Endl; +} diff --git a/library/cpp/robots_txt/robots_txt.h b/library/cpp/robots_txt/robots_txt.h new file mode 100644 index 0000000000..5ee48fb14f --- /dev/null +++ b/library/cpp/robots_txt/robots_txt.h @@ -0,0 +1,605 @@ +#pragma once + +#include "constants.h" +#include "robots_txt_parser.h" +#include "prefix_tree.h" +#include "robotstxtcfg.h" + +#include <util/generic/noncopyable.h> +#include <util/generic/map.h> +#include <util/generic/maybe.h> +#include <util/generic/ptr.h> +#include <util/generic/set.h> + +#include <array> +#include <utility> + + +enum EDirectiveType { + USER_AGENT = 1, + DISALLOW = 2, + ALLOW = 3, + HOST = 4, + SITEMAP = 5, + CRAWL_DELAY = 6, + CLEAN_PARAM = 7, + UNKNOWN = 9, +}; + +enum EFormatErrorType { + ERROR_RULE_NOT_SLASH = 1, + ERROR_ASTERISK_MULTI = 2, + ERROR_HOST_MULTI = 3, + ERROR_ROBOTS_HUGE = 4, + ERROR_RULE_BEFORE_USER_AGENT = 5, + ERROR_RULE_HUGE = 6, + ERROR_HOST_FORMAT = 7, + ERROR_TRASH = 8, + ERROR_SITEMAP_FORMAT = 9, + ERROR_CRAWL_DELAY_FORMAT = 10, + ERROR_CRAWL_DELAY_MULTI = 11, + ERROR_CLEAN_PARAM_FORMAT = 12, + + WARNING_EMPTY_RULE = 30, + WARNING_SUSPECT_SYMBOL = 31, + WARNING_UNKNOWN_FIELD = 33, + WARNING_UPPER_REGISTER = 34, + WARNING_SITEMAP = 35, +}; + +class TRobotsTxtRulesIterator { +private: + const char* Begin = nullptr; + const char* End = nullptr; + +public: + TRobotsTxtRulesIterator() = default; + TRobotsTxtRulesIterator(const char* begin, const char* end); + void Next(); + bool HasRule() const; + const char* GetRule() const; + TString GetInitialRule() const; // unlike GetRule(), it neither omits trailing '$' nor adds redundant '*' + EDirectiveType GetRuleType() const; + + static EDirectiveType CharToDirType(char ch); +}; + +class TRobotsTxtRulesHandlerBase { +public: + typedef TVector<std::pair<EFormatErrorType, int>> TErrorVector; + + TRobotsTxtRulesHandlerBase( + TBotIdSet supportedBotIds, + int robotsMaxSize, + int maxRulesNumber, + bool saveDataForAnyBot); + + TRobotsTxtRulesHandlerBase( + const TSet<ui32>& supportedBotIds, + int robotsMaxSize, + int maxRulesNumber, + bool saveDataForAnyBot); + + virtual ~TRobotsTxtRulesHandlerBase(); + + int GetCrawlDelay(ui32 botId, bool* realInfo = nullptr) const; + int GetMinCrawlDelay(int defaultCrawlDelay = -1) const; + bool IsHandlingErrors() const; + const TString& GetHostDirective() const; + const TVector<TString> GetSiteMaps() const; + const TVector<TString> GetCleanParams() const; + const TErrorVector& GetErrors() const; + TVector<int> GetAcceptedLines(ui32 botId = robotstxtcfg::id_yandexbot) const; + + template <class THostHandler> + static int ParseRules(TRobotsTxtParser& parser, TRobotsTxtRulesHandlerBase* rulesHandler, THostHandler* hostHandler, const char* host = nullptr); + static inline void ClearAllExceptCrossSection(TRobotsTxtParser& parser, TRobotsTxtRulesHandlerBase* rulesHandler, ui32 botId); + static int CheckHost(const char* host); + static int CheckSitemapUrl(const char* url, const char* host, TString& modifiedUrl); + static int CheckRule(const char* value, int line, TRobotsTxtRulesHandlerBase* rulesHandler); + static int CheckAndNormCleanParam(TString& s); + static int ParseCrawlDelay(const char* value, int& crawlDelay); + static EDirectiveType NameToDirType(const char* d); + static const char* DirTypeToName(EDirectiveType t); + + void SetErrorsHandling(bool handleErrors); + void SetHostDirective(const char* hostDirective); + void SetCrawlDelay(ui32 botId, int crawlDelay); + void AddAcceptedLine(ui32 line, const TBotIdSet& botIds, bool isCrossSection); + void AddSiteMap(const char* sitemap); + void AddCleanParam(const char* cleanParam); + bool AddRuleWithErrorCheck(ui32 botId, TStringBuf rule, char type, TRobotsTxtParser& parser); + int OnHost(ui32 botId, TRobotsTxtParser& parser, const char* value, TRobotsTxtRulesHandlerBase*& rulesHandler); + + virtual void Clear(); + virtual bool IsAllowAll(ui32 botId) const = 0; + virtual bool IsAllowAll() const = 0; + virtual bool IsDisallowAll(ui32 botId, bool useAny = true) const = 0; + virtual bool IsDisallowAll() const = 0; + virtual const char* IsDisallow(ui32 botId, const char* s, bool useAny = true) const = 0; + virtual const char* IsAllow(ui32 botId, const char* s) const = 0; + virtual TRobotsTxtRulesIterator GetRulesIterator(ui32 botId) const = 0; + virtual void Dump(ui32 botId, FILE* logFile) = 0; + virtual void Dump(ui32 botId, IOutputStream& out) = 0; + virtual bool Empty(ui32 botId) const = 0; + virtual void LoadPacked(const char* botsData, const char* botsDataEnd = nullptr) = 0; + virtual size_t GetPacked(const char*& data) const = 0; + virtual void AfterParse(ui32 botId) = 0; + virtual void DoAllowAll() = 0; + virtual void DoDisallowAll() = 0; + bool IsBotIdLoaded(ui32 botId) const; + bool IsBotIdSupported(ui32 botId) const; + ui32 GetNotOptimizedBotId(ui32 botId) const; + TMaybe<ui32> GetMappedBotId(ui32 botId, bool useAny = true) const; + +protected: + void CheckBotIdValidity(ui32 botId) const; + virtual bool OptimizeSize() = 0; + +private: + bool HandleErrors; + +protected: + struct TBotInfo { + int CrawlDelay; + + TBotInfo() + : CrawlDelay(-1) + { + } + }; + + TBotIdSet LoadedBotIds; + TSet<TString> SiteMaps; + TSet<TString> CleanParams; + TString HostDirective; + TErrorVector Errors; + typedef std::pair<ui32, ui32> TBotIdAcceptedLine; + TVector<TBotIdAcceptedLine> AcceptedLines; + TVector<ui32> CrossSectionAcceptedLines; + + TVector<TBotInfo> BotIdToInfo; + int CrawlDelay; + size_t RobotsMaxSize; + size_t MaxRulesNumber; + bool SaveDataForAnyBot; + + TBotIdSet SupportedBotIds; + std::array<ui8, robotstxtcfg::max_botid> OptimizedBotIdToStoredBotId; + + virtual bool IsFull(ui32 botId, size_t length) const = 0; + virtual bool IsFullTotal() const = 0; + virtual bool AddRule(ui32 botId, TStringBuf rule, char type) = 0; + //parts of ParseRules + inline static void CheckRobotsLines(TRobotsTxtRulesHandlerBase* rulesHandler, TVector<int>& nonRobotsLines); + inline static void CheckAsterisk(TRobotsTxtRulesHandlerBase* rulesHandler, const char* value, ui32 lineNumber, bool& wasAsterisk); + inline static bool CheckWasUserAgent(TRobotsTxtRulesHandlerBase* rulesHandler, bool wasUserAgent, bool& ruleBeforeUserAgent, bool& wasRule, ui32 lineNumber); + inline static bool CheckRuleNotSlash(TRobotsTxtRulesHandlerBase* rulesHandler, const char* value, ui32 lineNumber); + inline static bool CheckSupportedBots(const TBotIdSet& currentBotIds, TBotIdSet& wasRuleForBot, const TBotIdSet& isSupportedBot); + inline static bool CheckEmptyRule(TRobotsTxtRulesHandlerBase* rulesHandler, const char* value, EDirectiveType& type, ui32 lineNumber); + inline static bool ProcessSitemap(TRobotsTxtRulesHandlerBase* rulesHandler, TRobotsTxtParser& parser, const char* value, const char* host); + inline static bool ProcessCleanParam(TRobotsTxtRulesHandlerBase* rulesHandler, TRobotsTxtParser& parser, TString& value); + inline static bool AddRules( + TRobotsTxtRulesHandlerBase* rulesHandler, + TRobotsTxtParser& parser, + const char* value, + char type, + const TBotIdSet& currentBotIds, + const TBotIdSet& isSupportedBot); + + inline static bool ProcessCrawlDelay( + TRobotsTxtRulesHandlerBase* rulesHandler, + TRobotsTxtParser& parser, + const TBotIdSet& currentBotIds, + const TBotIdSet& isSupportedBot, + const char* value); + + inline static void ProcessUserAgent( + TRobotsTxtRulesHandlerBase* rulesHandler, + TRobotsTxtParser& parser, + const TBotIdSet& currentBotIds, + TBotIdSet& wasRuleForBot, + TBotIdSet& isSupportedBot, + TVector<ui32>& botIdToMaxAppropriateUserAgentNameLength, + const char* value); + + bool CheckRobot( + const char* userAgent, + TBotIdSet& botIds, + const TVector<ui32>* botIdToMaxAppropriateUserAgentNameLength = nullptr) const; + + virtual void ClearInternal(ui32 botId); + + void AddError(EFormatErrorType type, int line); + + void ResetOptimized() noexcept; +}; + +class TPrefixTreeRobotsTxtRulesHandler: public TRobotsTxtRulesHandlerBase, TNonCopyable { +private: + static const int INIT_BUFFER_SIZE = 1 << 6; + + struct TRuleInfo { + size_t Len; + bool Allow; + }; + + bool IsFull(ui32 botId, size_t length) const override; + bool IsFullTotal() const override; + bool AddRule(ui32 botId, TStringBuf rule, char type) override; + const char* GetRule(ui32 botId, const char* s, char type) const; + void ResizeBuffer(ui32 botId, int newSize); + void SaveRulesFromBuffer(ui32 botId); + int TraceBuffer(ui32 botId, int countRules, const TArrayHolder<TRuleInfo>* ruleInfos); + bool CheckAllowDisallowAll(ui32 botId, bool checkDisallow); + void SaveRulesToBuffer(); + int StrLenWithoutStars(const char* s); + +protected: + class TRulesSortFunc { + private: + const TArrayHolder<TRuleInfo>* RuleInfos; + + public: + TRulesSortFunc(const TArrayHolder<TRuleInfo>* ruleInfos) + : RuleInfos(ruleInfos) + { + } + bool operator()(const size_t& lhs, const size_t& rhs) { + const TRuleInfo& left = (*RuleInfos).Get()[lhs]; + const TRuleInfo& right = (*RuleInfos).Get()[rhs]; + return (left.Len == right.Len) ? left.Allow && !right.Allow : left.Len > right.Len; + } + }; + + struct TPrefixTreeBotInfo { + bool DisallowAll = false; + bool AllowAll = false; + bool HasDisallow = false; + bool HasAllow = false; + + TArrayHolder<char> Buffer{new char[INIT_BUFFER_SIZE]}; + ui32 BufferPosition = sizeof(BufferPosition); + int BufferSize = INIT_BUFFER_SIZE; + + TArrayHolder<char*> Rules = nullptr; + int RulesPosition = 0; + int RulesSize = 0; + + TArrayHolder<char**> ComplexRules = nullptr; + int ComplexRulesPosition = 0; + int ComplexRulesSize = 0; + + TPrefixTree PrefixRules {0}; + }; + + std::array<THolder<TPrefixTreeBotInfo>, robotstxtcfg::max_botid> BotIdToPrefixTreeBotInfo; + + TPrefixTreeBotInfo& GetInfo(ui32 botId); + static bool CheckRule(const char* s, const char* rule); + void ClearInternal(ui32 botId) override; + bool OptimizeSize() override; + +private: + void SortRules(TPrefixTreeBotInfo& prefixBotInfo, size_t count, const TArrayHolder<TRuleInfo>* ruleInfos); + bool HasDisallowRulePrevAllowAll(const TPrefixTreeBotInfo& prefixBotInfo, int ruleAllAllow); + int FindRuleAll(const TPrefixTreeBotInfo& prefixBotInfo, char neededType); + +public: + TPrefixTreeRobotsTxtRulesHandler( + TBotIdSet supportedBotIds = robotstxtcfg::defaultSupportedBotIds, + int robotsMaxSize = robots_max, + int maxRulesCount = -1, + bool saveDataForAnyBot = true); + + TPrefixTreeRobotsTxtRulesHandler( + std::initializer_list<ui32> supportedBotIds, + int robotsMaxSize = robots_max, + int maxRulesCount = -1, + bool saveDataForAnyBot = true); + + TPrefixTreeRobotsTxtRulesHandler( + const TSet<ui32>& supportedBotIds, + int robotsMaxSize = robots_max, + int maxRulesCount = -1, + bool saveDataForAnyBot = true); + + void Clear() override; + void AfterParse(ui32 botId) override; + bool IsAllowAll(ui32 botId) const override; + bool IsAllowAll() const override; + bool IsDisallowAll(ui32 botId, bool useAny = true) const override; + bool IsDisallowAll() const override; + const char* IsDisallow(ui32 botId, const char* s, bool useAny = true) const override; + const char* IsAllow(ui32 botId, const char* s) const override; + TRobotsTxtRulesIterator GetRulesIterator(ui32 botId) const override; + void DoAllowAll() override; + void DoDisallowAll() override; + bool Empty(ui32 botId) const override; + + void LoadPacked(const char* botsData, const char* botsDataEnd = nullptr) override; + size_t GetPacked(const char*& data) const override; + void Dump(ui32 botId, FILE* logFile) override; + void Dump(ui32 botId, IOutputStream& out) override; + size_t GetMemorySize(); +}; + +using TRobotsTxt = TPrefixTreeRobotsTxtRulesHandler; + +void TRobotsTxtRulesHandlerBase::ClearAllExceptCrossSection(TRobotsTxtParser& parser, TRobotsTxtRulesHandlerBase* rulesHandler, ui32 botId) { + rulesHandler->ClearInternal(botId); + if (botId == robotstxtcfg::id_anybot) { + // as sitemaps, clean-params and HostDirective from prefix tree was deleted + for (const auto& sitemap : rulesHandler->SiteMaps) { + rulesHandler->AddRuleWithErrorCheck(robotstxtcfg::id_anybot, sitemap, 'S', parser); + } + for (const auto& param : rulesHandler->CleanParams) { + rulesHandler->AddRuleWithErrorCheck(robotstxtcfg::id_anybot, param, 'P', parser); + } + if (!rulesHandler->HostDirective.empty()) { + rulesHandler->AddRuleWithErrorCheck(robotstxtcfg::id_anybot, rulesHandler->HostDirective, 'H', parser); + } + } +} + +void TRobotsTxtRulesHandlerBase::CheckRobotsLines(TRobotsTxtRulesHandlerBase* rulesHandler, TVector<int>& nonRobotsLines) { + if (rulesHandler->IsHandlingErrors()) { + for (size_t i = 0; i < nonRobotsLines.size(); ++i) + rulesHandler->AddError(ERROR_TRASH, nonRobotsLines[i]); + nonRobotsLines.clear(); + } +} + +void TRobotsTxtRulesHandlerBase::CheckAsterisk(TRobotsTxtRulesHandlerBase* rulesHandler, const char* value, ui32 lineNumber, bool& wasAsterisk) { + if (strcmp(value, "*") == 0) { + if (wasAsterisk) + rulesHandler->AddError(ERROR_ASTERISK_MULTI, lineNumber); + wasAsterisk = true; + } +} + +bool TRobotsTxtRulesHandlerBase::CheckWasUserAgent(TRobotsTxtRulesHandlerBase* rulesHandler, bool wasUserAgent, bool& ruleBeforeUserAgent, bool& wasRule, ui32 lineNumber) { + if (wasUserAgent) { + wasRule = true; + return false; + } + if (!ruleBeforeUserAgent) { + ruleBeforeUserAgent = true; + rulesHandler->AddError(ERROR_RULE_BEFORE_USER_AGENT, lineNumber); + } + return true; +} + +bool TRobotsTxtRulesHandlerBase::CheckRuleNotSlash(TRobotsTxtRulesHandlerBase* rulesHandler, const char* value, ui32 lineNumber) { + if (*value && *value != '/' && *value != '*') { + rulesHandler->AddError(ERROR_RULE_NOT_SLASH, lineNumber); + return true; + } + return false; +} + +bool TRobotsTxtRulesHandlerBase::CheckSupportedBots( + const TBotIdSet& currentBotIds, + TBotIdSet& wasRuleForBot, + const TBotIdSet& isSupportedBot) +{ + bool hasAtLeastOneSupportedBot = false; + for (ui32 currentBotId : currentBotIds) { + wasRuleForBot.insert(currentBotId); + hasAtLeastOneSupportedBot = hasAtLeastOneSupportedBot || isSupportedBot.contains(currentBotId); + } + return hasAtLeastOneSupportedBot; +} + +bool TRobotsTxtRulesHandlerBase::CheckEmptyRule(TRobotsTxtRulesHandlerBase* rulesHandler, const char* value, EDirectiveType& type, ui32 lineNumber) { + if (value && strlen(value) == 0) { + rulesHandler->AddError(WARNING_EMPTY_RULE, lineNumber); + type = type == ALLOW ? DISALLOW : ALLOW; + return true; + } + return false; +} + +bool TRobotsTxtRulesHandlerBase::AddRules( + TRobotsTxtRulesHandlerBase* rulesHandler, + TRobotsTxtParser& parser, + const char* value, + char type, + const TBotIdSet& currentBotIds, + const TBotIdSet& isSupportedBot) +{ + for (ui32 currentBotId : currentBotIds) { + if (!isSupportedBot.contains(currentBotId)) + continue; + if (!rulesHandler->AddRuleWithErrorCheck(currentBotId, value, type, parser)) + return true; + } + return false; +} + +bool TRobotsTxtRulesHandlerBase::ProcessSitemap(TRobotsTxtRulesHandlerBase* rulesHandler, TRobotsTxtParser& parser, const char* value, const char* host) { + TString modifiedUrl; + if (!CheckSitemapUrl(value, host, modifiedUrl)) + rulesHandler->AddError(ERROR_SITEMAP_FORMAT, parser.GetLineNumber()); + else { + rulesHandler->AddSiteMap(modifiedUrl.data()); + if (!rulesHandler->AddRuleWithErrorCheck(robotstxtcfg::id_anybot, modifiedUrl.data(), 'S', parser)) + return true; + } + return false; +} + +bool TRobotsTxtRulesHandlerBase::ProcessCleanParam(TRobotsTxtRulesHandlerBase* rulesHandler, TRobotsTxtParser& parser, TString& value) { + if (!CheckAndNormCleanParam(value)) + rulesHandler->AddError(ERROR_CLEAN_PARAM_FORMAT, parser.GetLineNumber()); + else { + rulesHandler->AddCleanParam(value.data()); + if (!rulesHandler->AddRuleWithErrorCheck(robotstxtcfg::id_anybot, value.data(), 'P', parser)) + return true; + } + return false; +} + +bool TRobotsTxtRulesHandlerBase::ProcessCrawlDelay( + TRobotsTxtRulesHandlerBase* rulesHandler, + TRobotsTxtParser& parser, + const TBotIdSet& currentBotIds, + const TBotIdSet& isSupportedBot, + const char* value) { + for (ui32 currentBotId : currentBotIds) { + if (!isSupportedBot.contains(currentBotId)) + continue; + if (rulesHandler->BotIdToInfo[currentBotId].CrawlDelay >= 0) { + rulesHandler->AddError(ERROR_CRAWL_DELAY_MULTI, parser.GetLineNumber()); + break; + } + int crawlDelay = -1; + if (!ParseCrawlDelay(value, crawlDelay)) + rulesHandler->AddError(ERROR_CRAWL_DELAY_FORMAT, parser.GetLineNumber()); + else { + rulesHandler->SetCrawlDelay(currentBotId, crawlDelay); + if (!rulesHandler->AddRuleWithErrorCheck(currentBotId, value, 'C', parser)) + return true; + } + } + return false; +} + +void TRobotsTxtRulesHandlerBase::ProcessUserAgent( + TRobotsTxtRulesHandlerBase* rulesHandler, + TRobotsTxtParser& parser, + const TBotIdSet& currentBotIds, + TBotIdSet& wasSupportedBot, + TBotIdSet& isSupportedBot, + TVector<ui32>& botIdToMaxAppropriateUserAgentNameLength, + const char* value) +{ + ui32 userAgentNameLength = (ui32)strlen(value); + + for (ui32 currentBotId : currentBotIds) { + bool userAgentNameLonger = userAgentNameLength > botIdToMaxAppropriateUserAgentNameLength[currentBotId]; + bool userAgentNameSame = userAgentNameLength == botIdToMaxAppropriateUserAgentNameLength[currentBotId]; + + if (!wasSupportedBot.contains(currentBotId) || userAgentNameLonger) + ClearAllExceptCrossSection(parser, rulesHandler, currentBotId); + + wasSupportedBot.insert(currentBotId); + if (userAgentNameLonger || userAgentNameSame) { + isSupportedBot.insert(currentBotId); // Allow multiple blocks for the same user agent + } + botIdToMaxAppropriateUserAgentNameLength[currentBotId] = Max(userAgentNameLength, botIdToMaxAppropriateUserAgentNameLength[currentBotId]); + } +} + +template <class THostHandler> +int TRobotsTxtRulesHandlerBase::ParseRules(TRobotsTxtParser& parser, TRobotsTxtRulesHandlerBase* rulesHandler, THostHandler* hostHandler, const char* host) { + rulesHandler->Clear(); + + TBotIdSet wasSupportedBot; + TBotIdSet wasRuleForBot; + bool wasAsterisk = false; + TVector<int> nonRobotsLines; + TVector<ui32> botIdToMaxAppropriateUserAgentNameLength(robotstxtcfg::max_botid, 0); + static char all[] = "/"; + EDirectiveType prevType = USER_AGENT; + while (parser.HasRecord()) { + TRobotsTxtRulesRecord record = parser.NextRecord(); + bool wasUserAgent = false; + bool isRobotsRecordUseful = false; + TBotIdSet isSupportedBot; + TBotIdSet currentBotIds; + TString field; + TString value; + bool ruleBeforeUserAgent = false; + int ret = 0; + bool wasRule = false; + bool wasBlank = false; + while (record.NextPair(field, value, isRobotsRecordUseful && rulesHandler->IsHandlingErrors(), nonRobotsLines, &wasBlank)) { + CheckRobotsLines(rulesHandler, nonRobotsLines); + EDirectiveType type = NameToDirType(field.data()); + EDirectiveType typeBeforeChange = type; + + if ((prevType != type || wasBlank) && type == USER_AGENT) { + currentBotIds.clear(); + } + prevType = type; + + switch (type) { + case USER_AGENT: + if (wasUserAgent && wasRule) { + wasRule = false; + currentBotIds.clear(); + isSupportedBot.clear(); + } + wasUserAgent = true; + value.to_lower(); + CheckAsterisk(rulesHandler, value.data(), parser.GetLineNumber(), wasAsterisk); + isRobotsRecordUseful = rulesHandler->CheckRobot(value.data(), currentBotIds, &botIdToMaxAppropriateUserAgentNameLength); + if (isRobotsRecordUseful) + ProcessUserAgent(rulesHandler, parser, currentBotIds, wasSupportedBot, isSupportedBot, botIdToMaxAppropriateUserAgentNameLength, value.data()); + break; + + case DISALLOW: + case ALLOW: + if (CheckWasUserAgent(rulesHandler, wasUserAgent, ruleBeforeUserAgent, wasRule, parser.GetLineNumber())) + break; + if (CheckRuleNotSlash(rulesHandler, value.data(), parser.GetLineNumber())) + break; + CheckRule(value.data(), parser.GetLineNumber(), rulesHandler); + if (!CheckSupportedBots(currentBotIds, wasRuleForBot, isSupportedBot)) { + break; + } + if (CheckEmptyRule(rulesHandler, value.data(), type, parser.GetLineNumber())) { + value = all; + if (typeBeforeChange == ALLOW) + continue; + } + + if (AddRules(rulesHandler, parser, value.data(), type == ALLOW ? 'A' : 'D', currentBotIds, isSupportedBot)) + return 2; + break; + + case HOST: + value.to_lower(); + ret = hostHandler->OnHost(robotstxtcfg::id_anybot, parser, value.data(), rulesHandler); + if (ret) + return ret; + break; + + case SITEMAP: + if (ProcessSitemap(rulesHandler, parser, value.data(), host)) + return 2; + break; + + case CLEAN_PARAM: + if (ProcessCleanParam(rulesHandler, parser, value)) + return 2; + break; + + case CRAWL_DELAY: + if (ProcessCrawlDelay(rulesHandler, parser, currentBotIds, isSupportedBot, value.data())) + return 2; + break; + + default: + rulesHandler->AddError(WARNING_UNKNOWN_FIELD, parser.GetLineNumber()); + break; + } + bool isCrossSection = type == SITEMAP || type == HOST || type == CLEAN_PARAM; + if (rulesHandler->IsHandlingErrors() && (isRobotsRecordUseful || isCrossSection)) + rulesHandler->AddAcceptedLine(parser.GetLineNumber(), currentBotIds, isCrossSection); + } + } + + for (auto botId : wasSupportedBot) { + rulesHandler->LoadedBotIds.insert(botId); + if (rulesHandler->IsBotIdSupported(botId)) + rulesHandler->AfterParse(botId); + } + + if (!rulesHandler->OptimizeSize()) { + return 2; + } + + return 1; +} diff --git a/library/cpp/robots_txt/robots_txt_parser.cpp b/library/cpp/robots_txt/robots_txt_parser.cpp new file mode 100644 index 0000000000..8e2fe6073d --- /dev/null +++ b/library/cpp/robots_txt/robots_txt_parser.cpp @@ -0,0 +1,116 @@ +#include "robots_txt_parser.h" +#include <util/generic/string.h> +#include <util/stream/output.h> + +TRobotsTxtParser::TRobotsTxtParser(IInputStream& inputStream) + : InputStream(inputStream) + , LineNumber(0) + , IsLastSymbolCR(false) +{ +} + +int TRobotsTxtParser::GetLineNumber() { + return LineNumber; +} + +const char* TRobotsTxtParser::ReadLine() { + Line = ""; + char c; + + if (IsLastSymbolCR) { + if (!InputStream.ReadChar(c)) + return nullptr; + if (c != '\n') + Line.append(c); + } + + bool hasMoreSymbols; + while (hasMoreSymbols = InputStream.ReadChar(c)) { + if (c == '\r') { + IsLastSymbolCR = true; + break; + } else { + IsLastSymbolCR = false; + if (c == '\n') + break; + Line.append(c); + } + } + if (!hasMoreSymbols && Line.empty()) + return nullptr; + + // BOM UTF-8: EF BB BF + if (0 == LineNumber && Line.size() >= 3 && Line[0] == '\xEF' && Line[1] == '\xBB' && Line[2] == '\xBF') + Line = Line.substr(3, Line.size() - 3); + + ++LineNumber; + int i = Line.find('#'); + if (i == 0) + Line = ""; + else if (i > 0) + Line = Line.substr(0, i); + return Line.data(); +} + +bool TRobotsTxtParser::IsBlankLine(const char* s) { + for (const char* p = s; *p; ++p) + if (!isspace(*p)) + return 0; + return 1; +} + +char* TRobotsTxtParser::Trim(char* s) { + while (isspace(*s)) + ++s; + char* p = s + strlen(s) - 1; + while (s < p && isspace(*p)) + --p; + *(p + 1) = 0; + return s; +} + +inline bool TRobotsTxtParser::IsRobotsLine(const char* s) { + return strchr(s, ':'); +} + +bool TRobotsTxtParser::HasRecord() { + while (!IsRobotsLine(Line.data())) + if (!ReadLine()) + return 0; + return 1; +} + +TRobotsTxtRulesRecord TRobotsTxtParser::NextRecord() { + return TRobotsTxtRulesRecord(*this); +} + +TRobotsTxtRulesRecord::TRobotsTxtRulesRecord(TRobotsTxtParser& parser) + : Parser(parser) +{ +} + +bool TRobotsTxtRulesRecord::NextPair(TString& field, TString& value, bool handleErrors, TVector<int>& nonRobotsLines, bool* wasBlank) { + if (wasBlank) { + *wasBlank = false; + } + while (!Parser.IsRobotsLine(Parser.Line.data())) { + if (!Parser.ReadLine()) + return 0; + if (Parser.IsBlankLine(Parser.Line.data())) { + if (wasBlank) { + *wasBlank = true; + } + continue; + } + if (handleErrors && !Parser.IsRobotsLine(Parser.Line.data())) + nonRobotsLines.push_back(Parser.GetLineNumber()); + } + + char* s = strchr(Parser.Line.begin(), ':'); + *s = 0; + char* p = s + 1; + + field = TRobotsTxtParser::Trim(strlwr(Parser.Line.begin())); + value = TRobotsTxtParser::Trim(p); + return 1; +} diff --git a/library/cpp/robots_txt/robots_txt_parser.h b/library/cpp/robots_txt/robots_txt_parser.h new file mode 100644 index 0000000000..8032d0d20b --- /dev/null +++ b/library/cpp/robots_txt/robots_txt_parser.h @@ -0,0 +1,38 @@ +#pragma once + +#include <algorithm> +#include <util/generic/string.h> +#include <util/generic/vector.h> +#include <util/stream/input.h> + +class TRobotsTxtParser; + +class TRobotsTxtRulesRecord { +private: + TRobotsTxtParser& Parser; + +public: + TRobotsTxtRulesRecord(TRobotsTxtParser& parser); + bool NextPair(TString& field, TString& value, bool handleErrors, TVector<int>& nonRobotsLines, bool* wasBlank = nullptr); +}; + +class TRobotsTxtParser { + friend class TRobotsTxtRulesRecord; + +private: + IInputStream& InputStream; + TString Line; + int LineNumber; + bool IsLastSymbolCR; + + const char* ReadLine(); + static bool IsBlankLine(const char*); + static bool IsRobotsLine(const char*); + +public: + static char* Trim(char*); + TRobotsTxtParser(IInputStream& inputStream); + bool HasRecord(); + TRobotsTxtRulesRecord NextRecord(); + int GetLineNumber(); +}; diff --git a/library/cpp/robots_txt/robotstxtcfg.h b/library/cpp/robots_txt/robotstxtcfg.h new file mode 100644 index 0000000000..5ca1682a0c --- /dev/null +++ b/library/cpp/robots_txt/robotstxtcfg.h @@ -0,0 +1,3 @@ +#pragma once + +#include <library/cpp/robots_txt/robotstxtcfg/robotstxtcfg.h> diff --git a/library/cpp/robots_txt/robotstxtcfg/bot_id_set.cpp b/library/cpp/robots_txt/robotstxtcfg/bot_id_set.cpp new file mode 100644 index 0000000000..aec668582c --- /dev/null +++ b/library/cpp/robots_txt/robotstxtcfg/bot_id_set.cpp @@ -0,0 +1,2 @@ +#include "bot_id_set.h" +// header compile test diff --git a/library/cpp/robots_txt/robotstxtcfg/bot_id_set.h b/library/cpp/robots_txt/robotstxtcfg/bot_id_set.h new file mode 100644 index 0000000000..08aaa68a50 --- /dev/null +++ b/library/cpp/robots_txt/robotstxtcfg/bot_id_set.h @@ -0,0 +1,132 @@ +#pragma once + +#include "user_agents.h" + +#include <bitset> + + +/// Simple vector-based set for bot ids, meant to optimize memory and lookups +class TBotIdSet +{ +public: + using TData = std::bitset<robotstxtcfg::max_botid>; + + constexpr TBotIdSet() noexcept = default; + constexpr TBotIdSet(const TBotIdSet&) noexcept = default; + constexpr TBotIdSet(TBotIdSet&&) noexcept = default; + constexpr TBotIdSet& operator = (const TBotIdSet&) noexcept = default; + constexpr TBotIdSet& operator = (TBotIdSet&&) noexcept = default; + + TBotIdSet(std::initializer_list<ui32> botIds) { + for (auto id : botIds) { + insert(id); + } + } + + static TBotIdSet All() noexcept { + TBotIdSet res; + res.Bots.set(); + return res; + } + + constexpr bool contains(ui32 botId) const noexcept { + return (botId < Bots.size()) && Bots[botId]; + } + + bool insert(ui32 botId) noexcept { + if (botId >= Bots.size() || Bots[botId]) { + return false; + } + Bots[botId] = true; + return true; + } + + bool remove(ui32 botId) noexcept { + if (botId >= Bots.size() || !Bots[botId]) { + return false; + } + Bots[botId] = false; + return true; + } + + void clear() noexcept { + Bots.reset(); + } + + size_t size() const noexcept { + return Bots.count(); + } + + bool empty() const noexcept { + return Bots.none(); + } + + bool operator==(const TBotIdSet& rhs) const noexcept = default; + + TBotIdSet operator&(TBotIdSet rhs) const noexcept { + rhs.Bots &= Bots; + return rhs; + } + + TBotIdSet operator|(TBotIdSet rhs) const noexcept { + rhs.Bots |= Bots; + return rhs; + } + + TBotIdSet operator~() const noexcept { + TBotIdSet result; + result.Bots = ~Bots; + return result; + } + + class iterator + { + public: + auto operator * () const noexcept { + return BotId; + } + + iterator& operator ++ () noexcept { + while (BotId < Bots.size()) { + if (Bots[++BotId]) { + break; + } + } + return *this; + } + + bool operator == (const iterator& rhs) const noexcept { + return (&Bots == &rhs.Bots) && (BotId == rhs.BotId); + } + + bool operator != (const iterator& rhs) const noexcept { + return !(*this == rhs); + } + + private: + friend class TBotIdSet; + iterator(const TData& bots, ui32 botId) + : Bots(bots) + , BotId(botId) + { + while (BotId < Bots.size() && !Bots[BotId]) { + ++BotId; + } + } + + private: + const TData& Bots; + ui32 BotId; + }; + + iterator begin() const noexcept { + return {Bots, robotstxtcfg::id_anybot}; + } + + iterator end() const noexcept { + return {Bots, robotstxtcfg::max_botid}; + } + +private: + TData Bots {}; +}; diff --git a/library/cpp/robots_txt/robotstxtcfg/robotstxtcfg.cpp b/library/cpp/robots_txt/robotstxtcfg/robotstxtcfg.cpp new file mode 100644 index 0000000000..c5652b81c5 --- /dev/null +++ b/library/cpp/robots_txt/robotstxtcfg/robotstxtcfg.cpp @@ -0,0 +1,2 @@ +#include "robotstxtcfg.h" +// header compile test diff --git a/library/cpp/robots_txt/robotstxtcfg/robotstxtcfg.h b/library/cpp/robots_txt/robotstxtcfg/robotstxtcfg.h new file mode 100644 index 0000000000..2cf9430d7c --- /dev/null +++ b/library/cpp/robots_txt/robotstxtcfg/robotstxtcfg.h @@ -0,0 +1,11 @@ +#pragma once + +#include "bot_id_set.h" + + +namespace robotstxtcfg { + +static const TBotIdSet defaultSupportedBotIds = {id_defbot}; +static const TBotIdSet allSupportedBotIds = TBotIdSet::All(); + +} // namespace robotstxtcfg diff --git a/library/cpp/robots_txt/robotstxtcfg/user_agents.cpp b/library/cpp/robots_txt/robotstxtcfg/user_agents.cpp new file mode 100644 index 0000000000..60b353a427 --- /dev/null +++ b/library/cpp/robots_txt/robotstxtcfg/user_agents.cpp @@ -0,0 +1,2 @@ +#include "user_agents.h" +// header compile test diff --git a/library/cpp/robots_txt/robotstxtcfg/user_agents.h b/library/cpp/robots_txt/robotstxtcfg/user_agents.h new file mode 100644 index 0000000000..a56e5b66f4 --- /dev/null +++ b/library/cpp/robots_txt/robotstxtcfg/user_agents.h @@ -0,0 +1,292 @@ +#pragma once + +#include <library/cpp/case_insensitive_string/case_insensitive_string.h> + + +namespace robotstxtcfg { + // robots.txt agents and identifiers + + enum EBots : ui32 { + id_anybot = 0, + id_yandexbot = 1, + id_yandexmediabot = 2, + id_yandeximagesbot = 3, + id_googlebot = 4, + id_yandexbotmirr = 5, + id_yahooslurp = 6, + id_msnbot = 7, + id_yandexcatalogbot = 8, + id_yandexdirectbot = 9, + id_yandexblogsbot = 10, + id_yandexnewsbot = 11, + id_yandexpagechk = 12, + id_yandexmetrikabot = 13, + id_yandexbrowser = 14, + id_yandexmarketbot = 15, + id_yandexcalendarbot = 16, + id_yandexwebmasterbot = 17, + id_yandexvideobot = 18, + id_yandeximageresizerbot = 19, + id_yandexadnetbot = 20, + id_yandexpartnerbot = 21, + id_yandexdirectdbot = 22, + id_yandextravelbot = 23, + id_yandexmobilebot = 24, + id_yandexrcabot = 25, + id_yandexdirectdynbot = 26, + id_yandexmobilebot_ed = 27, + id_yandexaccessibilitybot = 28, + id_baidubot = 29, + id_yandexscreenshotbot = 30, + id_yandexmetrikayabs = 31, + id_yandexvideoparserbot = 32, + id_yandexnewsbot4 = 33, + id_yandexmarketbot2 = 34, + id_yandexmedianabot = 35, + id_yandexsearchshopbot = 36, + id_yandexontodbbot = 37, + id_yandexontodbapibot = 38, + id_yandexampbot = 39, + id_yandexvideohosting = 40, + id_yandexmediaselling = 41, + id_yandexverticals = 42, + id_yandexturbobot = 43, + id_yandexzenbot = 44, + id_yandextrackerbot = 45, + id_yandexmetrikabot4 = 46, + id_yandexmobilescreenshotbot = 47, + id_yandexfaviconsbot = 48, + max_botid + }; + + static const ui32 id_defbot = id_yandexbot; + + struct TBotInfo { + TCaseInsensitiveStringBuf ReqPrefix; + TCaseInsensitiveStringBuf FullName; + TStringBuf FromField = {}; + TStringBuf UserAgent = {}; + TStringBuf RotorUserAgent = {}; + bool ExplicitDisallow = false; + }; + + static constexpr TStringBuf UserAgentFrom("support@search.yandex.ru"); + + static constexpr TBotInfo BotInfoArr[] = { + {"*", "*"}, + {"Yandex", "YandexBot/3.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268", + false}, + {"Yandex", "YandexMedia/3.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexMedia/3.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexMedia/3.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268", + false}, + {"Yandex", "YandexImages/3.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexImages/3.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexImages/3.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268", + false}, + {"Google", "GoogleBot"}, + {"Yandex", "YandexBot/3.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexBot/3.0; MirrorDetector; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexBot/3.0; MirrorDetector; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268", + false}, + {"Slurp", "Slurp"}, + {"msn", "msnbot"}, + {"Yandex", "YandexCatalog/3.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexCatalog/3.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexCatalog/3.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268", + false}, + {"YaDirectFetcher", "YaDirectFetcher/1.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YaDirectFetcher/1.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YaDirectFetcher/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268", + true}, + + {"Yandex", "YandexBlogs/0.99", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexBlogs/0.99; robot; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexBlogs/0.99; robot; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268", + false}, + {"Yandex", "YandexNews/3.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexNews/3.0; robot; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexNews/3.0; robot; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268", + false}, + {"Yandex", "YandexPagechecker/2.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexPagechecker/2.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexPagechecker/2.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268", + false}, + {"Yandex", "YandexMetrika/3.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexMetrika/3.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexMetrika/3.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268", + false}, + {"Yandex", "YandexBrowser/1.0", UserAgentFrom, + "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/536.5 (KHTML, like Gecko) YaBrowser/1.0.1084.5402 Chrome/19.0.1084.5409 Safari/536.5", + "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/536.5 (KHTML, like Gecko) YaBrowser/1.0.1084.5402 Chrome/19.0.1084.5409 Safari/536.5", + false}, + {"Yandex", "YandexMarket/1.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexMarket/1.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexMarket/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268", + false}, + {"YandexCalendar", "YandexCalendar/1.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexCalendar/1.0 +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexCalendar/1.0 +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268", + true}, + {"Yandex", "YandexWebmaster/2.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexWebmaster/2.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexWebmaster/2.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268", + false}, + {"Yandex", "YandexVideo/3.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexVideo/3.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexVideo/3.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268", + false}, + {"Yandex", "YandexImageResizer/2.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexImageResizer/2.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexImageResizer/2.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268", + false}, + + {"YandexDirect", "YandexDirect/3.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexDirect/3.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexDirect/3.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268", + true}, + {"YandexPartner", "YandexPartner/3.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexPartner/3.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexPartner/3.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268", + true}, + {"YaDirectFetcher", "YaDirectFetcher/1.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YaDirectFetcher/1.0; Dyatel; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YaDirectFetcher/1.0; Dyatel; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268", + true}, + {"Yandex", "YandexTravel/1.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexTravel/1.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexTravel/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268", + false}, + {"Yandex", "YandexBot/3.0", UserAgentFrom, + "Mozilla/5.0 (iPhone; CPU iPhone OS 8_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12B411 Safari/600.1.4 (compatible; YandexBot/3.0; +http://yandex.com/bots)", + "Mozilla/5.0 (iPhone; CPU iPhone OS 8_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12B411 Safari/600.1.4 (compatible; YandexBot/3.0; +http://yandex.com/bots)", + false}, + {"YandexRCA", "YandexRCA/1.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexRCA/1.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexRCA/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268", + true}, + {"YandexDirectDyn", "YandexDirectDyn/1.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexDirectDyn/1.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexDirectDyn/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268", + true}, + {"YandexMobileBot", "YandexMobileBot/3.0", UserAgentFrom, + "Mozilla/5.0 (iPhone; CPU iPhone OS 15_4_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.4 Mobile/15E148 Safari/604.1 (compatible; YandexMobileBot/3.0; +http://yandex.com/bots)", + "Mozilla/5.0 (iPhone; CPU iPhone OS 15_4_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.4 Mobile/15E148 Safari/604.1 (compatible; YandexMobileBot/3.0; +http://yandex.com/bots)", + true}, + {"YandexAccessibilityBot", "YandexAccessibilityBot/3.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexAccessibilityBot/3.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexAccessibilityBot/3.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268", + true}, + {"Baidu", "Baiduspider"}, + + {"YandexScreenshotBot", "YandexScreenshotBot/3.0", UserAgentFrom, + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36 (compatible; YandexScreenshotBot/3.0; +http://yandex.com/bots)", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36 (compatible; YandexScreenshotBot/3.0; +http://yandex.com/bots)", + true}, + {"YandexMetrika", "YandexMetrika/2.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexMetrika/2.0; +http://yandex.com/bots yabs01)", + "Mozilla/5.0 (compatible; YandexMetrika/2.0; +http://yandex.com/bots yabs01) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268", + true}, + {"YandexVideoParser", "YandexVideoParser/1.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexVideoParser/1.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexVideoParser/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268", + true}, + {"Yandex", "YandexNews/4.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexNews/4.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexNews/4.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268", + true}, + {"YandexMarket", "YandexMarket/2.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexMarket/2.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexMarket/2.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268", + true}, + {"YandexMedianaBot", "YandexMedianaBot/1.0", UserAgentFrom, + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36 (compatible; YandexMedianaBot/1.0; +http://yandex.com/bots)", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36 (compatible; YandexMedianaBot/1.0; +http://yandex.com/bots)", + true}, + {"YandexSearchShop", "YandexSearchShop/1.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexSearchShop/1.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexSearchShop/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268", + true}, + {"Yandex", "YandexOntoDB/1.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexOntoDB/1.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexOntoDB/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268", + false}, + {"YandexOntoDBAPI", "YandexOntoDBAPI/1.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexOntoDBAPI/1.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexOntoDBAPI/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268", + true}, + {"Yandex-AMPHTML", "Yandex-AMPHTML", UserAgentFrom, + "Mozilla/5.0 (compatible; Yandex-AMPHTML; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; Yandex-AMPHTML; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268", + true}, + + {"YandexVideoHosting", "YandexVideoHosting/1.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexVideoHosting/1.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexVideoHosting/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268", + true}, + {"YandexMediaSelling", "YandexMediaSelling/1.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexMediaSelling/1.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexMediaSelling/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268", + true}, + {"YandexVerticals", "YandexVerticals/1.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexVerticals/1.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexVerticals/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268", + true}, + {"YandexTurbo", "YandexTurbo/1.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexTurbo/1.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexTurbo/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268", + true}, + {"YandexZenRss", "YandexZenRss/1.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexZenRss/1.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexZenRss/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268", + true}, + {"YandexTracker", "YandexTracker/1.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexTracker/1.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexTracker/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268", + true}, + {"YandexMetrika", "YandexMetrika/4.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexMetrika/4.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexMetrika/4.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268", + true}, + {"YandexMobileScreenShotBot", "YandexMobileScreenShotBot/1.0", UserAgentFrom, + "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/11.0 Mobile/12B411 Safari/600.1.4 (compatible; YandexBot/3.0; +http://yandex.com/bots)", + "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/11.0 Mobile/12B411 Safari/600.1.4 (compatible; YandexBot/3.0; +http://yandex.com/bots)", + true}, + {"YandexFavicons", "YandexFavicons/1.0", UserAgentFrom, + "Mozilla/5.0 (compatible; YandexFavicons/1.0; +http://yandex.com/bots)", + "Mozilla/5.0 (compatible; YandexFavicons/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.268", + true}}; + + static_assert(std::size(BotInfoArr) == max_botid); + + constexpr auto GetReqPrefix(ui32 botId) { + return BotInfoArr[botId].ReqPrefix; + } + + constexpr auto GetFullName(ui32 botId) { + return BotInfoArr[botId].FullName; + } + + constexpr auto GetFromField(ui32 botId) { + return BotInfoArr[botId].FromField; + } + + constexpr auto GetUserAgent(ui32 botId) { + return BotInfoArr[botId].UserAgent; + } + + constexpr auto GetRotorUserAgent(ui32 botId) { + return BotInfoArr[botId].RotorUserAgent; + } + + constexpr bool IsExplicitDisallow(ui32 botId) { + return BotInfoArr[botId].ExplicitDisallow; + } + + constexpr bool IsYandexBotId(ui32 botId) { + return !BotInfoArr[botId].UserAgent.empty(); + } + +} // namespace robotstxtcfg diff --git a/library/cpp/robots_txt/rules_handler.cpp b/library/cpp/robots_txt/rules_handler.cpp new file mode 100644 index 0000000000..14f6810948 --- /dev/null +++ b/library/cpp/robots_txt/rules_handler.cpp @@ -0,0 +1,514 @@ +#include "robots_txt.h" +#include "constants.h" + +#include <library/cpp/uri/http_url.h> +#include <library/cpp/charset/ci_string.h> +#include <library/cpp/string_utils/url/url.h> +#include <util/system/maxlen.h> +#include <util/generic/yexception.h> +#include <util/generic/algorithm.h> + + +namespace { + +TBotIdSet ConvertBotIdSet(const TSet<ui32>& botIds) noexcept { + TBotIdSet result; + for (auto id : botIds) { + result.insert(id); + } + return result; +} + +} // namespace + +TRobotsTxtRulesIterator::TRobotsTxtRulesIterator(const char* begin, const char* end) + : Begin(begin) + , End(end) +{ +} + +void TRobotsTxtRulesIterator::Next() { + while (Begin < End && *Begin) + ++Begin; + while (Begin < End && !isalpha(*Begin)) + ++Begin; +} + +bool TRobotsTxtRulesIterator::HasRule() const { + return Begin < End; +} + +const char* TRobotsTxtRulesIterator::GetRule() const { + return Begin + 1; +} + +TString TRobotsTxtRulesIterator::GetInitialRule() const { + auto begin = Begin + 1; + TStringBuf rule(begin, strlen(begin)); + + switch (*Begin) { + case 'a': + case 'd': + return rule.EndsWith('*') ? TString(rule.Chop(1)) : TString::Join(rule, '$'); + default: + return TString(rule); + } +} + +EDirectiveType TRobotsTxtRulesIterator::GetRuleType() const { + return CharToDirType(*Begin); +} + +EDirectiveType TRobotsTxtRulesIterator::CharToDirType(char ch) { + switch (toupper(ch)) { + case 'A': + return ALLOW; + case 'C': + return CRAWL_DELAY; + case 'D': + return DISALLOW; + case 'H': + return HOST; + case 'P': + return CLEAN_PARAM; + case 'S': + return SITEMAP; + } + return UNKNOWN; +} + +TRobotsTxtRulesHandlerBase::TRobotsTxtRulesHandlerBase( + TBotIdSet supportedBotIds, + int robotsMaxSize, + int maxRulesNumber, + bool saveDataForAnyBot) + : HandleErrors(false) + , SiteMaps() + , CleanParams() + , HostDirective("") + , Errors() + , AcceptedLines() + , CrossSectionAcceptedLines() + , BotIdToInfo(robotstxtcfg::max_botid) + , RobotsMaxSize(robotsMaxSize) + , MaxRulesNumber(maxRulesNumber) + , SaveDataForAnyBot(saveDataForAnyBot) + , SupportedBotIds(supportedBotIds) +{ + Y_ENSURE(!supportedBotIds.empty()); + + if (RobotsMaxSize <= 0) + RobotsMaxSize = robots_max; + if (MaxRulesNumber <= 0) + MaxRulesNumber = max_rules_count; + + ResetOptimized(); +} + +TRobotsTxtRulesHandlerBase::TRobotsTxtRulesHandlerBase( + const TSet<ui32>& supportedBotIds, + int robotsMaxSize, + int maxRulesNumber, + bool saveDataForAnyBot) + : TRobotsTxtRulesHandlerBase(ConvertBotIdSet(supportedBotIds), robotsMaxSize, maxRulesNumber, saveDataForAnyBot) +{} + +TRobotsTxtRulesHandlerBase::~TRobotsTxtRulesHandlerBase() = default; + +void TRobotsTxtRulesHandlerBase::CheckBotIdValidity(const ui32 botId) const { + if (botId >= robotstxtcfg::max_botid || !IsBotIdSupported(botId)) + ythrow yexception() << "robots.txt parser requested for invalid or unsupported botId = " << botId << Endl; + ; +} + +int TRobotsTxtRulesHandlerBase::GetCrawlDelay(const ui32 botId, bool* realInfo) const { + const auto id = GetMappedBotId(botId, false); + if (realInfo) + *realInfo = bool(id); + return BotIdToInfo[id.GetOrElse(robotstxtcfg::id_anybot)].CrawlDelay; +} + +int TRobotsTxtRulesHandlerBase::GetMinCrawlDelay(int defaultCrawlDelay) const { + int res = INT_MAX; + bool useDefault = false; + for (ui32 botId = 0; botId < robotstxtcfg::max_botid; ++botId) { + if (robotstxtcfg::IsYandexBotId(botId) && IsBotIdSupported(botId) && !IsDisallowAll(botId)) { + bool realInfo; + int curCrawlDelay = GetCrawlDelay(botId, &realInfo); + if (realInfo) { + if (curCrawlDelay == -1) { + useDefault = true; + } else { + res = Min(res, curCrawlDelay); + } + } + } + } + + if (useDefault && defaultCrawlDelay < res) { + return -1; + } + + if (res == INT_MAX) { + res = GetCrawlDelay(robotstxtcfg::id_anybot); + } + + return res; +} + +void TRobotsTxtRulesHandlerBase::SetCrawlDelay(const ui32 botId, int crawlDelay) { + CheckBotIdValidity(botId); + BotIdToInfo[botId].CrawlDelay = crawlDelay; +} + +const TVector<TString> TRobotsTxtRulesHandlerBase::GetSiteMaps() const { + return TVector<TString>(SiteMaps.begin(), SiteMaps.end()); +} + +void TRobotsTxtRulesHandlerBase::AddSiteMap(const char* sitemap) { + SiteMaps.insert(sitemap); +} + +const TVector<TString> TRobotsTxtRulesHandlerBase::GetCleanParams() const { + return TVector<TString>(CleanParams.begin(), CleanParams.end()); +} + +void TRobotsTxtRulesHandlerBase::AddCleanParam(const char* cleanParam) { + CleanParams.insert(cleanParam); +} + +const TString& TRobotsTxtRulesHandlerBase::GetHostDirective() const { + return HostDirective; +} + +void TRobotsTxtRulesHandlerBase::SetHostDirective(const char* hostDirective) { + HostDirective = hostDirective; +} + +const TRobotsTxtRulesHandlerBase::TErrorVector& TRobotsTxtRulesHandlerBase::GetErrors() const { + return Errors; +} + +TVector<int> TRobotsTxtRulesHandlerBase::GetAcceptedLines(const ui32 botId) const { + TVector<int> ret; + for (size_t i = 0; i < CrossSectionAcceptedLines.size(); ++i) + ret.push_back(CrossSectionAcceptedLines[i]); + + bool hasLinesForBotId = false; + for (size_t i = 0; i < AcceptedLines.size(); ++i) { + if (AcceptedLines[i].first == botId) { + hasLinesForBotId = true; + break; + } + } + + for (size_t i = 0; i < AcceptedLines.size(); ++i) { + if (hasLinesForBotId && AcceptedLines[i].first == botId) { + ret.push_back(AcceptedLines[i].second); + } else if (!hasLinesForBotId && AcceptedLines[i].first == robotstxtcfg::id_anybot) { + ret.push_back(AcceptedLines[i].second); + } + } + + Sort(ret.begin(), ret.end()); + + return ret; +} + +void TRobotsTxtRulesHandlerBase::AddAcceptedLine(ui32 line, const TBotIdSet& botIds, bool isCrossSection) { + if (isCrossSection) { + CrossSectionAcceptedLines.push_back(line); + return; + } + + for (auto botId : botIds) { + AcceptedLines.push_back(TBotIdAcceptedLine(botId, line)); + } +} + +void TRobotsTxtRulesHandlerBase::SetErrorsHandling(bool handleErrors) { + HandleErrors = handleErrors; +} + +bool TRobotsTxtRulesHandlerBase::IsHandlingErrors() const { + return HandleErrors; +} + +EDirectiveType TRobotsTxtRulesHandlerBase::NameToDirType(const char* d) { + if (!strcmp("disallow", d)) + return DISALLOW; + if (!strcmp("allow", d)) + return ALLOW; + if (!strcmp("user-agent", d)) + return USER_AGENT; + if (!strcmp("host", d)) + return HOST; + if (!strcmp("sitemap", d)) + return SITEMAP; + if (!strcmp("clean-param", d)) + return CLEAN_PARAM; + if (!strcmp("crawl-delay", d)) + return CRAWL_DELAY; + return UNKNOWN; +}; + +const char* TRobotsTxtRulesHandlerBase::DirTypeToName(EDirectiveType t) { + static const char* name[] = {"Allow", "Crawl-Delay", "Disallow", "Host", "Clean-Param", "Sitemap", "User-Agent", "Unknown"}; + switch (t) { + case ALLOW: + return name[0]; + case CRAWL_DELAY: + return name[1]; + case DISALLOW: + return name[2]; + case HOST: + return name[3]; + case CLEAN_PARAM: + return name[4]; + case SITEMAP: + return name[5]; + case USER_AGENT: + return name[6]; + case UNKNOWN: + return name[7]; + } + return name[7]; +}; + +bool TRobotsTxtRulesHandlerBase::CheckRobot( + const char* userAgent, + TBotIdSet& botIds, + const TVector<ui32>* botIdToMaxAppropriateUserAgentNameLength) const +{ + TCaseInsensitiveStringBuf agent(userAgent); + + for (size_t botIndex = 0; botIndex < robotstxtcfg::max_botid; ++botIndex) { + if (!IsBotIdSupported(botIndex)) + continue; + + bool hasRequiredAgentNamePrefix = agent.StartsWith(robotstxtcfg::GetReqPrefix(botIndex)); + bool isContainedInFullName = robotstxtcfg::GetFullName(botIndex).StartsWith(agent); + bool wasMoreImportantAgent = false; + if (botIdToMaxAppropriateUserAgentNameLength) + wasMoreImportantAgent = agent.size() < (*botIdToMaxAppropriateUserAgentNameLength)[botIndex]; + + if (hasRequiredAgentNamePrefix && isContainedInFullName && !wasMoreImportantAgent) { + botIds.insert(botIndex); + } + } + + return !botIds.empty(); +} + +int TRobotsTxtRulesHandlerBase::CheckRule(const char* value, int line, TRobotsTxtRulesHandlerBase* rulesHandler) { + if (!rulesHandler->IsHandlingErrors()) + return 0; + + if (auto len = strlen(value); len > max_rule_length) { + rulesHandler->AddError(ERROR_RULE_HUGE, line); + } + + bool upper = false, suspect = false; + for (const char* r = value; *r; ++r) { + if (!upper && isupper(*r)) + upper = true; + if (!suspect && !isalnum(*r) && !strchr("/_?=.-*%&~[]:;@", *r) && (*(r + 1) || *r != '$')) + suspect = true; + } + if (suspect) + rulesHandler->AddError(WARNING_SUSPECT_SYMBOL, line); + if (upper) + rulesHandler->AddError(WARNING_UPPER_REGISTER, line); + return suspect || upper; +} + +void TRobotsTxtRulesHandlerBase::AddError(EFormatErrorType type, int line) { + if (!HandleErrors) + return; + Errors.push_back(std::make_pair(type, line)); +} + +void TRobotsTxtRulesHandlerBase::ResetOptimized() noexcept { + for (ui32 i = 0; i < OptimizedBotIdToStoredBotId.size(); ++i) { + OptimizedBotIdToStoredBotId[i] = i; // by default, every bot maps to itself + } +} + +void TRobotsTxtRulesHandlerBase::Clear() { + SiteMaps.clear(); + CleanParams.clear(); + HostDirective = ""; + if (HandleErrors) { + AcceptedLines.clear(); + CrossSectionAcceptedLines.clear(); + Errors.clear(); + } + + for (size_t botId = 0; botId < BotIdToInfo.size(); ++botId) { + BotIdToInfo[botId].CrawlDelay = -1; + } + + LoadedBotIds.clear(); +} + +void TRobotsTxtRulesHandlerBase::ClearInternal(const ui32 botId) { + CheckBotIdValidity(botId); + BotIdToInfo[botId].CrawlDelay = -1; + + TVector<TBotIdAcceptedLine> newAcceptedLines; + for (size_t i = 0; i < AcceptedLines.size(); ++i) + if (AcceptedLines[i].first != botId) + newAcceptedLines.push_back(AcceptedLines[i]); + + AcceptedLines.swap(newAcceptedLines); +} + +int TRobotsTxtRulesHandlerBase::CheckHost(const char* host) { + THttpURL parsed; + TString copyHost = host; + + if (GetHttpPrefixSize(copyHost) == 0) { + copyHost = TString("http://") + copyHost; + } + + return parsed.Parse(copyHost.data(), THttpURL::FeaturesRobot) == THttpURL::ParsedOK && parsed.GetField(THttpURL::FieldHost) != TString(""); +} + +int TRobotsTxtRulesHandlerBase::CheckSitemapUrl(const char* url, const char* host, TString& modifiedUrl) { + if (host != nullptr && strlen(url) > 0 && url[0] == '/') { + modifiedUrl = TString(host) + url; + } else { + modifiedUrl = url; + } + + url = modifiedUrl.data(); + + if (strlen(url) >= URL_MAX - 8) + return 0; + THttpURL parsed; + if (parsed.Parse(url, THttpURL::FeaturesRobot) || !parsed.IsValidAbs()) + return 0; + if (parsed.GetScheme() != THttpURL::SchemeHTTP && parsed.GetScheme() != THttpURL::SchemeHTTPS) + return 0; + return CheckHost(parsed.PrintS(THttpURL::FlagHostPort).data()); +} + +// s - is space separated pair of clean-params (separated by &) and path prefix +int TRobotsTxtRulesHandlerBase::CheckAndNormCleanParam(TString& value) { + if (value.find(' ') == TString::npos) { + value.push_back(' '); + } + + const char* s = value.data(); + if (!s || !*s || strlen(s) > URL_MAX / 2 - 9) + return 0; + const char* p = s; + while (*p && !isspace(*p)) + ++p; + for (; s != p; ++s) { + // allowed only following not alpha-numerical symbols + if (!isalnum(*s) && !strchr("+-=_&%[]{}():.", *s)) + return 0; + // clean-params for prefix can be enumerated by & symbol, && not allowed syntax + if (*s == '&' && *(s + 1) == '&') + return 0; + } + const char* pathPrefix = p + 1; + while (isspace(*p)) + ++p; + char r[URL_MAX]; + char* pr = r; + for (; *p; ++p) { + if (!isalnum(*p) && !strchr(".-/*_,;:%", *p)) + return 0; + if (*p == '*') + *pr++ = '.'; + if (*p == '.') + *pr++ = '\\'; + *pr++ = *p; + } + *pr++ = '.'; + *pr++ = '*'; + *pr = 0; + TString params = value.substr(0, pathPrefix - value.data()); + value = params + r; + return 1; +} + +int TRobotsTxtRulesHandlerBase::ParseCrawlDelay(const char* value, int& crawlDelay) { + static const int MAX_CRAWL_DELAY = 1 << 10; + int val = 0; + const char* p = value; + for (; isdigit(*p); ++p) { + val = val * 10 + *p - '0'; + if (val > MAX_CRAWL_DELAY) + return 0; + } + if (*p) { + if (*p++ != '.') + return 0; + if (strspn(p, "1234567890") != strlen(p)) + return 0; + } + for (const char* s = p; s - p < 3; ++s) + val = val * 10 + (s < p + strlen(p) ? *s - '0' : 0); + crawlDelay = val; + return 1; +} + +bool TRobotsTxtRulesHandlerBase::AddRuleWithErrorCheck(const ui32 botId, TStringBuf rule, char type, TRobotsTxtParser& parser) { + if (!IsBotIdSupported(botId)) + return true; + + if (!AddRule(botId, rule, type)) { + AddError(ERROR_ROBOTS_HUGE, parser.GetLineNumber()); + AfterParse(botId); + return false; + } + return true; +} + +int TRobotsTxtRulesHandlerBase::OnHost(const ui32 botId, TRobotsTxtParser& parser, const char* value, TRobotsTxtRulesHandlerBase*& rulesHandler) { + // Temporary hack for correct repacking robots.txt from new format to old + // Remove it, when robot-stable-2010-10-17 will be deployed in production + if (!IsBotIdSupported(botId)) + return 0; + // end of hack + + if (rulesHandler->HostDirective != "") + rulesHandler->AddError(ERROR_HOST_MULTI, parser.GetLineNumber()); + else { + if (!CheckHost(value)) + rulesHandler->AddError(ERROR_HOST_FORMAT, parser.GetLineNumber()); + else { + rulesHandler->SetHostDirective(value); + if (!rulesHandler->AddRuleWithErrorCheck(botId, value, 'H', parser)) + return 2; + } + } + return 0; +} + +bool TRobotsTxtRulesHandlerBase::IsBotIdLoaded(const ui32 botId) const { + return LoadedBotIds.contains(botId); +} + +bool TRobotsTxtRulesHandlerBase::IsBotIdSupported(const ui32 botId) const { + return (SaveDataForAnyBot && botId == robotstxtcfg::id_anybot) || SupportedBotIds.contains(botId); +} + +ui32 TRobotsTxtRulesHandlerBase::GetNotOptimizedBotId(const ui32 botId) const { + return (botId < OptimizedBotIdToStoredBotId.size()) + ? OptimizedBotIdToStoredBotId[botId] + : botId; +} + +TMaybe<ui32> TRobotsTxtRulesHandlerBase::GetMappedBotId(ui32 botId, bool useAny) const { + botId = GetNotOptimizedBotId(botId); + CheckBotIdValidity(botId); + if (IsBotIdLoaded(botId)) + return botId; + if (useAny) + return robotstxtcfg::id_anybot; + return {}; +} |