summaryrefslogtreecommitdiffstats
path: root/library/cpp/robots_txt/rules_handler.cpp
diff options
context:
space:
mode:
authorvvvv <[email protected]>2023-07-31 18:21:04 +0300
committervvvv <[email protected]>2023-07-31 18:21:04 +0300
commitdec41c40e51aa407edef81a3c566a5a15780fc49 (patch)
tree4f197b596b32f35eca368121f0dff913419da9af /library/cpp/robots_txt/rules_handler.cpp
parent3ca8b54c96e09eb2b65be7f09675623438d559c7 (diff)
YQL-16239 Move purecalc to public
Diffstat (limited to 'library/cpp/robots_txt/rules_handler.cpp')
-rw-r--r--library/cpp/robots_txt/rules_handler.cpp514
1 files changed, 514 insertions, 0 deletions
diff --git a/library/cpp/robots_txt/rules_handler.cpp b/library/cpp/robots_txt/rules_handler.cpp
new file mode 100644
index 00000000000..4297db9d218
--- /dev/null
+++ b/library/cpp/robots_txt/rules_handler.cpp
@@ -0,0 +1,514 @@
+#include "robots_txt.h"
+#include "constants.h"
+
+#include <library/cpp/uri/http_url.h>
+#include <library/cpp/charset/ci_string.h>
+#include <library/cpp/string_utils/url/url.h>
+#include <util/system/maxlen.h>
+#include <util/generic/yexception.h>
+#include <util/generic/algorithm.h>
+
+
+namespace {
+
+TBotIdSet ConvertBotIdSet(const TSet<ui32>& botIds) noexcept {
+ TBotIdSet result;
+ for (auto id : botIds) {
+ result.insert(id);
+ }
+ return result;
+}
+
+} // namespace
+
+TRobotsTxtRulesIterator::TRobotsTxtRulesIterator(const char* begin, const char* end)
+ : Begin(begin)
+ , End(end)
+{
+}
+
+void TRobotsTxtRulesIterator::Next() {
+ while (Begin < End && *Begin)
+ ++Begin;
+ while (Begin < End && !isalpha(*Begin))
+ ++Begin;
+}
+
+bool TRobotsTxtRulesIterator::HasRule() const {
+ return Begin < End;
+}
+
+const char* TRobotsTxtRulesIterator::GetRule() const {
+ return Begin + 1;
+}
+
+TString TRobotsTxtRulesIterator::GetInitialRule() const {
+ auto begin = Begin + 1;
+ TStringBuf rule(begin, strlen(begin));
+
+ switch (*Begin) {
+ case 'a':
+ case 'd':
+ return rule.EndsWith('*') ? TString(rule.Chop(1)) : TString::Join(rule, '$');
+ default:
+ return TString(rule);
+ }
+}
+
+EDirectiveType TRobotsTxtRulesIterator::GetRuleType() const {
+ return CharToDirType(*Begin);
+}
+
+EDirectiveType TRobotsTxtRulesIterator::CharToDirType(char ch) {
+ switch (toupper(ch)) {
+ case 'A':
+ return ALLOW;
+ case 'C':
+ return CRAWL_DELAY;
+ case 'D':
+ return DISALLOW;
+ case 'H':
+ return HOST;
+ case 'P':
+ return CLEAN_PARAM;
+ case 'S':
+ return SITEMAP;
+ }
+ return UNKNOWN;
+}
+
+TRobotsTxtRulesHandlerBase::TRobotsTxtRulesHandlerBase(
+ TBotIdSet supportedBotIds,
+ int robotsMaxSize,
+ int maxRulesNumber,
+ bool saveDataForAnyBot)
+ : HandleErrors(false)
+ , SiteMaps()
+ , CleanParams()
+ , HostDirective("")
+ , Errors()
+ , AcceptedLines()
+ , CrossSectionAcceptedLines()
+ , BotIdToInfo(robotstxtcfg::max_botid)
+ , RobotsMaxSize(robotsMaxSize)
+ , MaxRulesNumber(maxRulesNumber)
+ , SaveDataForAnyBot(saveDataForAnyBot)
+ , SupportedBotIds(supportedBotIds)
+{
+ Y_ENSURE(!supportedBotIds.empty());
+
+ if (RobotsMaxSize <= 0)
+ RobotsMaxSize = robots_max;
+ if (MaxRulesNumber <= 0)
+ MaxRulesNumber = max_rules_count;
+
+ ResetOptimized();
+}
+
+TRobotsTxtRulesHandlerBase::TRobotsTxtRulesHandlerBase(
+ const TSet<ui32>& supportedBotIds,
+ int robotsMaxSize,
+ int maxRulesNumber,
+ bool saveDataForAnyBot)
+ : TRobotsTxtRulesHandlerBase(ConvertBotIdSet(supportedBotIds), robotsMaxSize, maxRulesNumber, saveDataForAnyBot)
+{}
+
+TRobotsTxtRulesHandlerBase::~TRobotsTxtRulesHandlerBase() = default;
+
+void TRobotsTxtRulesHandlerBase::CheckBotIdValidity(const ui32 botId) const {
+ if (botId >= robotstxtcfg::max_botid || !IsBotIdSupported(botId))
+ ythrow yexception() << "robots.txt parser requested for invalid or unsupported botId = " << botId << Endl;
+ ;
+}
+
+int TRobotsTxtRulesHandlerBase::GetCrawlDelay(const ui32 botId, bool* realInfo) const {
+ const auto id = GetMappedBotId(botId, false);
+ if (realInfo)
+ *realInfo = bool(id);
+ return BotIdToInfo[id.GetOrElse(robotstxtcfg::id_anybot)].CrawlDelay;
+}
+
+int TRobotsTxtRulesHandlerBase::GetMinCrawlDelay(int defaultCrawlDelay) const {
+ int res = INT_MAX;
+ bool useDefault = false;
+ for (ui32 botId = 0; botId < robotstxtcfg::max_botid; ++botId) {
+ if (robotstxtcfg::IsYandexBotId(botId) && IsBotIdSupported(botId) && !IsDisallowAll(botId)) {
+ bool realInfo;
+ int curCrawlDelay = GetCrawlDelay(botId, &realInfo);
+ if (realInfo) {
+ if (curCrawlDelay == -1) {
+ useDefault = true;
+ } else {
+ res = Min(res, curCrawlDelay);
+ }
+ }
+ }
+ }
+
+ if (useDefault && defaultCrawlDelay < res) {
+ return -1;
+ }
+
+ if (res == INT_MAX) {
+ res = GetCrawlDelay(robotstxtcfg::id_anybot);
+ }
+
+ return res;
+}
+
+void TRobotsTxtRulesHandlerBase::SetCrawlDelay(const ui32 botId, int crawlDelay) {
+ CheckBotIdValidity(botId);
+ BotIdToInfo[botId].CrawlDelay = crawlDelay;
+}
+
+const TVector<TString> TRobotsTxtRulesHandlerBase::GetSiteMaps() const {
+ return TVector<TString>(SiteMaps.begin(), SiteMaps.end());
+}
+
+void TRobotsTxtRulesHandlerBase::AddSiteMap(const char* sitemap) {
+ SiteMaps.insert(sitemap);
+}
+
+const TVector<TString> TRobotsTxtRulesHandlerBase::GetCleanParams() const {
+ return TVector<TString>(CleanParams.begin(), CleanParams.end());
+}
+
+void TRobotsTxtRulesHandlerBase::AddCleanParam(const char* cleanParam) {
+ CleanParams.insert(cleanParam);
+}
+
+const TString& TRobotsTxtRulesHandlerBase::GetHostDirective() const {
+ return HostDirective;
+}
+
+void TRobotsTxtRulesHandlerBase::SetHostDirective(const char* hostDirective) {
+ HostDirective = hostDirective;
+}
+
+const TRobotsTxtRulesHandlerBase::TErrorVector& TRobotsTxtRulesHandlerBase::GetErrors() const {
+ return Errors;
+}
+
+TVector<int> TRobotsTxtRulesHandlerBase::GetAcceptedLines(const ui32 botId) const {
+ TVector<int> ret;
+ for (size_t i = 0; i < CrossSectionAcceptedLines.size(); ++i)
+ ret.push_back(CrossSectionAcceptedLines[i]);
+
+ bool hasLinesForBotId = false;
+ for (size_t i = 0; i < AcceptedLines.size(); ++i) {
+ if (AcceptedLines[i].first == botId) {
+ hasLinesForBotId = true;
+ break;
+ }
+ }
+
+ for (size_t i = 0; i < AcceptedLines.size(); ++i) {
+ if (hasLinesForBotId && AcceptedLines[i].first == botId) {
+ ret.push_back(AcceptedLines[i].second);
+ } else if (!hasLinesForBotId && AcceptedLines[i].first == robotstxtcfg::id_anybot) {
+ ret.push_back(AcceptedLines[i].second);
+ }
+ }
+
+ Sort(ret.begin(), ret.end());
+
+ return ret;
+}
+
+void TRobotsTxtRulesHandlerBase::AddAcceptedLine(ui32 line, const TBotIdSet& botIds, bool isCrossSection) {
+ if (isCrossSection) {
+ CrossSectionAcceptedLines.push_back(line);
+ return;
+ }
+
+ for (auto botId : botIds) {
+ AcceptedLines.push_back(TBotIdAcceptedLine(botId, line));
+ }
+}
+
+void TRobotsTxtRulesHandlerBase::SetErrorsHandling(bool handleErrors) {
+ HandleErrors = handleErrors;
+}
+
+bool TRobotsTxtRulesHandlerBase::IsHandlingErrors() const {
+ return HandleErrors;
+}
+
+EDirectiveType TRobotsTxtRulesHandlerBase::NameToDirType(const char* d) {
+ if (!strcmp("disallow", d))
+ return DISALLOW;
+ if (!strcmp("allow", d))
+ return ALLOW;
+ if (!strcmp("user-agent", d))
+ return USER_AGENT;
+ if (!strcmp("host", d))
+ return HOST;
+ if (!strcmp("sitemap", d))
+ return SITEMAP;
+ if (!strcmp("clean-param", d))
+ return CLEAN_PARAM;
+ if (!strcmp("crawl-delay", d))
+ return CRAWL_DELAY;
+ return UNKNOWN;
+}
+
+const char* TRobotsTxtRulesHandlerBase::DirTypeToName(EDirectiveType t) {
+ static const char* name[] = {"Allow", "Crawl-Delay", "Disallow", "Host", "Clean-Param", "Sitemap", "User-Agent", "Unknown"};
+ switch (t) {
+ case ALLOW:
+ return name[0];
+ case CRAWL_DELAY:
+ return name[1];
+ case DISALLOW:
+ return name[2];
+ case HOST:
+ return name[3];
+ case CLEAN_PARAM:
+ return name[4];
+ case SITEMAP:
+ return name[5];
+ case USER_AGENT:
+ return name[6];
+ case UNKNOWN:
+ return name[7];
+ }
+ return name[7];
+}
+
+bool TRobotsTxtRulesHandlerBase::CheckRobot(
+ const char* userAgent,
+ TBotIdSet& botIds,
+ const TVector<ui32>* botIdToMaxAppropriateUserAgentNameLength) const
+{
+ TCaseInsensitiveStringBuf agent(userAgent);
+
+ for (size_t botIndex = 0; botIndex < robotstxtcfg::max_botid; ++botIndex) {
+ if (!IsBotIdSupported(botIndex))
+ continue;
+
+ bool hasRequiredAgentNamePrefix = agent.StartsWith(robotstxtcfg::GetReqPrefix(botIndex));
+ bool isContainedInFullName = robotstxtcfg::GetFullName(botIndex).StartsWith(agent);
+ bool wasMoreImportantAgent = false;
+ if (botIdToMaxAppropriateUserAgentNameLength)
+ wasMoreImportantAgent = agent.size() < (*botIdToMaxAppropriateUserAgentNameLength)[botIndex];
+
+ if (hasRequiredAgentNamePrefix && isContainedInFullName && !wasMoreImportantAgent) {
+ botIds.insert(botIndex);
+ }
+ }
+
+ return !botIds.empty();
+}
+
+int TRobotsTxtRulesHandlerBase::CheckRule(const char* value, int line, TRobotsTxtRulesHandlerBase* rulesHandler) {
+ if (!rulesHandler->IsHandlingErrors())
+ return 0;
+
+ if (auto len = strlen(value); len > max_rule_length) {
+ rulesHandler->AddError(ERROR_RULE_HUGE, line);
+ }
+
+ bool upper = false, suspect = false;
+ for (const char* r = value; *r; ++r) {
+ if (!upper && isupper(*r))
+ upper = true;
+ if (!suspect && !isalnum(*r) && !strchr("/_?=.-*%&~[]:;@", *r) && (*(r + 1) || *r != '$'))
+ suspect = true;
+ }
+ if (suspect)
+ rulesHandler->AddError(WARNING_SUSPECT_SYMBOL, line);
+ if (upper)
+ rulesHandler->AddError(WARNING_UPPER_REGISTER, line);
+ return suspect || upper;
+}
+
+void TRobotsTxtRulesHandlerBase::AddError(EFormatErrorType type, int line) {
+ if (!HandleErrors)
+ return;
+ Errors.push_back(std::make_pair(type, line));
+}
+
+void TRobotsTxtRulesHandlerBase::ResetOptimized() noexcept {
+ for (ui32 i = 0; i < OptimizedBotIdToStoredBotId.size(); ++i) {
+ OptimizedBotIdToStoredBotId[i] = i; // by default, every bot maps to itself
+ }
+}
+
+void TRobotsTxtRulesHandlerBase::Clear() {
+ SiteMaps.clear();
+ CleanParams.clear();
+ HostDirective = "";
+ if (HandleErrors) {
+ AcceptedLines.clear();
+ CrossSectionAcceptedLines.clear();
+ Errors.clear();
+ }
+
+ for (size_t botId = 0; botId < BotIdToInfo.size(); ++botId) {
+ BotIdToInfo[botId].CrawlDelay = -1;
+ }
+
+ LoadedBotIds.clear();
+}
+
+void TRobotsTxtRulesHandlerBase::ClearInternal(const ui32 botId) {
+ CheckBotIdValidity(botId);
+ BotIdToInfo[botId].CrawlDelay = -1;
+
+ TVector<TBotIdAcceptedLine> newAcceptedLines;
+ for (size_t i = 0; i < AcceptedLines.size(); ++i)
+ if (AcceptedLines[i].first != botId)
+ newAcceptedLines.push_back(AcceptedLines[i]);
+
+ AcceptedLines.swap(newAcceptedLines);
+}
+
+int TRobotsTxtRulesHandlerBase::CheckHost(const char* host) {
+ THttpURL parsed;
+ TString copyHost = host;
+
+ if (GetHttpPrefixSize(copyHost) == 0) {
+ copyHost = TString("http://") + copyHost;
+ }
+
+ return parsed.Parse(copyHost.data(), THttpURL::FeaturesRobot) == THttpURL::ParsedOK && parsed.GetField(THttpURL::FieldHost) != TString("");
+}
+
+int TRobotsTxtRulesHandlerBase::CheckSitemapUrl(const char* url, const char* host, TString& modifiedUrl) {
+ if (host != nullptr && strlen(url) > 0 && url[0] == '/') {
+ modifiedUrl = TString(host) + url;
+ } else {
+ modifiedUrl = url;
+ }
+
+ url = modifiedUrl.data();
+
+ if (strlen(url) >= URL_MAX - 8)
+ return 0;
+ THttpURL parsed;
+ if (parsed.Parse(url, THttpURL::FeaturesRobot) || !parsed.IsValidAbs())
+ return 0;
+ if (parsed.GetScheme() != THttpURL::SchemeHTTP && parsed.GetScheme() != THttpURL::SchemeHTTPS)
+ return 0;
+ return CheckHost(parsed.PrintS(THttpURL::FlagHostPort).data());
+}
+
+// s - is space separated pair of clean-params (separated by &) and path prefix
+int TRobotsTxtRulesHandlerBase::CheckAndNormCleanParam(TString& value) {
+ if (value.find(' ') == TString::npos) {
+ value.push_back(' ');
+ }
+
+ const char* s = value.data();
+ if (!s || !*s || strlen(s) > URL_MAX / 2 - 9)
+ return 0;
+ const char* p = s;
+ while (*p && !isspace(*p))
+ ++p;
+ for (; s != p; ++s) {
+ // allowed only following not alpha-numerical symbols
+ if (!isalnum(*s) && !strchr("+-=_&%[]{}():.", *s))
+ return 0;
+ // clean-params for prefix can be enumerated by & symbol, && not allowed syntax
+ if (*s == '&' && *(s + 1) == '&')
+ return 0;
+ }
+ const char* pathPrefix = p + 1;
+ while (isspace(*p))
+ ++p;
+ char r[URL_MAX];
+ char* pr = r;
+ for (; *p; ++p) {
+ if (!isalnum(*p) && !strchr(".-/*_,;:%", *p))
+ return 0;
+ if (*p == '*')
+ *pr++ = '.';
+ if (*p == '.')
+ *pr++ = '\\';
+ *pr++ = *p;
+ }
+ *pr++ = '.';
+ *pr++ = '*';
+ *pr = 0;
+ TString params = value.substr(0, pathPrefix - value.data());
+ value = params + r;
+ return 1;
+}
+
+int TRobotsTxtRulesHandlerBase::ParseCrawlDelay(const char* value, int& crawlDelay) {
+ static const int MAX_CRAWL_DELAY = 1 << 10;
+ int val = 0;
+ const char* p = value;
+ for (; isdigit(*p); ++p) {
+ val = val * 10 + *p - '0';
+ if (val > MAX_CRAWL_DELAY)
+ return 0;
+ }
+ if (*p) {
+ if (*p++ != '.')
+ return 0;
+ if (strspn(p, "1234567890") != strlen(p))
+ return 0;
+ }
+ for (const char* s = p; s - p < 3; ++s)
+ val = val * 10 + (s < p + strlen(p) ? *s - '0' : 0);
+ crawlDelay = val;
+ return 1;
+}
+
+bool TRobotsTxtRulesHandlerBase::AddRuleWithErrorCheck(const ui32 botId, TStringBuf rule, char type, TRobotsTxtParser& parser) {
+ if (!IsBotIdSupported(botId))
+ return true;
+
+ if (!AddRule(botId, rule, type)) {
+ AddError(ERROR_ROBOTS_HUGE, parser.GetLineNumber());
+ AfterParse(botId);
+ return false;
+ }
+ return true;
+}
+
+int TRobotsTxtRulesHandlerBase::OnHost(const ui32 botId, TRobotsTxtParser& parser, const char* value, TRobotsTxtRulesHandlerBase*& rulesHandler) {
+ // Temporary hack for correct repacking robots.txt from new format to old
+ // Remove it, when robot-stable-2010-10-17 will be deployed in production
+ if (!IsBotIdSupported(botId))
+ return 0;
+ // end of hack
+
+ if (rulesHandler->HostDirective != "")
+ rulesHandler->AddError(ERROR_HOST_MULTI, parser.GetLineNumber());
+ else {
+ if (!CheckHost(value))
+ rulesHandler->AddError(ERROR_HOST_FORMAT, parser.GetLineNumber());
+ else {
+ rulesHandler->SetHostDirective(value);
+ if (!rulesHandler->AddRuleWithErrorCheck(botId, value, 'H', parser))
+ return 2;
+ }
+ }
+ return 0;
+}
+
+bool TRobotsTxtRulesHandlerBase::IsBotIdLoaded(const ui32 botId) const {
+ return LoadedBotIds.contains(botId);
+}
+
+bool TRobotsTxtRulesHandlerBase::IsBotIdSupported(const ui32 botId) const {
+ return (SaveDataForAnyBot && botId == robotstxtcfg::id_anybot) || SupportedBotIds.contains(botId);
+}
+
+ui32 TRobotsTxtRulesHandlerBase::GetNotOptimizedBotId(const ui32 botId) const {
+ return (botId < OptimizedBotIdToStoredBotId.size())
+ ? OptimizedBotIdToStoredBotId[botId]
+ : botId;
+}
+
+TMaybe<ui32> TRobotsTxtRulesHandlerBase::GetMappedBotId(ui32 botId, bool useAny) const {
+ botId = GetNotOptimizedBotId(botId);
+ CheckBotIdValidity(botId);
+ if (IsBotIdLoaded(botId))
+ return botId;
+ if (useAny)
+ return robotstxtcfg::id_anybot;
+ return {};
+}