diff options
author | AlexSm <alex@ydb.tech> | 2024-03-05 10:40:59 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-03-05 12:40:59 +0300 |
commit | 1ac13c847b5358faba44dbb638a828e24369467b (patch) | |
tree | 07672b4dd3604ad3dee540a02c6494cb7d10dc3d /contrib/tools/python3/Lib/urllib/robotparser.py | |
parent | ffcca3e7f7958ddc6487b91d3df8c01054bd0638 (diff) | |
download | ydb-1ac13c847b5358faba44dbb638a828e24369467b.tar.gz |
Library import 16 (#2433)
Co-authored-by: robot-piglet <robot-piglet@yandex-team.com>
Co-authored-by: deshevoy <deshevoy@yandex-team.com>
Co-authored-by: robot-contrib <robot-contrib@yandex-team.com>
Co-authored-by: thegeorg <thegeorg@yandex-team.com>
Co-authored-by: robot-ya-builder <robot-ya-builder@yandex-team.com>
Co-authored-by: svidyuk <svidyuk@yandex-team.com>
Co-authored-by: shadchin <shadchin@yandex-team.com>
Co-authored-by: robot-ratatosk <robot-ratatosk@yandex-team.com>
Co-authored-by: innokentii <innokentii@yandex-team.com>
Co-authored-by: arkady-e1ppa <arkady-e1ppa@yandex-team.com>
Co-authored-by: snermolaev <snermolaev@yandex-team.com>
Co-authored-by: dimdim11 <dimdim11@yandex-team.com>
Co-authored-by: kickbutt <kickbutt@yandex-team.com>
Co-authored-by: abdullinsaid <abdullinsaid@yandex-team.com>
Co-authored-by: korsunandrei <korsunandrei@yandex-team.com>
Co-authored-by: petrk <petrk@yandex-team.com>
Co-authored-by: miroslav2 <miroslav2@yandex-team.com>
Co-authored-by: serjflint <serjflint@yandex-team.com>
Co-authored-by: akhropov <akhropov@yandex-team.com>
Co-authored-by: prettyboy <prettyboy@yandex-team.com>
Co-authored-by: ilikepugs <ilikepugs@yandex-team.com>
Co-authored-by: hiddenpath <hiddenpath@yandex-team.com>
Co-authored-by: mikhnenko <mikhnenko@yandex-team.com>
Co-authored-by: spreis <spreis@yandex-team.com>
Co-authored-by: andreyshspb <andreyshspb@yandex-team.com>
Co-authored-by: dimaandreev <dimaandreev@yandex-team.com>
Co-authored-by: rashid <rashid@yandex-team.com>
Co-authored-by: robot-ydb-importer <robot-ydb-importer@yandex-team.com>
Co-authored-by: r-vetrov <r-vetrov@yandex-team.com>
Co-authored-by: ypodlesov <ypodlesov@yandex-team.com>
Co-authored-by: zaverden <zaverden@yandex-team.com>
Co-authored-by: vpozdyayev <vpozdyayev@yandex-team.com>
Co-authored-by: robot-cozmo <robot-cozmo@yandex-team.com>
Co-authored-by: v-korovin <v-korovin@yandex-team.com>
Co-authored-by: arikon <arikon@yandex-team.com>
Co-authored-by: khoden <khoden@yandex-team.com>
Co-authored-by: psydmm <psydmm@yandex-team.com>
Co-authored-by: robot-javacom <robot-javacom@yandex-team.com>
Co-authored-by: dtorilov <dtorilov@yandex-team.com>
Co-authored-by: sennikovmv <sennikovmv@yandex-team.com>
Co-authored-by: hcpp <hcpp@ydb.tech>
Diffstat (limited to 'contrib/tools/python3/Lib/urllib/robotparser.py')
-rw-r--r-- | contrib/tools/python3/Lib/urllib/robotparser.py | 273 |
1 files changed, 273 insertions, 0 deletions
diff --git a/contrib/tools/python3/Lib/urllib/robotparser.py b/contrib/tools/python3/Lib/urllib/robotparser.py new file mode 100644 index 0000000000..c58565e394 --- /dev/null +++ b/contrib/tools/python3/Lib/urllib/robotparser.py @@ -0,0 +1,273 @@ +""" robotparser.py + + Copyright (C) 2000 Bastian Kleineidam + + You can choose between two licenses when using this package: + 1) GNU GPLv2 + 2) PSF license for Python 2.2 + + The robots.txt Exclusion Protocol is implemented as specified in + http://www.robotstxt.org/norobots-rfc.txt +""" + +import collections +import urllib.parse +import urllib.request + +__all__ = ["RobotFileParser"] + +RequestRate = collections.namedtuple("RequestRate", "requests seconds") + + +class RobotFileParser: + """ This class provides a set of methods to read, parse and answer + questions about a single robots.txt file. + + """ + + def __init__(self, url=''): + self.entries = [] + self.sitemaps = [] + self.default_entry = None + self.disallow_all = False + self.allow_all = False + self.set_url(url) + self.last_checked = 0 + + def mtime(self): + """Returns the time the robots.txt file was last fetched. + + This is useful for long-running web spiders that need to + check for new robots.txt files periodically. + + """ + return self.last_checked + + def modified(self): + """Sets the time the robots.txt file was last fetched to the + current time. + + """ + import time + self.last_checked = time.time() + + def set_url(self, url): + """Sets the URL referring to a robots.txt file.""" + self.url = url + self.host, self.path = urllib.parse.urlparse(url)[1:3] + + def read(self): + """Reads the robots.txt URL and feeds it to the parser.""" + try: + f = urllib.request.urlopen(self.url) + except urllib.error.HTTPError as err: + if err.code in (401, 403): + self.disallow_all = True + elif err.code >= 400 and err.code < 500: + self.allow_all = True + else: + raw = f.read() + self.parse(raw.decode("utf-8").splitlines()) + + def _add_entry(self, entry): + if "*" in entry.useragents: + # the default entry is considered last + if self.default_entry is None: + # the first default entry wins + self.default_entry = entry + else: + self.entries.append(entry) + + def parse(self, lines): + """Parse the input lines from a robots.txt file. + + We allow that a user-agent: line is not preceded by + one or more blank lines. + """ + # states: + # 0: start state + # 1: saw user-agent line + # 2: saw an allow or disallow line + state = 0 + entry = Entry() + + self.modified() + for line in lines: + if not line: + if state == 1: + entry = Entry() + state = 0 + elif state == 2: + self._add_entry(entry) + entry = Entry() + state = 0 + # remove optional comment and strip line + i = line.find('#') + if i >= 0: + line = line[:i] + line = line.strip() + if not line: + continue + line = line.split(':', 1) + if len(line) == 2: + line[0] = line[0].strip().lower() + line[1] = urllib.parse.unquote(line[1].strip()) + if line[0] == "user-agent": + if state == 2: + self._add_entry(entry) + entry = Entry() + entry.useragents.append(line[1]) + state = 1 + elif line[0] == "disallow": + if state != 0: + entry.rulelines.append(RuleLine(line[1], False)) + state = 2 + elif line[0] == "allow": + if state != 0: + entry.rulelines.append(RuleLine(line[1], True)) + state = 2 + elif line[0] == "crawl-delay": + if state != 0: + # before trying to convert to int we need to make + # sure that robots.txt has valid syntax otherwise + # it will crash + if line[1].strip().isdigit(): + entry.delay = int(line[1]) + state = 2 + elif line[0] == "request-rate": + if state != 0: + numbers = line[1].split('/') + # check if all values are sane + if (len(numbers) == 2 and numbers[0].strip().isdigit() + and numbers[1].strip().isdigit()): + entry.req_rate = RequestRate(int(numbers[0]), int(numbers[1])) + state = 2 + elif line[0] == "sitemap": + # According to http://www.sitemaps.org/protocol.html + # "This directive is independent of the user-agent line, + # so it doesn't matter where you place it in your file." + # Therefore we do not change the state of the parser. + self.sitemaps.append(line[1]) + if state == 2: + self._add_entry(entry) + + def can_fetch(self, useragent, url): + """using the parsed robots.txt decide if useragent can fetch url""" + if self.disallow_all: + return False + if self.allow_all: + return True + # Until the robots.txt file has been read or found not + # to exist, we must assume that no url is allowable. + # This prevents false positives when a user erroneously + # calls can_fetch() before calling read(). + if not self.last_checked: + return False + # search for given user agent matches + # the first match counts + parsed_url = urllib.parse.urlparse(urllib.parse.unquote(url)) + url = urllib.parse.urlunparse(('','',parsed_url.path, + parsed_url.params,parsed_url.query, parsed_url.fragment)) + url = urllib.parse.quote(url) + if not url: + url = "/" + for entry in self.entries: + if entry.applies_to(useragent): + return entry.allowance(url) + # try the default entry last + if self.default_entry: + return self.default_entry.allowance(url) + # agent not found ==> access granted + return True + + def crawl_delay(self, useragent): + if not self.mtime(): + return None + for entry in self.entries: + if entry.applies_to(useragent): + return entry.delay + if self.default_entry: + return self.default_entry.delay + return None + + def request_rate(self, useragent): + if not self.mtime(): + return None + for entry in self.entries: + if entry.applies_to(useragent): + return entry.req_rate + if self.default_entry: + return self.default_entry.req_rate + return None + + def site_maps(self): + if not self.sitemaps: + return None + return self.sitemaps + + def __str__(self): + entries = self.entries + if self.default_entry is not None: + entries = entries + [self.default_entry] + return '\n\n'.join(map(str, entries)) + + +class RuleLine: + """A rule line is a single "Allow:" (allowance==True) or "Disallow:" + (allowance==False) followed by a path.""" + def __init__(self, path, allowance): + if path == '' and not allowance: + # an empty value means allow all + allowance = True + path = urllib.parse.urlunparse(urllib.parse.urlparse(path)) + self.path = urllib.parse.quote(path) + self.allowance = allowance + + def applies_to(self, filename): + return self.path == "*" or filename.startswith(self.path) + + def __str__(self): + return ("Allow" if self.allowance else "Disallow") + ": " + self.path + + +class Entry: + """An entry has one or more user-agents and zero or more rulelines""" + def __init__(self): + self.useragents = [] + self.rulelines = [] + self.delay = None + self.req_rate = None + + def __str__(self): + ret = [] + for agent in self.useragents: + ret.append(f"User-agent: {agent}") + if self.delay is not None: + ret.append(f"Crawl-delay: {self.delay}") + if self.req_rate is not None: + rate = self.req_rate + ret.append(f"Request-rate: {rate.requests}/{rate.seconds}") + ret.extend(map(str, self.rulelines)) + return '\n'.join(ret) + + def applies_to(self, useragent): + """check if this entry applies to the specified agent""" + # split the name token and make it lower case + useragent = useragent.split("/")[0].lower() + for agent in self.useragents: + if agent == '*': + # we have the catch-all agent + return True + agent = agent.lower() + if agent in useragent: + return True + return False + + def allowance(self, filename): + """Preconditions: + - our agent applies to this entry + - filename is URL decoded""" + for line in self.rulelines: + if line.applies_to(filename): + return line.allowance + return True |