diff options
author | thegeorg <thegeorg@yandex-team.com> | 2024-03-15 23:28:02 +0300 |
---|---|---|
committer | thegeorg <thegeorg@yandex-team.com> | 2024-03-15 23:38:24 +0300 |
commit | 6635e51dd2fc4c8fbc3e510cd31c9c9102b7cfda (patch) | |
tree | d51d048d3eb5d7a24ec6e4302b589513a3b97de3 /contrib/tools/python/src/Lib/robotparser.py | |
parent | e894914751eea93bb23102a533c8d481b80c2560 (diff) | |
download | ydb-6635e51dd2fc4c8fbc3e510cd31c9c9102b7cfda.tar.gz |
Store generated list of frozen modules for contrib/tools/python
a94357bf8071b16879eeabac1f52f54e278c03ed
Diffstat (limited to 'contrib/tools/python/src/Lib/robotparser.py')
-rw-r--r-- | contrib/tools/python/src/Lib/robotparser.py | 236 |
1 files changed, 0 insertions, 236 deletions
diff --git a/contrib/tools/python/src/Lib/robotparser.py b/contrib/tools/python/src/Lib/robotparser.py deleted file mode 100644 index 4e13f7f780c..00000000000 --- a/contrib/tools/python/src/Lib/robotparser.py +++ /dev/null @@ -1,236 +0,0 @@ -""" robotparser.py - - Copyright (C) 2000 Bastian Kleineidam - - You can choose between two licenses when using this package: - 1) GNU GPLv2 - 2) PSF license for Python 2.2 - - The robots.txt Exclusion Protocol is implemented as specified in - http://www.robotstxt.org/norobots-rfc.txt - -""" -import urlparse -import urllib - -__all__ = ["RobotFileParser"] - - -class RobotFileParser: - """ This class provides a set of methods to read, parse and answer - questions about a single robots.txt file. - - """ - - def __init__(self, url=''): - self.entries = [] - self.default_entry = None - self.disallow_all = False - self.allow_all = False - self.set_url(url) - self.last_checked = 0 - - def mtime(self): - """Returns the time the robots.txt file was last fetched. - - This is useful for long-running web spiders that need to - check for new robots.txt files periodically. - - """ - return self.last_checked - - def modified(self): - """Sets the time the robots.txt file was last fetched to the - current time. - - """ - import time - self.last_checked = time.time() - - def set_url(self, url): - """Sets the URL referring to a robots.txt file.""" - self.url = url - self.host, self.path = urlparse.urlparse(url)[1:3] - - def read(self): - """Reads the robots.txt URL and feeds it to the parser.""" - opener = URLopener() - f = opener.open(self.url) - lines = [line.strip() for line in f] - f.close() - self.errcode = opener.errcode - if self.errcode in (401, 403): - self.disallow_all = True - elif self.errcode >= 400 and self.errcode < 500: - self.allow_all = True - elif self.errcode == 200 and lines: - self.parse(lines) - - def _add_entry(self, entry): - if "*" in entry.useragents: - # the default entry is considered last - if self.default_entry is None: - # the first default entry wins - self.default_entry = entry - else: - self.entries.append(entry) - - def parse(self, lines): - """parse the input lines from a robots.txt file. - We allow that a user-agent: line is not preceded by - one or more blank lines.""" - # states: - # 0: start state - # 1: saw user-agent line - # 2: saw an allow or disallow line - state = 0 - linenumber = 0 - entry = Entry() - - self.modified() - for line in lines: - linenumber += 1 - if not line: - if state == 1: - entry = Entry() - state = 0 - elif state == 2: - self._add_entry(entry) - entry = Entry() - state = 0 - # remove optional comment and strip line - i = line.find('#') - if i >= 0: - line = line[:i] - line = line.strip() - if not line: - continue - line = line.split(':', 1) - if len(line) == 2: - line[0] = line[0].strip().lower() - line[1] = urllib.unquote(line[1].strip()) - if line[0] == "user-agent": - if state == 2: - self._add_entry(entry) - entry = Entry() - entry.useragents.append(line[1]) - state = 1 - elif line[0] == "disallow": - if state != 0: - entry.rulelines.append(RuleLine(line[1], False)) - state = 2 - elif line[0] == "allow": - if state != 0: - entry.rulelines.append(RuleLine(line[1], True)) - state = 2 - if state == 2: - self._add_entry(entry) - - - def can_fetch(self, useragent, url): - """using the parsed robots.txt decide if useragent can fetch url""" - if self.disallow_all: - return False - if self.allow_all: - return True - - # Until the robots.txt file has been read or found not - # to exist, we must assume that no url is allowable. - # This prevents false positives when a user erroneously - # calls can_fetch() before calling read(). - if not self.last_checked: - return False - - # search for given user agent matches - # the first match counts - parsed_url = urlparse.urlparse(urllib.unquote(url)) - url = urlparse.urlunparse(('', '', parsed_url.path, - parsed_url.params, parsed_url.query, parsed_url.fragment)) - url = urllib.quote(url) - if not url: - url = "/" - for entry in self.entries: - if entry.applies_to(useragent): - return entry.allowance(url) - # try the default entry last - if self.default_entry: - return self.default_entry.allowance(url) - # agent not found ==> access granted - return True - - - def __str__(self): - entries = self.entries - if self.default_entry is not None: - entries = entries + [self.default_entry] - return '\n'.join(map(str, entries)) + '\n' - - -class RuleLine: - """A rule line is a single "Allow:" (allowance==True) or "Disallow:" - (allowance==False) followed by a path.""" - def __init__(self, path, allowance): - if path == '' and not allowance: - # an empty value means allow all - allowance = True - path = urlparse.urlunparse(urlparse.urlparse(path)) - self.path = urllib.quote(path) - self.allowance = allowance - - def applies_to(self, filename): - return self.path == "*" or filename.startswith(self.path) - - def __str__(self): - return (self.allowance and "Allow" or "Disallow") + ": " + self.path - - -class Entry: - """An entry has one or more user-agents and zero or more rulelines""" - def __init__(self): - self.useragents = [] - self.rulelines = [] - - def __str__(self): - ret = [] - for agent in self.useragents: - ret.extend(["User-agent: ", agent, "\n"]) - for line in self.rulelines: - ret.extend([str(line), "\n"]) - return ''.join(ret) - - def applies_to(self, useragent): - """check if this entry applies to the specified agent""" - # split the name token and make it lower case - useragent = useragent.split("/")[0].lower() - for agent in self.useragents: - if agent == '*': - # we have the catch-all agent - return True - agent = agent.lower() - if agent in useragent: - return True - return False - - def allowance(self, filename): - """Preconditions: - - our agent applies to this entry - - filename is URL decoded""" - for line in self.rulelines: - if line.applies_to(filename): - return line.allowance - return True - -class URLopener(urllib.FancyURLopener): - def __init__(self, *args): - urllib.FancyURLopener.__init__(self, *args) - self.errcode = 200 - - def prompt_user_passwd(self, host, realm): - ## If robots.txt file is accessible only with a password, - ## we act as if the file wasn't there. - return None, None - - def http_error_default(self, url, fp, errcode, errmsg, headers): - self.errcode = errcode - return urllib.FancyURLopener.http_error_default(self, url, fp, errcode, - errmsg, headers) |