diff options
author | shmel1k <shmel1k@ydb.tech> | 2023-11-26 18:16:14 +0300 |
---|---|---|
committer | shmel1k <shmel1k@ydb.tech> | 2023-11-26 18:43:30 +0300 |
commit | b8cf9e88f4c5c64d9406af533d8948deb050d695 (patch) | |
tree | 218eb61fb3c3b96ec08b4d8cdfef383104a87d63 /contrib/python/hyperlink | |
parent | 523f645a83a0ec97a0332dbc3863bb354c92a328 (diff) | |
download | ydb-b8cf9e88f4c5c64d9406af533d8948deb050d695.tar.gz |
add kikimr_configure
Diffstat (limited to 'contrib/python/hyperlink')
23 files changed, 6045 insertions, 0 deletions
diff --git a/contrib/python/hyperlink/py2/.dist-info/METADATA b/contrib/python/hyperlink/py2/.dist-info/METADATA new file mode 100644 index 0000000000..fc5922ba87 --- /dev/null +++ b/contrib/python/hyperlink/py2/.dist-info/METADATA @@ -0,0 +1,38 @@ +Metadata-Version: 2.1 +Name: hyperlink +Version: 21.0.0 +Summary: A featureful, immutable, and correct URL for Python. +Home-page: https://github.com/python-hyper/hyperlink +Author: Mahmoud Hashemi and Glyph Lefkowitz +Author-email: mahmoud@hatnote.com +License: MIT +Platform: any +Classifier: Topic :: Utilities +Classifier: Intended Audience :: Developers +Classifier: Topic :: Software Development :: Libraries +Classifier: Development Status :: 5 - Production/Stable +Classifier: Programming Language :: Python :: 2 +Classifier: Programming Language :: Python :: 2.6 +Classifier: Programming Language :: Python :: 2.7 +Classifier: Programming Language :: Python :: 3 +Classifier: Programming Language :: Python :: 3.4 +Classifier: Programming Language :: Python :: 3.5 +Classifier: Programming Language :: Python :: 3.6 +Classifier: Programming Language :: Python :: 3.7 +Classifier: Programming Language :: Python :: 3.8 +Classifier: Programming Language :: Python :: 3.9 +Classifier: Programming Language :: Python :: Implementation :: PyPy +Classifier: License :: OSI Approved :: MIT License +Requires-Python: >=2.6, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.* +Requires-Dist: idna (>=2.5) +Requires-Dist: typing ; python_version < "3.5" + +The humble, but powerful, URL runs everything around us. Chances +are you've used several just to read this text. + +Hyperlink is a featureful, pure-Python implementation of the URL, with +an emphasis on correctness. MIT licensed. + +See the docs at http://hyperlink.readthedocs.io. + + diff --git a/contrib/python/hyperlink/py2/.dist-info/top_level.txt b/contrib/python/hyperlink/py2/.dist-info/top_level.txt new file mode 100644 index 0000000000..81722ce1d8 --- /dev/null +++ b/contrib/python/hyperlink/py2/.dist-info/top_level.txt @@ -0,0 +1 @@ +hyperlink diff --git a/contrib/python/hyperlink/py2/LICENSE b/contrib/python/hyperlink/py2/LICENSE new file mode 100644 index 0000000000..a73f882ffb --- /dev/null +++ b/contrib/python/hyperlink/py2/LICENSE @@ -0,0 +1,29 @@ +Copyright (c) 2017 +Glyph Lefkowitz +Itamar Turner-Trauring +Jean Paul Calderone +Adi Roiban +Amber Hawkie Brown +Mahmoud Hashemi +Wilfredo Sanchez Vega + +and others that have contributed code to the public domain. + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/contrib/python/hyperlink/py2/README.md b/contrib/python/hyperlink/py2/README.md new file mode 100644 index 0000000000..017f9eb88c --- /dev/null +++ b/contrib/python/hyperlink/py2/README.md @@ -0,0 +1,67 @@ +# Hyperlink + +*Cool URLs that don't change.* + +<a href="https://hyperlink.readthedocs.io/en/latest/"> + <img src="https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat" alt="Documentation"> +</a> +<a href="https://pypi.org/project/hyperlink/"> + <img src="https://img.shields.io/pypi/v/hyperlink.svg" alt="PyPI"> +</a> +<a href="http://calver.org"> + <img src="https://img.shields.io/badge/calver-YY.MINOR.MICRO-22bfda.svg" alt="Calendar Versioning"> +</a> +<a href="https://pypi.org/project/hyperlink/"> + <img src="https://img.shields.io/pypi/pyversions/hyperlink.svg" alt="Python Version Compatibility"> +</a> +<a href="https://https://codecov.io/github/python-hyper/hyperlink?branch=master"> + <img src="https://codecov.io/github/python-hyper/hyperlink/coverage.svg?branch=master" alt="Code Coverage"> +</a> +<a href="https://requires.io/github/python-hyper/hyperlink/requirements/?branch=master"> + <img src="https://requires.io/github/python-hyper/hyperlink/requirements.svg?branch=master" alt="Requirements Status"> +</a> + +Hyperlink provides a pure-Python implementation of immutable +URLs. Based on [RFC 3986][rfc3986] and [3987][rfc3987], the Hyperlink URL +makes working with both URIs and IRIs easy. + +Hyperlink is tested against Python 2.7, 3.4, 3.5, 3.6, 3.7, 3.8, and PyPy. + +Full documentation is available on [Read the Docs][docs]. + +[rfc3986]: https://tools.ietf.org/html/rfc3986 +[rfc3987]: https://tools.ietf.org/html/rfc3987 +[docs]: http://hyperlink.readthedocs.io/en/latest/ + +## Installation + +Hyperlink is a pure-Python package and requires nothing but +Python. The easiest way to install is with pip: + +``` +pip install hyperlink +``` + +Then, hyperlink away! + +```python +from hyperlink import URL + +url = URL.from_text(u'http://github.com/python-hyper/hyperlink?utm_source=README') +utm_source = url.get(u'utm_source') +better_url = url.replace(scheme=u'https', port=443) +org_url = better_url.click(u'.') +``` + +See the full API docs on [Read the Docs][docs]. + +## More information + +Hyperlink would not have been possible without the help of +[Glyph Lefkowitz](https://glyph.twistedmatrix.com/) and many other +community members, especially considering that it started as an +extract from the Twisted networking library. Thanks to them, +Hyperlink's URL has been production-grade for well over a decade. + +Still, should you encounter any issues, do file an issue, or submit a +pull request. diff --git a/contrib/python/hyperlink/py2/hyperlink/__init__.py b/contrib/python/hyperlink/py2/hyperlink/__init__.py new file mode 100644 index 0000000000..f680b01a90 --- /dev/null +++ b/contrib/python/hyperlink/py2/hyperlink/__init__.py @@ -0,0 +1,17 @@ +from ._url import ( + parse, + register_scheme, + URL, + EncodedURL, + DecodedURL, + URLParseError, +) + +__all__ = ( + "parse", + "register_scheme", + "URL", + "EncodedURL", + "DecodedURL", + "URLParseError", +) diff --git a/contrib/python/hyperlink/py2/hyperlink/_socket.py b/contrib/python/hyperlink/py2/hyperlink/_socket.py new file mode 100644 index 0000000000..3bcf89706d --- /dev/null +++ b/contrib/python/hyperlink/py2/hyperlink/_socket.py @@ -0,0 +1,53 @@ +try: + from socket import inet_pton +except ImportError: + from typing import TYPE_CHECKING + + if TYPE_CHECKING: # pragma: no cover + pass + else: + # based on https://gist.github.com/nnemkin/4966028 + # this code only applies on Windows Python 2.7 + import ctypes + import socket + + class SockAddr(ctypes.Structure): + _fields_ = [ + ("sa_family", ctypes.c_short), + ("__pad1", ctypes.c_ushort), + ("ipv4_addr", ctypes.c_byte * 4), + ("ipv6_addr", ctypes.c_byte * 16), + ("__pad2", ctypes.c_ulong), + ] + + WSAStringToAddressA = ctypes.windll.ws2_32.WSAStringToAddressA + WSAAddressToStringA = ctypes.windll.ws2_32.WSAAddressToStringA + + def inet_pton(address_family, ip_string): + # type: (int, str) -> bytes + addr = SockAddr() + ip_string_bytes = ip_string.encode("ascii") + addr.sa_family = address_family + addr_size = ctypes.c_int(ctypes.sizeof(addr)) + + try: + attribute, size = { + socket.AF_INET: ("ipv4_addr", 4), + socket.AF_INET6: ("ipv6_addr", 16), + }[address_family] + except KeyError: + raise socket.error("unknown address family") + + if ( + WSAStringToAddressA( + ip_string_bytes, + address_family, + None, + ctypes.byref(addr), + ctypes.byref(addr_size), + ) + != 0 + ): + raise socket.error(ctypes.FormatError()) + + return ctypes.string_at(getattr(addr, attribute), size) diff --git a/contrib/python/hyperlink/py2/hyperlink/_url.py b/contrib/python/hyperlink/py2/hyperlink/_url.py new file mode 100644 index 0000000000..be69baf696 --- /dev/null +++ b/contrib/python/hyperlink/py2/hyperlink/_url.py @@ -0,0 +1,2448 @@ +# -*- coding: utf-8 -*- +u"""Hyperlink provides Pythonic URL parsing, construction, and rendering. + +Usage is straightforward:: + + >>> import hyperlink + >>> url = hyperlink.parse(u'http://github.com/mahmoud/hyperlink?utm_source=docs') + >>> url.host + u'github.com' + >>> secure_url = url.replace(scheme=u'https') + >>> secure_url.get('utm_source')[0] + u'docs' + +Hyperlink's API centers on the :class:`DecodedURL` type, which wraps +the lower-level :class:`URL`, both of which can be returned by the +:func:`parse()` convenience function. + +""" # noqa: E501 + +import re +import sys +import string +import socket +from socket import AF_INET, AF_INET6 + +try: + from socket import AddressFamily +except ImportError: + AddressFamily = int # type: ignore[assignment,misc] +from typing import ( + Any, + Callable, + Dict, + Iterable, + Iterator, + List, + Mapping, + Optional, + Sequence, + Text, + Tuple, + Type, + TypeVar, + Union, + cast, +) +from unicodedata import normalize +from ._socket import inet_pton + +try: + from collections.abc import Mapping as MappingABC +except ImportError: # Python 2 + from collections import Mapping as MappingABC + +from idna import encode as idna_encode, decode as idna_decode + + +PY2 = sys.version_info[0] == 2 +try: + unichr +except NameError: # Py3 + unichr = chr # type: Callable[[int], Text] +NoneType = type(None) # type: Type[None] +QueryPairs = Tuple[Tuple[Text, Optional[Text]], ...] # internal representation +QueryParameters = Union[ + Mapping[Text, Optional[Text]], + QueryPairs, + Sequence[Tuple[Text, Optional[Text]]], +] +T = TypeVar("T") + + +# from boltons.typeutils +def make_sentinel(name="_MISSING", var_name=""): + # type: (str, str) -> object + """Creates and returns a new **instance** of a new class, suitable for + usage as a "sentinel", a kind of singleton often used to indicate + a value is missing when ``None`` is a valid input. + + Args: + name: Name of the Sentinel + var_name: Set this name to the name of the variable in its respective + module enable pickle-ability. + + >>> make_sentinel(var_name='_MISSING') + _MISSING + + The most common use cases here in boltons are as default values + for optional function arguments, partly because of its + less-confusing appearance in automatically generated + documentation. Sentinels also function well as placeholders in queues + and linked lists. + + .. note:: + + By design, additional calls to ``make_sentinel`` with the same + values will not produce equivalent objects. + + >>> make_sentinel('TEST') == make_sentinel('TEST') + False + >>> type(make_sentinel('TEST')) == type(make_sentinel('TEST')) + False + """ + + class Sentinel(object): + def __init__(self): + # type: () -> None + self.name = name + self.var_name = var_name + + def __repr__(self): + # type: () -> str + if self.var_name: + return self.var_name + return "%s(%r)" % (self.__class__.__name__, self.name) + + if var_name: + # superclass type hints don't allow str return type, but it is + # allowed in the docs, hence the ignore[override] below + def __reduce__(self): + # type: () -> str + return self.var_name + + def __nonzero__(self): + # type: () -> bool + return False + + __bool__ = __nonzero__ + + return Sentinel() + + +_unspecified = _UNSET = make_sentinel("_UNSET") # type: Any + + +# RFC 3986 Section 2.3, Unreserved URI Characters +# https://tools.ietf.org/html/rfc3986#section-2.3 +_UNRESERVED_CHARS = frozenset( + "~-._0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz" +) + + +# URL parsing regex (based on RFC 3986 Appendix B, with modifications) +_URL_RE = re.compile( + r"^((?P<scheme>[^:/?#]+):)?" + r"((?P<_netloc_sep>//)" + r"(?P<authority>[^/?#]*))?" + r"(?P<path>[^?#]*)" + r"(\?(?P<query>[^#]*))?" + r"(#(?P<fragment>.*))?$" +) +_SCHEME_RE = re.compile(r"^[a-zA-Z0-9+-.]*$") +_AUTHORITY_RE = re.compile( + r"^(?:(?P<userinfo>[^@/?#]*)@)?" + r"(?P<host>" + r"(?:\[(?P<ipv6_host>[^[\]/?#]*)\])" + r"|(?P<plain_host>[^:/?#[\]]*)" + r"|(?P<bad_host>.*?))?" + r"(?::(?P<port>.*))?$" +) + + +_HEX_CHAR_MAP = dict( + [ + ((a + b).encode("ascii"), unichr(int(a + b, 16)).encode("charmap")) + for a in string.hexdigits + for b in string.hexdigits + ] +) +_ASCII_RE = re.compile("([\x00-\x7f]+)") + +# RFC 3986 section 2.2, Reserved Characters +# https://tools.ietf.org/html/rfc3986#section-2.2 +_GEN_DELIMS = frozenset(u":/?#[]@") +_SUB_DELIMS = frozenset(u"!$&'()*+,;=") +_ALL_DELIMS = _GEN_DELIMS | _SUB_DELIMS + +_USERINFO_SAFE = _UNRESERVED_CHARS | _SUB_DELIMS | set(u"%") +_USERINFO_DELIMS = _ALL_DELIMS - _USERINFO_SAFE +_PATH_SAFE = _USERINFO_SAFE | set(u":@") +_PATH_DELIMS = _ALL_DELIMS - _PATH_SAFE +_SCHEMELESS_PATH_SAFE = _PATH_SAFE - set(":") +_SCHEMELESS_PATH_DELIMS = _ALL_DELIMS - _SCHEMELESS_PATH_SAFE +_FRAGMENT_SAFE = _UNRESERVED_CHARS | _PATH_SAFE | set(u"/?") +_FRAGMENT_DELIMS = _ALL_DELIMS - _FRAGMENT_SAFE +_QUERY_VALUE_SAFE = _UNRESERVED_CHARS | _FRAGMENT_SAFE - set(u"&") +_QUERY_VALUE_DELIMS = _ALL_DELIMS - _QUERY_VALUE_SAFE +_QUERY_KEY_SAFE = _UNRESERVED_CHARS | _QUERY_VALUE_SAFE - set(u"=") +_QUERY_KEY_DELIMS = _ALL_DELIMS - _QUERY_KEY_SAFE + + +def _make_decode_map(delims, allow_percent=False): + # type: (Iterable[Text], bool) -> Mapping[bytes, bytes] + ret = dict(_HEX_CHAR_MAP) + if not allow_percent: + delims = set(delims) | set([u"%"]) + for delim in delims: + _hexord = "{0:02X}".format(ord(delim)).encode("ascii") + _hexord_lower = _hexord.lower() + ret.pop(_hexord) + if _hexord != _hexord_lower: + ret.pop(_hexord_lower) + return ret + + +def _make_quote_map(safe_chars): + # type: (Iterable[Text]) -> Mapping[Union[int, Text], Text] + ret = {} # type: Dict[Union[int, Text], Text] + # v is included in the dict for py3 mostly, because bytestrings + # are iterables of ints, of course! + for i, v in zip(range(256), range(256)): + c = chr(v) + if c in safe_chars: + ret[c] = ret[v] = c + else: + ret[c] = ret[v] = "%{0:02X}".format(i) + return ret + + +_USERINFO_PART_QUOTE_MAP = _make_quote_map(_USERINFO_SAFE) +_USERINFO_DECODE_MAP = _make_decode_map(_USERINFO_DELIMS) +_PATH_PART_QUOTE_MAP = _make_quote_map(_PATH_SAFE) +_SCHEMELESS_PATH_PART_QUOTE_MAP = _make_quote_map(_SCHEMELESS_PATH_SAFE) +_PATH_DECODE_MAP = _make_decode_map(_PATH_DELIMS) +_QUERY_KEY_QUOTE_MAP = _make_quote_map(_QUERY_KEY_SAFE) +_QUERY_KEY_DECODE_MAP = _make_decode_map(_QUERY_KEY_DELIMS) +_QUERY_VALUE_QUOTE_MAP = _make_quote_map(_QUERY_VALUE_SAFE) +_QUERY_VALUE_DECODE_MAP = _make_decode_map(_QUERY_VALUE_DELIMS) +_FRAGMENT_QUOTE_MAP = _make_quote_map(_FRAGMENT_SAFE) +_FRAGMENT_DECODE_MAP = _make_decode_map(_FRAGMENT_DELIMS) +_UNRESERVED_QUOTE_MAP = _make_quote_map(_UNRESERVED_CHARS) +_UNRESERVED_DECODE_MAP = dict( + [ + (k, v) + for k, v in _HEX_CHAR_MAP.items() + if v.decode("ascii", "replace") in _UNRESERVED_CHARS + ] +) + +_ROOT_PATHS = frozenset(((), (u"",))) + + +def _encode_reserved(text, maximal=True): + # type: (Text, bool) -> Text + """A very comprehensive percent encoding for encoding all + delimiters. Used for arguments to DecodedURL, where a % means a + percent sign, and not the character used by URLs for escaping + bytes. + """ + if maximal: + bytestr = normalize("NFC", text).encode("utf8") + return u"".join([_UNRESERVED_QUOTE_MAP[b] for b in bytestr]) + return u"".join( + [ + _UNRESERVED_QUOTE_MAP[t] if t in _UNRESERVED_CHARS else t + for t in text + ] + ) + + +def _encode_path_part(text, maximal=True): + # type: (Text, bool) -> Text + "Percent-encode a single segment of a URL path." + if maximal: + bytestr = normalize("NFC", text).encode("utf8") + return u"".join([_PATH_PART_QUOTE_MAP[b] for b in bytestr]) + return u"".join( + [_PATH_PART_QUOTE_MAP[t] if t in _PATH_DELIMS else t for t in text] + ) + + +def _encode_schemeless_path_part(text, maximal=True): + # type: (Text, bool) -> Text + """Percent-encode the first segment of a URL path for a URL without a + scheme specified. + """ + if maximal: + bytestr = normalize("NFC", text).encode("utf8") + return u"".join([_SCHEMELESS_PATH_PART_QUOTE_MAP[b] for b in bytestr]) + return u"".join( + [ + _SCHEMELESS_PATH_PART_QUOTE_MAP[t] + if t in _SCHEMELESS_PATH_DELIMS + else t + for t in text + ] + ) + + +def _encode_path_parts( + text_parts, # type: Sequence[Text] + rooted=False, # type: bool + has_scheme=True, # type: bool + has_authority=True, # type: bool + maximal=True, # type: bool +): + # type: (...) -> Sequence[Text] + """ + Percent-encode a tuple of path parts into a complete path. + + Setting *maximal* to False percent-encodes only the reserved + characters that are syntactically necessary for serialization, + preserving any IRI-style textual data. + + Leaving *maximal* set to its default True percent-encodes + everything required to convert a portion of an IRI to a portion of + a URI. + + RFC 3986 3.3: + + If a URI contains an authority component, then the path component + must either be empty or begin with a slash ("/") character. If a URI + does not contain an authority component, then the path cannot begin + with two slash characters ("//"). In addition, a URI reference + (Section 4.1) may be a relative-path reference, in which case the + first path segment cannot contain a colon (":") character. + """ + if not text_parts: + return () + if rooted: + text_parts = (u"",) + tuple(text_parts) + # elif has_authority and text_parts: + # raise Exception('see rfc above') # TODO: too late to fail like this? + encoded_parts = [] # type: List[Text] + if has_scheme: + encoded_parts = [ + _encode_path_part(part, maximal=maximal) if part else part + for part in text_parts + ] + else: + encoded_parts = [_encode_schemeless_path_part(text_parts[0])] + encoded_parts.extend( + [ + _encode_path_part(part, maximal=maximal) if part else part + for part in text_parts[1:] + ] + ) + return tuple(encoded_parts) + + +def _encode_query_key(text, maximal=True): + # type: (Text, bool) -> Text + """ + Percent-encode a single query string key or value. + """ + if maximal: + bytestr = normalize("NFC", text).encode("utf8") + return u"".join([_QUERY_KEY_QUOTE_MAP[b] for b in bytestr]) + return u"".join( + [_QUERY_KEY_QUOTE_MAP[t] if t in _QUERY_KEY_DELIMS else t for t in text] + ) + + +def _encode_query_value(text, maximal=True): + # type: (Text, bool) -> Text + """ + Percent-encode a single query string key or value. + """ + if maximal: + bytestr = normalize("NFC", text).encode("utf8") + return u"".join([_QUERY_VALUE_QUOTE_MAP[b] for b in bytestr]) + return u"".join( + [ + _QUERY_VALUE_QUOTE_MAP[t] if t in _QUERY_VALUE_DELIMS else t + for t in text + ] + ) + + +def _encode_fragment_part(text, maximal=True): + # type: (Text, bool) -> Text + """Quote the fragment part of the URL. Fragments don't have + subdelimiters, so the whole URL fragment can be passed. + """ + if maximal: + bytestr = normalize("NFC", text).encode("utf8") + return u"".join([_FRAGMENT_QUOTE_MAP[b] for b in bytestr]) + return u"".join( + [_FRAGMENT_QUOTE_MAP[t] if t in _FRAGMENT_DELIMS else t for t in text] + ) + + +def _encode_userinfo_part(text, maximal=True): + # type: (Text, bool) -> Text + """Quote special characters in either the username or password + section of the URL. + """ + if maximal: + bytestr = normalize("NFC", text).encode("utf8") + return u"".join([_USERINFO_PART_QUOTE_MAP[b] for b in bytestr]) + return u"".join( + [ + _USERINFO_PART_QUOTE_MAP[t] if t in _USERINFO_DELIMS else t + for t in text + ] + ) + + +# This port list painstakingly curated by hand searching through +# https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml +# and +# https://www.iana.org/assignments/service-names-port-numbers/service-names-port-numbers.xhtml +SCHEME_PORT_MAP = { + "acap": 674, + "afp": 548, + "dict": 2628, + "dns": 53, + "file": None, + "ftp": 21, + "git": 9418, + "gopher": 70, + "http": 80, + "https": 443, + "imap": 143, + "ipp": 631, + "ipps": 631, + "irc": 194, + "ircs": 6697, + "ldap": 389, + "ldaps": 636, + "mms": 1755, + "msrp": 2855, + "msrps": None, + "mtqp": 1038, + "nfs": 111, + "nntp": 119, + "nntps": 563, + "pop": 110, + "prospero": 1525, + "redis": 6379, + "rsync": 873, + "rtsp": 554, + "rtsps": 322, + "rtspu": 5005, + "sftp": 22, + "smb": 445, + "snmp": 161, + "ssh": 22, + "steam": None, + "svn": 3690, + "telnet": 23, + "ventrilo": 3784, + "vnc": 5900, + "wais": 210, + "ws": 80, + "wss": 443, + "xmpp": None, +} + +# This list of schemes that don't use authorities is also from the link above. +NO_NETLOC_SCHEMES = set( + [ + "urn", + "about", + "bitcoin", + "blob", + "data", + "geo", + "magnet", + "mailto", + "news", + "pkcs11", + "sip", + "sips", + "tel", + ] +) +# As of Mar 11, 2017, there were 44 netloc schemes, and 13 non-netloc + +NO_QUERY_PLUS_SCHEMES = set() + + +def register_scheme( + text, uses_netloc=True, default_port=None, query_plus_is_space=True +): + # type: (Text, bool, Optional[int], bool) -> None + """Registers new scheme information, resulting in correct port and + slash behavior from the URL object. There are dozens of standard + schemes preregistered, so this function is mostly meant for + proprietary internal customizations or stopgaps on missing + standards information. If a scheme seems to be missing, please + `file an issue`_! + + Args: + text: A string representation of the scheme. + (the 'http' in 'http://hatnote.com') + uses_netloc: Does the scheme support specifying a + network host? For instance, "http" does, "mailto" does + not. Defaults to True. + default_port: The default port, if any, for + netloc-using schemes. + query_plus_is_space: If true, a "+" in the query string should be + decoded as a space by DecodedURL. + + .. _file an issue: https://github.com/mahmoud/hyperlink/issues + """ + text = text.lower() + if default_port is not None: + try: + default_port = int(default_port) + except (ValueError, TypeError): + raise ValueError( + "default_port expected integer or None, not %r" + % (default_port,) + ) + + if uses_netloc is True: + SCHEME_PORT_MAP[text] = default_port + elif uses_netloc is False: + if default_port is not None: + raise ValueError( + "unexpected default port while specifying" + " non-netloc scheme: %r" % default_port + ) + NO_NETLOC_SCHEMES.add(text) + else: + raise ValueError("uses_netloc expected bool, not: %r" % uses_netloc) + + if not query_plus_is_space: + NO_QUERY_PLUS_SCHEMES.add(text) + + return + + +def scheme_uses_netloc(scheme, default=None): + # type: (Text, Optional[bool]) -> Optional[bool] + """Whether or not a URL uses :code:`:` or :code:`://` to separate the + scheme from the rest of the URL depends on the scheme's own + standard definition. There is no way to infer this behavior + from other parts of the URL. A scheme either supports network + locations or it does not. + + The URL type's approach to this is to check for explicitly + registered schemes, with common schemes like HTTP + preregistered. This is the same approach taken by + :mod:`urlparse`. + + URL adds two additional heuristics if the scheme as a whole is + not registered. First, it attempts to check the subpart of the + scheme after the last ``+`` character. This adds intuitive + behavior for schemes like ``git+ssh``. Second, if a URL with + an unrecognized scheme is loaded, it will maintain the + separator it sees. + """ + if not scheme: + return False + scheme = scheme.lower() + if scheme in SCHEME_PORT_MAP: + return True + if scheme in NO_NETLOC_SCHEMES: + return False + if scheme.split("+")[-1] in SCHEME_PORT_MAP: + return True + return default + + +class URLParseError(ValueError): + """Exception inheriting from :exc:`ValueError`, raised when failing to + parse a URL. Mostly raised on invalid ports and IPv6 addresses. + """ + + pass + + +def _optional(argument, default): + # type: (Any, Any) -> Any + if argument is _UNSET: + return default + else: + return argument + + +def _typecheck(name, value, *types): + # type: (Text, T, Type[Any]) -> T + """ + Check that the given *value* is one of the given *types*, or raise an + exception describing the problem using *name*. + """ + if not types: + raise ValueError("expected one or more types, maybe use _textcheck?") + if not isinstance(value, types): + raise TypeError( + "expected %s for %s, got %r" + % (" or ".join([t.__name__ for t in types]), name, value) + ) + return value + + +def _textcheck(name, value, delims=frozenset(), nullable=False): + # type: (Text, T, Iterable[Text], bool) -> T + if not isinstance(value, Text): + if nullable and value is None: + # used by query string values + return value # type: ignore[unreachable] + else: + str_name = "unicode" if PY2 else "str" + exp = str_name + " or NoneType" if nullable else str_name + raise TypeError("expected %s for %s, got %r" % (exp, name, value)) + if delims and set(value) & set(delims): # TODO: test caching into regexes + raise ValueError( + "one or more reserved delimiters %s present in %s: %r" + % ("".join(delims), name, value) + ) + return value # type: ignore[return-value] # T vs. Text + + +def iter_pairs(iterable): + # type: (Iterable[Any]) -> Iterator[Any] + """ + Iterate over the (key, value) pairs in ``iterable``. + + This handles dictionaries sensibly, and falls back to assuming the + iterable yields (key, value) pairs. This behaviour is similar to + what Python's ``dict()`` constructor does. + """ + if isinstance(iterable, MappingABC): + iterable = iterable.items() + return iter(iterable) + + +def _decode_unreserved(text, normalize_case=False, encode_stray_percents=False): + # type: (Text, bool, bool) -> Text + return _percent_decode( + text, + normalize_case=normalize_case, + encode_stray_percents=encode_stray_percents, + _decode_map=_UNRESERVED_DECODE_MAP, + ) + + +def _decode_userinfo_part( + text, normalize_case=False, encode_stray_percents=False +): + # type: (Text, bool, bool) -> Text + return _percent_decode( + text, + normalize_case=normalize_case, + encode_stray_percents=encode_stray_percents, + _decode_map=_USERINFO_DECODE_MAP, + ) + + +def _decode_path_part(text, normalize_case=False, encode_stray_percents=False): + # type: (Text, bool, bool) -> Text + """ + >>> _decode_path_part(u'%61%77%2f%7a') + u'aw%2fz' + >>> _decode_path_part(u'%61%77%2f%7a', normalize_case=True) + u'aw%2Fz' + """ + return _percent_decode( + text, + normalize_case=normalize_case, + encode_stray_percents=encode_stray_percents, + _decode_map=_PATH_DECODE_MAP, + ) + + +def _decode_query_key(text, normalize_case=False, encode_stray_percents=False): + # type: (Text, bool, bool) -> Text + return _percent_decode( + text, + normalize_case=normalize_case, + encode_stray_percents=encode_stray_percents, + _decode_map=_QUERY_KEY_DECODE_MAP, + ) + + +def _decode_query_value( + text, normalize_case=False, encode_stray_percents=False +): + # type: (Text, bool, bool) -> Text + return _percent_decode( + text, + normalize_case=normalize_case, + encode_stray_percents=encode_stray_percents, + _decode_map=_QUERY_VALUE_DECODE_MAP, + ) + + +def _decode_fragment_part( + text, normalize_case=False, encode_stray_percents=False +): + # type: (Text, bool, bool) -> Text + return _percent_decode( + text, + normalize_case=normalize_case, + encode_stray_percents=encode_stray_percents, + _decode_map=_FRAGMENT_DECODE_MAP, + ) + + +def _percent_decode( + text, # type: Text + normalize_case=False, # type: bool + subencoding="utf-8", # type: Text + raise_subencoding_exc=False, # type: bool + encode_stray_percents=False, # type: bool + _decode_map=_HEX_CHAR_MAP, # type: Mapping[bytes, bytes] +): + # type: (...) -> Text + """Convert percent-encoded text characters to their normal, + human-readable equivalents. + + All characters in the input text must be encodable by + *subencoding*. All special characters underlying the values in the + percent-encoding must be decodable as *subencoding*. If a + non-*subencoding*-valid string is passed, the original text is + returned with no changes applied. + + Only called by field-tailored variants, e.g., + :func:`_decode_path_part`, as every percent-encodable part of the + URL has characters which should not be percent decoded. + + >>> _percent_decode(u'abc%20def') + u'abc def' + + Args: + text: Text with percent-encoding present. + normalize_case: Whether undecoded percent segments, such as encoded + delimiters, should be uppercased, per RFC 3986 Section 2.1. + See :func:`_decode_path_part` for an example. + subencoding: The name of the encoding underlying the percent-encoding. + raise_subencoding_exc: Whether an error in decoding the bytes + underlying the percent-decoding should be raised. + + Returns: + Text: The percent-decoded version of *text*, decoded by *subencoding*. + """ + try: + quoted_bytes = text.encode(subencoding) + except UnicodeEncodeError: + return text + + bits = quoted_bytes.split(b"%") + if len(bits) == 1: + return text + + res = [bits[0]] + append = res.append + + for item in bits[1:]: + hexpair, rest = item[:2], item[2:] + try: + append(_decode_map[hexpair]) + append(rest) + except KeyError: + pair_is_hex = hexpair in _HEX_CHAR_MAP + if pair_is_hex or not encode_stray_percents: + append(b"%") + else: + # if it's undecodable, treat as a real percent sign, + # which is reserved (because it wasn't in the + # context-aware _decode_map passed in), and should + # stay in an encoded state. + append(b"%25") + if normalize_case and pair_is_hex: + append(hexpair.upper()) + append(rest) + else: + append(item) + + unquoted_bytes = b"".join(res) + + try: + return unquoted_bytes.decode(subencoding) + except UnicodeDecodeError: + if raise_subencoding_exc: + raise + return text + + +def _decode_host(host): + # type: (Text) -> Text + """Decode a host from ASCII-encodable text to IDNA-decoded text. If + the host text is not ASCII, it is returned unchanged, as it is + presumed that it is already IDNA-decoded. + + Some technical details: _decode_host is built on top of the "idna" + package, which has some quirks: + + Capital letters are not valid IDNA2008. The idna package will + raise an exception like this on capital letters: + + > idna.core.InvalidCodepoint: Codepoint U+004B at position 1 ... not allowed + + However, if a segment of a host (i.e., something in + url.host.split('.')) is already ASCII, idna doesn't perform its + usual checks. In fact, for capital letters it automatically + lowercases them. + + This check and some other functionality can be bypassed by passing + uts46=True to idna.encode/decode. This allows a more permissive and + convenient interface. So far it seems like the balanced approach. + + Example output (from idna==2.6): + + >> idna.encode(u'mahmöud.io') + 'xn--mahmud-zxa.io' + >> idna.encode(u'Mahmöud.io') + Traceback (most recent call last): + File "<stdin>", line 1, in <module> + File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 355, in encode + result.append(alabel(label)) + File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 276, in alabel + check_label(label) + File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 253, in check_label + raise InvalidCodepoint('Codepoint {0} at position {1} of {2} not allowed'.format(_unot(cp_value), pos+1, repr(label))) + idna.core.InvalidCodepoint: Codepoint U+004D at position 1 of u'Mahm\xf6ud' not allowed + >> idna.encode(u'Mahmoud.io') + 'Mahmoud.io' + + # Similar behavior for decodes below + >> idna.decode(u'Mahmoud.io') + u'mahmoud.io + >> idna.decode(u'Méhmoud.io', uts46=True) + u'm\xe9hmoud.io' + """ # noqa: E501 + if not host: + return u"" + try: + host_bytes = host.encode("ascii") + except UnicodeEncodeError: + host_text = host + else: + try: + host_text = idna_decode(host_bytes, uts46=True) + except ValueError: + # only reached on "narrow" (UCS-2) Python builds <3.4, see #7 + # NOTE: not going to raise here, because there's no + # ambiguity in the IDNA, and the host is still + # technically usable + host_text = host + return host_text + + +def _resolve_dot_segments(path): + # type: (Sequence[Text]) -> Sequence[Text] + """Normalize the URL path by resolving segments of '.' and '..'. For + more details, see `RFC 3986 section 5.2.4, Remove Dot Segments`_. + + Args: + path: sequence of path segments in text form + + Returns: + A new sequence of path segments with the '.' and '..' elements removed + and resolved. + + .. _RFC 3986 section 5.2.4, Remove Dot Segments: https://tools.ietf.org/html/rfc3986#section-5.2.4 + """ # noqa: E501 + segs = [] # type: List[Text] + + for seg in path: + if seg == u".": + pass + elif seg == u"..": + if segs: + segs.pop() + else: + segs.append(seg) + + if list(path[-1:]) in ([u"."], [u".."]): + segs.append(u"") + + return segs + + +def parse_host(host): + # type: (Text) -> Tuple[Optional[AddressFamily], Text] + """Parse the host into a tuple of ``(family, host)``, where family + is the appropriate :mod:`socket` module constant when the host is + an IP address. Family is ``None`` when the host is not an IP. + + Will raise :class:`URLParseError` on invalid IPv6 constants. + + Returns: + family (socket constant or None), host (string) + + >>> import socket + >>> parse_host('googlewebsite.com') == (None, 'googlewebsite.com') + True + >>> parse_host('::1') == (socket.AF_INET6, '::1') + True + >>> parse_host('192.168.1.1') == (socket.AF_INET, '192.168.1.1') + True + """ + if not host: + return None, u"" + + if u":" in host: + try: + inet_pton(AF_INET6, host) + except socket.error as se: + raise URLParseError("invalid IPv6 host: %r (%r)" % (host, se)) + except UnicodeEncodeError: + pass # TODO: this can't be a real host right? + else: + family = AF_INET6 # type: Optional[AddressFamily] + else: + try: + inet_pton(AF_INET, host) + except (socket.error, UnicodeEncodeError): + family = None # not an IP + else: + family = AF_INET + + return family, host + + +class URL(object): + r"""From blogs to billboards, URLs are so common, that it's easy to + overlook their complexity and power. With hyperlink's + :class:`URL` type, working with URLs doesn't have to be hard. + + URLs are made of many parts. Most of these parts are officially + named in `RFC 3986`_ and this diagram may prove handy in identifying + them:: + + foo://user:pass@example.com:8042/over/there?name=ferret#nose + \_/ \_______/ \_________/ \__/\_________/ \_________/ \__/ + | | | | | | | + scheme userinfo host port path query fragment + + While :meth:`~URL.from_text` is used for parsing whole URLs, the + :class:`URL` constructor builds a URL from the individual + components, like so:: + + >>> from hyperlink import URL + >>> url = URL(scheme=u'https', host=u'example.com', path=[u'hello', u'world']) + >>> print(url.to_text()) + https://example.com/hello/world + + The constructor runs basic type checks. All strings are expected + to be text (:class:`str` in Python 3, :class:`unicode` in Python 2). All + arguments are optional, defaulting to appropriately empty values. A full + list of constructor arguments is below. + + Args: + scheme: The text name of the scheme. + host: The host portion of the network location + port: The port part of the network location. If ``None`` or no port is + passed, the port will default to the default port of the scheme, if + it is known. See the ``SCHEME_PORT_MAP`` and + :func:`register_default_port` for more info. + path: A tuple of strings representing the slash-separated parts of the + path, each percent-encoded. + query: The query parameters, as a dictionary or as an sequence of + percent-encoded key-value pairs. + fragment: The fragment part of the URL. + rooted: A rooted URL is one which indicates an absolute path. + This is True on any URL that includes a host, or any relative URL + that starts with a slash. + userinfo: The username or colon-separated username:password pair. + uses_netloc: Indicates whether ``://`` (the "netloc separator") will + appear to separate the scheme from the *path* in cases where no + host is present. + Setting this to ``True`` is a non-spec-compliant affordance for the + common practice of having URIs that are *not* URLs (cannot have a + 'host' part) but nevertheless use the common ``://`` idiom that + most people associate with URLs; e.g. ``message:`` URIs like + ``message://message-id`` being equivalent to ``message:message-id``. + This may be inferred based on the scheme depending on whether + :func:`register_scheme` has been used to register the scheme and + should not be passed directly unless you know the scheme works like + this and you know it has not been registered. + + All of these parts are also exposed as read-only attributes of :class:`URL` + instances, along with several useful methods. + + .. _RFC 3986: https://tools.ietf.org/html/rfc3986 + .. _RFC 3987: https://tools.ietf.org/html/rfc3987 + """ # noqa: E501 + + def __init__( + self, + scheme=None, # type: Optional[Text] + host=None, # type: Optional[Text] + path=(), # type: Iterable[Text] + query=(), # type: QueryParameters + fragment=u"", # type: Text + port=None, # type: Optional[int] + rooted=None, # type: Optional[bool] + userinfo=u"", # type: Text + uses_netloc=None, # type: Optional[bool] + ): + # type: (...) -> None + if host is not None and scheme is None: + scheme = u"http" # TODO: why + if port is None and scheme is not None: + port = SCHEME_PORT_MAP.get(scheme) + if host and query and not path: + # per RFC 3986 6.2.3, "a URI that uses the generic syntax + # for authority with an empty path should be normalized to + # a path of '/'." + path = (u"",) + + # Now that we're done detecting whether they were passed, we can set + # them to their defaults: + if scheme is None: + scheme = u"" + if host is None: + host = u"" + if rooted is None: + rooted = bool(host) + + # Set attributes. + self._scheme = _textcheck("scheme", scheme) + if self._scheme: + if not _SCHEME_RE.match(self._scheme): + raise ValueError( + 'invalid scheme: %r. Only alphanumeric, "+",' + ' "-", and "." allowed. Did you meant to call' + " %s.from_text()?" % (self._scheme, self.__class__.__name__) + ) + + _, self._host = parse_host(_textcheck("host", host, "/?#@")) + if isinstance(path, Text): + raise TypeError( + "expected iterable of text for path, not: %r" % (path,) + ) + self._path = tuple( + (_textcheck("path segment", segment, "/?#") for segment in path) + ) + self._query = tuple( + ( + _textcheck("query parameter name", k, "&=#"), + _textcheck("query parameter value", v, "&#", nullable=True), + ) + for k, v in iter_pairs(query) + ) + self._fragment = _textcheck("fragment", fragment) + self._port = _typecheck("port", port, int, NoneType) + self._rooted = _typecheck("rooted", rooted, bool) + self._userinfo = _textcheck("userinfo", userinfo, "/?#@") + + if uses_netloc is None: + uses_netloc = scheme_uses_netloc(self._scheme, uses_netloc) + self._uses_netloc = _typecheck( + "uses_netloc", uses_netloc, bool, NoneType + ) + will_have_authority = self._host or ( + self._port and self._port != SCHEME_PORT_MAP.get(scheme) + ) + if will_have_authority: + # fixup for rooted consistency; if there's any 'authority' + # represented in the textual URL, then the path must be rooted, and + # we're definitely using a netloc (there must be a ://). + self._rooted = True + self._uses_netloc = True + if (not self._rooted) and self.path[:1] == (u"",): + self._rooted = True + self._path = self._path[1:] + if not will_have_authority and self._path and not self._rooted: + # If, after fixing up the path, there *is* a path and it *isn't* + # rooted, then we are definitely not using a netloc; if we did, it + # would make the path (erroneously) look like a hostname. + self._uses_netloc = False + + def get_decoded_url(self, lazy=False): + # type: (bool) -> DecodedURL + try: + return self._decoded_url + except AttributeError: + self._decoded_url = DecodedURL(self, lazy=lazy) # type: DecodedURL + return self._decoded_url + + @property + def scheme(self): + # type: () -> Text + """The scheme is a string, and the first part of an absolute URL, the + part before the first colon, and the part which defines the + semantics of the rest of the URL. Examples include "http", + "https", "ssh", "file", "mailto", and many others. See + :func:`~hyperlink.register_scheme()` for more info. + """ + return self._scheme + + @property + def host(self): + # type: () -> Text + """The host is a string, and the second standard part of an absolute + URL. When present, a valid host must be a domain name, or an + IP (v4 or v6). It occurs before the first slash, or the second + colon, if a :attr:`~hyperlink.URL.port` is provided. + """ + return self._host + + @property + def port(self): + # type: () -> Optional[int] + """The port is an integer that is commonly used in connecting to the + :attr:`host`, and almost never appears without it. + + When not present in the original URL, this attribute defaults + to the scheme's default port. If the scheme's default port is + not known, and the port is not provided, this attribute will + be set to None. + + >>> URL.from_text(u'http://example.com/pa/th').port + 80 + >>> URL.from_text(u'foo://example.com/pa/th').port + >>> URL.from_text(u'foo://example.com:8042/pa/th').port + 8042 + + .. note:: + + Per the standard, when the port is the same as the schemes + default port, it will be omitted in the text URL. + """ + return self._port + + @property + def path(self): + # type: () -> Sequence[Text] + """A tuple of strings, created by splitting the slash-separated + hierarchical path. Started by the first slash after the host, + terminated by a "?", which indicates the start of the + :attr:`~hyperlink.URL.query` string. + """ + return self._path + + @property + def query(self): + # type: () -> QueryPairs + """Tuple of pairs, created by splitting the ampersand-separated + mapping of keys and optional values representing + non-hierarchical data used to identify the resource. Keys are + always strings. Values are strings when present, or None when + missing. + + For more operations on the mapping, see + :meth:`~hyperlink.URL.get()`, :meth:`~hyperlink.URL.add()`, + :meth:`~hyperlink.URL.set()`, and + :meth:`~hyperlink.URL.delete()`. + """ + return self._query + + @property + def fragment(self): + # type: () -> Text + """A string, the last part of the URL, indicated by the first "#" + after the :attr:`~hyperlink.URL.path` or + :attr:`~hyperlink.URL.query`. Enables indirect identification + of a secondary resource, like an anchor within an HTML page. + """ + return self._fragment + + @property + def rooted(self): + # type: () -> bool + """Whether or not the path starts with a forward slash (``/``). + + This is taken from the terminology in the BNF grammar, + specifically the "path-rootless", rule, since "absolute path" + and "absolute URI" are somewhat ambiguous. :attr:`path` does + not contain the implicit prefixed ``"/"`` since that is + somewhat awkward to work with. + """ + return self._rooted + + @property + def userinfo(self): + # type: () -> Text + """The colon-separated string forming the username-password + combination. + """ + return self._userinfo + + @property + def uses_netloc(self): + # type: () -> Optional[bool] + """ + Indicates whether ``://`` (the "netloc separator") will appear to + separate the scheme from the *path* in cases where no host is present. + """ + return self._uses_netloc + + @property + def user(self): + # type: () -> Text + """ + The user portion of :attr:`~hyperlink.URL.userinfo`. + """ + return self.userinfo.split(u":")[0] + + def authority(self, with_password=False, **kw): + # type: (bool, Any) -> Text + """Compute and return the appropriate host/port/userinfo combination. + + >>> url = URL.from_text(u'http://user:pass@localhost:8080/a/b?x=y') + >>> url.authority() + u'user:@localhost:8080' + >>> url.authority(with_password=True) + u'user:pass@localhost:8080' + + Args: + with_password: Whether the return value of this method include the + password in the URL, if it is set. + Defaults to False. + + Returns: + Text: The authority (network location and user information) portion + of the URL. + """ + # first, a bit of twisted compat + with_password = kw.pop("includeSecrets", with_password) + if kw: + raise TypeError("got unexpected keyword arguments: %r" % kw.keys()) + host = self.host + if ":" in host: + hostport = ["[" + host + "]"] + else: + hostport = [self.host] + if self.port != SCHEME_PORT_MAP.get(self.scheme): + hostport.append(Text(self.port)) + authority = [] + if self.userinfo: + userinfo = self.userinfo + if not with_password and u":" in userinfo: + userinfo = userinfo[: userinfo.index(u":") + 1] + authority.append(userinfo) + authority.append(u":".join(hostport)) + return u"@".join(authority) + + def __eq__(self, other): + # type: (Any) -> bool + if not isinstance(other, self.__class__): + return NotImplemented + for attr in [ + "scheme", + "userinfo", + "host", + "query", + "fragment", + "port", + "uses_netloc", + "rooted", + ]: + if getattr(self, attr) != getattr(other, attr): + return False + if self.path == other.path or ( + self.path in _ROOT_PATHS and other.path in _ROOT_PATHS + ): + return True + return False + + def __ne__(self, other): + # type: (Any) -> bool + if not isinstance(other, self.__class__): + return NotImplemented + return not self.__eq__(other) + + def __hash__(self): + # type: () -> int + return hash( + ( + self.__class__, + self.scheme, + self.userinfo, + self.host, + self.path, + self.query, + self.fragment, + self.port, + self.rooted, + self.uses_netloc, + ) + ) + + @property + def absolute(self): + # type: () -> bool + """Whether or not the URL is "absolute". Absolute URLs are complete + enough to resolve to a network resource without being relative + to a base URI. + + >>> URL.from_text(u'http://wikipedia.org/').absolute + True + >>> URL.from_text(u'?a=b&c=d').absolute + False + + Absolute URLs must have both a scheme and a host set. + """ + return bool(self.scheme and self.host) + + def replace( + self, + scheme=_UNSET, # type: Optional[Text] + host=_UNSET, # type: Optional[Text] + path=_UNSET, # type: Iterable[Text] + query=_UNSET, # type: QueryParameters + fragment=_UNSET, # type: Text + port=_UNSET, # type: Optional[int] + rooted=_UNSET, # type: Optional[bool] + userinfo=_UNSET, # type: Text + uses_netloc=_UNSET, # type: Optional[bool] + ): + # type: (...) -> URL + """:class:`URL` objects are immutable, which means that attributes + are designed to be set only once, at construction. Instead of + modifying an existing URL, one simply creates a copy with the + desired changes. + + If any of the following arguments is omitted, it defaults to + the value on the current URL. + + Args: + scheme: The text name of the scheme. + host: The host portion of the network location. + path: A tuple of strings representing the slash-separated parts of + the path. + query: The query parameters, as a dictionary or as an sequence of + key-value pairs. + fragment: The fragment part of the URL. + port: The port part of the network location. + rooted: Whether or not the path begins with a slash. + userinfo: The username or colon-separated username:password pair. + uses_netloc: Indicates whether ``://`` (the "netloc separator") + will appear to separate the scheme from the *path* in cases + where no host is present. + Setting this to ``True`` is a non-spec-compliant affordance for + the common practice of having URIs that are *not* URLs (cannot + have a 'host' part) but nevertheless use the common ``://`` + idiom that most people associate with URLs; e.g. ``message:`` + URIs like ``message://message-id`` being equivalent to + ``message:message-id``. + This may be inferred based on the scheme depending on whether + :func:`register_scheme` has been used to register the scheme + and should not be passed directly unless you know the scheme + works like this and you know it has not been registered. + + Returns: + URL: A copy of the current :class:`URL`, with new values for + parameters passed. + """ + if scheme is not _UNSET and scheme != self.scheme: + # when changing schemes, reset the explicit uses_netloc preference + # to honor the new scheme. + uses_netloc = None + return self.__class__( + scheme=_optional(scheme, self.scheme), + host=_optional(host, self.host), + path=_optional(path, self.path), + query=_optional(query, self.query), + fragment=_optional(fragment, self.fragment), + port=_optional(port, self.port), + rooted=_optional(rooted, self.rooted), + userinfo=_optional(userinfo, self.userinfo), + uses_netloc=_optional(uses_netloc, self.uses_netloc), + ) + + @classmethod + def from_text(cls, text): + # type: (Text) -> URL + """Whereas the :class:`URL` constructor is useful for constructing + URLs from parts, :meth:`~URL.from_text` supports parsing whole + URLs from their string form:: + + >>> URL.from_text(u'http://example.com') + URL.from_text(u'http://example.com') + >>> URL.from_text(u'?a=b&x=y') + URL.from_text(u'?a=b&x=y') + + As you can see above, it's also used as the :func:`repr` of + :class:`URL` objects. The natural counterpart to + :func:`~URL.to_text()`. This method only accepts *text*, so be + sure to decode those bytestrings. + + Args: + text: A valid URL string. + + Returns: + URL: The structured object version of the parsed string. + + .. note:: + + Somewhat unexpectedly, URLs are a far more permissive + format than most would assume. Many strings which don't + look like URLs are still valid URLs. As a result, this + method only raises :class:`URLParseError` on invalid port + and IPv6 values in the host portion of the URL. + """ + um = _URL_RE.match(_textcheck("text", text)) + if um is None: + raise URLParseError("could not parse url: %r" % text) + gs = um.groupdict() + + au_text = gs["authority"] or u"" + au_m = _AUTHORITY_RE.match(au_text) + if au_m is None: + raise URLParseError( + "invalid authority %r in url: %r" % (au_text, text) + ) + au_gs = au_m.groupdict() + if au_gs["bad_host"]: + raise URLParseError( + "invalid host %r in url: %r" % (au_gs["bad_host"], text) + ) + + userinfo = au_gs["userinfo"] or u"" + + host = au_gs["ipv6_host"] or au_gs["plain_host"] + port = au_gs["port"] + if port is not None: + try: + port = int(port) # type: ignore[assignment] # FIXME, see below + except ValueError: + if not port: # TODO: excessive? + raise URLParseError("port must not be empty: %r" % au_text) + raise URLParseError("expected integer for port, not %r" % port) + + scheme = gs["scheme"] or u"" + fragment = gs["fragment"] or u"" + uses_netloc = bool(gs["_netloc_sep"]) + + if gs["path"]: + path = tuple(gs["path"].split(u"/")) + if not path[0]: + path = path[1:] + rooted = True + else: + rooted = False + else: + path = () + rooted = bool(au_text) + if gs["query"]: + query = tuple( + ( + qe.split(u"=", 1) # type: ignore[misc] + if u"=" in qe + else (qe, None) + ) + for qe in gs["query"].split(u"&") + ) # type: QueryPairs + else: + query = () + return cls( + scheme, + host, + path, + query, + fragment, + port, # type: ignore[arg-type] # FIXME, see above + rooted, + userinfo, + uses_netloc, + ) + + def normalize( + self, + scheme=True, + host=True, + path=True, + query=True, + fragment=True, + userinfo=True, + percents=True, + ): + # type: (bool, bool, bool, bool, bool, bool, bool) -> URL + """Return a new URL object with several standard normalizations + applied: + + * Decode unreserved characters (`RFC 3986 2.3`_) + * Uppercase remaining percent-encoded octets (`RFC 3986 2.1`_) + * Convert scheme and host casing to lowercase (`RFC 3986 3.2.2`_) + * Resolve any "." and ".." references in the path (`RFC 3986 6.2.2.3`_) + * Ensure an ending slash on URLs with an empty path (`RFC 3986 6.2.3`_) + * Encode any stray percent signs (`%`) in percent-encoded + fields (path, query, fragment, userinfo) (`RFC 3986 2.4`_) + + All are applied by default, but normalizations can be disabled + per-part by passing `False` for that part's corresponding + name. + + Args: + scheme: Convert the scheme to lowercase + host: Convert the host to lowercase + path: Normalize the path (see above for details) + query: Normalize the query string + fragment: Normalize the fragment + userinfo: Normalize the userinfo + percents: Encode isolated percent signs for any percent-encoded + fields which are being normalized (defaults to `True`). + + >>> url = URL.from_text(u'Http://example.COM/a/../b/./c%2f?%61%') + >>> print(url.normalize().to_text()) + http://example.com/b/c%2F?a%25 + + .. _RFC 3986 3.2.2: https://tools.ietf.org/html/rfc3986#section-3.2.2 + .. _RFC 3986 2.3: https://tools.ietf.org/html/rfc3986#section-2.3 + .. _RFC 3986 2.1: https://tools.ietf.org/html/rfc3986#section-2.1 + .. _RFC 3986 6.2.2.3: https://tools.ietf.org/html/rfc3986#section-6.2.2.3 + .. _RFC 3986 6.2.3: https://tools.ietf.org/html/rfc3986#section-6.2.3 + .. _RFC 3986 2.4: https://tools.ietf.org/html/rfc3986#section-2.4 + """ # noqa: E501 + kw = {} # type: Dict[str, Any] + if scheme: + kw["scheme"] = self.scheme.lower() + if host: + kw["host"] = self.host.lower() + + def _dec_unres(target): + # type: (Text) -> Text + return _decode_unreserved( + target, normalize_case=True, encode_stray_percents=percents + ) + + if path: + if self.path: + kw["path"] = [ + _dec_unres(p) for p in _resolve_dot_segments(self.path) + ] + else: + kw["path"] = (u"",) + if query: + kw["query"] = [ + (_dec_unres(k), _dec_unres(v) if v else v) + for k, v in self.query + ] + if fragment: + kw["fragment"] = _dec_unres(self.fragment) + if userinfo: + kw["userinfo"] = u":".join( + [_dec_unres(p) for p in self.userinfo.split(":", 1)] + ) + + return self.replace(**kw) + + def child(self, *segments): + # type: (Text) -> URL + """Make a new :class:`URL` where the given path segments are a child + of this URL, preserving other parts of the URL, including the + query string and fragment. + + For example:: + + >>> url = URL.from_text(u'http://localhost/a/b?x=y') + >>> child_url = url.child(u"c", u"d") + >>> child_url.to_text() + u'http://localhost/a/b/c/d?x=y' + + Args: + segments: Additional parts to be joined and added to the path, like + :func:`os.path.join`. Special characters in segments will be + percent encoded. + + Returns: + URL: A copy of the current URL with the extra path segments. + """ + if not segments: + return self + + segments = [ # type: ignore[assignment] # variable is tuple + _textcheck("path segment", s) for s in segments + ] + new_path = tuple(self.path) + if self.path and self.path[-1] == u"": + new_path = new_path[:-1] + new_path += tuple(_encode_path_parts(segments, maximal=False)) + return self.replace(path=new_path) + + def sibling(self, segment): + # type: (Text) -> URL + """Make a new :class:`URL` with a single path segment that is a + sibling of this URL path. + + Args: + segment: A single path segment. + + Returns: + URL: A copy of the current URL with the last path segment + replaced by *segment*. Special characters such as + ``/?#`` will be percent encoded. + """ + _textcheck("path segment", segment) + new_path = tuple(self.path)[:-1] + (_encode_path_part(segment),) + return self.replace(path=new_path) + + def click(self, href=u""): + # type: (Union[Text, URL]) -> URL + """Resolve the given URL relative to this URL. + + The resulting URI should match what a web browser would + generate if you visited the current URL and clicked on *href*. + + >>> url = URL.from_text(u'http://blog.hatnote.com/') + >>> url.click(u'/post/155074058790').to_text() + u'http://blog.hatnote.com/post/155074058790' + >>> url = URL.from_text(u'http://localhost/a/b/c/') + >>> url.click(u'../d/./e').to_text() + u'http://localhost/a/b/d/e' + + Args (Text): + href: A string representing a clicked URL. + + Return: + A copy of the current URL with navigation logic applied. + + For more information, see `RFC 3986 section 5`_. + + .. _RFC 3986 section 5: https://tools.ietf.org/html/rfc3986#section-5 + """ + if href: + if isinstance(href, URL): + clicked = href + else: + # TODO: This error message is not completely accurate, + # as URL objects are now also valid, but Twisted's + # test suite (wrongly) relies on this exact message. + _textcheck("relative URL", href) + clicked = URL.from_text(href) + if clicked.absolute: + return clicked + else: + clicked = self + + query = clicked.query + if clicked.scheme and not clicked.rooted: + # Schemes with relative paths are not well-defined. RFC 3986 calls + # them a "loophole in prior specifications" that should be avoided, + # or supported only for backwards compatibility. + raise NotImplementedError( + "absolute URI with rootless path: %r" % (href,) + ) + else: + if clicked.rooted: + path = clicked.path + elif clicked.path: + path = tuple(self.path)[:-1] + tuple(clicked.path) + else: + path = self.path + if not query: + query = self.query + return self.replace( + scheme=clicked.scheme or self.scheme, + host=clicked.host or self.host, + port=clicked.port or self.port, + path=_resolve_dot_segments(path), + query=query, + fragment=clicked.fragment, + ) + + def to_uri(self): + # type: () -> URL + u"""Make a new :class:`URL` instance with all non-ASCII characters + appropriately percent-encoded. This is useful to do in preparation + for sending a :class:`URL` over a network protocol. + + For example:: + + >>> URL.from_text(u'https://ايران.com/foo⇧bar/').to_uri() + URL.from_text(u'https://xn--mgba3a4fra.com/foo%E2%87%A7bar/') + + Returns: + URL: A new instance with its path segments, query parameters, and + hostname encoded, so that they are all in the standard + US-ASCII range. + """ + new_userinfo = u":".join( + [_encode_userinfo_part(p) for p in self.userinfo.split(":", 1)] + ) + new_path = _encode_path_parts( + self.path, has_scheme=bool(self.scheme), rooted=False, maximal=True + ) + new_host = ( + self.host + if not self.host + else idna_encode(self.host, uts46=True).decode("ascii") + ) + return self.replace( + userinfo=new_userinfo, + host=new_host, + path=new_path, + query=tuple( + [ + ( + _encode_query_key(k, maximal=True), + _encode_query_value(v, maximal=True) + if v is not None + else None, + ) + for k, v in self.query + ] + ), + fragment=_encode_fragment_part(self.fragment, maximal=True), + ) + + def to_iri(self): + # type: () -> URL + u"""Make a new :class:`URL` instance with all but a few reserved + characters decoded into human-readable format. + + Percent-encoded Unicode and IDNA-encoded hostnames are + decoded, like so:: + + >>> url = URL.from_text(u'https://xn--mgba3a4fra.example.com/foo%E2%87%A7bar/') + >>> print(url.to_iri().to_text()) + https://ايران.example.com/foo⇧bar/ + + .. note:: + + As a general Python issue, "narrow" (UCS-2) builds of + Python may not be able to fully decode certain URLs, and + the in those cases, this method will return a best-effort, + partially-decoded, URL which is still valid. This issue + does not affect any Python builds 3.4+. + + Returns: + URL: A new instance with its path segments, query parameters, and + hostname decoded for display purposes. + """ # noqa: E501 + new_userinfo = u":".join( + [_decode_userinfo_part(p) for p in self.userinfo.split(":", 1)] + ) + host_text = _decode_host(self.host) + + return self.replace( + userinfo=new_userinfo, + host=host_text, + path=[_decode_path_part(segment) for segment in self.path], + query=tuple( + ( + _decode_query_key(k), + _decode_query_value(v) if v is not None else None, + ) + for k, v in self.query + ), + fragment=_decode_fragment_part(self.fragment), + ) + + def to_text(self, with_password=False): + # type: (bool) -> Text + """Render this URL to its textual representation. + + By default, the URL text will *not* include a password, if one + is set. RFC 3986 considers using URLs to represent such + sensitive information as deprecated. Quoting from RFC 3986, + `section 3.2.1`: + + "Applications should not render as clear text any data after the + first colon (":") character found within a userinfo subcomponent + unless the data after the colon is the empty string (indicating no + password)." + + Args (bool): + with_password: Whether or not to include the password in the URL + text. Defaults to False. + + Returns: + Text: The serialized textual representation of this URL, such as + ``u"http://example.com/some/path?some=query"``. + + The natural counterpart to :class:`URL.from_text()`. + + .. _section 3.2.1: https://tools.ietf.org/html/rfc3986#section-3.2.1 + """ + scheme = self.scheme + authority = self.authority(with_password) + path = "/".join( + _encode_path_parts( + self.path, + rooted=self.rooted, + has_scheme=bool(scheme), + has_authority=bool(authority), + maximal=False, + ) + ) + query_parts = [] + for k, v in self.query: + if v is None: + query_parts.append(_encode_query_key(k, maximal=False)) + else: + query_parts.append( + u"=".join( + ( + _encode_query_key(k, maximal=False), + _encode_query_value(v, maximal=False), + ) + ) + ) + query_string = u"&".join(query_parts) + + fragment = self.fragment + + parts = [] # type: List[Text] + _add = parts.append + if scheme: + _add(scheme) + _add(":") + if authority: + _add("//") + _add(authority) + elif scheme and path[:2] != "//" and self.uses_netloc: + _add("//") + if path: + if scheme and authority and path[:1] != "/": + _add("/") # relpaths with abs authorities auto get '/' + _add(path) + if query_string: + _add("?") + _add(query_string) + if fragment: + _add("#") + _add(fragment) + return u"".join(parts) + + def __repr__(self): + # type: () -> str + """Convert this URL to an representation that shows all of its + constituent parts, as well as being a valid argument to + :func:`eval`. + """ + return "%s.from_text(%r)" % (self.__class__.__name__, self.to_text()) + + def _to_bytes(self): + # type: () -> bytes + """ + Allows for direct usage of URL objects with libraries like + requests, which automatically stringify URL parameters. See + issue #49. + """ + return self.to_uri().to_text().encode("ascii") + + if PY2: + __str__ = _to_bytes + __unicode__ = to_text + else: + __bytes__ = _to_bytes + __str__ = to_text + + # # Begin Twisted Compat Code + asURI = to_uri + asIRI = to_iri + + @classmethod + def fromText(cls, s): + # type: (Text) -> URL + return cls.from_text(s) + + def asText(self, includeSecrets=False): + # type: (bool) -> Text + return self.to_text(with_password=includeSecrets) + + def __dir__(self): + # type: () -> Sequence[Text] + try: + ret = object.__dir__(self) + except AttributeError: + # object.__dir__ == AttributeError # pdw for py2 + ret = dir(self.__class__) + list(self.__dict__.keys()) + ret = sorted(set(ret) - set(["fromText", "asURI", "asIRI", "asText"])) + return ret + + # # End Twisted Compat Code + + def add(self, name, value=None): + # type: (Text, Optional[Text]) -> URL + """Make a new :class:`URL` instance with a given query argument, + *name*, added to it with the value *value*, like so:: + + >>> URL.from_text(u'https://example.com/?x=y').add(u'x') + URL.from_text(u'https://example.com/?x=y&x') + >>> URL.from_text(u'https://example.com/?x=y').add(u'x', u'z') + URL.from_text(u'https://example.com/?x=y&x=z') + + Args: + name: The name of the query parameter to add. + The part before the ``=``. + value: The value of the query parameter to add. + The part after the ``=``. + Defaults to ``None``, meaning no value. + + Returns: + URL: A new :class:`URL` instance with the parameter added. + """ + return self.replace(query=self.query + ((name, value),)) + + def set(self, name, value=None): + # type: (Text, Optional[Text]) -> URL + """Make a new :class:`URL` instance with the query parameter *name* + set to *value*. All existing occurences, if any are replaced + by the single name-value pair. + + >>> URL.from_text(u'https://example.com/?x=y').set(u'x') + URL.from_text(u'https://example.com/?x') + >>> URL.from_text(u'https://example.com/?x=y').set(u'x', u'z') + URL.from_text(u'https://example.com/?x=z') + + Args: + name: The name of the query parameter to set. + The part before the ``=``. + value: The value of the query parameter to set. + The part after the ``=``. + Defaults to ``None``, meaning no value. + + Returns: + URL: A new :class:`URL` instance with the parameter set. + """ + # Preserve the original position of the query key in the list + q = [(k, v) for (k, v) in self.query if k != name] + idx = next( + (i for (i, (k, v)) in enumerate(self.query) if k == name), -1 + ) + q[idx:idx] = [(name, value)] + return self.replace(query=q) + + def get(self, name): + # type: (Text) -> List[Optional[Text]] + """Get a list of values for the given query parameter, *name*:: + + >>> url = URL.from_text(u'?x=1&x=2') + >>> url.get('x') + [u'1', u'2'] + >>> url.get('y') + [] + + If the given *name* is not set, an empty list is returned. A + list is always returned, and this method raises no exceptions. + + Args: + name: The name of the query parameter to get. + + Returns: + List[Optional[Text]]: A list of all the values associated with the + key, in string form. + """ + return [value for (key, value) in self.query if name == key] + + def remove( + self, + name, # type: Text + value=_UNSET, # type: Text + limit=None, # type: Optional[int] + ): + # type: (...) -> URL + """Make a new :class:`URL` instance with occurrences of the query + parameter *name* removed, or, if *value* is set, parameters + matching *name* and *value*. No exception is raised if the + parameter is not already set. + + Args: + name: The name of the query parameter to remove. + value: Optional value to additionally filter on. + Setting this removes query parameters which match both name + and value. + limit: Optional maximum number of parameters to remove. + + Returns: + URL: A new :class:`URL` instance with the parameter removed. + """ + if limit is None: + if value is _UNSET: + nq = [(k, v) for (k, v) in self.query if k != name] + else: + nq = [ + (k, v) + for (k, v) in self.query + if not (k == name and v == value) + ] + else: + nq, removed_count = [], 0 + + for k, v in self.query: + if ( + k == name + and (value is _UNSET or v == value) + and removed_count < limit + ): + removed_count += 1 # drop it + else: + nq.append((k, v)) # keep it + + return self.replace(query=nq) + + +EncodedURL = URL # An alias better describing what the URL really is + +_EMPTY_URL = URL() + + +def _replace_plus(text): + # type: (Text) -> Text + return text.replace("+", "%20") + + +def _no_op(text): + # type: (Text) -> Text + return text + + +class DecodedURL(object): + """ + :class:`DecodedURL` is a type designed to act as a higher-level + interface to :class:`URL` and the recommended type for most + operations. By analogy, :class:`DecodedURL` is the + :class:`unicode` to URL's :class:`bytes`. + + :class:`DecodedURL` automatically handles encoding and decoding + all its components, such that all inputs and outputs are in a + maximally-decoded state. Note that this means, for some special + cases, a URL may not "roundtrip" character-for-character, but this + is considered a good tradeoff for the safety of automatic + encoding. + + Otherwise, :class:`DecodedURL` has almost exactly the same API as + :class:`URL`. + + Where applicable, a UTF-8 encoding is presumed. Be advised that + some interactions can raise :exc:`UnicodeEncodeErrors` and + :exc:`UnicodeDecodeErrors`, just like when working with + bytestrings. Examples of such interactions include handling query + strings encoding binary data, and paths containing segments with + special characters encoded with codecs other than UTF-8. + + Args: + url: A :class:`URL` object to wrap. + lazy: Set to True to avoid pre-decode all parts of the URL to check for + validity. + Defaults to False. + query_plus_is_space: + characters in the query string should be treated + as spaces when decoding. If unspecified, the default is taken from + the scheme. + + .. note:: + + The :class:`DecodedURL` initializer takes a :class:`URL` object, + not URL components, like :class:`URL`. To programmatically + construct a :class:`DecodedURL`, you can use this pattern: + + >>> print(DecodedURL().replace(scheme=u'https', + ... host=u'pypi.org', path=(u'projects', u'hyperlink')).to_text()) + https://pypi.org/projects/hyperlink + + .. versionadded:: 18.0.0 + """ + + def __init__(self, url=_EMPTY_URL, lazy=False, query_plus_is_space=None): + # type: (URL, bool, Optional[bool]) -> None + self._url = url + if query_plus_is_space is None: + query_plus_is_space = url.scheme not in NO_QUERY_PLUS_SCHEMES + self._query_plus_is_space = query_plus_is_space + if not lazy: + # cache the following, while triggering any decoding + # issues with decodable fields + self.host, self.userinfo, self.path, self.query, self.fragment + return + + @classmethod + def from_text(cls, text, lazy=False, query_plus_is_space=None): + # type: (Text, bool, Optional[bool]) -> DecodedURL + """\ + Make a `DecodedURL` instance from any text string containing a URL. + + Args: + text: Text containing the URL + lazy: Whether to pre-decode all parts of the URL to check for + validity. + Defaults to True. + """ + _url = URL.from_text(text) + return cls(_url, lazy=lazy, query_plus_is_space=query_plus_is_space) + + @property + def encoded_url(self): + # type: () -> URL + """Access the underlying :class:`URL` object, which has any special + characters encoded. + """ + return self._url + + def to_text(self, with_password=False): + # type: (bool) -> Text + "Passthrough to :meth:`~hyperlink.URL.to_text()`" + return self._url.to_text(with_password) + + def to_uri(self): + # type: () -> URL + "Passthrough to :meth:`~hyperlink.URL.to_uri()`" + return self._url.to_uri() + + def to_iri(self): + # type: () -> URL + "Passthrough to :meth:`~hyperlink.URL.to_iri()`" + return self._url.to_iri() + + def _clone(self, url): + # type: (URL) -> DecodedURL + return self.__class__( + url, + # TODO: propagate laziness? + query_plus_is_space=self._query_plus_is_space, + ) + + def click(self, href=u""): + # type: (Union[Text, URL, DecodedURL]) -> DecodedURL + """Return a new DecodedURL wrapping the result of + :meth:`~hyperlink.URL.click()` + """ + if isinstance(href, DecodedURL): + href = href._url + return self._clone( + self._url.click(href=href), + ) + + def sibling(self, segment): + # type: (Text) -> DecodedURL + """Automatically encode any reserved characters in *segment* and + return a new `DecodedURL` wrapping the result of + :meth:`~hyperlink.URL.sibling()` + """ + return self._clone( + self._url.sibling(_encode_reserved(segment)), + ) + + def child(self, *segments): + # type: (Text) -> DecodedURL + """Automatically encode any reserved characters in *segments* and + return a new `DecodedURL` wrapping the result of + :meth:`~hyperlink.URL.child()`. + """ + if not segments: + return self + new_segs = [_encode_reserved(s) for s in segments] + return self._clone(self._url.child(*new_segs)) + + def normalize( + self, + scheme=True, + host=True, + path=True, + query=True, + fragment=True, + userinfo=True, + percents=True, + ): + # type: (bool, bool, bool, bool, bool, bool, bool) -> DecodedURL + """Return a new `DecodedURL` wrapping the result of + :meth:`~hyperlink.URL.normalize()` + """ + return self._clone( + self._url.normalize( + scheme, host, path, query, fragment, userinfo, percents + ) + ) + + @property + def absolute(self): + # type: () -> bool + return self._url.absolute + + @property + def scheme(self): + # type: () -> Text + return self._url.scheme + + @property + def host(self): + # type: () -> Text + return _decode_host(self._url.host) + + @property + def port(self): + # type: () -> Optional[int] + return self._url.port + + @property + def rooted(self): + # type: () -> bool + return self._url.rooted + + @property + def path(self): + # type: () -> Sequence[Text] + if not hasattr(self, "_path"): + self._path = tuple( + [ + _percent_decode(p, raise_subencoding_exc=True) + for p in self._url.path + ] + ) + return self._path + + @property + def query(self): + # type: () -> QueryPairs + if not hasattr(self, "_query"): + if self._query_plus_is_space: + predecode = _replace_plus + else: + predecode = _no_op + + self._query = cast( + QueryPairs, + tuple( + tuple( + _percent_decode( + predecode(x), raise_subencoding_exc=True + ) + if x is not None + else None + for x in (k, v) + ) + for k, v in self._url.query + ), + ) + return self._query + + @property + def fragment(self): + # type: () -> Text + if not hasattr(self, "_fragment"): + frag = self._url.fragment + self._fragment = _percent_decode(frag, raise_subencoding_exc=True) + return self._fragment + + @property + def userinfo(self): + # type: () -> Union[Tuple[str], Tuple[str, str]] + if not hasattr(self, "_userinfo"): + self._userinfo = cast( + Union[Tuple[str], Tuple[str, str]], + tuple( + tuple( + _percent_decode(p, raise_subencoding_exc=True) + for p in self._url.userinfo.split(":", 1) + ) + ), + ) + return self._userinfo + + @property + def user(self): + # type: () -> Text + return self.userinfo[0] + + @property + def uses_netloc(self): + # type: () -> Optional[bool] + return self._url.uses_netloc + + def replace( + self, + scheme=_UNSET, # type: Optional[Text] + host=_UNSET, # type: Optional[Text] + path=_UNSET, # type: Iterable[Text] + query=_UNSET, # type: QueryParameters + fragment=_UNSET, # type: Text + port=_UNSET, # type: Optional[int] + rooted=_UNSET, # type: Optional[bool] + userinfo=_UNSET, # type: Union[Tuple[str], Tuple[str, str]] + uses_netloc=_UNSET, # type: Optional[bool] + ): + # type: (...) -> DecodedURL + """While the signature is the same, this `replace()` differs a little + from URL.replace. For instance, it accepts userinfo as a + tuple, not as a string, handling the case of having a username + containing a `:`. As with the rest of the methods on + DecodedURL, if you pass a reserved character, it will be + automatically encoded instead of an error being raised. + """ + if path is not _UNSET: + path = tuple(_encode_reserved(p) for p in path) + if query is not _UNSET: + query = cast( + QueryPairs, + tuple( + tuple( + _encode_reserved(x) if x is not None else None + for x in (k, v) + ) + for k, v in iter_pairs(query) + ), + ) + if userinfo is not _UNSET: + if len(userinfo) > 2: + raise ValueError( + 'userinfo expected sequence of ["user"] or' + ' ["user", "password"], got %r' % (userinfo,) + ) + userinfo_text = u":".join([_encode_reserved(p) for p in userinfo]) + else: + userinfo_text = _UNSET + new_url = self._url.replace( + scheme=scheme, + host=host, + path=path, + query=query, + fragment=fragment, + port=port, + rooted=rooted, + userinfo=userinfo_text, + uses_netloc=uses_netloc, + ) + return self._clone(url=new_url) + + def get(self, name): + # type: (Text) -> List[Optional[Text]] + "Get the value of all query parameters whose name matches *name*" + return [v for (k, v) in self.query if name == k] + + def add(self, name, value=None): + # type: (Text, Optional[Text]) -> DecodedURL + """Return a new DecodedURL with the query parameter *name* and *value* + added.""" + return self.replace(query=self.query + ((name, value),)) + + def set(self, name, value=None): + # type: (Text, Optional[Text]) -> DecodedURL + "Return a new DecodedURL with query parameter *name* set to *value*" + query = self.query + q = [(k, v) for (k, v) in query if k != name] + idx = next((i for (i, (k, v)) in enumerate(query) if k == name), -1) + q[idx:idx] = [(name, value)] + return self.replace(query=q) + + def remove( + self, + name, # type: Text + value=_UNSET, # type: Text + limit=None, # type: Optional[int] + ): + # type: (...) -> DecodedURL + """Return a new DecodedURL with query parameter *name* removed. + + Optionally also filter for *value*, as well as cap the number + of parameters removed with *limit*. + """ + if limit is None: + if value is _UNSET: + nq = [(k, v) for (k, v) in self.query if k != name] + else: + nq = [ + (k, v) + for (k, v) in self.query + if not (k == name and v == value) + ] + else: + nq, removed_count = [], 0 + for k, v in self.query: + if ( + k == name + and (value is _UNSET or v == value) + and removed_count < limit + ): + removed_count += 1 # drop it + else: + nq.append((k, v)) # keep it + + return self.replace(query=nq) + + def __repr__(self): + # type: () -> str + cn = self.__class__.__name__ + return "%s(url=%r)" % (cn, self._url) + + def __str__(self): + # type: () -> str + # TODO: the underlying URL's __str__ needs to change to make + # this work as the URL, see #55 + return str(self._url) + + def __eq__(self, other): + # type: (Any) -> bool + if not isinstance(other, self.__class__): + return NotImplemented + return self.normalize().to_uri() == other.normalize().to_uri() + + def __ne__(self, other): + # type: (Any) -> bool + if not isinstance(other, self.__class__): + return NotImplemented + return not self.__eq__(other) + + def __hash__(self): + # type: () -> int + return hash( + ( + self.__class__, + self.scheme, + self.userinfo, + self.host, + self.path, + self.query, + self.fragment, + self.port, + self.rooted, + self.uses_netloc, + ) + ) + + # # Begin Twisted Compat Code + asURI = to_uri + asIRI = to_iri + + @classmethod + def fromText(cls, s, lazy=False): + # type: (Text, bool) -> DecodedURL + return cls.from_text(s, lazy=lazy) + + def asText(self, includeSecrets=False): + # type: (bool) -> Text + return self.to_text(with_password=includeSecrets) + + def __dir__(self): + # type: () -> Sequence[Text] + try: + ret = object.__dir__(self) + except AttributeError: + # object.__dir__ == AttributeError # pdw for py2 + ret = dir(self.__class__) + list(self.__dict__.keys()) + ret = sorted(set(ret) - set(["fromText", "asURI", "asIRI", "asText"])) + return ret + + # # End Twisted Compat Code + + +def parse(url, decoded=True, lazy=False): + # type: (Text, bool, bool) -> Union[URL, DecodedURL] + """ + Automatically turn text into a structured URL object. + + >>> url = parse(u"https://github.com/python-hyper/hyperlink") + >>> print(url.to_text()) + https://github.com/python-hyper/hyperlink + + Args: + url: A text string representation of a URL. + + decoded: Whether or not to return a :class:`DecodedURL`, + which automatically handles all + encoding/decoding/quoting/unquoting for all the various + accessors of parts of the URL, or a :class:`URL`, + which has the same API, but requires handling of special + characters for different parts of the URL. + + lazy: In the case of `decoded=True`, this controls + whether the URL is decoded immediately or as accessed. The + default, `lazy=False`, checks all encoded parts of the URL + for decodability. + + .. versionadded:: 18.0.0 + """ + enc_url = EncodedURL.from_text(url) + if not decoded: + return enc_url + dec_url = DecodedURL(enc_url, lazy=lazy) + return dec_url diff --git a/contrib/python/hyperlink/py2/hyperlink/hypothesis.py b/contrib/python/hyperlink/py2/hyperlink/hypothesis.py new file mode 100644 index 0000000000..45fd9a9956 --- /dev/null +++ b/contrib/python/hyperlink/py2/hyperlink/hypothesis.py @@ -0,0 +1,324 @@ +# -*- coding: utf-8 -*- +""" +Hypothesis strategies. +""" +from __future__ import absolute_import + +try: + import hypothesis + + del hypothesis +except ImportError: + from typing import Tuple + + __all__ = () # type: Tuple[str, ...] +else: + import io + import pkgutil + from csv import reader as csv_reader + from os.path import dirname, join + from string import ascii_letters, digits + from sys import maxunicode + from typing import ( + Callable, + Iterable, + List, + Optional, + Sequence, + Text, + TypeVar, + cast, + ) + from gzip import open as open_gzip + + from . import DecodedURL, EncodedURL + + from hypothesis import assume + from hypothesis.strategies import ( + composite, + integers, + lists, + sampled_from, + text, + ) + + from idna import IDNAError, check_label, encode as idna_encode + + __all__ = ( + "decoded_urls", + "encoded_urls", + "hostname_labels", + "hostnames", + "idna_text", + "paths", + "port_numbers", + ) + + T = TypeVar("T") + DrawCallable = Callable[[Callable[..., T]], T] + + try: + unichr + except NameError: # Py3 + unichr = chr # type: Callable[[int], Text] + + def idna_characters(): + # type: () -> Text + """ + Returns a string containing IDNA characters. + """ + global _idnaCharacters + + if not _idnaCharacters: + result = [] + + # Data source "IDNA Derived Properties": + # https://www.iana.org/assignments/idna-tables-6.3.0/ + # idna-tables-6.3.0.xhtml#idna-tables-properties + dataFileName = join( + dirname(__file__), "idna-tables-properties.csv.gz" + ) + data = io.BytesIO(pkgutil.get_data(__name__, "idna-tables-properties.csv.gz")) + with open_gzip(data) as dataFile: + reader = csv_reader( + (line.decode("utf-8") for line in dataFile), + delimiter=",", + ) + next(reader) # Skip header row + for row in reader: + codes, prop, description = row + + if prop != "PVALID": + # CONTEXTO or CONTEXTJ are also allowed, but they come + # with rules, so we're punting on those here. + # See: https://tools.ietf.org/html/rfc5892 + continue + + startEnd = row[0].split("-", 1) + if len(startEnd) == 1: + # No end of range given; use start + startEnd.append(startEnd[0]) + start, end = (int(i, 16) for i in startEnd) + + for i in range(start, end + 1): + if i > maxunicode: # Happens using Py2 on Windows + break + result.append(unichr(i)) + + _idnaCharacters = u"".join(result) + + return _idnaCharacters + + _idnaCharacters = "" # type: Text + + @composite + def idna_text(draw, min_size=1, max_size=None): + # type: (DrawCallable, int, Optional[int]) -> Text + """ + A strategy which generates IDNA-encodable text. + + @param min_size: The minimum number of characters in the text. + C{None} is treated as C{0}. + + @param max_size: The maximum number of characters in the text. + Use C{None} for an unbounded size. + """ + alphabet = idna_characters() + + assert min_size >= 1 + + if max_size is not None: + assert max_size >= 1 + + result = cast( + Text, + draw(text(min_size=min_size, max_size=max_size, alphabet=alphabet)), + ) + + # FIXME: There should be a more efficient way to ensure we produce + # valid IDNA text. + try: + idna_encode(result) + except IDNAError: + assume(False) + + return result + + @composite + def port_numbers(draw, allow_zero=False): + # type: (DrawCallable, bool) -> int + """ + A strategy which generates port numbers. + + @param allow_zero: Whether to allow port C{0} as a possible value. + """ + if allow_zero: + min_value = 0 + else: + min_value = 1 + + return cast(int, draw(integers(min_value=min_value, max_value=65535))) + + @composite + def hostname_labels(draw, allow_idn=True): + # type: (DrawCallable, bool) -> Text + """ + A strategy which generates host name labels. + + @param allow_idn: Whether to allow non-ASCII characters as allowed by + internationalized domain names (IDNs). + """ + if allow_idn: + label = cast(Text, draw(idna_text(min_size=1, max_size=63))) + + try: + label.encode("ascii") + except UnicodeEncodeError: + # If the label doesn't encode to ASCII, then we need to check + # the length of the label after encoding to punycode and adding + # the xn-- prefix. + while len(label.encode("punycode")) > 63 - len("xn--"): + # Rather than bombing out, just trim from the end until it + # is short enough, so hypothesis doesn't have to generate + # new data. + label = label[:-1] + + else: + label = cast( + Text, + draw( + text( + min_size=1, + max_size=63, + alphabet=Text(ascii_letters + digits + u"-"), + ) + ), + ) + + # Filter invalid labels. + # It would be better to reliably avoid generation of bogus labels in + # the first place, but it's hard... + try: + check_label(label) + except UnicodeError: # pragma: no cover (not always drawn) + assume(False) + + return label + + @composite + def hostnames(draw, allow_leading_digit=True, allow_idn=True): + # type: (DrawCallable, bool, bool) -> Text + """ + A strategy which generates host names. + + @param allow_leading_digit: Whether to allow a leading digit in host + names; they were not allowed prior to RFC 1123. + + @param allow_idn: Whether to allow non-ASCII characters as allowed by + internationalized domain names (IDNs). + """ + # Draw first label, filtering out labels with leading digits if needed + labels = [ + cast( + Text, + draw( + hostname_labels(allow_idn=allow_idn).filter( + lambda l: ( + True if allow_leading_digit else l[0] not in digits + ) + ) + ), + ) + ] + # Draw remaining labels + labels += cast( + List[Text], + draw( + lists( + hostname_labels(allow_idn=allow_idn), + min_size=1, + max_size=4, + ) + ), + ) + + # Trim off labels until the total host name length fits in 252 + # characters. This avoids having to filter the data. + while sum(len(label) for label in labels) + len(labels) - 1 > 252: + labels = labels[:-1] + + return u".".join(labels) + + def path_characters(): + # type: () -> str + """ + Returns a string containing valid URL path characters. + """ + global _path_characters + + if _path_characters is None: + + def chars(): + # type: () -> Iterable[Text] + for i in range(maxunicode): + c = unichr(i) + + # Exclude reserved characters + if c in "#/?": + continue + + # Exclude anything not UTF-8 compatible + try: + c.encode("utf-8") + except UnicodeEncodeError: + continue + + yield c + + _path_characters = "".join(chars()) + + return _path_characters + + _path_characters = None # type: Optional[str] + + @composite + def paths(draw): + # type: (DrawCallable) -> Sequence[Text] + return cast( + List[Text], + draw( + lists(text(min_size=1, alphabet=path_characters()), max_size=10) + ), + ) + + @composite + def encoded_urls(draw): + # type: (DrawCallable) -> EncodedURL + """ + A strategy which generates L{EncodedURL}s. + Call the L{EncodedURL.to_uri} method on each URL to get an HTTP + protocol-friendly URI. + """ + port = cast(Optional[int], draw(port_numbers(allow_zero=True))) + host = cast(Text, draw(hostnames())) + path = cast(Sequence[Text], draw(paths())) + + if port == 0: + port = None + + return EncodedURL( + scheme=cast(Text, draw(sampled_from((u"http", u"https")))), + host=host, + port=port, + path=path, + ) + + @composite + def decoded_urls(draw): + # type: (DrawCallable) -> DecodedURL + """ + A strategy which generates L{DecodedURL}s. + Call the L{EncodedURL.to_uri} method on each URL to get an HTTP + protocol-friendly URI. + """ + return DecodedURL(draw(encoded_urls())) diff --git a/contrib/python/hyperlink/py2/hyperlink/idna-tables-properties.csv.gz b/contrib/python/hyperlink/py2/hyperlink/idna-tables-properties.csv.gz Binary files differnew file mode 100644 index 0000000000..48e9f06742 --- /dev/null +++ b/contrib/python/hyperlink/py2/hyperlink/idna-tables-properties.csv.gz diff --git a/contrib/python/hyperlink/py2/hyperlink/py.typed b/contrib/python/hyperlink/py2/hyperlink/py.typed new file mode 100644 index 0000000000..d2dfd5e491 --- /dev/null +++ b/contrib/python/hyperlink/py2/hyperlink/py.typed @@ -0,0 +1 @@ +# See: https://www.python.org/dev/peps/pep-0561/ diff --git a/contrib/python/hyperlink/py2/ya.make b/contrib/python/hyperlink/py2/ya.make new file mode 100644 index 0000000000..5611a958d8 --- /dev/null +++ b/contrib/python/hyperlink/py2/ya.make @@ -0,0 +1,36 @@ +# Generated by devtools/yamaker (pypi). + +PY2_LIBRARY() + +VERSION(21.0.0) + +LICENSE(MIT) + +PEERDIR( + contrib/deprecated/python/typing + contrib/python/idna +) + +NO_LINT() + +PY_SRCS( + TOP_LEVEL + hyperlink/__init__.py + hyperlink/_socket.py + hyperlink/_url.py + hyperlink/hypothesis.py +) + +RESOURCE_FILES( + PREFIX contrib/python/hyperlink/py2/ + .dist-info/METADATA + .dist-info/top_level.txt + hyperlink/idna-tables-properties.csv.gz + hyperlink/py.typed +) + +END() + +RECURSE_FOR_TESTS( + tests +) diff --git a/contrib/python/hyperlink/py3/.dist-info/METADATA b/contrib/python/hyperlink/py3/.dist-info/METADATA new file mode 100644 index 0000000000..fc5922ba87 --- /dev/null +++ b/contrib/python/hyperlink/py3/.dist-info/METADATA @@ -0,0 +1,38 @@ +Metadata-Version: 2.1 +Name: hyperlink +Version: 21.0.0 +Summary: A featureful, immutable, and correct URL for Python. +Home-page: https://github.com/python-hyper/hyperlink +Author: Mahmoud Hashemi and Glyph Lefkowitz +Author-email: mahmoud@hatnote.com +License: MIT +Platform: any +Classifier: Topic :: Utilities +Classifier: Intended Audience :: Developers +Classifier: Topic :: Software Development :: Libraries +Classifier: Development Status :: 5 - Production/Stable +Classifier: Programming Language :: Python :: 2 +Classifier: Programming Language :: Python :: 2.6 +Classifier: Programming Language :: Python :: 2.7 +Classifier: Programming Language :: Python :: 3 +Classifier: Programming Language :: Python :: 3.4 +Classifier: Programming Language :: Python :: 3.5 +Classifier: Programming Language :: Python :: 3.6 +Classifier: Programming Language :: Python :: 3.7 +Classifier: Programming Language :: Python :: 3.8 +Classifier: Programming Language :: Python :: 3.9 +Classifier: Programming Language :: Python :: Implementation :: PyPy +Classifier: License :: OSI Approved :: MIT License +Requires-Python: >=2.6, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.* +Requires-Dist: idna (>=2.5) +Requires-Dist: typing ; python_version < "3.5" + +The humble, but powerful, URL runs everything around us. Chances +are you've used several just to read this text. + +Hyperlink is a featureful, pure-Python implementation of the URL, with +an emphasis on correctness. MIT licensed. + +See the docs at http://hyperlink.readthedocs.io. + + diff --git a/contrib/python/hyperlink/py3/.dist-info/top_level.txt b/contrib/python/hyperlink/py3/.dist-info/top_level.txt new file mode 100644 index 0000000000..81722ce1d8 --- /dev/null +++ b/contrib/python/hyperlink/py3/.dist-info/top_level.txt @@ -0,0 +1 @@ +hyperlink diff --git a/contrib/python/hyperlink/py3/LICENSE b/contrib/python/hyperlink/py3/LICENSE new file mode 100644 index 0000000000..a73f882ffb --- /dev/null +++ b/contrib/python/hyperlink/py3/LICENSE @@ -0,0 +1,29 @@ +Copyright (c) 2017 +Glyph Lefkowitz +Itamar Turner-Trauring +Jean Paul Calderone +Adi Roiban +Amber Hawkie Brown +Mahmoud Hashemi +Wilfredo Sanchez Vega + +and others that have contributed code to the public domain. + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/contrib/python/hyperlink/py3/README.md b/contrib/python/hyperlink/py3/README.md new file mode 100644 index 0000000000..017f9eb88c --- /dev/null +++ b/contrib/python/hyperlink/py3/README.md @@ -0,0 +1,67 @@ +# Hyperlink + +*Cool URLs that don't change.* + +<a href="https://hyperlink.readthedocs.io/en/latest/"> + <img src="https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat" alt="Documentation"> +</a> +<a href="https://pypi.org/project/hyperlink/"> + <img src="https://img.shields.io/pypi/v/hyperlink.svg" alt="PyPI"> +</a> +<a href="http://calver.org"> + <img src="https://img.shields.io/badge/calver-YY.MINOR.MICRO-22bfda.svg" alt="Calendar Versioning"> +</a> +<a href="https://pypi.org/project/hyperlink/"> + <img src="https://img.shields.io/pypi/pyversions/hyperlink.svg" alt="Python Version Compatibility"> +</a> +<a href="https://https://codecov.io/github/python-hyper/hyperlink?branch=master"> + <img src="https://codecov.io/github/python-hyper/hyperlink/coverage.svg?branch=master" alt="Code Coverage"> +</a> +<a href="https://requires.io/github/python-hyper/hyperlink/requirements/?branch=master"> + <img src="https://requires.io/github/python-hyper/hyperlink/requirements.svg?branch=master" alt="Requirements Status"> +</a> + +Hyperlink provides a pure-Python implementation of immutable +URLs. Based on [RFC 3986][rfc3986] and [3987][rfc3987], the Hyperlink URL +makes working with both URIs and IRIs easy. + +Hyperlink is tested against Python 2.7, 3.4, 3.5, 3.6, 3.7, 3.8, and PyPy. + +Full documentation is available on [Read the Docs][docs]. + +[rfc3986]: https://tools.ietf.org/html/rfc3986 +[rfc3987]: https://tools.ietf.org/html/rfc3987 +[docs]: http://hyperlink.readthedocs.io/en/latest/ + +## Installation + +Hyperlink is a pure-Python package and requires nothing but +Python. The easiest way to install is with pip: + +``` +pip install hyperlink +``` + +Then, hyperlink away! + +```python +from hyperlink import URL + +url = URL.from_text(u'http://github.com/python-hyper/hyperlink?utm_source=README') +utm_source = url.get(u'utm_source') +better_url = url.replace(scheme=u'https', port=443) +org_url = better_url.click(u'.') +``` + +See the full API docs on [Read the Docs][docs]. + +## More information + +Hyperlink would not have been possible without the help of +[Glyph Lefkowitz](https://glyph.twistedmatrix.com/) and many other +community members, especially considering that it started as an +extract from the Twisted networking library. Thanks to them, +Hyperlink's URL has been production-grade for well over a decade. + +Still, should you encounter any issues, do file an issue, or submit a +pull request. diff --git a/contrib/python/hyperlink/py3/hyperlink/__init__.py b/contrib/python/hyperlink/py3/hyperlink/__init__.py new file mode 100644 index 0000000000..f680b01a90 --- /dev/null +++ b/contrib/python/hyperlink/py3/hyperlink/__init__.py @@ -0,0 +1,17 @@ +from ._url import ( + parse, + register_scheme, + URL, + EncodedURL, + DecodedURL, + URLParseError, +) + +__all__ = ( + "parse", + "register_scheme", + "URL", + "EncodedURL", + "DecodedURL", + "URLParseError", +) diff --git a/contrib/python/hyperlink/py3/hyperlink/_socket.py b/contrib/python/hyperlink/py3/hyperlink/_socket.py new file mode 100644 index 0000000000..3bcf89706d --- /dev/null +++ b/contrib/python/hyperlink/py3/hyperlink/_socket.py @@ -0,0 +1,53 @@ +try: + from socket import inet_pton +except ImportError: + from typing import TYPE_CHECKING + + if TYPE_CHECKING: # pragma: no cover + pass + else: + # based on https://gist.github.com/nnemkin/4966028 + # this code only applies on Windows Python 2.7 + import ctypes + import socket + + class SockAddr(ctypes.Structure): + _fields_ = [ + ("sa_family", ctypes.c_short), + ("__pad1", ctypes.c_ushort), + ("ipv4_addr", ctypes.c_byte * 4), + ("ipv6_addr", ctypes.c_byte * 16), + ("__pad2", ctypes.c_ulong), + ] + + WSAStringToAddressA = ctypes.windll.ws2_32.WSAStringToAddressA + WSAAddressToStringA = ctypes.windll.ws2_32.WSAAddressToStringA + + def inet_pton(address_family, ip_string): + # type: (int, str) -> bytes + addr = SockAddr() + ip_string_bytes = ip_string.encode("ascii") + addr.sa_family = address_family + addr_size = ctypes.c_int(ctypes.sizeof(addr)) + + try: + attribute, size = { + socket.AF_INET: ("ipv4_addr", 4), + socket.AF_INET6: ("ipv6_addr", 16), + }[address_family] + except KeyError: + raise socket.error("unknown address family") + + if ( + WSAStringToAddressA( + ip_string_bytes, + address_family, + None, + ctypes.byref(addr), + ctypes.byref(addr_size), + ) + != 0 + ): + raise socket.error(ctypes.FormatError()) + + return ctypes.string_at(getattr(addr, attribute), size) diff --git a/contrib/python/hyperlink/py3/hyperlink/_url.py b/contrib/python/hyperlink/py3/hyperlink/_url.py new file mode 100644 index 0000000000..be69baf696 --- /dev/null +++ b/contrib/python/hyperlink/py3/hyperlink/_url.py @@ -0,0 +1,2448 @@ +# -*- coding: utf-8 -*- +u"""Hyperlink provides Pythonic URL parsing, construction, and rendering. + +Usage is straightforward:: + + >>> import hyperlink + >>> url = hyperlink.parse(u'http://github.com/mahmoud/hyperlink?utm_source=docs') + >>> url.host + u'github.com' + >>> secure_url = url.replace(scheme=u'https') + >>> secure_url.get('utm_source')[0] + u'docs' + +Hyperlink's API centers on the :class:`DecodedURL` type, which wraps +the lower-level :class:`URL`, both of which can be returned by the +:func:`parse()` convenience function. + +""" # noqa: E501 + +import re +import sys +import string +import socket +from socket import AF_INET, AF_INET6 + +try: + from socket import AddressFamily +except ImportError: + AddressFamily = int # type: ignore[assignment,misc] +from typing import ( + Any, + Callable, + Dict, + Iterable, + Iterator, + List, + Mapping, + Optional, + Sequence, + Text, + Tuple, + Type, + TypeVar, + Union, + cast, +) +from unicodedata import normalize +from ._socket import inet_pton + +try: + from collections.abc import Mapping as MappingABC +except ImportError: # Python 2 + from collections import Mapping as MappingABC + +from idna import encode as idna_encode, decode as idna_decode + + +PY2 = sys.version_info[0] == 2 +try: + unichr +except NameError: # Py3 + unichr = chr # type: Callable[[int], Text] +NoneType = type(None) # type: Type[None] +QueryPairs = Tuple[Tuple[Text, Optional[Text]], ...] # internal representation +QueryParameters = Union[ + Mapping[Text, Optional[Text]], + QueryPairs, + Sequence[Tuple[Text, Optional[Text]]], +] +T = TypeVar("T") + + +# from boltons.typeutils +def make_sentinel(name="_MISSING", var_name=""): + # type: (str, str) -> object + """Creates and returns a new **instance** of a new class, suitable for + usage as a "sentinel", a kind of singleton often used to indicate + a value is missing when ``None`` is a valid input. + + Args: + name: Name of the Sentinel + var_name: Set this name to the name of the variable in its respective + module enable pickle-ability. + + >>> make_sentinel(var_name='_MISSING') + _MISSING + + The most common use cases here in boltons are as default values + for optional function arguments, partly because of its + less-confusing appearance in automatically generated + documentation. Sentinels also function well as placeholders in queues + and linked lists. + + .. note:: + + By design, additional calls to ``make_sentinel`` with the same + values will not produce equivalent objects. + + >>> make_sentinel('TEST') == make_sentinel('TEST') + False + >>> type(make_sentinel('TEST')) == type(make_sentinel('TEST')) + False + """ + + class Sentinel(object): + def __init__(self): + # type: () -> None + self.name = name + self.var_name = var_name + + def __repr__(self): + # type: () -> str + if self.var_name: + return self.var_name + return "%s(%r)" % (self.__class__.__name__, self.name) + + if var_name: + # superclass type hints don't allow str return type, but it is + # allowed in the docs, hence the ignore[override] below + def __reduce__(self): + # type: () -> str + return self.var_name + + def __nonzero__(self): + # type: () -> bool + return False + + __bool__ = __nonzero__ + + return Sentinel() + + +_unspecified = _UNSET = make_sentinel("_UNSET") # type: Any + + +# RFC 3986 Section 2.3, Unreserved URI Characters +# https://tools.ietf.org/html/rfc3986#section-2.3 +_UNRESERVED_CHARS = frozenset( + "~-._0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz" +) + + +# URL parsing regex (based on RFC 3986 Appendix B, with modifications) +_URL_RE = re.compile( + r"^((?P<scheme>[^:/?#]+):)?" + r"((?P<_netloc_sep>//)" + r"(?P<authority>[^/?#]*))?" + r"(?P<path>[^?#]*)" + r"(\?(?P<query>[^#]*))?" + r"(#(?P<fragment>.*))?$" +) +_SCHEME_RE = re.compile(r"^[a-zA-Z0-9+-.]*$") +_AUTHORITY_RE = re.compile( + r"^(?:(?P<userinfo>[^@/?#]*)@)?" + r"(?P<host>" + r"(?:\[(?P<ipv6_host>[^[\]/?#]*)\])" + r"|(?P<plain_host>[^:/?#[\]]*)" + r"|(?P<bad_host>.*?))?" + r"(?::(?P<port>.*))?$" +) + + +_HEX_CHAR_MAP = dict( + [ + ((a + b).encode("ascii"), unichr(int(a + b, 16)).encode("charmap")) + for a in string.hexdigits + for b in string.hexdigits + ] +) +_ASCII_RE = re.compile("([\x00-\x7f]+)") + +# RFC 3986 section 2.2, Reserved Characters +# https://tools.ietf.org/html/rfc3986#section-2.2 +_GEN_DELIMS = frozenset(u":/?#[]@") +_SUB_DELIMS = frozenset(u"!$&'()*+,;=") +_ALL_DELIMS = _GEN_DELIMS | _SUB_DELIMS + +_USERINFO_SAFE = _UNRESERVED_CHARS | _SUB_DELIMS | set(u"%") +_USERINFO_DELIMS = _ALL_DELIMS - _USERINFO_SAFE +_PATH_SAFE = _USERINFO_SAFE | set(u":@") +_PATH_DELIMS = _ALL_DELIMS - _PATH_SAFE +_SCHEMELESS_PATH_SAFE = _PATH_SAFE - set(":") +_SCHEMELESS_PATH_DELIMS = _ALL_DELIMS - _SCHEMELESS_PATH_SAFE +_FRAGMENT_SAFE = _UNRESERVED_CHARS | _PATH_SAFE | set(u"/?") +_FRAGMENT_DELIMS = _ALL_DELIMS - _FRAGMENT_SAFE +_QUERY_VALUE_SAFE = _UNRESERVED_CHARS | _FRAGMENT_SAFE - set(u"&") +_QUERY_VALUE_DELIMS = _ALL_DELIMS - _QUERY_VALUE_SAFE +_QUERY_KEY_SAFE = _UNRESERVED_CHARS | _QUERY_VALUE_SAFE - set(u"=") +_QUERY_KEY_DELIMS = _ALL_DELIMS - _QUERY_KEY_SAFE + + +def _make_decode_map(delims, allow_percent=False): + # type: (Iterable[Text], bool) -> Mapping[bytes, bytes] + ret = dict(_HEX_CHAR_MAP) + if not allow_percent: + delims = set(delims) | set([u"%"]) + for delim in delims: + _hexord = "{0:02X}".format(ord(delim)).encode("ascii") + _hexord_lower = _hexord.lower() + ret.pop(_hexord) + if _hexord != _hexord_lower: + ret.pop(_hexord_lower) + return ret + + +def _make_quote_map(safe_chars): + # type: (Iterable[Text]) -> Mapping[Union[int, Text], Text] + ret = {} # type: Dict[Union[int, Text], Text] + # v is included in the dict for py3 mostly, because bytestrings + # are iterables of ints, of course! + for i, v in zip(range(256), range(256)): + c = chr(v) + if c in safe_chars: + ret[c] = ret[v] = c + else: + ret[c] = ret[v] = "%{0:02X}".format(i) + return ret + + +_USERINFO_PART_QUOTE_MAP = _make_quote_map(_USERINFO_SAFE) +_USERINFO_DECODE_MAP = _make_decode_map(_USERINFO_DELIMS) +_PATH_PART_QUOTE_MAP = _make_quote_map(_PATH_SAFE) +_SCHEMELESS_PATH_PART_QUOTE_MAP = _make_quote_map(_SCHEMELESS_PATH_SAFE) +_PATH_DECODE_MAP = _make_decode_map(_PATH_DELIMS) +_QUERY_KEY_QUOTE_MAP = _make_quote_map(_QUERY_KEY_SAFE) +_QUERY_KEY_DECODE_MAP = _make_decode_map(_QUERY_KEY_DELIMS) +_QUERY_VALUE_QUOTE_MAP = _make_quote_map(_QUERY_VALUE_SAFE) +_QUERY_VALUE_DECODE_MAP = _make_decode_map(_QUERY_VALUE_DELIMS) +_FRAGMENT_QUOTE_MAP = _make_quote_map(_FRAGMENT_SAFE) +_FRAGMENT_DECODE_MAP = _make_decode_map(_FRAGMENT_DELIMS) +_UNRESERVED_QUOTE_MAP = _make_quote_map(_UNRESERVED_CHARS) +_UNRESERVED_DECODE_MAP = dict( + [ + (k, v) + for k, v in _HEX_CHAR_MAP.items() + if v.decode("ascii", "replace") in _UNRESERVED_CHARS + ] +) + +_ROOT_PATHS = frozenset(((), (u"",))) + + +def _encode_reserved(text, maximal=True): + # type: (Text, bool) -> Text + """A very comprehensive percent encoding for encoding all + delimiters. Used for arguments to DecodedURL, where a % means a + percent sign, and not the character used by URLs for escaping + bytes. + """ + if maximal: + bytestr = normalize("NFC", text).encode("utf8") + return u"".join([_UNRESERVED_QUOTE_MAP[b] for b in bytestr]) + return u"".join( + [ + _UNRESERVED_QUOTE_MAP[t] if t in _UNRESERVED_CHARS else t + for t in text + ] + ) + + +def _encode_path_part(text, maximal=True): + # type: (Text, bool) -> Text + "Percent-encode a single segment of a URL path." + if maximal: + bytestr = normalize("NFC", text).encode("utf8") + return u"".join([_PATH_PART_QUOTE_MAP[b] for b in bytestr]) + return u"".join( + [_PATH_PART_QUOTE_MAP[t] if t in _PATH_DELIMS else t for t in text] + ) + + +def _encode_schemeless_path_part(text, maximal=True): + # type: (Text, bool) -> Text + """Percent-encode the first segment of a URL path for a URL without a + scheme specified. + """ + if maximal: + bytestr = normalize("NFC", text).encode("utf8") + return u"".join([_SCHEMELESS_PATH_PART_QUOTE_MAP[b] for b in bytestr]) + return u"".join( + [ + _SCHEMELESS_PATH_PART_QUOTE_MAP[t] + if t in _SCHEMELESS_PATH_DELIMS + else t + for t in text + ] + ) + + +def _encode_path_parts( + text_parts, # type: Sequence[Text] + rooted=False, # type: bool + has_scheme=True, # type: bool + has_authority=True, # type: bool + maximal=True, # type: bool +): + # type: (...) -> Sequence[Text] + """ + Percent-encode a tuple of path parts into a complete path. + + Setting *maximal* to False percent-encodes only the reserved + characters that are syntactically necessary for serialization, + preserving any IRI-style textual data. + + Leaving *maximal* set to its default True percent-encodes + everything required to convert a portion of an IRI to a portion of + a URI. + + RFC 3986 3.3: + + If a URI contains an authority component, then the path component + must either be empty or begin with a slash ("/") character. If a URI + does not contain an authority component, then the path cannot begin + with two slash characters ("//"). In addition, a URI reference + (Section 4.1) may be a relative-path reference, in which case the + first path segment cannot contain a colon (":") character. + """ + if not text_parts: + return () + if rooted: + text_parts = (u"",) + tuple(text_parts) + # elif has_authority and text_parts: + # raise Exception('see rfc above') # TODO: too late to fail like this? + encoded_parts = [] # type: List[Text] + if has_scheme: + encoded_parts = [ + _encode_path_part(part, maximal=maximal) if part else part + for part in text_parts + ] + else: + encoded_parts = [_encode_schemeless_path_part(text_parts[0])] + encoded_parts.extend( + [ + _encode_path_part(part, maximal=maximal) if part else part + for part in text_parts[1:] + ] + ) + return tuple(encoded_parts) + + +def _encode_query_key(text, maximal=True): + # type: (Text, bool) -> Text + """ + Percent-encode a single query string key or value. + """ + if maximal: + bytestr = normalize("NFC", text).encode("utf8") + return u"".join([_QUERY_KEY_QUOTE_MAP[b] for b in bytestr]) + return u"".join( + [_QUERY_KEY_QUOTE_MAP[t] if t in _QUERY_KEY_DELIMS else t for t in text] + ) + + +def _encode_query_value(text, maximal=True): + # type: (Text, bool) -> Text + """ + Percent-encode a single query string key or value. + """ + if maximal: + bytestr = normalize("NFC", text).encode("utf8") + return u"".join([_QUERY_VALUE_QUOTE_MAP[b] for b in bytestr]) + return u"".join( + [ + _QUERY_VALUE_QUOTE_MAP[t] if t in _QUERY_VALUE_DELIMS else t + for t in text + ] + ) + + +def _encode_fragment_part(text, maximal=True): + # type: (Text, bool) -> Text + """Quote the fragment part of the URL. Fragments don't have + subdelimiters, so the whole URL fragment can be passed. + """ + if maximal: + bytestr = normalize("NFC", text).encode("utf8") + return u"".join([_FRAGMENT_QUOTE_MAP[b] for b in bytestr]) + return u"".join( + [_FRAGMENT_QUOTE_MAP[t] if t in _FRAGMENT_DELIMS else t for t in text] + ) + + +def _encode_userinfo_part(text, maximal=True): + # type: (Text, bool) -> Text + """Quote special characters in either the username or password + section of the URL. + """ + if maximal: + bytestr = normalize("NFC", text).encode("utf8") + return u"".join([_USERINFO_PART_QUOTE_MAP[b] for b in bytestr]) + return u"".join( + [ + _USERINFO_PART_QUOTE_MAP[t] if t in _USERINFO_DELIMS else t + for t in text + ] + ) + + +# This port list painstakingly curated by hand searching through +# https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml +# and +# https://www.iana.org/assignments/service-names-port-numbers/service-names-port-numbers.xhtml +SCHEME_PORT_MAP = { + "acap": 674, + "afp": 548, + "dict": 2628, + "dns": 53, + "file": None, + "ftp": 21, + "git": 9418, + "gopher": 70, + "http": 80, + "https": 443, + "imap": 143, + "ipp": 631, + "ipps": 631, + "irc": 194, + "ircs": 6697, + "ldap": 389, + "ldaps": 636, + "mms": 1755, + "msrp": 2855, + "msrps": None, + "mtqp": 1038, + "nfs": 111, + "nntp": 119, + "nntps": 563, + "pop": 110, + "prospero": 1525, + "redis": 6379, + "rsync": 873, + "rtsp": 554, + "rtsps": 322, + "rtspu": 5005, + "sftp": 22, + "smb": 445, + "snmp": 161, + "ssh": 22, + "steam": None, + "svn": 3690, + "telnet": 23, + "ventrilo": 3784, + "vnc": 5900, + "wais": 210, + "ws": 80, + "wss": 443, + "xmpp": None, +} + +# This list of schemes that don't use authorities is also from the link above. +NO_NETLOC_SCHEMES = set( + [ + "urn", + "about", + "bitcoin", + "blob", + "data", + "geo", + "magnet", + "mailto", + "news", + "pkcs11", + "sip", + "sips", + "tel", + ] +) +# As of Mar 11, 2017, there were 44 netloc schemes, and 13 non-netloc + +NO_QUERY_PLUS_SCHEMES = set() + + +def register_scheme( + text, uses_netloc=True, default_port=None, query_plus_is_space=True +): + # type: (Text, bool, Optional[int], bool) -> None + """Registers new scheme information, resulting in correct port and + slash behavior from the URL object. There are dozens of standard + schemes preregistered, so this function is mostly meant for + proprietary internal customizations or stopgaps on missing + standards information. If a scheme seems to be missing, please + `file an issue`_! + + Args: + text: A string representation of the scheme. + (the 'http' in 'http://hatnote.com') + uses_netloc: Does the scheme support specifying a + network host? For instance, "http" does, "mailto" does + not. Defaults to True. + default_port: The default port, if any, for + netloc-using schemes. + query_plus_is_space: If true, a "+" in the query string should be + decoded as a space by DecodedURL. + + .. _file an issue: https://github.com/mahmoud/hyperlink/issues + """ + text = text.lower() + if default_port is not None: + try: + default_port = int(default_port) + except (ValueError, TypeError): + raise ValueError( + "default_port expected integer or None, not %r" + % (default_port,) + ) + + if uses_netloc is True: + SCHEME_PORT_MAP[text] = default_port + elif uses_netloc is False: + if default_port is not None: + raise ValueError( + "unexpected default port while specifying" + " non-netloc scheme: %r" % default_port + ) + NO_NETLOC_SCHEMES.add(text) + else: + raise ValueError("uses_netloc expected bool, not: %r" % uses_netloc) + + if not query_plus_is_space: + NO_QUERY_PLUS_SCHEMES.add(text) + + return + + +def scheme_uses_netloc(scheme, default=None): + # type: (Text, Optional[bool]) -> Optional[bool] + """Whether or not a URL uses :code:`:` or :code:`://` to separate the + scheme from the rest of the URL depends on the scheme's own + standard definition. There is no way to infer this behavior + from other parts of the URL. A scheme either supports network + locations or it does not. + + The URL type's approach to this is to check for explicitly + registered schemes, with common schemes like HTTP + preregistered. This is the same approach taken by + :mod:`urlparse`. + + URL adds two additional heuristics if the scheme as a whole is + not registered. First, it attempts to check the subpart of the + scheme after the last ``+`` character. This adds intuitive + behavior for schemes like ``git+ssh``. Second, if a URL with + an unrecognized scheme is loaded, it will maintain the + separator it sees. + """ + if not scheme: + return False + scheme = scheme.lower() + if scheme in SCHEME_PORT_MAP: + return True + if scheme in NO_NETLOC_SCHEMES: + return False + if scheme.split("+")[-1] in SCHEME_PORT_MAP: + return True + return default + + +class URLParseError(ValueError): + """Exception inheriting from :exc:`ValueError`, raised when failing to + parse a URL. Mostly raised on invalid ports and IPv6 addresses. + """ + + pass + + +def _optional(argument, default): + # type: (Any, Any) -> Any + if argument is _UNSET: + return default + else: + return argument + + +def _typecheck(name, value, *types): + # type: (Text, T, Type[Any]) -> T + """ + Check that the given *value* is one of the given *types*, or raise an + exception describing the problem using *name*. + """ + if not types: + raise ValueError("expected one or more types, maybe use _textcheck?") + if not isinstance(value, types): + raise TypeError( + "expected %s for %s, got %r" + % (" or ".join([t.__name__ for t in types]), name, value) + ) + return value + + +def _textcheck(name, value, delims=frozenset(), nullable=False): + # type: (Text, T, Iterable[Text], bool) -> T + if not isinstance(value, Text): + if nullable and value is None: + # used by query string values + return value # type: ignore[unreachable] + else: + str_name = "unicode" if PY2 else "str" + exp = str_name + " or NoneType" if nullable else str_name + raise TypeError("expected %s for %s, got %r" % (exp, name, value)) + if delims and set(value) & set(delims): # TODO: test caching into regexes + raise ValueError( + "one or more reserved delimiters %s present in %s: %r" + % ("".join(delims), name, value) + ) + return value # type: ignore[return-value] # T vs. Text + + +def iter_pairs(iterable): + # type: (Iterable[Any]) -> Iterator[Any] + """ + Iterate over the (key, value) pairs in ``iterable``. + + This handles dictionaries sensibly, and falls back to assuming the + iterable yields (key, value) pairs. This behaviour is similar to + what Python's ``dict()`` constructor does. + """ + if isinstance(iterable, MappingABC): + iterable = iterable.items() + return iter(iterable) + + +def _decode_unreserved(text, normalize_case=False, encode_stray_percents=False): + # type: (Text, bool, bool) -> Text + return _percent_decode( + text, + normalize_case=normalize_case, + encode_stray_percents=encode_stray_percents, + _decode_map=_UNRESERVED_DECODE_MAP, + ) + + +def _decode_userinfo_part( + text, normalize_case=False, encode_stray_percents=False +): + # type: (Text, bool, bool) -> Text + return _percent_decode( + text, + normalize_case=normalize_case, + encode_stray_percents=encode_stray_percents, + _decode_map=_USERINFO_DECODE_MAP, + ) + + +def _decode_path_part(text, normalize_case=False, encode_stray_percents=False): + # type: (Text, bool, bool) -> Text + """ + >>> _decode_path_part(u'%61%77%2f%7a') + u'aw%2fz' + >>> _decode_path_part(u'%61%77%2f%7a', normalize_case=True) + u'aw%2Fz' + """ + return _percent_decode( + text, + normalize_case=normalize_case, + encode_stray_percents=encode_stray_percents, + _decode_map=_PATH_DECODE_MAP, + ) + + +def _decode_query_key(text, normalize_case=False, encode_stray_percents=False): + # type: (Text, bool, bool) -> Text + return _percent_decode( + text, + normalize_case=normalize_case, + encode_stray_percents=encode_stray_percents, + _decode_map=_QUERY_KEY_DECODE_MAP, + ) + + +def _decode_query_value( + text, normalize_case=False, encode_stray_percents=False +): + # type: (Text, bool, bool) -> Text + return _percent_decode( + text, + normalize_case=normalize_case, + encode_stray_percents=encode_stray_percents, + _decode_map=_QUERY_VALUE_DECODE_MAP, + ) + + +def _decode_fragment_part( + text, normalize_case=False, encode_stray_percents=False +): + # type: (Text, bool, bool) -> Text + return _percent_decode( + text, + normalize_case=normalize_case, + encode_stray_percents=encode_stray_percents, + _decode_map=_FRAGMENT_DECODE_MAP, + ) + + +def _percent_decode( + text, # type: Text + normalize_case=False, # type: bool + subencoding="utf-8", # type: Text + raise_subencoding_exc=False, # type: bool + encode_stray_percents=False, # type: bool + _decode_map=_HEX_CHAR_MAP, # type: Mapping[bytes, bytes] +): + # type: (...) -> Text + """Convert percent-encoded text characters to their normal, + human-readable equivalents. + + All characters in the input text must be encodable by + *subencoding*. All special characters underlying the values in the + percent-encoding must be decodable as *subencoding*. If a + non-*subencoding*-valid string is passed, the original text is + returned with no changes applied. + + Only called by field-tailored variants, e.g., + :func:`_decode_path_part`, as every percent-encodable part of the + URL has characters which should not be percent decoded. + + >>> _percent_decode(u'abc%20def') + u'abc def' + + Args: + text: Text with percent-encoding present. + normalize_case: Whether undecoded percent segments, such as encoded + delimiters, should be uppercased, per RFC 3986 Section 2.1. + See :func:`_decode_path_part` for an example. + subencoding: The name of the encoding underlying the percent-encoding. + raise_subencoding_exc: Whether an error in decoding the bytes + underlying the percent-decoding should be raised. + + Returns: + Text: The percent-decoded version of *text*, decoded by *subencoding*. + """ + try: + quoted_bytes = text.encode(subencoding) + except UnicodeEncodeError: + return text + + bits = quoted_bytes.split(b"%") + if len(bits) == 1: + return text + + res = [bits[0]] + append = res.append + + for item in bits[1:]: + hexpair, rest = item[:2], item[2:] + try: + append(_decode_map[hexpair]) + append(rest) + except KeyError: + pair_is_hex = hexpair in _HEX_CHAR_MAP + if pair_is_hex or not encode_stray_percents: + append(b"%") + else: + # if it's undecodable, treat as a real percent sign, + # which is reserved (because it wasn't in the + # context-aware _decode_map passed in), and should + # stay in an encoded state. + append(b"%25") + if normalize_case and pair_is_hex: + append(hexpair.upper()) + append(rest) + else: + append(item) + + unquoted_bytes = b"".join(res) + + try: + return unquoted_bytes.decode(subencoding) + except UnicodeDecodeError: + if raise_subencoding_exc: + raise + return text + + +def _decode_host(host): + # type: (Text) -> Text + """Decode a host from ASCII-encodable text to IDNA-decoded text. If + the host text is not ASCII, it is returned unchanged, as it is + presumed that it is already IDNA-decoded. + + Some technical details: _decode_host is built on top of the "idna" + package, which has some quirks: + + Capital letters are not valid IDNA2008. The idna package will + raise an exception like this on capital letters: + + > idna.core.InvalidCodepoint: Codepoint U+004B at position 1 ... not allowed + + However, if a segment of a host (i.e., something in + url.host.split('.')) is already ASCII, idna doesn't perform its + usual checks. In fact, for capital letters it automatically + lowercases them. + + This check and some other functionality can be bypassed by passing + uts46=True to idna.encode/decode. This allows a more permissive and + convenient interface. So far it seems like the balanced approach. + + Example output (from idna==2.6): + + >> idna.encode(u'mahmöud.io') + 'xn--mahmud-zxa.io' + >> idna.encode(u'Mahmöud.io') + Traceback (most recent call last): + File "<stdin>", line 1, in <module> + File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 355, in encode + result.append(alabel(label)) + File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 276, in alabel + check_label(label) + File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 253, in check_label + raise InvalidCodepoint('Codepoint {0} at position {1} of {2} not allowed'.format(_unot(cp_value), pos+1, repr(label))) + idna.core.InvalidCodepoint: Codepoint U+004D at position 1 of u'Mahm\xf6ud' not allowed + >> idna.encode(u'Mahmoud.io') + 'Mahmoud.io' + + # Similar behavior for decodes below + >> idna.decode(u'Mahmoud.io') + u'mahmoud.io + >> idna.decode(u'Méhmoud.io', uts46=True) + u'm\xe9hmoud.io' + """ # noqa: E501 + if not host: + return u"" + try: + host_bytes = host.encode("ascii") + except UnicodeEncodeError: + host_text = host + else: + try: + host_text = idna_decode(host_bytes, uts46=True) + except ValueError: + # only reached on "narrow" (UCS-2) Python builds <3.4, see #7 + # NOTE: not going to raise here, because there's no + # ambiguity in the IDNA, and the host is still + # technically usable + host_text = host + return host_text + + +def _resolve_dot_segments(path): + # type: (Sequence[Text]) -> Sequence[Text] + """Normalize the URL path by resolving segments of '.' and '..'. For + more details, see `RFC 3986 section 5.2.4, Remove Dot Segments`_. + + Args: + path: sequence of path segments in text form + + Returns: + A new sequence of path segments with the '.' and '..' elements removed + and resolved. + + .. _RFC 3986 section 5.2.4, Remove Dot Segments: https://tools.ietf.org/html/rfc3986#section-5.2.4 + """ # noqa: E501 + segs = [] # type: List[Text] + + for seg in path: + if seg == u".": + pass + elif seg == u"..": + if segs: + segs.pop() + else: + segs.append(seg) + + if list(path[-1:]) in ([u"."], [u".."]): + segs.append(u"") + + return segs + + +def parse_host(host): + # type: (Text) -> Tuple[Optional[AddressFamily], Text] + """Parse the host into a tuple of ``(family, host)``, where family + is the appropriate :mod:`socket` module constant when the host is + an IP address. Family is ``None`` when the host is not an IP. + + Will raise :class:`URLParseError` on invalid IPv6 constants. + + Returns: + family (socket constant or None), host (string) + + >>> import socket + >>> parse_host('googlewebsite.com') == (None, 'googlewebsite.com') + True + >>> parse_host('::1') == (socket.AF_INET6, '::1') + True + >>> parse_host('192.168.1.1') == (socket.AF_INET, '192.168.1.1') + True + """ + if not host: + return None, u"" + + if u":" in host: + try: + inet_pton(AF_INET6, host) + except socket.error as se: + raise URLParseError("invalid IPv6 host: %r (%r)" % (host, se)) + except UnicodeEncodeError: + pass # TODO: this can't be a real host right? + else: + family = AF_INET6 # type: Optional[AddressFamily] + else: + try: + inet_pton(AF_INET, host) + except (socket.error, UnicodeEncodeError): + family = None # not an IP + else: + family = AF_INET + + return family, host + + +class URL(object): + r"""From blogs to billboards, URLs are so common, that it's easy to + overlook their complexity and power. With hyperlink's + :class:`URL` type, working with URLs doesn't have to be hard. + + URLs are made of many parts. Most of these parts are officially + named in `RFC 3986`_ and this diagram may prove handy in identifying + them:: + + foo://user:pass@example.com:8042/over/there?name=ferret#nose + \_/ \_______/ \_________/ \__/\_________/ \_________/ \__/ + | | | | | | | + scheme userinfo host port path query fragment + + While :meth:`~URL.from_text` is used for parsing whole URLs, the + :class:`URL` constructor builds a URL from the individual + components, like so:: + + >>> from hyperlink import URL + >>> url = URL(scheme=u'https', host=u'example.com', path=[u'hello', u'world']) + >>> print(url.to_text()) + https://example.com/hello/world + + The constructor runs basic type checks. All strings are expected + to be text (:class:`str` in Python 3, :class:`unicode` in Python 2). All + arguments are optional, defaulting to appropriately empty values. A full + list of constructor arguments is below. + + Args: + scheme: The text name of the scheme. + host: The host portion of the network location + port: The port part of the network location. If ``None`` or no port is + passed, the port will default to the default port of the scheme, if + it is known. See the ``SCHEME_PORT_MAP`` and + :func:`register_default_port` for more info. + path: A tuple of strings representing the slash-separated parts of the + path, each percent-encoded. + query: The query parameters, as a dictionary or as an sequence of + percent-encoded key-value pairs. + fragment: The fragment part of the URL. + rooted: A rooted URL is one which indicates an absolute path. + This is True on any URL that includes a host, or any relative URL + that starts with a slash. + userinfo: The username or colon-separated username:password pair. + uses_netloc: Indicates whether ``://`` (the "netloc separator") will + appear to separate the scheme from the *path* in cases where no + host is present. + Setting this to ``True`` is a non-spec-compliant affordance for the + common practice of having URIs that are *not* URLs (cannot have a + 'host' part) but nevertheless use the common ``://`` idiom that + most people associate with URLs; e.g. ``message:`` URIs like + ``message://message-id`` being equivalent to ``message:message-id``. + This may be inferred based on the scheme depending on whether + :func:`register_scheme` has been used to register the scheme and + should not be passed directly unless you know the scheme works like + this and you know it has not been registered. + + All of these parts are also exposed as read-only attributes of :class:`URL` + instances, along with several useful methods. + + .. _RFC 3986: https://tools.ietf.org/html/rfc3986 + .. _RFC 3987: https://tools.ietf.org/html/rfc3987 + """ # noqa: E501 + + def __init__( + self, + scheme=None, # type: Optional[Text] + host=None, # type: Optional[Text] + path=(), # type: Iterable[Text] + query=(), # type: QueryParameters + fragment=u"", # type: Text + port=None, # type: Optional[int] + rooted=None, # type: Optional[bool] + userinfo=u"", # type: Text + uses_netloc=None, # type: Optional[bool] + ): + # type: (...) -> None + if host is not None and scheme is None: + scheme = u"http" # TODO: why + if port is None and scheme is not None: + port = SCHEME_PORT_MAP.get(scheme) + if host and query and not path: + # per RFC 3986 6.2.3, "a URI that uses the generic syntax + # for authority with an empty path should be normalized to + # a path of '/'." + path = (u"",) + + # Now that we're done detecting whether they were passed, we can set + # them to their defaults: + if scheme is None: + scheme = u"" + if host is None: + host = u"" + if rooted is None: + rooted = bool(host) + + # Set attributes. + self._scheme = _textcheck("scheme", scheme) + if self._scheme: + if not _SCHEME_RE.match(self._scheme): + raise ValueError( + 'invalid scheme: %r. Only alphanumeric, "+",' + ' "-", and "." allowed. Did you meant to call' + " %s.from_text()?" % (self._scheme, self.__class__.__name__) + ) + + _, self._host = parse_host(_textcheck("host", host, "/?#@")) + if isinstance(path, Text): + raise TypeError( + "expected iterable of text for path, not: %r" % (path,) + ) + self._path = tuple( + (_textcheck("path segment", segment, "/?#") for segment in path) + ) + self._query = tuple( + ( + _textcheck("query parameter name", k, "&=#"), + _textcheck("query parameter value", v, "&#", nullable=True), + ) + for k, v in iter_pairs(query) + ) + self._fragment = _textcheck("fragment", fragment) + self._port = _typecheck("port", port, int, NoneType) + self._rooted = _typecheck("rooted", rooted, bool) + self._userinfo = _textcheck("userinfo", userinfo, "/?#@") + + if uses_netloc is None: + uses_netloc = scheme_uses_netloc(self._scheme, uses_netloc) + self._uses_netloc = _typecheck( + "uses_netloc", uses_netloc, bool, NoneType + ) + will_have_authority = self._host or ( + self._port and self._port != SCHEME_PORT_MAP.get(scheme) + ) + if will_have_authority: + # fixup for rooted consistency; if there's any 'authority' + # represented in the textual URL, then the path must be rooted, and + # we're definitely using a netloc (there must be a ://). + self._rooted = True + self._uses_netloc = True + if (not self._rooted) and self.path[:1] == (u"",): + self._rooted = True + self._path = self._path[1:] + if not will_have_authority and self._path and not self._rooted: + # If, after fixing up the path, there *is* a path and it *isn't* + # rooted, then we are definitely not using a netloc; if we did, it + # would make the path (erroneously) look like a hostname. + self._uses_netloc = False + + def get_decoded_url(self, lazy=False): + # type: (bool) -> DecodedURL + try: + return self._decoded_url + except AttributeError: + self._decoded_url = DecodedURL(self, lazy=lazy) # type: DecodedURL + return self._decoded_url + + @property + def scheme(self): + # type: () -> Text + """The scheme is a string, and the first part of an absolute URL, the + part before the first colon, and the part which defines the + semantics of the rest of the URL. Examples include "http", + "https", "ssh", "file", "mailto", and many others. See + :func:`~hyperlink.register_scheme()` for more info. + """ + return self._scheme + + @property + def host(self): + # type: () -> Text + """The host is a string, and the second standard part of an absolute + URL. When present, a valid host must be a domain name, or an + IP (v4 or v6). It occurs before the first slash, or the second + colon, if a :attr:`~hyperlink.URL.port` is provided. + """ + return self._host + + @property + def port(self): + # type: () -> Optional[int] + """The port is an integer that is commonly used in connecting to the + :attr:`host`, and almost never appears without it. + + When not present in the original URL, this attribute defaults + to the scheme's default port. If the scheme's default port is + not known, and the port is not provided, this attribute will + be set to None. + + >>> URL.from_text(u'http://example.com/pa/th').port + 80 + >>> URL.from_text(u'foo://example.com/pa/th').port + >>> URL.from_text(u'foo://example.com:8042/pa/th').port + 8042 + + .. note:: + + Per the standard, when the port is the same as the schemes + default port, it will be omitted in the text URL. + """ + return self._port + + @property + def path(self): + # type: () -> Sequence[Text] + """A tuple of strings, created by splitting the slash-separated + hierarchical path. Started by the first slash after the host, + terminated by a "?", which indicates the start of the + :attr:`~hyperlink.URL.query` string. + """ + return self._path + + @property + def query(self): + # type: () -> QueryPairs + """Tuple of pairs, created by splitting the ampersand-separated + mapping of keys and optional values representing + non-hierarchical data used to identify the resource. Keys are + always strings. Values are strings when present, or None when + missing. + + For more operations on the mapping, see + :meth:`~hyperlink.URL.get()`, :meth:`~hyperlink.URL.add()`, + :meth:`~hyperlink.URL.set()`, and + :meth:`~hyperlink.URL.delete()`. + """ + return self._query + + @property + def fragment(self): + # type: () -> Text + """A string, the last part of the URL, indicated by the first "#" + after the :attr:`~hyperlink.URL.path` or + :attr:`~hyperlink.URL.query`. Enables indirect identification + of a secondary resource, like an anchor within an HTML page. + """ + return self._fragment + + @property + def rooted(self): + # type: () -> bool + """Whether or not the path starts with a forward slash (``/``). + + This is taken from the terminology in the BNF grammar, + specifically the "path-rootless", rule, since "absolute path" + and "absolute URI" are somewhat ambiguous. :attr:`path` does + not contain the implicit prefixed ``"/"`` since that is + somewhat awkward to work with. + """ + return self._rooted + + @property + def userinfo(self): + # type: () -> Text + """The colon-separated string forming the username-password + combination. + """ + return self._userinfo + + @property + def uses_netloc(self): + # type: () -> Optional[bool] + """ + Indicates whether ``://`` (the "netloc separator") will appear to + separate the scheme from the *path* in cases where no host is present. + """ + return self._uses_netloc + + @property + def user(self): + # type: () -> Text + """ + The user portion of :attr:`~hyperlink.URL.userinfo`. + """ + return self.userinfo.split(u":")[0] + + def authority(self, with_password=False, **kw): + # type: (bool, Any) -> Text + """Compute and return the appropriate host/port/userinfo combination. + + >>> url = URL.from_text(u'http://user:pass@localhost:8080/a/b?x=y') + >>> url.authority() + u'user:@localhost:8080' + >>> url.authority(with_password=True) + u'user:pass@localhost:8080' + + Args: + with_password: Whether the return value of this method include the + password in the URL, if it is set. + Defaults to False. + + Returns: + Text: The authority (network location and user information) portion + of the URL. + """ + # first, a bit of twisted compat + with_password = kw.pop("includeSecrets", with_password) + if kw: + raise TypeError("got unexpected keyword arguments: %r" % kw.keys()) + host = self.host + if ":" in host: + hostport = ["[" + host + "]"] + else: + hostport = [self.host] + if self.port != SCHEME_PORT_MAP.get(self.scheme): + hostport.append(Text(self.port)) + authority = [] + if self.userinfo: + userinfo = self.userinfo + if not with_password and u":" in userinfo: + userinfo = userinfo[: userinfo.index(u":") + 1] + authority.append(userinfo) + authority.append(u":".join(hostport)) + return u"@".join(authority) + + def __eq__(self, other): + # type: (Any) -> bool + if not isinstance(other, self.__class__): + return NotImplemented + for attr in [ + "scheme", + "userinfo", + "host", + "query", + "fragment", + "port", + "uses_netloc", + "rooted", + ]: + if getattr(self, attr) != getattr(other, attr): + return False + if self.path == other.path or ( + self.path in _ROOT_PATHS and other.path in _ROOT_PATHS + ): + return True + return False + + def __ne__(self, other): + # type: (Any) -> bool + if not isinstance(other, self.__class__): + return NotImplemented + return not self.__eq__(other) + + def __hash__(self): + # type: () -> int + return hash( + ( + self.__class__, + self.scheme, + self.userinfo, + self.host, + self.path, + self.query, + self.fragment, + self.port, + self.rooted, + self.uses_netloc, + ) + ) + + @property + def absolute(self): + # type: () -> bool + """Whether or not the URL is "absolute". Absolute URLs are complete + enough to resolve to a network resource without being relative + to a base URI. + + >>> URL.from_text(u'http://wikipedia.org/').absolute + True + >>> URL.from_text(u'?a=b&c=d').absolute + False + + Absolute URLs must have both a scheme and a host set. + """ + return bool(self.scheme and self.host) + + def replace( + self, + scheme=_UNSET, # type: Optional[Text] + host=_UNSET, # type: Optional[Text] + path=_UNSET, # type: Iterable[Text] + query=_UNSET, # type: QueryParameters + fragment=_UNSET, # type: Text + port=_UNSET, # type: Optional[int] + rooted=_UNSET, # type: Optional[bool] + userinfo=_UNSET, # type: Text + uses_netloc=_UNSET, # type: Optional[bool] + ): + # type: (...) -> URL + """:class:`URL` objects are immutable, which means that attributes + are designed to be set only once, at construction. Instead of + modifying an existing URL, one simply creates a copy with the + desired changes. + + If any of the following arguments is omitted, it defaults to + the value on the current URL. + + Args: + scheme: The text name of the scheme. + host: The host portion of the network location. + path: A tuple of strings representing the slash-separated parts of + the path. + query: The query parameters, as a dictionary or as an sequence of + key-value pairs. + fragment: The fragment part of the URL. + port: The port part of the network location. + rooted: Whether or not the path begins with a slash. + userinfo: The username or colon-separated username:password pair. + uses_netloc: Indicates whether ``://`` (the "netloc separator") + will appear to separate the scheme from the *path* in cases + where no host is present. + Setting this to ``True`` is a non-spec-compliant affordance for + the common practice of having URIs that are *not* URLs (cannot + have a 'host' part) but nevertheless use the common ``://`` + idiom that most people associate with URLs; e.g. ``message:`` + URIs like ``message://message-id`` being equivalent to + ``message:message-id``. + This may be inferred based on the scheme depending on whether + :func:`register_scheme` has been used to register the scheme + and should not be passed directly unless you know the scheme + works like this and you know it has not been registered. + + Returns: + URL: A copy of the current :class:`URL`, with new values for + parameters passed. + """ + if scheme is not _UNSET and scheme != self.scheme: + # when changing schemes, reset the explicit uses_netloc preference + # to honor the new scheme. + uses_netloc = None + return self.__class__( + scheme=_optional(scheme, self.scheme), + host=_optional(host, self.host), + path=_optional(path, self.path), + query=_optional(query, self.query), + fragment=_optional(fragment, self.fragment), + port=_optional(port, self.port), + rooted=_optional(rooted, self.rooted), + userinfo=_optional(userinfo, self.userinfo), + uses_netloc=_optional(uses_netloc, self.uses_netloc), + ) + + @classmethod + def from_text(cls, text): + # type: (Text) -> URL + """Whereas the :class:`URL` constructor is useful for constructing + URLs from parts, :meth:`~URL.from_text` supports parsing whole + URLs from their string form:: + + >>> URL.from_text(u'http://example.com') + URL.from_text(u'http://example.com') + >>> URL.from_text(u'?a=b&x=y') + URL.from_text(u'?a=b&x=y') + + As you can see above, it's also used as the :func:`repr` of + :class:`URL` objects. The natural counterpart to + :func:`~URL.to_text()`. This method only accepts *text*, so be + sure to decode those bytestrings. + + Args: + text: A valid URL string. + + Returns: + URL: The structured object version of the parsed string. + + .. note:: + + Somewhat unexpectedly, URLs are a far more permissive + format than most would assume. Many strings which don't + look like URLs are still valid URLs. As a result, this + method only raises :class:`URLParseError` on invalid port + and IPv6 values in the host portion of the URL. + """ + um = _URL_RE.match(_textcheck("text", text)) + if um is None: + raise URLParseError("could not parse url: %r" % text) + gs = um.groupdict() + + au_text = gs["authority"] or u"" + au_m = _AUTHORITY_RE.match(au_text) + if au_m is None: + raise URLParseError( + "invalid authority %r in url: %r" % (au_text, text) + ) + au_gs = au_m.groupdict() + if au_gs["bad_host"]: + raise URLParseError( + "invalid host %r in url: %r" % (au_gs["bad_host"], text) + ) + + userinfo = au_gs["userinfo"] or u"" + + host = au_gs["ipv6_host"] or au_gs["plain_host"] + port = au_gs["port"] + if port is not None: + try: + port = int(port) # type: ignore[assignment] # FIXME, see below + except ValueError: + if not port: # TODO: excessive? + raise URLParseError("port must not be empty: %r" % au_text) + raise URLParseError("expected integer for port, not %r" % port) + + scheme = gs["scheme"] or u"" + fragment = gs["fragment"] or u"" + uses_netloc = bool(gs["_netloc_sep"]) + + if gs["path"]: + path = tuple(gs["path"].split(u"/")) + if not path[0]: + path = path[1:] + rooted = True + else: + rooted = False + else: + path = () + rooted = bool(au_text) + if gs["query"]: + query = tuple( + ( + qe.split(u"=", 1) # type: ignore[misc] + if u"=" in qe + else (qe, None) + ) + for qe in gs["query"].split(u"&") + ) # type: QueryPairs + else: + query = () + return cls( + scheme, + host, + path, + query, + fragment, + port, # type: ignore[arg-type] # FIXME, see above + rooted, + userinfo, + uses_netloc, + ) + + def normalize( + self, + scheme=True, + host=True, + path=True, + query=True, + fragment=True, + userinfo=True, + percents=True, + ): + # type: (bool, bool, bool, bool, bool, bool, bool) -> URL + """Return a new URL object with several standard normalizations + applied: + + * Decode unreserved characters (`RFC 3986 2.3`_) + * Uppercase remaining percent-encoded octets (`RFC 3986 2.1`_) + * Convert scheme and host casing to lowercase (`RFC 3986 3.2.2`_) + * Resolve any "." and ".." references in the path (`RFC 3986 6.2.2.3`_) + * Ensure an ending slash on URLs with an empty path (`RFC 3986 6.2.3`_) + * Encode any stray percent signs (`%`) in percent-encoded + fields (path, query, fragment, userinfo) (`RFC 3986 2.4`_) + + All are applied by default, but normalizations can be disabled + per-part by passing `False` for that part's corresponding + name. + + Args: + scheme: Convert the scheme to lowercase + host: Convert the host to lowercase + path: Normalize the path (see above for details) + query: Normalize the query string + fragment: Normalize the fragment + userinfo: Normalize the userinfo + percents: Encode isolated percent signs for any percent-encoded + fields which are being normalized (defaults to `True`). + + >>> url = URL.from_text(u'Http://example.COM/a/../b/./c%2f?%61%') + >>> print(url.normalize().to_text()) + http://example.com/b/c%2F?a%25 + + .. _RFC 3986 3.2.2: https://tools.ietf.org/html/rfc3986#section-3.2.2 + .. _RFC 3986 2.3: https://tools.ietf.org/html/rfc3986#section-2.3 + .. _RFC 3986 2.1: https://tools.ietf.org/html/rfc3986#section-2.1 + .. _RFC 3986 6.2.2.3: https://tools.ietf.org/html/rfc3986#section-6.2.2.3 + .. _RFC 3986 6.2.3: https://tools.ietf.org/html/rfc3986#section-6.2.3 + .. _RFC 3986 2.4: https://tools.ietf.org/html/rfc3986#section-2.4 + """ # noqa: E501 + kw = {} # type: Dict[str, Any] + if scheme: + kw["scheme"] = self.scheme.lower() + if host: + kw["host"] = self.host.lower() + + def _dec_unres(target): + # type: (Text) -> Text + return _decode_unreserved( + target, normalize_case=True, encode_stray_percents=percents + ) + + if path: + if self.path: + kw["path"] = [ + _dec_unres(p) for p in _resolve_dot_segments(self.path) + ] + else: + kw["path"] = (u"",) + if query: + kw["query"] = [ + (_dec_unres(k), _dec_unres(v) if v else v) + for k, v in self.query + ] + if fragment: + kw["fragment"] = _dec_unres(self.fragment) + if userinfo: + kw["userinfo"] = u":".join( + [_dec_unres(p) for p in self.userinfo.split(":", 1)] + ) + + return self.replace(**kw) + + def child(self, *segments): + # type: (Text) -> URL + """Make a new :class:`URL` where the given path segments are a child + of this URL, preserving other parts of the URL, including the + query string and fragment. + + For example:: + + >>> url = URL.from_text(u'http://localhost/a/b?x=y') + >>> child_url = url.child(u"c", u"d") + >>> child_url.to_text() + u'http://localhost/a/b/c/d?x=y' + + Args: + segments: Additional parts to be joined and added to the path, like + :func:`os.path.join`. Special characters in segments will be + percent encoded. + + Returns: + URL: A copy of the current URL with the extra path segments. + """ + if not segments: + return self + + segments = [ # type: ignore[assignment] # variable is tuple + _textcheck("path segment", s) for s in segments + ] + new_path = tuple(self.path) + if self.path and self.path[-1] == u"": + new_path = new_path[:-1] + new_path += tuple(_encode_path_parts(segments, maximal=False)) + return self.replace(path=new_path) + + def sibling(self, segment): + # type: (Text) -> URL + """Make a new :class:`URL` with a single path segment that is a + sibling of this URL path. + + Args: + segment: A single path segment. + + Returns: + URL: A copy of the current URL with the last path segment + replaced by *segment*. Special characters such as + ``/?#`` will be percent encoded. + """ + _textcheck("path segment", segment) + new_path = tuple(self.path)[:-1] + (_encode_path_part(segment),) + return self.replace(path=new_path) + + def click(self, href=u""): + # type: (Union[Text, URL]) -> URL + """Resolve the given URL relative to this URL. + + The resulting URI should match what a web browser would + generate if you visited the current URL and clicked on *href*. + + >>> url = URL.from_text(u'http://blog.hatnote.com/') + >>> url.click(u'/post/155074058790').to_text() + u'http://blog.hatnote.com/post/155074058790' + >>> url = URL.from_text(u'http://localhost/a/b/c/') + >>> url.click(u'../d/./e').to_text() + u'http://localhost/a/b/d/e' + + Args (Text): + href: A string representing a clicked URL. + + Return: + A copy of the current URL with navigation logic applied. + + For more information, see `RFC 3986 section 5`_. + + .. _RFC 3986 section 5: https://tools.ietf.org/html/rfc3986#section-5 + """ + if href: + if isinstance(href, URL): + clicked = href + else: + # TODO: This error message is not completely accurate, + # as URL objects are now also valid, but Twisted's + # test suite (wrongly) relies on this exact message. + _textcheck("relative URL", href) + clicked = URL.from_text(href) + if clicked.absolute: + return clicked + else: + clicked = self + + query = clicked.query + if clicked.scheme and not clicked.rooted: + # Schemes with relative paths are not well-defined. RFC 3986 calls + # them a "loophole in prior specifications" that should be avoided, + # or supported only for backwards compatibility. + raise NotImplementedError( + "absolute URI with rootless path: %r" % (href,) + ) + else: + if clicked.rooted: + path = clicked.path + elif clicked.path: + path = tuple(self.path)[:-1] + tuple(clicked.path) + else: + path = self.path + if not query: + query = self.query + return self.replace( + scheme=clicked.scheme or self.scheme, + host=clicked.host or self.host, + port=clicked.port or self.port, + path=_resolve_dot_segments(path), + query=query, + fragment=clicked.fragment, + ) + + def to_uri(self): + # type: () -> URL + u"""Make a new :class:`URL` instance with all non-ASCII characters + appropriately percent-encoded. This is useful to do in preparation + for sending a :class:`URL` over a network protocol. + + For example:: + + >>> URL.from_text(u'https://ايران.com/foo⇧bar/').to_uri() + URL.from_text(u'https://xn--mgba3a4fra.com/foo%E2%87%A7bar/') + + Returns: + URL: A new instance with its path segments, query parameters, and + hostname encoded, so that they are all in the standard + US-ASCII range. + """ + new_userinfo = u":".join( + [_encode_userinfo_part(p) for p in self.userinfo.split(":", 1)] + ) + new_path = _encode_path_parts( + self.path, has_scheme=bool(self.scheme), rooted=False, maximal=True + ) + new_host = ( + self.host + if not self.host + else idna_encode(self.host, uts46=True).decode("ascii") + ) + return self.replace( + userinfo=new_userinfo, + host=new_host, + path=new_path, + query=tuple( + [ + ( + _encode_query_key(k, maximal=True), + _encode_query_value(v, maximal=True) + if v is not None + else None, + ) + for k, v in self.query + ] + ), + fragment=_encode_fragment_part(self.fragment, maximal=True), + ) + + def to_iri(self): + # type: () -> URL + u"""Make a new :class:`URL` instance with all but a few reserved + characters decoded into human-readable format. + + Percent-encoded Unicode and IDNA-encoded hostnames are + decoded, like so:: + + >>> url = URL.from_text(u'https://xn--mgba3a4fra.example.com/foo%E2%87%A7bar/') + >>> print(url.to_iri().to_text()) + https://ايران.example.com/foo⇧bar/ + + .. note:: + + As a general Python issue, "narrow" (UCS-2) builds of + Python may not be able to fully decode certain URLs, and + the in those cases, this method will return a best-effort, + partially-decoded, URL which is still valid. This issue + does not affect any Python builds 3.4+. + + Returns: + URL: A new instance with its path segments, query parameters, and + hostname decoded for display purposes. + """ # noqa: E501 + new_userinfo = u":".join( + [_decode_userinfo_part(p) for p in self.userinfo.split(":", 1)] + ) + host_text = _decode_host(self.host) + + return self.replace( + userinfo=new_userinfo, + host=host_text, + path=[_decode_path_part(segment) for segment in self.path], + query=tuple( + ( + _decode_query_key(k), + _decode_query_value(v) if v is not None else None, + ) + for k, v in self.query + ), + fragment=_decode_fragment_part(self.fragment), + ) + + def to_text(self, with_password=False): + # type: (bool) -> Text + """Render this URL to its textual representation. + + By default, the URL text will *not* include a password, if one + is set. RFC 3986 considers using URLs to represent such + sensitive information as deprecated. Quoting from RFC 3986, + `section 3.2.1`: + + "Applications should not render as clear text any data after the + first colon (":") character found within a userinfo subcomponent + unless the data after the colon is the empty string (indicating no + password)." + + Args (bool): + with_password: Whether or not to include the password in the URL + text. Defaults to False. + + Returns: + Text: The serialized textual representation of this URL, such as + ``u"http://example.com/some/path?some=query"``. + + The natural counterpart to :class:`URL.from_text()`. + + .. _section 3.2.1: https://tools.ietf.org/html/rfc3986#section-3.2.1 + """ + scheme = self.scheme + authority = self.authority(with_password) + path = "/".join( + _encode_path_parts( + self.path, + rooted=self.rooted, + has_scheme=bool(scheme), + has_authority=bool(authority), + maximal=False, + ) + ) + query_parts = [] + for k, v in self.query: + if v is None: + query_parts.append(_encode_query_key(k, maximal=False)) + else: + query_parts.append( + u"=".join( + ( + _encode_query_key(k, maximal=False), + _encode_query_value(v, maximal=False), + ) + ) + ) + query_string = u"&".join(query_parts) + + fragment = self.fragment + + parts = [] # type: List[Text] + _add = parts.append + if scheme: + _add(scheme) + _add(":") + if authority: + _add("//") + _add(authority) + elif scheme and path[:2] != "//" and self.uses_netloc: + _add("//") + if path: + if scheme and authority and path[:1] != "/": + _add("/") # relpaths with abs authorities auto get '/' + _add(path) + if query_string: + _add("?") + _add(query_string) + if fragment: + _add("#") + _add(fragment) + return u"".join(parts) + + def __repr__(self): + # type: () -> str + """Convert this URL to an representation that shows all of its + constituent parts, as well as being a valid argument to + :func:`eval`. + """ + return "%s.from_text(%r)" % (self.__class__.__name__, self.to_text()) + + def _to_bytes(self): + # type: () -> bytes + """ + Allows for direct usage of URL objects with libraries like + requests, which automatically stringify URL parameters. See + issue #49. + """ + return self.to_uri().to_text().encode("ascii") + + if PY2: + __str__ = _to_bytes + __unicode__ = to_text + else: + __bytes__ = _to_bytes + __str__ = to_text + + # # Begin Twisted Compat Code + asURI = to_uri + asIRI = to_iri + + @classmethod + def fromText(cls, s): + # type: (Text) -> URL + return cls.from_text(s) + + def asText(self, includeSecrets=False): + # type: (bool) -> Text + return self.to_text(with_password=includeSecrets) + + def __dir__(self): + # type: () -> Sequence[Text] + try: + ret = object.__dir__(self) + except AttributeError: + # object.__dir__ == AttributeError # pdw for py2 + ret = dir(self.__class__) + list(self.__dict__.keys()) + ret = sorted(set(ret) - set(["fromText", "asURI", "asIRI", "asText"])) + return ret + + # # End Twisted Compat Code + + def add(self, name, value=None): + # type: (Text, Optional[Text]) -> URL + """Make a new :class:`URL` instance with a given query argument, + *name*, added to it with the value *value*, like so:: + + >>> URL.from_text(u'https://example.com/?x=y').add(u'x') + URL.from_text(u'https://example.com/?x=y&x') + >>> URL.from_text(u'https://example.com/?x=y').add(u'x', u'z') + URL.from_text(u'https://example.com/?x=y&x=z') + + Args: + name: The name of the query parameter to add. + The part before the ``=``. + value: The value of the query parameter to add. + The part after the ``=``. + Defaults to ``None``, meaning no value. + + Returns: + URL: A new :class:`URL` instance with the parameter added. + """ + return self.replace(query=self.query + ((name, value),)) + + def set(self, name, value=None): + # type: (Text, Optional[Text]) -> URL + """Make a new :class:`URL` instance with the query parameter *name* + set to *value*. All existing occurences, if any are replaced + by the single name-value pair. + + >>> URL.from_text(u'https://example.com/?x=y').set(u'x') + URL.from_text(u'https://example.com/?x') + >>> URL.from_text(u'https://example.com/?x=y').set(u'x', u'z') + URL.from_text(u'https://example.com/?x=z') + + Args: + name: The name of the query parameter to set. + The part before the ``=``. + value: The value of the query parameter to set. + The part after the ``=``. + Defaults to ``None``, meaning no value. + + Returns: + URL: A new :class:`URL` instance with the parameter set. + """ + # Preserve the original position of the query key in the list + q = [(k, v) for (k, v) in self.query if k != name] + idx = next( + (i for (i, (k, v)) in enumerate(self.query) if k == name), -1 + ) + q[idx:idx] = [(name, value)] + return self.replace(query=q) + + def get(self, name): + # type: (Text) -> List[Optional[Text]] + """Get a list of values for the given query parameter, *name*:: + + >>> url = URL.from_text(u'?x=1&x=2') + >>> url.get('x') + [u'1', u'2'] + >>> url.get('y') + [] + + If the given *name* is not set, an empty list is returned. A + list is always returned, and this method raises no exceptions. + + Args: + name: The name of the query parameter to get. + + Returns: + List[Optional[Text]]: A list of all the values associated with the + key, in string form. + """ + return [value for (key, value) in self.query if name == key] + + def remove( + self, + name, # type: Text + value=_UNSET, # type: Text + limit=None, # type: Optional[int] + ): + # type: (...) -> URL + """Make a new :class:`URL` instance with occurrences of the query + parameter *name* removed, or, if *value* is set, parameters + matching *name* and *value*. No exception is raised if the + parameter is not already set. + + Args: + name: The name of the query parameter to remove. + value: Optional value to additionally filter on. + Setting this removes query parameters which match both name + and value. + limit: Optional maximum number of parameters to remove. + + Returns: + URL: A new :class:`URL` instance with the parameter removed. + """ + if limit is None: + if value is _UNSET: + nq = [(k, v) for (k, v) in self.query if k != name] + else: + nq = [ + (k, v) + for (k, v) in self.query + if not (k == name and v == value) + ] + else: + nq, removed_count = [], 0 + + for k, v in self.query: + if ( + k == name + and (value is _UNSET or v == value) + and removed_count < limit + ): + removed_count += 1 # drop it + else: + nq.append((k, v)) # keep it + + return self.replace(query=nq) + + +EncodedURL = URL # An alias better describing what the URL really is + +_EMPTY_URL = URL() + + +def _replace_plus(text): + # type: (Text) -> Text + return text.replace("+", "%20") + + +def _no_op(text): + # type: (Text) -> Text + return text + + +class DecodedURL(object): + """ + :class:`DecodedURL` is a type designed to act as a higher-level + interface to :class:`URL` and the recommended type for most + operations. By analogy, :class:`DecodedURL` is the + :class:`unicode` to URL's :class:`bytes`. + + :class:`DecodedURL` automatically handles encoding and decoding + all its components, such that all inputs and outputs are in a + maximally-decoded state. Note that this means, for some special + cases, a URL may not "roundtrip" character-for-character, but this + is considered a good tradeoff for the safety of automatic + encoding. + + Otherwise, :class:`DecodedURL` has almost exactly the same API as + :class:`URL`. + + Where applicable, a UTF-8 encoding is presumed. Be advised that + some interactions can raise :exc:`UnicodeEncodeErrors` and + :exc:`UnicodeDecodeErrors`, just like when working with + bytestrings. Examples of such interactions include handling query + strings encoding binary data, and paths containing segments with + special characters encoded with codecs other than UTF-8. + + Args: + url: A :class:`URL` object to wrap. + lazy: Set to True to avoid pre-decode all parts of the URL to check for + validity. + Defaults to False. + query_plus_is_space: + characters in the query string should be treated + as spaces when decoding. If unspecified, the default is taken from + the scheme. + + .. note:: + + The :class:`DecodedURL` initializer takes a :class:`URL` object, + not URL components, like :class:`URL`. To programmatically + construct a :class:`DecodedURL`, you can use this pattern: + + >>> print(DecodedURL().replace(scheme=u'https', + ... host=u'pypi.org', path=(u'projects', u'hyperlink')).to_text()) + https://pypi.org/projects/hyperlink + + .. versionadded:: 18.0.0 + """ + + def __init__(self, url=_EMPTY_URL, lazy=False, query_plus_is_space=None): + # type: (URL, bool, Optional[bool]) -> None + self._url = url + if query_plus_is_space is None: + query_plus_is_space = url.scheme not in NO_QUERY_PLUS_SCHEMES + self._query_plus_is_space = query_plus_is_space + if not lazy: + # cache the following, while triggering any decoding + # issues with decodable fields + self.host, self.userinfo, self.path, self.query, self.fragment + return + + @classmethod + def from_text(cls, text, lazy=False, query_plus_is_space=None): + # type: (Text, bool, Optional[bool]) -> DecodedURL + """\ + Make a `DecodedURL` instance from any text string containing a URL. + + Args: + text: Text containing the URL + lazy: Whether to pre-decode all parts of the URL to check for + validity. + Defaults to True. + """ + _url = URL.from_text(text) + return cls(_url, lazy=lazy, query_plus_is_space=query_plus_is_space) + + @property + def encoded_url(self): + # type: () -> URL + """Access the underlying :class:`URL` object, which has any special + characters encoded. + """ + return self._url + + def to_text(self, with_password=False): + # type: (bool) -> Text + "Passthrough to :meth:`~hyperlink.URL.to_text()`" + return self._url.to_text(with_password) + + def to_uri(self): + # type: () -> URL + "Passthrough to :meth:`~hyperlink.URL.to_uri()`" + return self._url.to_uri() + + def to_iri(self): + # type: () -> URL + "Passthrough to :meth:`~hyperlink.URL.to_iri()`" + return self._url.to_iri() + + def _clone(self, url): + # type: (URL) -> DecodedURL + return self.__class__( + url, + # TODO: propagate laziness? + query_plus_is_space=self._query_plus_is_space, + ) + + def click(self, href=u""): + # type: (Union[Text, URL, DecodedURL]) -> DecodedURL + """Return a new DecodedURL wrapping the result of + :meth:`~hyperlink.URL.click()` + """ + if isinstance(href, DecodedURL): + href = href._url + return self._clone( + self._url.click(href=href), + ) + + def sibling(self, segment): + # type: (Text) -> DecodedURL + """Automatically encode any reserved characters in *segment* and + return a new `DecodedURL` wrapping the result of + :meth:`~hyperlink.URL.sibling()` + """ + return self._clone( + self._url.sibling(_encode_reserved(segment)), + ) + + def child(self, *segments): + # type: (Text) -> DecodedURL + """Automatically encode any reserved characters in *segments* and + return a new `DecodedURL` wrapping the result of + :meth:`~hyperlink.URL.child()`. + """ + if not segments: + return self + new_segs = [_encode_reserved(s) for s in segments] + return self._clone(self._url.child(*new_segs)) + + def normalize( + self, + scheme=True, + host=True, + path=True, + query=True, + fragment=True, + userinfo=True, + percents=True, + ): + # type: (bool, bool, bool, bool, bool, bool, bool) -> DecodedURL + """Return a new `DecodedURL` wrapping the result of + :meth:`~hyperlink.URL.normalize()` + """ + return self._clone( + self._url.normalize( + scheme, host, path, query, fragment, userinfo, percents + ) + ) + + @property + def absolute(self): + # type: () -> bool + return self._url.absolute + + @property + def scheme(self): + # type: () -> Text + return self._url.scheme + + @property + def host(self): + # type: () -> Text + return _decode_host(self._url.host) + + @property + def port(self): + # type: () -> Optional[int] + return self._url.port + + @property + def rooted(self): + # type: () -> bool + return self._url.rooted + + @property + def path(self): + # type: () -> Sequence[Text] + if not hasattr(self, "_path"): + self._path = tuple( + [ + _percent_decode(p, raise_subencoding_exc=True) + for p in self._url.path + ] + ) + return self._path + + @property + def query(self): + # type: () -> QueryPairs + if not hasattr(self, "_query"): + if self._query_plus_is_space: + predecode = _replace_plus + else: + predecode = _no_op + + self._query = cast( + QueryPairs, + tuple( + tuple( + _percent_decode( + predecode(x), raise_subencoding_exc=True + ) + if x is not None + else None + for x in (k, v) + ) + for k, v in self._url.query + ), + ) + return self._query + + @property + def fragment(self): + # type: () -> Text + if not hasattr(self, "_fragment"): + frag = self._url.fragment + self._fragment = _percent_decode(frag, raise_subencoding_exc=True) + return self._fragment + + @property + def userinfo(self): + # type: () -> Union[Tuple[str], Tuple[str, str]] + if not hasattr(self, "_userinfo"): + self._userinfo = cast( + Union[Tuple[str], Tuple[str, str]], + tuple( + tuple( + _percent_decode(p, raise_subencoding_exc=True) + for p in self._url.userinfo.split(":", 1) + ) + ), + ) + return self._userinfo + + @property + def user(self): + # type: () -> Text + return self.userinfo[0] + + @property + def uses_netloc(self): + # type: () -> Optional[bool] + return self._url.uses_netloc + + def replace( + self, + scheme=_UNSET, # type: Optional[Text] + host=_UNSET, # type: Optional[Text] + path=_UNSET, # type: Iterable[Text] + query=_UNSET, # type: QueryParameters + fragment=_UNSET, # type: Text + port=_UNSET, # type: Optional[int] + rooted=_UNSET, # type: Optional[bool] + userinfo=_UNSET, # type: Union[Tuple[str], Tuple[str, str]] + uses_netloc=_UNSET, # type: Optional[bool] + ): + # type: (...) -> DecodedURL + """While the signature is the same, this `replace()` differs a little + from URL.replace. For instance, it accepts userinfo as a + tuple, not as a string, handling the case of having a username + containing a `:`. As with the rest of the methods on + DecodedURL, if you pass a reserved character, it will be + automatically encoded instead of an error being raised. + """ + if path is not _UNSET: + path = tuple(_encode_reserved(p) for p in path) + if query is not _UNSET: + query = cast( + QueryPairs, + tuple( + tuple( + _encode_reserved(x) if x is not None else None + for x in (k, v) + ) + for k, v in iter_pairs(query) + ), + ) + if userinfo is not _UNSET: + if len(userinfo) > 2: + raise ValueError( + 'userinfo expected sequence of ["user"] or' + ' ["user", "password"], got %r' % (userinfo,) + ) + userinfo_text = u":".join([_encode_reserved(p) for p in userinfo]) + else: + userinfo_text = _UNSET + new_url = self._url.replace( + scheme=scheme, + host=host, + path=path, + query=query, + fragment=fragment, + port=port, + rooted=rooted, + userinfo=userinfo_text, + uses_netloc=uses_netloc, + ) + return self._clone(url=new_url) + + def get(self, name): + # type: (Text) -> List[Optional[Text]] + "Get the value of all query parameters whose name matches *name*" + return [v for (k, v) in self.query if name == k] + + def add(self, name, value=None): + # type: (Text, Optional[Text]) -> DecodedURL + """Return a new DecodedURL with the query parameter *name* and *value* + added.""" + return self.replace(query=self.query + ((name, value),)) + + def set(self, name, value=None): + # type: (Text, Optional[Text]) -> DecodedURL + "Return a new DecodedURL with query parameter *name* set to *value*" + query = self.query + q = [(k, v) for (k, v) in query if k != name] + idx = next((i for (i, (k, v)) in enumerate(query) if k == name), -1) + q[idx:idx] = [(name, value)] + return self.replace(query=q) + + def remove( + self, + name, # type: Text + value=_UNSET, # type: Text + limit=None, # type: Optional[int] + ): + # type: (...) -> DecodedURL + """Return a new DecodedURL with query parameter *name* removed. + + Optionally also filter for *value*, as well as cap the number + of parameters removed with *limit*. + """ + if limit is None: + if value is _UNSET: + nq = [(k, v) for (k, v) in self.query if k != name] + else: + nq = [ + (k, v) + for (k, v) in self.query + if not (k == name and v == value) + ] + else: + nq, removed_count = [], 0 + for k, v in self.query: + if ( + k == name + and (value is _UNSET or v == value) + and removed_count < limit + ): + removed_count += 1 # drop it + else: + nq.append((k, v)) # keep it + + return self.replace(query=nq) + + def __repr__(self): + # type: () -> str + cn = self.__class__.__name__ + return "%s(url=%r)" % (cn, self._url) + + def __str__(self): + # type: () -> str + # TODO: the underlying URL's __str__ needs to change to make + # this work as the URL, see #55 + return str(self._url) + + def __eq__(self, other): + # type: (Any) -> bool + if not isinstance(other, self.__class__): + return NotImplemented + return self.normalize().to_uri() == other.normalize().to_uri() + + def __ne__(self, other): + # type: (Any) -> bool + if not isinstance(other, self.__class__): + return NotImplemented + return not self.__eq__(other) + + def __hash__(self): + # type: () -> int + return hash( + ( + self.__class__, + self.scheme, + self.userinfo, + self.host, + self.path, + self.query, + self.fragment, + self.port, + self.rooted, + self.uses_netloc, + ) + ) + + # # Begin Twisted Compat Code + asURI = to_uri + asIRI = to_iri + + @classmethod + def fromText(cls, s, lazy=False): + # type: (Text, bool) -> DecodedURL + return cls.from_text(s, lazy=lazy) + + def asText(self, includeSecrets=False): + # type: (bool) -> Text + return self.to_text(with_password=includeSecrets) + + def __dir__(self): + # type: () -> Sequence[Text] + try: + ret = object.__dir__(self) + except AttributeError: + # object.__dir__ == AttributeError # pdw for py2 + ret = dir(self.__class__) + list(self.__dict__.keys()) + ret = sorted(set(ret) - set(["fromText", "asURI", "asIRI", "asText"])) + return ret + + # # End Twisted Compat Code + + +def parse(url, decoded=True, lazy=False): + # type: (Text, bool, bool) -> Union[URL, DecodedURL] + """ + Automatically turn text into a structured URL object. + + >>> url = parse(u"https://github.com/python-hyper/hyperlink") + >>> print(url.to_text()) + https://github.com/python-hyper/hyperlink + + Args: + url: A text string representation of a URL. + + decoded: Whether or not to return a :class:`DecodedURL`, + which automatically handles all + encoding/decoding/quoting/unquoting for all the various + accessors of parts of the URL, or a :class:`URL`, + which has the same API, but requires handling of special + characters for different parts of the URL. + + lazy: In the case of `decoded=True`, this controls + whether the URL is decoded immediately or as accessed. The + default, `lazy=False`, checks all encoded parts of the URL + for decodability. + + .. versionadded:: 18.0.0 + """ + enc_url = EncodedURL.from_text(url) + if not decoded: + return enc_url + dec_url = DecodedURL(enc_url, lazy=lazy) + return dec_url diff --git a/contrib/python/hyperlink/py3/hyperlink/hypothesis.py b/contrib/python/hyperlink/py3/hyperlink/hypothesis.py new file mode 100644 index 0000000000..45fd9a9956 --- /dev/null +++ b/contrib/python/hyperlink/py3/hyperlink/hypothesis.py @@ -0,0 +1,324 @@ +# -*- coding: utf-8 -*- +""" +Hypothesis strategies. +""" +from __future__ import absolute_import + +try: + import hypothesis + + del hypothesis +except ImportError: + from typing import Tuple + + __all__ = () # type: Tuple[str, ...] +else: + import io + import pkgutil + from csv import reader as csv_reader + from os.path import dirname, join + from string import ascii_letters, digits + from sys import maxunicode + from typing import ( + Callable, + Iterable, + List, + Optional, + Sequence, + Text, + TypeVar, + cast, + ) + from gzip import open as open_gzip + + from . import DecodedURL, EncodedURL + + from hypothesis import assume + from hypothesis.strategies import ( + composite, + integers, + lists, + sampled_from, + text, + ) + + from idna import IDNAError, check_label, encode as idna_encode + + __all__ = ( + "decoded_urls", + "encoded_urls", + "hostname_labels", + "hostnames", + "idna_text", + "paths", + "port_numbers", + ) + + T = TypeVar("T") + DrawCallable = Callable[[Callable[..., T]], T] + + try: + unichr + except NameError: # Py3 + unichr = chr # type: Callable[[int], Text] + + def idna_characters(): + # type: () -> Text + """ + Returns a string containing IDNA characters. + """ + global _idnaCharacters + + if not _idnaCharacters: + result = [] + + # Data source "IDNA Derived Properties": + # https://www.iana.org/assignments/idna-tables-6.3.0/ + # idna-tables-6.3.0.xhtml#idna-tables-properties + dataFileName = join( + dirname(__file__), "idna-tables-properties.csv.gz" + ) + data = io.BytesIO(pkgutil.get_data(__name__, "idna-tables-properties.csv.gz")) + with open_gzip(data) as dataFile: + reader = csv_reader( + (line.decode("utf-8") for line in dataFile), + delimiter=",", + ) + next(reader) # Skip header row + for row in reader: + codes, prop, description = row + + if prop != "PVALID": + # CONTEXTO or CONTEXTJ are also allowed, but they come + # with rules, so we're punting on those here. + # See: https://tools.ietf.org/html/rfc5892 + continue + + startEnd = row[0].split("-", 1) + if len(startEnd) == 1: + # No end of range given; use start + startEnd.append(startEnd[0]) + start, end = (int(i, 16) for i in startEnd) + + for i in range(start, end + 1): + if i > maxunicode: # Happens using Py2 on Windows + break + result.append(unichr(i)) + + _idnaCharacters = u"".join(result) + + return _idnaCharacters + + _idnaCharacters = "" # type: Text + + @composite + def idna_text(draw, min_size=1, max_size=None): + # type: (DrawCallable, int, Optional[int]) -> Text + """ + A strategy which generates IDNA-encodable text. + + @param min_size: The minimum number of characters in the text. + C{None} is treated as C{0}. + + @param max_size: The maximum number of characters in the text. + Use C{None} for an unbounded size. + """ + alphabet = idna_characters() + + assert min_size >= 1 + + if max_size is not None: + assert max_size >= 1 + + result = cast( + Text, + draw(text(min_size=min_size, max_size=max_size, alphabet=alphabet)), + ) + + # FIXME: There should be a more efficient way to ensure we produce + # valid IDNA text. + try: + idna_encode(result) + except IDNAError: + assume(False) + + return result + + @composite + def port_numbers(draw, allow_zero=False): + # type: (DrawCallable, bool) -> int + """ + A strategy which generates port numbers. + + @param allow_zero: Whether to allow port C{0} as a possible value. + """ + if allow_zero: + min_value = 0 + else: + min_value = 1 + + return cast(int, draw(integers(min_value=min_value, max_value=65535))) + + @composite + def hostname_labels(draw, allow_idn=True): + # type: (DrawCallable, bool) -> Text + """ + A strategy which generates host name labels. + + @param allow_idn: Whether to allow non-ASCII characters as allowed by + internationalized domain names (IDNs). + """ + if allow_idn: + label = cast(Text, draw(idna_text(min_size=1, max_size=63))) + + try: + label.encode("ascii") + except UnicodeEncodeError: + # If the label doesn't encode to ASCII, then we need to check + # the length of the label after encoding to punycode and adding + # the xn-- prefix. + while len(label.encode("punycode")) > 63 - len("xn--"): + # Rather than bombing out, just trim from the end until it + # is short enough, so hypothesis doesn't have to generate + # new data. + label = label[:-1] + + else: + label = cast( + Text, + draw( + text( + min_size=1, + max_size=63, + alphabet=Text(ascii_letters + digits + u"-"), + ) + ), + ) + + # Filter invalid labels. + # It would be better to reliably avoid generation of bogus labels in + # the first place, but it's hard... + try: + check_label(label) + except UnicodeError: # pragma: no cover (not always drawn) + assume(False) + + return label + + @composite + def hostnames(draw, allow_leading_digit=True, allow_idn=True): + # type: (DrawCallable, bool, bool) -> Text + """ + A strategy which generates host names. + + @param allow_leading_digit: Whether to allow a leading digit in host + names; they were not allowed prior to RFC 1123. + + @param allow_idn: Whether to allow non-ASCII characters as allowed by + internationalized domain names (IDNs). + """ + # Draw first label, filtering out labels with leading digits if needed + labels = [ + cast( + Text, + draw( + hostname_labels(allow_idn=allow_idn).filter( + lambda l: ( + True if allow_leading_digit else l[0] not in digits + ) + ) + ), + ) + ] + # Draw remaining labels + labels += cast( + List[Text], + draw( + lists( + hostname_labels(allow_idn=allow_idn), + min_size=1, + max_size=4, + ) + ), + ) + + # Trim off labels until the total host name length fits in 252 + # characters. This avoids having to filter the data. + while sum(len(label) for label in labels) + len(labels) - 1 > 252: + labels = labels[:-1] + + return u".".join(labels) + + def path_characters(): + # type: () -> str + """ + Returns a string containing valid URL path characters. + """ + global _path_characters + + if _path_characters is None: + + def chars(): + # type: () -> Iterable[Text] + for i in range(maxunicode): + c = unichr(i) + + # Exclude reserved characters + if c in "#/?": + continue + + # Exclude anything not UTF-8 compatible + try: + c.encode("utf-8") + except UnicodeEncodeError: + continue + + yield c + + _path_characters = "".join(chars()) + + return _path_characters + + _path_characters = None # type: Optional[str] + + @composite + def paths(draw): + # type: (DrawCallable) -> Sequence[Text] + return cast( + List[Text], + draw( + lists(text(min_size=1, alphabet=path_characters()), max_size=10) + ), + ) + + @composite + def encoded_urls(draw): + # type: (DrawCallable) -> EncodedURL + """ + A strategy which generates L{EncodedURL}s. + Call the L{EncodedURL.to_uri} method on each URL to get an HTTP + protocol-friendly URI. + """ + port = cast(Optional[int], draw(port_numbers(allow_zero=True))) + host = cast(Text, draw(hostnames())) + path = cast(Sequence[Text], draw(paths())) + + if port == 0: + port = None + + return EncodedURL( + scheme=cast(Text, draw(sampled_from((u"http", u"https")))), + host=host, + port=port, + path=path, + ) + + @composite + def decoded_urls(draw): + # type: (DrawCallable) -> DecodedURL + """ + A strategy which generates L{DecodedURL}s. + Call the L{EncodedURL.to_uri} method on each URL to get an HTTP + protocol-friendly URI. + """ + return DecodedURL(draw(encoded_urls())) diff --git a/contrib/python/hyperlink/py3/hyperlink/idna-tables-properties.csv.gz b/contrib/python/hyperlink/py3/hyperlink/idna-tables-properties.csv.gz Binary files differnew file mode 100644 index 0000000000..48e9f06742 --- /dev/null +++ b/contrib/python/hyperlink/py3/hyperlink/idna-tables-properties.csv.gz diff --git a/contrib/python/hyperlink/py3/hyperlink/py.typed b/contrib/python/hyperlink/py3/hyperlink/py.typed new file mode 100644 index 0000000000..d2dfd5e491 --- /dev/null +++ b/contrib/python/hyperlink/py3/hyperlink/py.typed @@ -0,0 +1 @@ +# See: https://www.python.org/dev/peps/pep-0561/ diff --git a/contrib/python/hyperlink/py3/ya.make b/contrib/python/hyperlink/py3/ya.make new file mode 100644 index 0000000000..2ada924679 --- /dev/null +++ b/contrib/python/hyperlink/py3/ya.make @@ -0,0 +1,35 @@ +# Generated by devtools/yamaker (pypi). + +PY3_LIBRARY() + +VERSION(21.0.0) + +LICENSE(MIT) + +PEERDIR( + contrib/python/idna +) + +NO_LINT() + +PY_SRCS( + TOP_LEVEL + hyperlink/__init__.py + hyperlink/_socket.py + hyperlink/_url.py + hyperlink/hypothesis.py +) + +RESOURCE_FILES( + PREFIX contrib/python/hyperlink/py3/ + .dist-info/METADATA + .dist-info/top_level.txt + hyperlink/idna-tables-properties.csv.gz + hyperlink/py.typed +) + +END() + +RECURSE_FOR_TESTS( + tests +) diff --git a/contrib/python/hyperlink/ya.make b/contrib/python/hyperlink/ya.make new file mode 100644 index 0000000000..64a73ff34e --- /dev/null +++ b/contrib/python/hyperlink/ya.make @@ -0,0 +1,18 @@ +PY23_LIBRARY() + +LICENSE(Service-Py23-Proxy) + +IF (PYTHON2) + PEERDIR(contrib/python/hyperlink/py2) +ELSE() + PEERDIR(contrib/python/hyperlink/py3) +ENDIF() + +NO_LINT() + +END() + +RECURSE( + py2 + py3 +) |