Intermediate changes

author: robot-piglet <robot-piglet@yandex-team.com> 2023-11-12 21:25:31 +0300
committer: robot-piglet <robot-piglet@yandex-team.com> 2023-11-12 21:39:54 +0300
commit: d28c55ab25cc8cedab8a5f4736c0d66e88b3da95 (patch)
tree: 73d373709b74fa2baaa4fe02a40a77c0a5baf6b7 /contrib/python/h11/h11/_headers.py
parent: 35b17f4f3b6e0ed855e7e47d3f1eb57470388a2c (diff)
download: ydb-d28c55ab25cc8cedab8a5f4736c0d66e88b3da95.tar.gz
1 files changed, 278 insertions, 0 deletions
diff --git a/contrib/python/h11/h11/_headers.py b/contrib/python/h11/h11/_headers.py
new file mode 100644
index 00000000000..b97d020b634
--- /dev/null
+++ b/contrib/python/h11/h11/_headers.py
@@ -0,0 +1,278 @@
+import re
+from typing import AnyStr, cast, List, overload, Sequence, Tuple, TYPE_CHECKING, Union
+
+from ._abnf import field_name, field_value
+from ._util import bytesify, LocalProtocolError, validate
+
+if TYPE_CHECKING:
+    from ._events import Request
+
+try:
+    from typing import Literal
+except ImportError:
+    from typing_extensions import Literal  # type: ignore
+
+
+# Facts
+# -----
+#
+# Headers are:
+#   keys: case-insensitive ascii
+#   values: mixture of ascii and raw bytes
+#
+# "Historically, HTTP has allowed field content with text in the ISO-8859-1
+# charset [ISO-8859-1], supporting other charsets only through use of
+# [RFC2047] encoding.  In practice, most HTTP header field values use only a
+# subset of the US-ASCII charset [USASCII]. Newly defined header fields SHOULD
+# limit their field values to US-ASCII octets.  A recipient SHOULD treat other
+# octets in field content (obs-text) as opaque data."
+# And it deprecates all non-ascii values
+#
+# Leading/trailing whitespace in header names is forbidden
+#
+# Values get leading/trailing whitespace stripped
+#
+# Content-Disposition actually needs to contain unicode semantically; to
+# accomplish this it has a terrifically weird way of encoding the filename
+# itself as ascii (and even this still has lots of cross-browser
+# incompatibilities)
+#
+# Order is important:
+# "a proxy MUST NOT change the order of these field values when forwarding a
+# message"
+# (and there are several headers where the order indicates a preference)
+#
+# Multiple occurences of the same header:
+# "A sender MUST NOT generate multiple header fields with the same field name
+# in a message unless either the entire field value for that header field is
+# defined as a comma-separated list [or the header is Set-Cookie which gets a
+# special exception]" - RFC 7230. (cookies are in RFC 6265)
+#
+# So every header aside from Set-Cookie can be merged by b", ".join if it
+# occurs repeatedly. But, of course, they can't necessarily be split by
+# .split(b","), because quoting.
+#
+# Given all this mess (case insensitive, duplicates allowed, order is
+# important, ...), there doesn't appear to be any standard way to handle
+# headers in Python -- they're almost like dicts, but... actually just
+# aren't. For now we punt and just use a super simple representation: headers
+# are a list of pairs
+#
+#   [(name1, value1), (name2, value2), ...]
+#
+# where all entries are bytestrings, names are lowercase and have no
+# leading/trailing whitespace, and values are bytestrings with no
+# leading/trailing whitespace. Searching and updating are done via naive O(n)
+# methods.
+#
+# Maybe a dict-of-lists would be better?
+
+_content_length_re = re.compile(rb"[0-9]+")
+_field_name_re = re.compile(field_name.encode("ascii"))
+_field_value_re = re.compile(field_value.encode("ascii"))
+
+
+class Headers(Sequence[Tuple[bytes, bytes]]):
+    """
+    A list-like interface that allows iterating over headers as byte-pairs
+    of (lowercased-name, value).
+
+    Internally we actually store the representation as three-tuples,
+    including both the raw original casing, in order to preserve casing
+    over-the-wire, and the lowercased name, for case-insensitive comparisions.
+
+    r = Request(
+        method="GET",
+        target="/",
+        headers=[("Host", "example.org"), ("Connection", "keep-alive")],
+        http_version="1.1",
+    )
+    assert r.headers == [
+        (b"host", b"example.org"),
+        (b"connection", b"keep-alive")
+    ]
+    assert r.headers.raw_items() == [
+        (b"Host", b"example.org"),
+        (b"Connection", b"keep-alive")
+    ]
+    """
+
+    __slots__ = "_full_items"
+
+    def __init__(self, full_items: List[Tuple[bytes, bytes, bytes]]) -> None:
+        self._full_items = full_items
+
+    def __bool__(self) -> bool:
+        return bool(self._full_items)
+
+    def __eq__(self, other: object) -> bool:
+        return list(self) == list(other)  # type: ignore
+
+    def __len__(self) -> int:
+        return len(self._full_items)
+
+    def __repr__(self) -> str:
+        return "<Headers(%s)>" % repr(list(self))
+
+    def __getitem__(self, idx: int) -> Tuple[bytes, bytes]:  # type: ignore[override]
+        _, name, value = self._full_items[idx]
+        return (name, value)
+
+    def raw_items(self) -> List[Tuple[bytes, bytes]]:
+        return [(raw_name, value) for raw_name, _, value in self._full_items]
+
+
+HeaderTypes = Union[
+    List[Tuple[bytes, bytes]],
+    List[Tuple[bytes, str]],
+    List[Tuple[str, bytes]],
+    List[Tuple[str, str]],
+]
+
+
+@overload
+def normalize_and_validate(headers: Headers, _parsed: Literal[True]) -> Headers:
+    ...
+
+
+@overload
+def normalize_and_validate(headers: HeaderTypes, _parsed: Literal[False]) -> Headers:
+    ...
+
+
+@overload
+def normalize_and_validate(
+    headers: Union[Headers, HeaderTypes], _parsed: bool = False
+) -> Headers:
+    ...
+
+
+def normalize_and_validate(
+    headers: Union[Headers, HeaderTypes], _parsed: bool = False
+) -> Headers:
+    new_headers = []
+    seen_content_length = None
+    saw_transfer_encoding = False
+    for name, value in headers:
+        # For headers coming out of the parser, we can safely skip some steps,
+        # because it always returns bytes and has already run these regexes
+        # over the data:
+        if not _parsed:
+            name = bytesify(name)
+            value = bytesify(value)
+            validate(_field_name_re, name, "Illegal header name {!r}", name)
+            validate(_field_value_re, value, "Illegal header value {!r}", value)
+        assert isinstance(name, bytes)
+        assert isinstance(value, bytes)
+
+        raw_name = name
+        name = name.lower()
+        if name == b"content-length":
+            lengths = {length.strip() for length in value.split(b",")}
+            if len(lengths) != 1:
+                raise LocalProtocolError("conflicting Content-Length headers")
+            value = lengths.pop()
+            validate(_content_length_re, value, "bad Content-Length")
+            if seen_content_length is None:
+                seen_content_length = value
+                new_headers.append((raw_name, name, value))
+            elif seen_content_length != value:
+                raise LocalProtocolError("conflicting Content-Length headers")
+        elif name == b"transfer-encoding":
+            # "A server that receives a request message with a transfer coding
+            # it does not understand SHOULD respond with 501 (Not
+            # Implemented)."
+            # https://tools.ietf.org/html/rfc7230#section-3.3.1
+            if saw_transfer_encoding:
+                raise LocalProtocolError(
+                    "multiple Transfer-Encoding headers", error_status_hint=501
+                )
+            # "All transfer-coding names are case-insensitive"
+            # -- https://tools.ietf.org/html/rfc7230#section-4
+            value = value.lower()
+            if value != b"chunked":
+                raise LocalProtocolError(
+                    "Only Transfer-Encoding: chunked is supported",
+                    error_status_hint=501,
+                )
+            saw_transfer_encoding = True
+            new_headers.append((raw_name, name, value))
+        else:
+            new_headers.append((raw_name, name, value))
+    return Headers(new_headers)
+
+
+def get_comma_header(headers: Headers, name: bytes) -> List[bytes]:
+    # Should only be used for headers whose value is a list of
+    # comma-separated, case-insensitive values.
+    #
+    # The header name `name` is expected to be lower-case bytes.
+    #
+    # Connection: meets these criteria (including cast insensitivity).
+    #
+    # Content-Length: technically is just a single value (1*DIGIT), but the
+    # standard makes reference to implementations that do multiple values, and
+    # using this doesn't hurt. Ditto, case insensitivity doesn't things either
+    # way.
+    #
+    # Transfer-Encoding: is more complex (allows for quoted strings), so
+    # splitting on , is actually wrong. For example, this is legal:
+    #
+    #    Transfer-Encoding: foo; options="1,2", chunked
+    #
+    # and should be parsed as
+    #
+    #    foo; options="1,2"
+    #    chunked
+    #
+    # but this naive function will parse it as
+    #
+    #    foo; options="1
+    #    2"
+    #    chunked
+    #
+    # However, this is okay because the only thing we are going to do with
+    # any Transfer-Encoding is reject ones that aren't just "chunked", so
+    # both of these will be treated the same anyway.
+    #
+    # Expect: the only legal value is the literal string
+    # "100-continue". Splitting on commas is harmless. Case insensitive.
+    #
+    out: List[bytes] = []
+    for _, found_name, found_raw_value in headers._full_items:
+        if found_name == name:
+            found_raw_value = found_raw_value.lower()
+            for found_split_value in found_raw_value.split(b","):
+                found_split_value = found_split_value.strip()
+                if found_split_value:
+                    out.append(found_split_value)
+    return out
+
+
+def set_comma_header(headers: Headers, name: bytes, new_values: List[bytes]) -> Headers:
+    # The header name `name` is expected to be lower-case bytes.
+    #
+    # Note that when we store the header we use title casing for the header
+    # names, in order to match the conventional HTTP header style.
+    #
+    # Simply calling `.title()` is a blunt approach, but it's correct
+    # here given the cases where we're using `set_comma_header`...
+    #
+    # Connection, Content-Length, Transfer-Encoding.
+    new_headers: List[Tuple[bytes, bytes]] = []
+    for found_raw_name, found_name, found_raw_value in headers._full_items:
+        if found_name != name:
+            new_headers.append((found_raw_name, found_raw_value))
+    for new_value in new_values:
+        new_headers.append((name.title(), new_value))
+    return normalize_and_validate(new_headers)
+
+
+def has_expect_100_continue(request: "Request") -> bool:
+    # https://tools.ietf.org/html/rfc7231#section-5.1.1
+    # "A server that receives a 100-continue expectation in an HTTP/1.0 request
+    # MUST ignore that expectation."
+    if request.http_version < b"1.1":
+        return False
+    expect = get_comma_header(request.headers, b"expect")
+    return b"100-continue" in expect
author	robot-piglet <robot-piglet@yandex-team.com>	2023-11-12 21:25:31 +0300
committer	robot-piglet <robot-piglet@yandex-team.com>	2023-11-12 21:39:54 +0300
commit	d28c55ab25cc8cedab8a5f4736c0d66e88b3da95 (patch)
tree	73d373709b74fa2baaa4fe02a40a77c0a5baf6b7 /contrib/python/h11/h11/_headers.py
parent	35b17f4f3b6e0ed855e7e47d3f1eb57470388a2c (diff)
download	ydb-d28c55ab25cc8cedab8a5f4736c0d66e88b3da95.tar.gz