diff options
author | nkozlovskiy <nmk@ydb.tech> | 2023-09-29 12:24:06 +0300 |
---|---|---|
committer | nkozlovskiy <nmk@ydb.tech> | 2023-09-29 12:41:34 +0300 |
commit | e0e3e1717e3d33762ce61950504f9637a6e669ed (patch) | |
tree | bca3ff6939b10ed60c3d5c12439963a1146b9711 /contrib/python/Pygments/py3/pygments/lexers/mime.py | |
parent | 38f2c5852db84c7b4d83adfcb009eb61541d1ccd (diff) | |
download | ydb-e0e3e1717e3d33762ce61950504f9637a6e669ed.tar.gz |
add ydb deps
Diffstat (limited to 'contrib/python/Pygments/py3/pygments/lexers/mime.py')
-rw-r--r-- | contrib/python/Pygments/py3/pygments/lexers/mime.py | 210 |
1 files changed, 210 insertions, 0 deletions
diff --git a/contrib/python/Pygments/py3/pygments/lexers/mime.py b/contrib/python/Pygments/py3/pygments/lexers/mime.py new file mode 100644 index 0000000000..8bf16f74fd --- /dev/null +++ b/contrib/python/Pygments/py3/pygments/lexers/mime.py @@ -0,0 +1,210 @@ +""" + pygments.lexers.mime + ~~~~~~~~~~~~~~~~~~~~ + + Lexer for Multipurpose Internet Mail Extensions (MIME) data. + + :copyright: Copyright 2006-2023 by the Pygments team, see AUTHORS. + :license: BSD, see LICENSE for details. +""" + +import re + +from pygments.lexer import RegexLexer, include +from pygments.lexers import get_lexer_for_mimetype +from pygments.token import Text, Name, String, Operator, Comment, Other +from pygments.util import get_int_opt, ClassNotFound + +__all__ = ["MIMELexer"] + + +class MIMELexer(RegexLexer): + """ + Lexer for Multipurpose Internet Mail Extensions (MIME) data. This lexer is + designed to process nested multipart data. + + It assumes that the given data contains both header and body (and is + split at an empty line). If no valid header is found, then the entire data + will be treated as body. + + Additional options accepted: + + `MIME-max-level` + Max recursion level for nested MIME structure. Any negative number + would treated as unlimited. (default: -1) + + `Content-Type` + Treat the data as a specific content type. Useful when header is + missing, or this lexer would try to parse from header. (default: + `text/plain`) + + `Multipart-Boundary` + Set the default multipart boundary delimiter. This option is only used + when `Content-Type` is `multipart` and header is missing. This lexer + would try to parse from header by default. (default: None) + + `Content-Transfer-Encoding` + Treat the data as a specific encoding. Or this lexer would try to parse + from header by default. (default: None) + + .. versionadded:: 2.5 + """ + + name = "MIME" + aliases = ["mime"] + mimetypes = ["multipart/mixed", + "multipart/related", + "multipart/alternative"] + + def __init__(self, **options): + super().__init__(**options) + self.boundary = options.get("Multipart-Boundary") + self.content_transfer_encoding = options.get("Content_Transfer_Encoding") + self.content_type = options.get("Content_Type", "text/plain") + self.max_nested_level = get_int_opt(options, "MIME-max-level", -1) + + def get_header_tokens(self, match): + field = match.group(1) + + if field.lower() in self.attention_headers: + yield match.start(1), Name.Tag, field + ":" + yield match.start(2), Text.Whitespace, match.group(2) + + pos = match.end(2) + body = match.group(3) + for i, t, v in self.get_tokens_unprocessed(body, ("root", field.lower())): + yield pos + i, t, v + + else: + yield match.start(), Comment, match.group() + + def get_body_tokens(self, match): + pos_body_start = match.start() + entire_body = match.group() + + # skip first newline + if entire_body[0] == '\n': + yield pos_body_start, Text.Whitespace, '\n' + pos_body_start = pos_body_start + 1 + entire_body = entire_body[1:] + + # if it is not a multipart + if not self.content_type.startswith("multipart") or not self.boundary: + for i, t, v in self.get_bodypart_tokens(entire_body): + yield pos_body_start + i, t, v + return + + # find boundary + bdry_pattern = r"^--%s(--)?\n" % re.escape(self.boundary) + bdry_matcher = re.compile(bdry_pattern, re.MULTILINE) + + # some data has prefix text before first boundary + m = bdry_matcher.search(entire_body) + if m: + pos_part_start = pos_body_start + m.end() + pos_iter_start = lpos_end = m.end() + yield pos_body_start, Text, entire_body[:m.start()] + yield pos_body_start + lpos_end, String.Delimiter, m.group() + else: + pos_part_start = pos_body_start + pos_iter_start = 0 + + # process tokens of each body part + for m in bdry_matcher.finditer(entire_body, pos_iter_start): + # bodypart + lpos_start = pos_part_start - pos_body_start + lpos_end = m.start() + part = entire_body[lpos_start:lpos_end] + for i, t, v in self.get_bodypart_tokens(part): + yield pos_part_start + i, t, v + + # boundary + yield pos_body_start + lpos_end, String.Delimiter, m.group() + pos_part_start = pos_body_start + m.end() + + # some data has suffix text after last boundary + lpos_start = pos_part_start - pos_body_start + if lpos_start != len(entire_body): + yield pos_part_start, Text, entire_body[lpos_start:] + + def get_bodypart_tokens(self, text): + # return if: + # * no content + # * no content type specific + # * content encoding is not readable + # * max recurrsion exceed + if not text.strip() or not self.content_type: + return [(0, Other, text)] + + cte = self.content_transfer_encoding + if cte and cte not in {"8bit", "7bit", "quoted-printable"}: + return [(0, Other, text)] + + if self.max_nested_level == 0: + return [(0, Other, text)] + + # get lexer + try: + lexer = get_lexer_for_mimetype(self.content_type) + except ClassNotFound: + return [(0, Other, text)] + + if isinstance(lexer, type(self)): + lexer.max_nested_level = self.max_nested_level - 1 + + return lexer.get_tokens_unprocessed(text) + + def store_content_type(self, match): + self.content_type = match.group(1) + + prefix_len = match.start(1) - match.start(0) + yield match.start(0), Text.Whitespace, match.group(0)[:prefix_len] + yield match.start(1), Name.Label, match.group(2) + yield match.end(2), String.Delimiter, '/' + yield match.start(3), Name.Label, match.group(3) + + def get_content_type_subtokens(self, match): + yield match.start(1), Text, match.group(1) + yield match.start(2), Text.Whitespace, match.group(2) + yield match.start(3), Name.Attribute, match.group(3) + yield match.start(4), Operator, match.group(4) + yield match.start(5), String, match.group(5) + + if match.group(3).lower() == "boundary": + boundary = match.group(5).strip() + if boundary[0] == '"' and boundary[-1] == '"': + boundary = boundary[1:-1] + self.boundary = boundary + + def store_content_transfer_encoding(self, match): + self.content_transfer_encoding = match.group(0).lower() + yield match.start(0), Name.Constant, match.group(0) + + attention_headers = {"content-type", "content-transfer-encoding"} + + tokens = { + "root": [ + (r"^([\w-]+):( *)([\s\S]*?\n)(?![ \t])", get_header_tokens), + (r"^$[\s\S]+", get_body_tokens), + ], + "header": [ + # folding + (r"\n[ \t]", Text.Whitespace), + (r"\n(?![ \t])", Text.Whitespace, "#pop"), + ], + "content-type": [ + include("header"), + ( + r"^\s*((multipart|application|audio|font|image|model|text|video" + r"|message)/([\w-]+))", + store_content_type, + ), + (r'(;)((?:[ \t]|\n[ \t])*)([\w:-]+)(=)([\s\S]*?)(?=;|\n(?![ \t]))', + get_content_type_subtokens), + (r';[ \t]*\n(?![ \t])', Text, '#pop'), + ], + "content-transfer-encoding": [ + include("header"), + (r"([\w-]+)", store_content_transfer_encoding), + ], + } |