diff options
author | nkozlovskiy <nmk@ydb.tech> | 2023-10-02 18:57:38 +0300 |
---|---|---|
committer | nkozlovskiy <nmk@ydb.tech> | 2023-10-02 19:39:06 +0300 |
commit | 6295ef4d23465c11296e898b9dc4524ad9592b5d (patch) | |
tree | fc0c852877b2c52f365a1f6ed0710955844338c2 /contrib/deprecated/python/win-unicode-console/win_unicode_console/tokenize_open.py | |
parent | de63c80b75948ecc13894854514d147840ff8430 (diff) | |
download | ydb-6295ef4d23465c11296e898b9dc4524ad9592b5d.tar.gz |
oss ydb: fix dstool building and test run
Diffstat (limited to 'contrib/deprecated/python/win-unicode-console/win_unicode_console/tokenize_open.py')
-rw-r--r-- | contrib/deprecated/python/win-unicode-console/win_unicode_console/tokenize_open.py | 162 |
1 files changed, 162 insertions, 0 deletions
diff --git a/contrib/deprecated/python/win-unicode-console/win_unicode_console/tokenize_open.py b/contrib/deprecated/python/win-unicode-console/win_unicode_console/tokenize_open.py new file mode 100644 index 0000000000..aa583dfa5f --- /dev/null +++ b/contrib/deprecated/python/win-unicode-console/win_unicode_console/tokenize_open.py @@ -0,0 +1,162 @@ +"""Backport of tokenize.open from Python 3.5 + +This is the exact Python 3.5 with the following differences: + - detect_encoding_ex is detect_encoding from Python 3.5 returning also a bool whether a cookie was found + - detect_encoding calls detect_encoding_ex, so that its signature is the same as in Python 3.5 + - function read_source_lines was added +""" + +from codecs import lookup, BOM_UTF8 +from io import TextIOWrapper, open as _builtin_open +import re + +re_ASCII = 256 # not present in Python 2 +cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re_ASCII) +blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re_ASCII) + + +def _get_normal_name(orig_enc): + """Imitates get_normal_name in tokenizer.c.""" + # Only care about the first 12 characters. + enc = orig_enc[:12].lower().replace("_", "-") + if enc == "utf-8" or enc.startswith("utf-8-"): + return "utf-8" + if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \ + enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")): + return "iso-8859-1" + return orig_enc + + +def detect_encoding(readline): + """ + The detect_encoding() function is used to detect the encoding that should + be used to decode a Python source file. It requires one argument, readline, + in the same way as the tokenize() generator. + + It will call readline a maximum of twice, and return the encoding used + (as a string) and a list of any lines (left as bytes) it has read in. + + It detects the encoding from the presence of a utf-8 bom or an encoding + cookie as specified in pep-0263. If both a bom and a cookie are present, + but disagree, a SyntaxError will be raised. If the encoding cookie is an + invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found, + 'utf-8-sig' is returned. + + If no encoding is specified, then the default of 'utf-8' will be returned. + """ + + return detect_encoding_ex(readline)[:2] + + +def detect_encoding_ex(readline): + try: + filename = readline.__self__.name + except AttributeError: + filename = None + bom_found = False + encoding = None + default = 'utf-8' + def read_or_stop(): + try: + return readline() + except StopIteration: + return b'' + + def find_cookie(line): + try: + # Decode as UTF-8. Either the line is an encoding declaration, + # in which case it should be pure ASCII, or it must be UTF-8 + # per default encoding. + line_string = line.decode('utf-8') + except UnicodeDecodeError: + msg = "invalid or missing encoding declaration" + if filename is not None: + msg = '{} for {!r}'.format(msg, filename) + raise SyntaxError(msg) + + match = cookie_re.match(line_string) + if not match: + return None + encoding = _get_normal_name(match.group(1)) + try: + codec = lookup(encoding) + except LookupError: + # This behaviour mimics the Python interpreter + if filename is None: + msg = "unknown encoding: " + encoding + else: + msg = "unknown encoding for {!r}: {}".format(filename, + encoding) + raise SyntaxError(msg) + + if bom_found: + if encoding != 'utf-8': + # This behaviour mimics the Python interpreter + if filename is None: + msg = 'encoding problem: utf-8' + else: + msg = 'encoding problem for {!r}: utf-8'.format(filename) + raise SyntaxError(msg) + encoding += '-sig' + return encoding + + first = read_or_stop() + if first.startswith(BOM_UTF8): + bom_found = True + first = first[3:] + default = 'utf-8-sig' + if not first: + return default, [], False + + encoding = find_cookie(first) + if encoding: + return encoding, [first], True + if not blank_re.match(first): + return default, [first], False + + second = read_or_stop() + if not second: + return default, [first], False + + encoding = find_cookie(second) + if encoding: + return encoding, [first, second], True + + return default, [first, second], False + + +def open(filename): + """Open a file in read only mode using the encoding detected by + detect_encoding(). + """ + buffer = _builtin_open(filename, 'rb') + try: + encoding, lines = detect_encoding(buffer.readline) + buffer.seek(0) + text = TextIOWrapper(buffer, encoding, line_buffering=True) + text.mode = 'r' + return text + except: + buffer.close() + raise + +def read_source_lines(filename): + buffer = _builtin_open(filename, 'rb') + try: + encoding, lines, cookie_present = detect_encoding_ex(buffer.readline) + buffer.seek(0) + text = TextIOWrapper(buffer, encoding, line_buffering=True) + text.mode = 'r' + except: + buffer.close() + raise + + with text: + if cookie_present: + for i in lines: + yield text.readline().replace("coding", "Coding") + # so compile() won't complain about encoding declatation in a Unicode string + # see 2.7/Python/ast.c:228 + + for line in text: + yield line |