aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/deprecated/python/win-unicode-console/win_unicode_console/tokenize_open.py
diff options
context:
space:
mode:
authornkozlovskiy <nmk@ydb.tech>2023-10-02 18:57:38 +0300
committernkozlovskiy <nmk@ydb.tech>2023-10-02 19:39:06 +0300
commit6295ef4d23465c11296e898b9dc4524ad9592b5d (patch)
treefc0c852877b2c52f365a1f6ed0710955844338c2 /contrib/deprecated/python/win-unicode-console/win_unicode_console/tokenize_open.py
parentde63c80b75948ecc13894854514d147840ff8430 (diff)
downloadydb-6295ef4d23465c11296e898b9dc4524ad9592b5d.tar.gz
oss ydb: fix dstool building and test run
Diffstat (limited to 'contrib/deprecated/python/win-unicode-console/win_unicode_console/tokenize_open.py')
-rw-r--r--contrib/deprecated/python/win-unicode-console/win_unicode_console/tokenize_open.py162
1 files changed, 162 insertions, 0 deletions
diff --git a/contrib/deprecated/python/win-unicode-console/win_unicode_console/tokenize_open.py b/contrib/deprecated/python/win-unicode-console/win_unicode_console/tokenize_open.py
new file mode 100644
index 0000000000..aa583dfa5f
--- /dev/null
+++ b/contrib/deprecated/python/win-unicode-console/win_unicode_console/tokenize_open.py
@@ -0,0 +1,162 @@
+"""Backport of tokenize.open from Python 3.5
+
+This is the exact Python 3.5 with the following differences:
+ - detect_encoding_ex is detect_encoding from Python 3.5 returning also a bool whether a cookie was found
+ - detect_encoding calls detect_encoding_ex, so that its signature is the same as in Python 3.5
+ - function read_source_lines was added
+"""
+
+from codecs import lookup, BOM_UTF8
+from io import TextIOWrapper, open as _builtin_open
+import re
+
+re_ASCII = 256 # not present in Python 2
+cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re_ASCII)
+blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re_ASCII)
+
+
+def _get_normal_name(orig_enc):
+ """Imitates get_normal_name in tokenizer.c."""
+ # Only care about the first 12 characters.
+ enc = orig_enc[:12].lower().replace("_", "-")
+ if enc == "utf-8" or enc.startswith("utf-8-"):
+ return "utf-8"
+ if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
+ enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
+ return "iso-8859-1"
+ return orig_enc
+
+
+def detect_encoding(readline):
+ """
+ The detect_encoding() function is used to detect the encoding that should
+ be used to decode a Python source file. It requires one argument, readline,
+ in the same way as the tokenize() generator.
+
+ It will call readline a maximum of twice, and return the encoding used
+ (as a string) and a list of any lines (left as bytes) it has read in.
+
+ It detects the encoding from the presence of a utf-8 bom or an encoding
+ cookie as specified in pep-0263. If both a bom and a cookie are present,
+ but disagree, a SyntaxError will be raised. If the encoding cookie is an
+ invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,
+ 'utf-8-sig' is returned.
+
+ If no encoding is specified, then the default of 'utf-8' will be returned.
+ """
+
+ return detect_encoding_ex(readline)[:2]
+
+
+def detect_encoding_ex(readline):
+ try:
+ filename = readline.__self__.name
+ except AttributeError:
+ filename = None
+ bom_found = False
+ encoding = None
+ default = 'utf-8'
+ def read_or_stop():
+ try:
+ return readline()
+ except StopIteration:
+ return b''
+
+ def find_cookie(line):
+ try:
+ # Decode as UTF-8. Either the line is an encoding declaration,
+ # in which case it should be pure ASCII, or it must be UTF-8
+ # per default encoding.
+ line_string = line.decode('utf-8')
+ except UnicodeDecodeError:
+ msg = "invalid or missing encoding declaration"
+ if filename is not None:
+ msg = '{} for {!r}'.format(msg, filename)
+ raise SyntaxError(msg)
+
+ match = cookie_re.match(line_string)
+ if not match:
+ return None
+ encoding = _get_normal_name(match.group(1))
+ try:
+ codec = lookup(encoding)
+ except LookupError:
+ # This behaviour mimics the Python interpreter
+ if filename is None:
+ msg = "unknown encoding: " + encoding
+ else:
+ msg = "unknown encoding for {!r}: {}".format(filename,
+ encoding)
+ raise SyntaxError(msg)
+
+ if bom_found:
+ if encoding != 'utf-8':
+ # This behaviour mimics the Python interpreter
+ if filename is None:
+ msg = 'encoding problem: utf-8'
+ else:
+ msg = 'encoding problem for {!r}: utf-8'.format(filename)
+ raise SyntaxError(msg)
+ encoding += '-sig'
+ return encoding
+
+ first = read_or_stop()
+ if first.startswith(BOM_UTF8):
+ bom_found = True
+ first = first[3:]
+ default = 'utf-8-sig'
+ if not first:
+ return default, [], False
+
+ encoding = find_cookie(first)
+ if encoding:
+ return encoding, [first], True
+ if not blank_re.match(first):
+ return default, [first], False
+
+ second = read_or_stop()
+ if not second:
+ return default, [first], False
+
+ encoding = find_cookie(second)
+ if encoding:
+ return encoding, [first, second], True
+
+ return default, [first, second], False
+
+
+def open(filename):
+ """Open a file in read only mode using the encoding detected by
+ detect_encoding().
+ """
+ buffer = _builtin_open(filename, 'rb')
+ try:
+ encoding, lines = detect_encoding(buffer.readline)
+ buffer.seek(0)
+ text = TextIOWrapper(buffer, encoding, line_buffering=True)
+ text.mode = 'r'
+ return text
+ except:
+ buffer.close()
+ raise
+
+def read_source_lines(filename):
+ buffer = _builtin_open(filename, 'rb')
+ try:
+ encoding, lines, cookie_present = detect_encoding_ex(buffer.readline)
+ buffer.seek(0)
+ text = TextIOWrapper(buffer, encoding, line_buffering=True)
+ text.mode = 'r'
+ except:
+ buffer.close()
+ raise
+
+ with text:
+ if cookie_present:
+ for i in lines:
+ yield text.readline().replace("coding", "Coding")
+ # so compile() won't complain about encoding declatation in a Unicode string
+ # see 2.7/Python/ast.c:228
+
+ for line in text:
+ yield line