oss ydb: fix dstool building and test run

author: nkozlovskiy <nmk@ydb.tech> 2023-10-02 18:57:38 +0300
committer: nkozlovskiy <nmk@ydb.tech> 2023-10-02 19:39:06 +0300
commit: 6295ef4d23465c11296e898b9dc4524ad9592b5d (patch)
tree: fc0c852877b2c52f365a1f6ed0710955844338c2 /contrib/deprecated/python/win-unicode-console/win_unicode_console/tokenize_open.py
parent: de63c80b75948ecc13894854514d147840ff8430 (diff)
download: ydb-6295ef4d23465c11296e898b9dc4524ad9592b5d.tar.gz
1 files changed, 162 insertions, 0 deletions
diff --git a/contrib/deprecated/python/win-unicode-console/win_unicode_console/tokenize_open.py b/contrib/deprecated/python/win-unicode-console/win_unicode_console/tokenize_open.py
new file mode 100644
index 0000000000..aa583dfa5f
--- /dev/null
+++ b/contrib/deprecated/python/win-unicode-console/win_unicode_console/tokenize_open.py
@@ -0,0 +1,162 @@
+"""Backport of tokenize.open from Python 3.5
+
+This is the exact Python 3.5 with the following differences:
+ - detect_encoding_ex is detect_encoding from Python 3.5 returning also a bool whether a cookie was found
+ - detect_encoding calls detect_encoding_ex, so that its signature is the same as in Python 3.5
+ - function read_source_lines was added
+"""
+
+from codecs import lookup, BOM_UTF8
+from io import TextIOWrapper, open as _builtin_open
+import re
+
+re_ASCII = 256 # not present in Python 2
+cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re_ASCII)
+blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re_ASCII)
+
+
+def _get_normal_name(orig_enc):
+	"""Imitates get_normal_name in tokenizer.c."""
+	# Only care about the first 12 characters.
+	enc = orig_enc[:12].lower().replace("_", "-")
+	if enc == "utf-8" or enc.startswith("utf-8-"):
+		return "utf-8"
+	if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
+			enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
+		return "iso-8859-1"
+	return orig_enc
+
+
+def detect_encoding(readline):
+	"""
+	The detect_encoding() function is used to detect the encoding that should
+	be used to decode a Python source file.  It requires one argument, readline,
+	in the same way as the tokenize() generator.
+	
+	It will call readline a maximum of twice, and return the encoding used
+	(as a string) and a list of any lines (left as bytes) it has read in.
+	
+	It detects the encoding from the presence of a utf-8 bom or an encoding
+	cookie as specified in pep-0263.  If both a bom and a cookie are present,
+	but disagree, a SyntaxError will be raised.  If the encoding cookie is an
+	invalid charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
+	'utf-8-sig' is returned.
+	
+	If no encoding is specified, then the default of 'utf-8' will be returned.
+	"""
+	
+	return detect_encoding_ex(readline)[:2]
+
+
+def detect_encoding_ex(readline):
+	try:
+		filename = readline.__self__.name
+	except AttributeError:
+		filename = None
+	bom_found = False
+	encoding = None
+	default = 'utf-8'
+	def read_or_stop():
+		try:
+			return readline()
+		except StopIteration:
+			return b''
+	
+	def find_cookie(line):
+		try:
+			# Decode as UTF-8. Either the line is an encoding declaration,
+			# in which case it should be pure ASCII, or it must be UTF-8
+			# per default encoding.
+			line_string = line.decode('utf-8')
+		except UnicodeDecodeError:
+			msg = "invalid or missing encoding declaration"
+			if filename is not None:
+				msg = '{} for {!r}'.format(msg, filename)
+			raise SyntaxError(msg)
+		
+		match = cookie_re.match(line_string)
+		if not match:
+			return None
+		encoding = _get_normal_name(match.group(1))
+		try:
+			codec = lookup(encoding)
+		except LookupError:
+			# This behaviour mimics the Python interpreter
+			if filename is None:
+				msg = "unknown encoding: " + encoding
+			else:
+				msg = "unknown encoding for {!r}: {}".format(filename,
+						encoding)
+			raise SyntaxError(msg)
+		
+		if bom_found:
+			if encoding != 'utf-8':
+				# This behaviour mimics the Python interpreter
+				if filename is None:
+					msg = 'encoding problem: utf-8'
+				else:
+					msg = 'encoding problem for {!r}: utf-8'.format(filename)
+				raise SyntaxError(msg)
+			encoding += '-sig'
+		return encoding
+	
+	first = read_or_stop()
+	if first.startswith(BOM_UTF8):
+		bom_found = True
+		first = first[3:]
+		default = 'utf-8-sig'
+	if not first:
+		return default, [], False
+	
+	encoding = find_cookie(first)
+	if encoding:
+		return encoding, [first], True
+	if not blank_re.match(first):
+		return default, [first], False
+	
+	second = read_or_stop()
+	if not second:
+		return default, [first], False
+	
+	encoding = find_cookie(second)
+	if encoding:
+		return encoding, [first, second], True
+	
+	return default, [first, second], False
+
+
+def open(filename):
+	"""Open a file in read only mode using the encoding detected by
+	detect_encoding().
+	"""
+	buffer = _builtin_open(filename, 'rb')
+	try:
+		encoding, lines = detect_encoding(buffer.readline)
+		buffer.seek(0)
+		text = TextIOWrapper(buffer, encoding, line_buffering=True)
+		text.mode = 'r'
+		return text
+	except:
+		buffer.close()
+		raise
+
+def read_source_lines(filename):
+	buffer = _builtin_open(filename, 'rb')
+	try:
+		encoding, lines, cookie_present = detect_encoding_ex(buffer.readline)
+		buffer.seek(0)
+		text = TextIOWrapper(buffer, encoding, line_buffering=True)
+		text.mode = 'r'
+	except:
+		buffer.close()
+		raise
+	
+	with text:
+		if cookie_present:
+			for i in lines:
+				yield text.readline().replace("coding", "Coding")
+				# so compile() won't complain about encoding declatation in a Unicode string
+				# see 2.7/Python/ast.c:228
+		
+		for line in text:
+			yield line
author	nkozlovskiy <nmk@ydb.tech>	2023-10-02 18:57:38 +0300
committer	nkozlovskiy <nmk@ydb.tech>	2023-10-02 19:39:06 +0300
commit	6295ef4d23465c11296e898b9dc4524ad9592b5d (patch)
tree	fc0c852877b2c52f365a1f6ed0710955844338c2 /contrib/deprecated/python/win-unicode-console/win_unicode_console/tokenize_open.py
parent	de63c80b75948ecc13894854514d147840ff8430 (diff)
download	ydb-6295ef4d23465c11296e898b9dc4524ad9592b5d.tar.gz