contrib/deprecated/python/win-unicode-console/win_unicode_console/tokenize_open.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162

"""Backport of tokenize.open from Python 3.5

This is the exact Python 3.5 with the following differences:
 - detect_encoding_ex is detect_encoding from Python 3.5 returning also a bool whether a cookie was found
 - detect_encoding calls detect_encoding_ex, so that its signature is the same as in Python 3.5
 - function read_source_lines was added
"""

from codecs import lookup, BOM_UTF8
from io import TextIOWrapper, open as _builtin_open
import re

re_ASCII = 256 # not present in Python 2
cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re_ASCII)
blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re_ASCII)


def _get_normal_name(orig_enc):
	"""Imitates get_normal_name in tokenizer.c."""
	# Only care about the first 12 characters.
	enc = orig_enc[:12].lower().replace("_", "-")
	if enc == "utf-8" or enc.startswith("utf-8-"):
		return "utf-8"
	if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
			enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
		return "iso-8859-1"
	return orig_enc


def detect_encoding(readline):
	"""
	The detect_encoding() function is used to detect the encoding that should
	be used to decode a Python source file.  It requires one argument, readline,
	in the same way as the tokenize() generator.
	
	It will call readline a maximum of twice, and return the encoding used
	(as a string) and a list of any lines (left as bytes) it has read in.
	
	It detects the encoding from the presence of a utf-8 bom or an encoding
	cookie as specified in pep-0263.  If both a bom and a cookie are present,
	but disagree, a SyntaxError will be raised.  If the encoding cookie is an
	invalid charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
	'utf-8-sig' is returned.
	
	If no encoding is specified, then the default of 'utf-8' will be returned.
	"""
	
	return detect_encoding_ex(readline)[:2]


def detect_encoding_ex(readline):
	try:
		filename = readline.__self__.name
	except AttributeError:
		filename = None
	bom_found = False
	encoding = None
	default = 'utf-8'
	def read_or_stop():
		try:
			return readline()
		except StopIteration:
			return b''
	
	def find_cookie(line):
		try:
			# Decode as UTF-8. Either the line is an encoding declaration,
			# in which case it should be pure ASCII, or it must be UTF-8
			# per default encoding.
			line_string = line.decode('utf-8')
		except UnicodeDecodeError:
			msg = "invalid or missing encoding declaration"
			if filename is not None:
				msg = '{} for {!r}'.format(msg, filename)
			raise SyntaxError(msg)
		
		match = cookie_re.match(line_string)
		if not match:
			return None
		encoding = _get_normal_name(match.group(1))
		try:
			codec = lookup(encoding)
		except LookupError:
			# This behaviour mimics the Python interpreter
			if filename is None:
				msg = "unknown encoding: " + encoding
			else:
				msg = "unknown encoding for {!r}: {}".format(filename,
						encoding)
			raise SyntaxError(msg)
		
		if bom_found:
			if encoding != 'utf-8':
				# This behaviour mimics the Python interpreter
				if filename is None:
					msg = 'encoding problem: utf-8'
				else:
					msg = 'encoding problem for {!r}: utf-8'.format(filename)
				raise SyntaxError(msg)
			encoding += '-sig'
		return encoding
	
	first = read_or_stop()
	if first.startswith(BOM_UTF8):
		bom_found = True
		first = first[3:]
		default = 'utf-8-sig'
	if not first:
		return default, [], False
	
	encoding = find_cookie(first)
	if encoding:
		return encoding, [first], True
	if not blank_re.match(first):
		return default, [first], False
	
	second = read_or_stop()
	if not second:
		return default, [first], False
	
	encoding = find_cookie(second)
	if encoding:
		return encoding, [first, second], True
	
	return default, [first, second], False


def open(filename):
	"""Open a file in read only mode using the encoding detected by
	detect_encoding().
	"""
	buffer = _builtin_open(filename, 'rb')
	try:
		encoding, lines = detect_encoding(buffer.readline)
		buffer.seek(0)
		text = TextIOWrapper(buffer, encoding, line_buffering=True)
		text.mode = 'r'
		return text
	except:
		buffer.close()
		raise

def read_source_lines(filename):
	buffer = _builtin_open(filename, 'rb')
	try:
		encoding, lines, cookie_present = detect_encoding_ex(buffer.readline)
		buffer.seek(0)
		text = TextIOWrapper(buffer, encoding, line_buffering=True)
		text.mode = 'r'
	except:
		buffer.close()
		raise
	
	with text:
		if cookie_present:
			for i in lines:
				yield text.readline().replace("coding", "Coding")
				# so compile() won't complain about encoding declatation in a Unicode string
				# see 2.7/Python/ast.c:228
		
		for line in text:
			yield line