summaryrefslogtreecommitdiffstats
path: root/contrib/tools/python3/Lib/html/parser.py
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/tools/python3/Lib/html/parser.py')
-rw-r--r--contrib/tools/python3/Lib/html/parser.py315
1 files changed, 206 insertions, 109 deletions
diff --git a/contrib/tools/python3/Lib/html/parser.py b/contrib/tools/python3/Lib/html/parser.py
index 13c95c34e50..3aa86f8f30e 100644
--- a/contrib/tools/python3/Lib/html/parser.py
+++ b/contrib/tools/python3/Lib/html/parser.py
@@ -12,6 +12,7 @@ import re
import _markupbase
from html import unescape
+from html.entities import html5 as html5_entities
__all__ = ['HTMLParser']
@@ -23,20 +24,52 @@ incomplete = re.compile('&[a-zA-Z#]')
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
+incomplete_charref = re.compile('&#(?:[0-9]|[xX][0-9a-fA-F])')
+attr_charref = re.compile(r'&(#[0-9]+|#[xX][0-9a-fA-F]+|[a-zA-Z][a-zA-Z0-9]*)[;=]?')
starttagopen = re.compile('<[a-zA-Z]')
+endtagopen = re.compile('</[a-zA-Z]')
piclose = re.compile('>')
-commentclose = re.compile(r'--\s*>')
+commentclose = re.compile(r'--!?>')
+commentabruptclose = re.compile(r'-?>')
# Note:
-# 1) if you change tagfind/attrfind remember to update locatestarttagend too;
-# 2) if you change tagfind/attrfind and/or locatestarttagend the parser will
+# 1) if you change tagfind/attrfind remember to update locatetagend too;
+# 2) if you change tagfind/attrfind and/or locatetagend the parser will
# explode, so don't do it.
-# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
-# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
-tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*')
-attrfind_tolerant = re.compile(
- r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
- r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
+# see the HTML5 specs section "13.2.5.6 Tag open state",
+# "13.2.5.8 Tag name state" and "13.2.5.33 Attribute name state".
+# https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
+# https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
+# https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
+tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />]*)(?:[\t\n\r\f ]|/(?!>))*')
+attrfind_tolerant = re.compile(r"""
+ (
+ (?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name
+ )
+ ([\t\n\r\f ]*=[\t\n\r\f ]* # value indicator
+ ('[^']*' # LITA-enclosed value
+ |"[^"]*" # LIT-enclosed value
+ |(?!['"])[^>\t\n\r\f ]* # bare value
+ )
+ )?
+ (?:[\t\n\r\f ]|/(?!>))* # possibly followed by a space
+""", re.VERBOSE)
+locatetagend = re.compile(r"""
+ [a-zA-Z][^\t\n\r\f />]* # tag name
+ [\t\n\r\f /]* # optional whitespace before attribute name
+ (?:(?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name
+ (?:[\t\n\r\f ]*=[\t\n\r\f ]* # value indicator
+ (?:'[^']*' # LITA-enclosed value
+ |"[^"]*" # LIT-enclosed value
+ |(?!['"])[^>\t\n\r\f ]* # bare value
+ )
+ )?
+ [\t\n\r\f /]* # possibly followed by a space
+ )*
+ >?
+""", re.VERBOSE)
+# The following variables are not used, but are temporarily left for
+# backward compatibility.
locatestarttagend_tolerant = re.compile(r"""
<[a-zA-Z][^\t\n\r\f />\x00]* # tag name
(?:[\s/]* # optional whitespace before attribute name
@@ -53,10 +86,24 @@ locatestarttagend_tolerant = re.compile(r"""
\s* # trailing whitespace
""", re.VERBOSE)
endendtag = re.compile('>')
-# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
-# </ and the tag name, so maybe this should be fixed
endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
+# Character reference processing logic specific to attribute values
+# See: https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
+def _replace_attr_charref(match):
+ ref = match.group(0)
+ # Numeric / hex char refs must always be unescaped
+ if ref.startswith('&#'):
+ return unescape(ref)
+ # Named character / entity references must only be unescaped
+ # if they are an exact match, and they are not followed by an equals sign
+ if not ref.endswith('=') and ref[1:] in html5_entities:
+ return unescape(ref)
+ # Otherwise do not unescape
+ return ref
+
+def _unescape_attrvalue(s):
+ return attr_charref.sub(_replace_attr_charref, s)
class HTMLParser(_markupbase.ParserBase):
@@ -81,16 +128,25 @@ class HTMLParser(_markupbase.ParserBase):
argument.
"""
- CDATA_CONTENT_ELEMENTS = ("script", "style")
+ # See the HTML5 specs section "13.4 Parsing HTML fragments".
+ # https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments
+ # CDATA_CONTENT_ELEMENTS are parsed in RAWTEXT mode
+ CDATA_CONTENT_ELEMENTS = ("script", "style", "xmp", "iframe", "noembed", "noframes")
+ RCDATA_CONTENT_ELEMENTS = ("textarea", "title")
- def __init__(self, *, convert_charrefs=True):
+ def __init__(self, *, convert_charrefs=True, scripting=False):
"""Initialize and reset this instance.
- If convert_charrefs is True (the default), all character references
+ If convert_charrefs is true (the default), all character references
are automatically converted to the corresponding Unicode characters.
+
+ If *scripting* is false (the default), the content of the
+ ``noscript`` element is parsed normally; if it's true,
+ it's returned as is without being parsed.
"""
super().__init__()
self.convert_charrefs = convert_charrefs
+ self.scripting = scripting
self.reset()
def reset(self):
@@ -99,6 +155,8 @@ class HTMLParser(_markupbase.ParserBase):
self.lasttag = '???'
self.interesting = interesting_normal
self.cdata_elem = None
+ self._support_cdata = True
+ self._escapable = True
super().reset()
def feed(self, data):
@@ -120,13 +178,35 @@ class HTMLParser(_markupbase.ParserBase):
"""Return full source of start tag: '<...>'."""
return self.__starttag_text
- def set_cdata_mode(self, elem):
+ def set_cdata_mode(self, elem, *, escapable=False):
self.cdata_elem = elem.lower()
- self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
+ self._escapable = escapable
+ if self.cdata_elem == 'plaintext':
+ self.interesting = re.compile(r'\Z')
+ elif escapable and not self.convert_charrefs:
+ self.interesting = re.compile(r'&|</%s(?=[\t\n\r\f />])' % self.cdata_elem,
+ re.IGNORECASE|re.ASCII)
+ else:
+ self.interesting = re.compile(r'</%s(?=[\t\n\r\f />])' % self.cdata_elem,
+ re.IGNORECASE|re.ASCII)
def clear_cdata_mode(self):
self.interesting = interesting_normal
self.cdata_elem = None
+ self._escapable = True
+
+ def _set_support_cdata(self, flag=True):
+ """Enable or disable support of the CDATA sections.
+ If enabled, "<[CDATA[" starts a CDATA section which ends with "]]>".
+ If disabled, "<[CDATA[" starts a bogus comments which ends with ">".
+
+ This method is not called by default. Its purpose is to be called
+ in custom handle_starttag() and handle_endtag() methods, with
+ value that depends on the adjusted current node.
+ See https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
+ for details.
+ """
+ self._support_cdata = flag
# Internal -- handle data as far as reasonable. May leave state
# and data to be processed by a subsequent call. If 'end' is
@@ -147,7 +227,7 @@ class HTMLParser(_markupbase.ParserBase):
# & near the end and see if it's followed by a space or ;.
amppos = rawdata.rfind('&', max(i, n-34))
if (amppos >= 0 and
- not re.compile(r'[\s;]').search(rawdata, amppos)):
+ not re.compile(r'[\t\n\r\f ;]').search(rawdata, amppos)):
break # wait till we get all the text
j = n
else:
@@ -159,7 +239,7 @@ class HTMLParser(_markupbase.ParserBase):
break
j = n
if i < j:
- if self.convert_charrefs and not self.cdata_elem:
+ if self.convert_charrefs and self._escapable:
self.handle_data(unescape(rawdata[i:j]))
else:
self.handle_data(rawdata[i:j])
@@ -177,7 +257,7 @@ class HTMLParser(_markupbase.ParserBase):
k = self.parse_pi(i)
elif startswith("<!", i):
k = self.parse_html_declaration(i)
- elif (i + 1) < n:
+ elif (i + 1) < n or end:
self.handle_data("<")
k = i + 1
else:
@@ -185,17 +265,35 @@ class HTMLParser(_markupbase.ParserBase):
if k < 0:
if not end:
break
- k = rawdata.find('>', i + 1)
- if k < 0:
- k = rawdata.find('<', i + 1)
- if k < 0:
- k = i + 1
- else:
- k += 1
- if self.convert_charrefs and not self.cdata_elem:
- self.handle_data(unescape(rawdata[i:k]))
+ if starttagopen.match(rawdata, i): # < + letter
+ pass
+ elif startswith("</", i):
+ if i + 2 == n:
+ self.handle_data("</")
+ elif endtagopen.match(rawdata, i): # </ + letter
+ pass
+ else:
+ # bogus comment
+ self.handle_comment(rawdata[i+2:])
+ elif startswith("<!--", i):
+ j = n
+ for suffix in ("--!", "--", "-"):
+ if rawdata.endswith(suffix, i+4):
+ j -= len(suffix)
+ break
+ self.handle_comment(rawdata[i+4:j])
+ elif startswith("<![CDATA[", i) and self._support_cdata:
+ self.unknown_decl(rawdata[i+3:])
+ elif rawdata[i:i+9].lower() == '<!doctype':
+ self.handle_decl(rawdata[i+2:])
+ elif startswith("<!", i):
+ # bogus comment
+ self.handle_comment(rawdata[i+2:])
+ elif startswith("<?", i):
+ self.handle_pi(rawdata[i+2:])
else:
- self.handle_data(rawdata[i:k])
+ raise AssertionError("we should not get here!")
+ k = n
i = self.updatepos(i, k)
elif startswith("&#", i):
match = charref.match(rawdata, i)
@@ -207,10 +305,20 @@ class HTMLParser(_markupbase.ParserBase):
k = k - 1
i = self.updatepos(i, k)
continue
+ match = incomplete_charref.match(rawdata, i)
+ if match:
+ if end:
+ self.handle_charref(rawdata[i+2:])
+ i = self.updatepos(i, n)
+ break
+ # incomplete
+ break
+ elif i + 3 < n: # larger than "&#x"
+ # not the end of the buffer, and can't be confused
+ # with some other construct
+ self.handle_data("&#")
+ i = self.updatepos(i, i + 2)
else:
- if ";" in rawdata[i:]: # bail by consuming &#
- self.handle_data(rawdata[i:i+2])
- i = self.updatepos(i, i+2)
break
elif startswith('&', i):
match = entityref.match(rawdata, i)
@@ -224,15 +332,13 @@ class HTMLParser(_markupbase.ParserBase):
continue
match = incomplete.match(rawdata, i)
if match:
- # match.group() will contain at least 2 chars
- if end and match.group() == rawdata[i:]:
- k = match.end()
- if k <= i:
- k = n
- i = self.updatepos(i, i + 1)
+ if end:
+ self.handle_entityref(rawdata[i+1:])
+ i = self.updatepos(i, n)
+ break
# incomplete
break
- elif (i + 1) < n:
+ elif i + 1 < n:
# not the end of the buffer, and can't be confused
# with some other construct
self.handle_data("&")
@@ -242,8 +348,8 @@ class HTMLParser(_markupbase.ParserBase):
else:
assert 0, "interesting.search() lied"
# end while
- if end and i < n and not self.cdata_elem:
- if self.convert_charrefs and not self.cdata_elem:
+ if end and i < n:
+ if self.convert_charrefs and self._escapable:
self.handle_data(unescape(rawdata[i:n]))
else:
self.handle_data(rawdata[i:n])
@@ -260,8 +366,12 @@ class HTMLParser(_markupbase.ParserBase):
if rawdata[i:i+4] == '<!--':
# this case is actually already handled in goahead()
return self.parse_comment(i)
- elif rawdata[i:i+3] == '<![':
- return self.parse_marked_section(i)
+ elif rawdata[i:i+9] == '<![CDATA[' and self._support_cdata:
+ j = rawdata.find(']]>', i+9)
+ if j < 0:
+ return -1
+ self.unknown_decl(rawdata[i+3: j])
+ return j + 3
elif rawdata[i:i+9].lower() == '<!doctype':
# find the closing >
gtpos = rawdata.find('>', i+9)
@@ -272,12 +382,27 @@ class HTMLParser(_markupbase.ParserBase):
else:
return self.parse_bogus_comment(i)
+ # Internal -- parse comment, return length or -1 if not terminated
+ # see https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state
+ def parse_comment(self, i, report=True):
+ rawdata = self.rawdata
+ assert rawdata.startswith('<!--', i), 'unexpected call to parse_comment()'
+ match = commentclose.search(rawdata, i+4)
+ if not match:
+ match = commentabruptclose.match(rawdata, i+4)
+ if not match:
+ return -1
+ if report:
+ j = match.start()
+ self.handle_comment(rawdata[i+4: j])
+ return match.end()
+
# Internal -- parse bogus comment, return length or -1 if not terminated
- # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
+ # see https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state
def parse_bogus_comment(self, i, report=1):
rawdata = self.rawdata
assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
- 'parse_comment()')
+ 'parse_bogus_comment()')
pos = rawdata.find('>', i+2)
if pos == -1:
return -1
@@ -299,6 +424,8 @@ class HTMLParser(_markupbase.ParserBase):
# Internal -- handle starttag, return end or -1 if not terminated
def parse_starttag(self, i):
+ # See the HTML5 specs section "13.2.5.8 Tag name state"
+ # https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
self.__starttag_text = None
endpos = self.check_for_whole_start_tag(i)
if endpos < 0:
@@ -323,7 +450,7 @@ class HTMLParser(_markupbase.ParserBase):
attrvalue[:1] == '"' == attrvalue[-1:]:
attrvalue = attrvalue[1:-1]
if attrvalue:
- attrvalue = unescape(attrvalue)
+ attrvalue = _unescape_attrvalue(attrvalue)
attrs.append((attrname.lower(), attrvalue))
k = m.end()
@@ -336,84 +463,54 @@ class HTMLParser(_markupbase.ParserBase):
self.handle_startendtag(tag, attrs)
else:
self.handle_starttag(tag, attrs)
- if tag in self.CDATA_CONTENT_ELEMENTS:
- self.set_cdata_mode(tag)
+ if (tag in self.CDATA_CONTENT_ELEMENTS or
+ (self.scripting and tag == "noscript") or
+ tag == "plaintext"):
+ self.set_cdata_mode(tag, escapable=False)
+ elif tag in self.RCDATA_CONTENT_ELEMENTS:
+ self.set_cdata_mode(tag, escapable=True)
return endpos
# Internal -- check to see if we have a complete starttag; return end
# or -1 if incomplete.
def check_for_whole_start_tag(self, i):
rawdata = self.rawdata
- m = locatestarttagend_tolerant.match(rawdata, i)
- if m:
- j = m.end()
- next = rawdata[j:j+1]
- if next == ">":
- return j + 1
- if next == "/":
- if rawdata.startswith("/>", j):
- return j + 2
- if rawdata.startswith("/", j):
- # buffer boundary
- return -1
- # else bogus input
- if j > i:
- return j
- else:
- return i + 1
- if next == "":
- # end of input
- return -1
- if next in ("abcdefghijklmnopqrstuvwxyz=/"
- "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
- # end of input in or before attribute value, or we have the
- # '/' from a '/>' ending
- return -1
- if j > i:
- return j
- else:
- return i + 1
- raise AssertionError("we should not get here!")
+ match = locatetagend.match(rawdata, i+1)
+ assert match
+ j = match.end()
+ if rawdata[j-1] != ">":
+ return -1
+ return j
# Internal -- parse endtag, return end or -1 if incomplete
def parse_endtag(self, i):
+ # See the HTML5 specs section "13.2.5.7 End tag open state"
+ # https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state
rawdata = self.rawdata
assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
- match = endendtag.search(rawdata, i+1) # >
- if not match:
+ if rawdata.find('>', i+2) < 0: # fast check
return -1
- gtpos = match.end()
- match = endtagfind.match(rawdata, i) # </ + tag + >
- if not match:
- if self.cdata_elem is not None:
- self.handle_data(rawdata[i:gtpos])
- return gtpos
- # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
- namematch = tagfind_tolerant.match(rawdata, i+2)
- if not namematch:
- # w3.org/TR/html5/tokenization.html#end-tag-open-state
- if rawdata[i:i+3] == '</>':
- return i+3
- else:
- return self.parse_bogus_comment(i)
- tagname = namematch.group(1).lower()
- # consume and ignore other stuff between the name and the >
- # Note: this is not 100% correct, since we might have things like
- # </tag attr=">">, but looking for > after the name should cover
- # most of the cases and is much simpler
- gtpos = rawdata.find('>', namematch.end())
- self.handle_endtag(tagname)
- return gtpos+1
+ if not endtagopen.match(rawdata, i): # </ + letter
+ if rawdata[i+2:i+3] == '>': # </> is ignored
+ # "missing-end-tag-name" parser error
+ return i+3
+ else:
+ return self.parse_bogus_comment(i)
- elem = match.group(1).lower() # script or style
- if self.cdata_elem is not None:
- if elem != self.cdata_elem:
- self.handle_data(rawdata[i:gtpos])
- return gtpos
+ match = locatetagend.match(rawdata, i+2)
+ assert match
+ j = match.end()
+ if rawdata[j-1] != ">":
+ return -1
- self.handle_endtag(elem)
+ # find the name: "13.2.5.8 Tag name state"
+ # https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
+ match = tagfind_tolerant.match(rawdata, i+2)
+ assert match
+ tag = match.group(1).lower()
+ self.handle_endtag(tag)
self.clear_cdata_mode()
- return gtpos
+ return j
# Overridable -- finish processing of start+end tag: <tag.../>
def handle_startendtag(self, tag, attrs):