diff options
author | shmel1k <shmel1k@ydb.tech> | 2023-11-26 18:16:14 +0300 |
---|---|---|
committer | shmel1k <shmel1k@ydb.tech> | 2023-11-26 18:43:30 +0300 |
commit | b8cf9e88f4c5c64d9406af533d8948deb050d695 (patch) | |
tree | 218eb61fb3c3b96ec08b4d8cdfef383104a87d63 /contrib/python/Twisted/py3/twisted/web/sux.py | |
parent | 523f645a83a0ec97a0332dbc3863bb354c92a328 (diff) | |
download | ydb-b8cf9e88f4c5c64d9406af533d8948deb050d695.tar.gz |
add kikimr_configure
Diffstat (limited to 'contrib/python/Twisted/py3/twisted/web/sux.py')
-rw-r--r-- | contrib/python/Twisted/py3/twisted/web/sux.py | 644 |
1 files changed, 644 insertions, 0 deletions
diff --git a/contrib/python/Twisted/py3/twisted/web/sux.py b/contrib/python/Twisted/py3/twisted/web/sux.py new file mode 100644 index 0000000000..69ad4dff95 --- /dev/null +++ b/contrib/python/Twisted/py3/twisted/web/sux.py @@ -0,0 +1,644 @@ +# -*- test-case-name: twisted.web.test.test_xml -*- +# +# Copyright (c) Twisted Matrix Laboratories. +# See LICENSE for details. + + +""" +*S*mall, *U*ncomplicated *X*ML. + +This is a very simple implementation of XML/HTML as a network +protocol. It is not at all clever. Its main features are that it +does not: + + - support namespaces + - mung mnemonic entity references + - validate + - perform *any* external actions (such as fetching URLs or writing files) + under *any* circumstances + - has lots and lots of horrible hacks for supporting broken HTML (as an + option, they're not on by default). +""" + + +from twisted.internet.protocol import Protocol +from twisted.python.reflect import prefixedMethodNames + +# Elements of the three-tuples in the state table. +BEGIN_HANDLER = 0 +DO_HANDLER = 1 +END_HANDLER = 2 + +identChars = ".-_:" +lenientIdentChars = identChars + ";+#/%~" + + +def nop(*args, **kw): + "Do nothing." + + +def unionlist(*args): + l = [] + for x in args: + l.extend(x) + d = {x: 1 for x in l} + return d.keys() + + +def zipfndict(*args, **kw): + default = kw.get("default", nop) + d = {} + for key in unionlist(*(fndict.keys() for fndict in args)): + d[key] = tuple(x.get(key, default) for x in args) + return d + + +def prefixedMethodClassDict(clazz, prefix): + return { + name: getattr(clazz, prefix + name) + for name in prefixedMethodNames(clazz, prefix) + } + + +def prefixedMethodObjDict(obj, prefix): + return { + name: getattr(obj, prefix + name) + for name in prefixedMethodNames(obj.__class__, prefix) + } + + +class ParseError(Exception): + def __init__(self, filename, line, col, message): + self.filename = filename + self.line = line + self.col = col + self.message = message + + def __str__(self) -> str: + return f"{self.filename}:{self.line}:{self.col}: {self.message}" + + +class XMLParser(Protocol): + state = None + encodings = None + filename = "<xml />" + beExtremelyLenient = 0 + _prepend = None + + # _leadingBodyData will sometimes be set before switching to the + # 'bodydata' state, when we "accidentally" read a byte of bodydata + # in a different state. + _leadingBodyData = None + + def connectionMade(self): + self.lineno = 1 + self.colno = 0 + self.encodings = [] + + def saveMark(self): + """Get the line number and column of the last character parsed""" + # This gets replaced during dataReceived, restored afterwards + return (self.lineno, self.colno) + + def _parseError(self, message): + raise ParseError(*((self.filename,) + self.saveMark() + (message,))) + + def _buildStateTable(self): + """Return a dictionary of begin, do, end state function tuples""" + # _buildStateTable leaves something to be desired but it does what it + # does.. probably slowly, so I'm doing some evil caching so it doesn't + # get called more than once per class. + stateTable = getattr(self.__class__, "__stateTable", None) + if stateTable is None: + stateTable = self.__class__.__stateTable = zipfndict( + *( + prefixedMethodObjDict(self, prefix) + for prefix in ("begin_", "do_", "end_") + ) + ) + return stateTable + + def _decode(self, data): + if "UTF-16" in self.encodings or "UCS-2" in self.encodings: + assert not len(data) & 1, "UTF-16 must come in pairs for now" + if self._prepend: + data = self._prepend + data + for encoding in self.encodings: + data = str(data, encoding) + return data + + def maybeBodyData(self): + if self.endtag: + return "bodydata" + + # Get ready for fun! We're going to allow + # <script>if (foo < bar)</script> to work! + # We do this by making everything between <script> and + # </script> a Text + # BUT <script src="foo"> will be special-cased to do regular, + # lenient behavior, because those may not have </script> + # -radix + + if self.tagName == "script" and "src" not in self.tagAttributes: + # we do this ourselves rather than having begin_waitforendscript + # because that can get called multiple times and we don't want + # bodydata to get reset other than the first time. + self.begin_bodydata(None) + return "waitforendscript" + return "bodydata" + + def dataReceived(self, data): + stateTable = self._buildStateTable() + if not self.state: + # all UTF-16 starts with this string + if data.startswith((b"\xff\xfe", b"\xfe\xff")): + self._prepend = data[0:2] + self.encodings.append("UTF-16") + data = data[2:] + self.state = "begin" + if self.encodings: + data = self._decode(data) + else: + data = data.decode("utf-8") + # bring state, lineno, colno into local scope + lineno, colno = self.lineno, self.colno + curState = self.state + # replace saveMark with a nested scope function + _saveMark = self.saveMark + + def saveMark(): + return (lineno, colno) + + self.saveMark = saveMark + # fetch functions from the stateTable + beginFn, doFn, endFn = stateTable[curState] + try: + for byte in data: + # do newline stuff + if byte == "\n": + lineno += 1 + colno = 0 + else: + colno += 1 + newState = doFn(byte) + if newState is not None and newState != curState: + # this is the endFn from the previous state + endFn() + curState = newState + beginFn, doFn, endFn = stateTable[curState] + beginFn(byte) + finally: + self.saveMark = _saveMark + self.lineno, self.colno = lineno, colno + # state doesn't make sense if there's an exception.. + self.state = curState + + def connectionLost(self, reason): + """ + End the last state we were in. + """ + stateTable = self._buildStateTable() + stateTable[self.state][END_HANDLER]() + + # state methods + + def do_begin(self, byte): + if byte.isspace(): + return + if byte != "<": + if self.beExtremelyLenient: + self._leadingBodyData = byte + return "bodydata" + self._parseError(f"First char of document [{byte!r}] wasn't <") + return "tagstart" + + def begin_comment(self, byte): + self.commentbuf = "" + + def do_comment(self, byte): + self.commentbuf += byte + if self.commentbuf.endswith("-->"): + self.gotComment(self.commentbuf[:-3]) + return "bodydata" + + def begin_tagstart(self, byte): + self.tagName = "" # name of the tag + self.tagAttributes = {} # attributes of the tag + self.termtag = 0 # is the tag self-terminating + self.endtag = 0 + + def do_tagstart(self, byte): + if byte.isalnum() or byte in identChars: + self.tagName += byte + if self.tagName == "!--": + return "comment" + elif byte.isspace(): + if self.tagName: + if self.endtag: + # properly strict thing to do here is probably to only + # accept whitespace + return "waitforgt" + return "attrs" + else: + self._parseError("Whitespace before tag-name") + elif byte == ">": + if self.endtag: + self.gotTagEnd(self.tagName) + return "bodydata" + else: + self.gotTagStart(self.tagName, {}) + return ( + (not self.beExtremelyLenient) and "bodydata" or self.maybeBodyData() + ) + elif byte == "/": + if self.tagName: + return "afterslash" + else: + self.endtag = 1 + elif byte in "!?": + if self.tagName: + if not self.beExtremelyLenient: + self._parseError("Invalid character in tag-name") + else: + self.tagName += byte + self.termtag = 1 + elif byte == "[": + if self.tagName == "!": + return "expectcdata" + else: + self._parseError("Invalid '[' in tag-name") + else: + if self.beExtremelyLenient: + self.bodydata = "<" + return "unentity" + self._parseError("Invalid tag character: %r" % byte) + + def begin_unentity(self, byte): + self.bodydata += byte + + def do_unentity(self, byte): + self.bodydata += byte + return "bodydata" + + def end_unentity(self): + self.gotText(self.bodydata) + + def begin_expectcdata(self, byte): + self.cdatabuf = byte + + def do_expectcdata(self, byte): + self.cdatabuf += byte + cdb = self.cdatabuf + cd = "[CDATA[" + if len(cd) > len(cdb): + if cd.startswith(cdb): + return + elif self.beExtremelyLenient: + ## WHAT THE CRAP!? MSWord9 generates HTML that includes these + ## bizarre <![if !foo]> <![endif]> chunks, so I've gotta ignore + ## 'em as best I can. this should really be a separate parse + ## state but I don't even have any idea what these _are_. + return "waitforgt" + else: + self._parseError("Mal-formed CDATA header") + if cd == cdb: + self.cdatabuf = "" + return "cdata" + self._parseError("Mal-formed CDATA header") + + def do_cdata(self, byte): + self.cdatabuf += byte + if self.cdatabuf.endswith("]]>"): + self.cdatabuf = self.cdatabuf[:-3] + return "bodydata" + + def end_cdata(self): + self.gotCData(self.cdatabuf) + self.cdatabuf = "" + + def do_attrs(self, byte): + if byte.isalnum() or byte in identChars: + # XXX FIXME really handle !DOCTYPE at some point + if self.tagName == "!DOCTYPE": + return "doctype" + if self.tagName[0] in "!?": + return "waitforgt" + return "attrname" + elif byte.isspace(): + return + elif byte == ">": + self.gotTagStart(self.tagName, self.tagAttributes) + return (not self.beExtremelyLenient) and "bodydata" or self.maybeBodyData() + elif byte == "/": + return "afterslash" + elif self.beExtremelyLenient: + # discard and move on? Only case I've seen of this so far was: + # <foo bar="baz""> + return + self._parseError("Unexpected character: %r" % byte) + + def begin_doctype(self, byte): + self.doctype = byte + + def do_doctype(self, byte): + if byte == ">": + return "bodydata" + self.doctype += byte + + def end_doctype(self): + self.gotDoctype(self.doctype) + self.doctype = None + + def do_waitforgt(self, byte): + if byte == ">": + if self.endtag or not self.beExtremelyLenient: + return "bodydata" + return self.maybeBodyData() + + def begin_attrname(self, byte): + self.attrname = byte + self._attrname_termtag = 0 + + def do_attrname(self, byte): + if byte.isalnum() or byte in identChars: + self.attrname += byte + return + elif byte == "=": + return "beforeattrval" + elif byte.isspace(): + return "beforeeq" + elif self.beExtremelyLenient: + if byte in "\"'": + return "attrval" + if byte in lenientIdentChars or byte.isalnum(): + self.attrname += byte + return + if byte == "/": + self._attrname_termtag = 1 + return + if byte == ">": + self.attrval = "True" + self.tagAttributes[self.attrname] = self.attrval + self.gotTagStart(self.tagName, self.tagAttributes) + if self._attrname_termtag: + self.gotTagEnd(self.tagName) + return "bodydata" + return self.maybeBodyData() + # something is really broken. let's leave this attribute where it + # is and move on to the next thing + return + self._parseError(f"Invalid attribute name: {self.attrname!r} {byte!r}") + + def do_beforeattrval(self, byte): + if byte in "\"'": + return "attrval" + elif byte.isspace(): + return + elif self.beExtremelyLenient: + if byte in lenientIdentChars or byte.isalnum(): + return "messyattr" + if byte == ">": + self.attrval = "True" + self.tagAttributes[self.attrname] = self.attrval + self.gotTagStart(self.tagName, self.tagAttributes) + return self.maybeBodyData() + if byte == "\\": + # I saw this in actual HTML once: + # <font size=\"3\"><sup>SM</sup></font> + return + self._parseError( + "Invalid initial attribute value: %r; Attribute values must be quoted." + % byte + ) + + attrname = "" + attrval = "" + + def begin_beforeeq(self, byte): + self._beforeeq_termtag = 0 + + def do_beforeeq(self, byte): + if byte == "=": + return "beforeattrval" + elif byte.isspace(): + return + elif self.beExtremelyLenient: + if byte.isalnum() or byte in identChars: + self.attrval = "True" + self.tagAttributes[self.attrname] = self.attrval + return "attrname" + elif byte == ">": + self.attrval = "True" + self.tagAttributes[self.attrname] = self.attrval + self.gotTagStart(self.tagName, self.tagAttributes) + if self._beforeeq_termtag: + self.gotTagEnd(self.tagName) + return "bodydata" + return self.maybeBodyData() + elif byte == "/": + self._beforeeq_termtag = 1 + return + self._parseError("Invalid attribute") + + def begin_attrval(self, byte): + self.quotetype = byte + self.attrval = "" + + def do_attrval(self, byte): + if byte == self.quotetype: + return "attrs" + self.attrval += byte + + def end_attrval(self): + self.tagAttributes[self.attrname] = self.attrval + self.attrname = self.attrval = "" + + def begin_messyattr(self, byte): + self.attrval = byte + + def do_messyattr(self, byte): + if byte.isspace(): + return "attrs" + elif byte == ">": + endTag = 0 + if self.attrval.endswith("/"): + endTag = 1 + self.attrval = self.attrval[:-1] + self.tagAttributes[self.attrname] = self.attrval + self.gotTagStart(self.tagName, self.tagAttributes) + if endTag: + self.gotTagEnd(self.tagName) + return "bodydata" + return self.maybeBodyData() + else: + self.attrval += byte + + def end_messyattr(self): + if self.attrval: + self.tagAttributes[self.attrname] = self.attrval + + def begin_afterslash(self, byte): + self._after_slash_closed = 0 + + def do_afterslash(self, byte): + # this state is only after a self-terminating slash, e.g. <foo/> + if self._after_slash_closed: + self._parseError("Mal-formed") # XXX When does this happen?? + if byte != ">": + if self.beExtremelyLenient: + return + else: + self._parseError("No data allowed after '/'") + self._after_slash_closed = 1 + self.gotTagStart(self.tagName, self.tagAttributes) + self.gotTagEnd(self.tagName) + # don't need maybeBodyData here because there better not be + # any javascript code after a <script/>... we'll see :( + return "bodydata" + + def begin_bodydata(self, byte): + if self._leadingBodyData: + self.bodydata = self._leadingBodyData + del self._leadingBodyData + else: + self.bodydata = "" + + def do_bodydata(self, byte): + if byte == "<": + return "tagstart" + if byte == "&": + return "entityref" + self.bodydata += byte + + def end_bodydata(self): + self.gotText(self.bodydata) + self.bodydata = "" + + def do_waitforendscript(self, byte): + if byte == "<": + return "waitscriptendtag" + self.bodydata += byte + + def begin_waitscriptendtag(self, byte): + self.temptagdata = "" + self.tagName = "" + self.endtag = 0 + + def do_waitscriptendtag(self, byte): + # 1 enforce / as first byte read + # 2 enforce following bytes to be subset of "script" until + # tagName == "script" + # 2a when that happens, gotText(self.bodydata) and gotTagEnd(self.tagName) + # 3 spaces can happen anywhere, they're ignored + # e.g. < / script > + # 4 anything else causes all data I've read to be moved to the + # bodydata, and switch back to waitforendscript state + + # If it turns out this _isn't_ a </script>, we need to + # remember all the data we've been through so we can append it + # to bodydata + self.temptagdata += byte + + # 1 + if byte == "/": + self.endtag = True + elif not self.endtag: + self.bodydata += "<" + self.temptagdata + return "waitforendscript" + # 2 + elif byte.isalnum() or byte in identChars: + self.tagName += byte + if not "script".startswith(self.tagName): + self.bodydata += "<" + self.temptagdata + return "waitforendscript" + elif self.tagName == "script": + self.gotText(self.bodydata) + self.gotTagEnd(self.tagName) + return "waitforgt" + # 3 + elif byte.isspace(): + return "waitscriptendtag" + # 4 + else: + self.bodydata += "<" + self.temptagdata + return "waitforendscript" + + def begin_entityref(self, byte): + self.erefbuf = "" + self.erefextra = "" # extra bit for lenient mode + + def do_entityref(self, byte): + if byte.isspace() or byte == "<": + if self.beExtremelyLenient: + # '&foo' probably was '&foo' + if self.erefbuf and self.erefbuf != "amp": + self.erefextra = self.erefbuf + self.erefbuf = "amp" + if byte == "<": + return "tagstart" + else: + self.erefextra += byte + return "spacebodydata" + self._parseError("Bad entity reference") + elif byte != ";": + self.erefbuf += byte + else: + return "bodydata" + + def end_entityref(self): + self.gotEntityReference(self.erefbuf) + + # hacky support for space after & in entityref in beExtremelyLenient + # state should only happen in that case + def begin_spacebodydata(self, byte): + self.bodydata = self.erefextra + self.erefextra = None + + do_spacebodydata = do_bodydata + end_spacebodydata = end_bodydata + + # Sorta SAX-ish API + + def gotTagStart(self, name, attributes): + """Encountered an opening tag. + + Default behaviour is to print.""" + print("begin", name, attributes) + + def gotText(self, data): + """Encountered text + + Default behaviour is to print.""" + print("text:", repr(data)) + + def gotEntityReference(self, entityRef): + """Encountered mnemonic entity reference + + Default behaviour is to print.""" + print("entityRef: &%s;" % entityRef) + + def gotComment(self, comment): + """Encountered comment. + + Default behaviour is to ignore.""" + pass + + def gotCData(self, cdata): + """Encountered CDATA + + Default behaviour is to call the gotText method""" + self.gotText(cdata) + + def gotDoctype(self, doctype): + """Encountered DOCTYPE + + This is really grotty: it basically just gives you everything between + '<!DOCTYPE' and '>' as an argument. + """ + print("!DOCTYPE", repr(doctype)) + + def gotTagEnd(self, name): + """Encountered closing tag + + Default behaviour is to print.""" + print("end", name) |