aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/python/Twisted/py3/twisted/web/sux.py
diff options
context:
space:
mode:
authorshmel1k <shmel1k@ydb.tech>2023-11-26 18:16:14 +0300
committershmel1k <shmel1k@ydb.tech>2023-11-26 18:43:30 +0300
commitb8cf9e88f4c5c64d9406af533d8948deb050d695 (patch)
tree218eb61fb3c3b96ec08b4d8cdfef383104a87d63 /contrib/python/Twisted/py3/twisted/web/sux.py
parent523f645a83a0ec97a0332dbc3863bb354c92a328 (diff)
downloadydb-b8cf9e88f4c5c64d9406af533d8948deb050d695.tar.gz
add kikimr_configure
Diffstat (limited to 'contrib/python/Twisted/py3/twisted/web/sux.py')
-rw-r--r--contrib/python/Twisted/py3/twisted/web/sux.py644
1 files changed, 644 insertions, 0 deletions
diff --git a/contrib/python/Twisted/py3/twisted/web/sux.py b/contrib/python/Twisted/py3/twisted/web/sux.py
new file mode 100644
index 0000000000..69ad4dff95
--- /dev/null
+++ b/contrib/python/Twisted/py3/twisted/web/sux.py
@@ -0,0 +1,644 @@
+# -*- test-case-name: twisted.web.test.test_xml -*-
+#
+# Copyright (c) Twisted Matrix Laboratories.
+# See LICENSE for details.
+
+
+"""
+*S*mall, *U*ncomplicated *X*ML.
+
+This is a very simple implementation of XML/HTML as a network
+protocol. It is not at all clever. Its main features are that it
+does not:
+
+ - support namespaces
+ - mung mnemonic entity references
+ - validate
+ - perform *any* external actions (such as fetching URLs or writing files)
+ under *any* circumstances
+ - has lots and lots of horrible hacks for supporting broken HTML (as an
+ option, they're not on by default).
+"""
+
+
+from twisted.internet.protocol import Protocol
+from twisted.python.reflect import prefixedMethodNames
+
+# Elements of the three-tuples in the state table.
+BEGIN_HANDLER = 0
+DO_HANDLER = 1
+END_HANDLER = 2
+
+identChars = ".-_:"
+lenientIdentChars = identChars + ";+#/%~"
+
+
+def nop(*args, **kw):
+ "Do nothing."
+
+
+def unionlist(*args):
+ l = []
+ for x in args:
+ l.extend(x)
+ d = {x: 1 for x in l}
+ return d.keys()
+
+
+def zipfndict(*args, **kw):
+ default = kw.get("default", nop)
+ d = {}
+ for key in unionlist(*(fndict.keys() for fndict in args)):
+ d[key] = tuple(x.get(key, default) for x in args)
+ return d
+
+
+def prefixedMethodClassDict(clazz, prefix):
+ return {
+ name: getattr(clazz, prefix + name)
+ for name in prefixedMethodNames(clazz, prefix)
+ }
+
+
+def prefixedMethodObjDict(obj, prefix):
+ return {
+ name: getattr(obj, prefix + name)
+ for name in prefixedMethodNames(obj.__class__, prefix)
+ }
+
+
+class ParseError(Exception):
+ def __init__(self, filename, line, col, message):
+ self.filename = filename
+ self.line = line
+ self.col = col
+ self.message = message
+
+ def __str__(self) -> str:
+ return f"{self.filename}:{self.line}:{self.col}: {self.message}"
+
+
+class XMLParser(Protocol):
+ state = None
+ encodings = None
+ filename = "<xml />"
+ beExtremelyLenient = 0
+ _prepend = None
+
+ # _leadingBodyData will sometimes be set before switching to the
+ # 'bodydata' state, when we "accidentally" read a byte of bodydata
+ # in a different state.
+ _leadingBodyData = None
+
+ def connectionMade(self):
+ self.lineno = 1
+ self.colno = 0
+ self.encodings = []
+
+ def saveMark(self):
+ """Get the line number and column of the last character parsed"""
+ # This gets replaced during dataReceived, restored afterwards
+ return (self.lineno, self.colno)
+
+ def _parseError(self, message):
+ raise ParseError(*((self.filename,) + self.saveMark() + (message,)))
+
+ def _buildStateTable(self):
+ """Return a dictionary of begin, do, end state function tuples"""
+ # _buildStateTable leaves something to be desired but it does what it
+ # does.. probably slowly, so I'm doing some evil caching so it doesn't
+ # get called more than once per class.
+ stateTable = getattr(self.__class__, "__stateTable", None)
+ if stateTable is None:
+ stateTable = self.__class__.__stateTable = zipfndict(
+ *(
+ prefixedMethodObjDict(self, prefix)
+ for prefix in ("begin_", "do_", "end_")
+ )
+ )
+ return stateTable
+
+ def _decode(self, data):
+ if "UTF-16" in self.encodings or "UCS-2" in self.encodings:
+ assert not len(data) & 1, "UTF-16 must come in pairs for now"
+ if self._prepend:
+ data = self._prepend + data
+ for encoding in self.encodings:
+ data = str(data, encoding)
+ return data
+
+ def maybeBodyData(self):
+ if self.endtag:
+ return "bodydata"
+
+ # Get ready for fun! We're going to allow
+ # <script>if (foo < bar)</script> to work!
+ # We do this by making everything between <script> and
+ # </script> a Text
+ # BUT <script src="foo"> will be special-cased to do regular,
+ # lenient behavior, because those may not have </script>
+ # -radix
+
+ if self.tagName == "script" and "src" not in self.tagAttributes:
+ # we do this ourselves rather than having begin_waitforendscript
+ # because that can get called multiple times and we don't want
+ # bodydata to get reset other than the first time.
+ self.begin_bodydata(None)
+ return "waitforendscript"
+ return "bodydata"
+
+ def dataReceived(self, data):
+ stateTable = self._buildStateTable()
+ if not self.state:
+ # all UTF-16 starts with this string
+ if data.startswith((b"\xff\xfe", b"\xfe\xff")):
+ self._prepend = data[0:2]
+ self.encodings.append("UTF-16")
+ data = data[2:]
+ self.state = "begin"
+ if self.encodings:
+ data = self._decode(data)
+ else:
+ data = data.decode("utf-8")
+ # bring state, lineno, colno into local scope
+ lineno, colno = self.lineno, self.colno
+ curState = self.state
+ # replace saveMark with a nested scope function
+ _saveMark = self.saveMark
+
+ def saveMark():
+ return (lineno, colno)
+
+ self.saveMark = saveMark
+ # fetch functions from the stateTable
+ beginFn, doFn, endFn = stateTable[curState]
+ try:
+ for byte in data:
+ # do newline stuff
+ if byte == "\n":
+ lineno += 1
+ colno = 0
+ else:
+ colno += 1
+ newState = doFn(byte)
+ if newState is not None and newState != curState:
+ # this is the endFn from the previous state
+ endFn()
+ curState = newState
+ beginFn, doFn, endFn = stateTable[curState]
+ beginFn(byte)
+ finally:
+ self.saveMark = _saveMark
+ self.lineno, self.colno = lineno, colno
+ # state doesn't make sense if there's an exception..
+ self.state = curState
+
+ def connectionLost(self, reason):
+ """
+ End the last state we were in.
+ """
+ stateTable = self._buildStateTable()
+ stateTable[self.state][END_HANDLER]()
+
+ # state methods
+
+ def do_begin(self, byte):
+ if byte.isspace():
+ return
+ if byte != "<":
+ if self.beExtremelyLenient:
+ self._leadingBodyData = byte
+ return "bodydata"
+ self._parseError(f"First char of document [{byte!r}] wasn't <")
+ return "tagstart"
+
+ def begin_comment(self, byte):
+ self.commentbuf = ""
+
+ def do_comment(self, byte):
+ self.commentbuf += byte
+ if self.commentbuf.endswith("-->"):
+ self.gotComment(self.commentbuf[:-3])
+ return "bodydata"
+
+ def begin_tagstart(self, byte):
+ self.tagName = "" # name of the tag
+ self.tagAttributes = {} # attributes of the tag
+ self.termtag = 0 # is the tag self-terminating
+ self.endtag = 0
+
+ def do_tagstart(self, byte):
+ if byte.isalnum() or byte in identChars:
+ self.tagName += byte
+ if self.tagName == "!--":
+ return "comment"
+ elif byte.isspace():
+ if self.tagName:
+ if self.endtag:
+ # properly strict thing to do here is probably to only
+ # accept whitespace
+ return "waitforgt"
+ return "attrs"
+ else:
+ self._parseError("Whitespace before tag-name")
+ elif byte == ">":
+ if self.endtag:
+ self.gotTagEnd(self.tagName)
+ return "bodydata"
+ else:
+ self.gotTagStart(self.tagName, {})
+ return (
+ (not self.beExtremelyLenient) and "bodydata" or self.maybeBodyData()
+ )
+ elif byte == "/":
+ if self.tagName:
+ return "afterslash"
+ else:
+ self.endtag = 1
+ elif byte in "!?":
+ if self.tagName:
+ if not self.beExtremelyLenient:
+ self._parseError("Invalid character in tag-name")
+ else:
+ self.tagName += byte
+ self.termtag = 1
+ elif byte == "[":
+ if self.tagName == "!":
+ return "expectcdata"
+ else:
+ self._parseError("Invalid '[' in tag-name")
+ else:
+ if self.beExtremelyLenient:
+ self.bodydata = "<"
+ return "unentity"
+ self._parseError("Invalid tag character: %r" % byte)
+
+ def begin_unentity(self, byte):
+ self.bodydata += byte
+
+ def do_unentity(self, byte):
+ self.bodydata += byte
+ return "bodydata"
+
+ def end_unentity(self):
+ self.gotText(self.bodydata)
+
+ def begin_expectcdata(self, byte):
+ self.cdatabuf = byte
+
+ def do_expectcdata(self, byte):
+ self.cdatabuf += byte
+ cdb = self.cdatabuf
+ cd = "[CDATA["
+ if len(cd) > len(cdb):
+ if cd.startswith(cdb):
+ return
+ elif self.beExtremelyLenient:
+ ## WHAT THE CRAP!? MSWord9 generates HTML that includes these
+ ## bizarre <![if !foo]> <![endif]> chunks, so I've gotta ignore
+ ## 'em as best I can. this should really be a separate parse
+ ## state but I don't even have any idea what these _are_.
+ return "waitforgt"
+ else:
+ self._parseError("Mal-formed CDATA header")
+ if cd == cdb:
+ self.cdatabuf = ""
+ return "cdata"
+ self._parseError("Mal-formed CDATA header")
+
+ def do_cdata(self, byte):
+ self.cdatabuf += byte
+ if self.cdatabuf.endswith("]]>"):
+ self.cdatabuf = self.cdatabuf[:-3]
+ return "bodydata"
+
+ def end_cdata(self):
+ self.gotCData(self.cdatabuf)
+ self.cdatabuf = ""
+
+ def do_attrs(self, byte):
+ if byte.isalnum() or byte in identChars:
+ # XXX FIXME really handle !DOCTYPE at some point
+ if self.tagName == "!DOCTYPE":
+ return "doctype"
+ if self.tagName[0] in "!?":
+ return "waitforgt"
+ return "attrname"
+ elif byte.isspace():
+ return
+ elif byte == ">":
+ self.gotTagStart(self.tagName, self.tagAttributes)
+ return (not self.beExtremelyLenient) and "bodydata" or self.maybeBodyData()
+ elif byte == "/":
+ return "afterslash"
+ elif self.beExtremelyLenient:
+ # discard and move on? Only case I've seen of this so far was:
+ # <foo bar="baz"">
+ return
+ self._parseError("Unexpected character: %r" % byte)
+
+ def begin_doctype(self, byte):
+ self.doctype = byte
+
+ def do_doctype(self, byte):
+ if byte == ">":
+ return "bodydata"
+ self.doctype += byte
+
+ def end_doctype(self):
+ self.gotDoctype(self.doctype)
+ self.doctype = None
+
+ def do_waitforgt(self, byte):
+ if byte == ">":
+ if self.endtag or not self.beExtremelyLenient:
+ return "bodydata"
+ return self.maybeBodyData()
+
+ def begin_attrname(self, byte):
+ self.attrname = byte
+ self._attrname_termtag = 0
+
+ def do_attrname(self, byte):
+ if byte.isalnum() or byte in identChars:
+ self.attrname += byte
+ return
+ elif byte == "=":
+ return "beforeattrval"
+ elif byte.isspace():
+ return "beforeeq"
+ elif self.beExtremelyLenient:
+ if byte in "\"'":
+ return "attrval"
+ if byte in lenientIdentChars or byte.isalnum():
+ self.attrname += byte
+ return
+ if byte == "/":
+ self._attrname_termtag = 1
+ return
+ if byte == ">":
+ self.attrval = "True"
+ self.tagAttributes[self.attrname] = self.attrval
+ self.gotTagStart(self.tagName, self.tagAttributes)
+ if self._attrname_termtag:
+ self.gotTagEnd(self.tagName)
+ return "bodydata"
+ return self.maybeBodyData()
+ # something is really broken. let's leave this attribute where it
+ # is and move on to the next thing
+ return
+ self._parseError(f"Invalid attribute name: {self.attrname!r} {byte!r}")
+
+ def do_beforeattrval(self, byte):
+ if byte in "\"'":
+ return "attrval"
+ elif byte.isspace():
+ return
+ elif self.beExtremelyLenient:
+ if byte in lenientIdentChars or byte.isalnum():
+ return "messyattr"
+ if byte == ">":
+ self.attrval = "True"
+ self.tagAttributes[self.attrname] = self.attrval
+ self.gotTagStart(self.tagName, self.tagAttributes)
+ return self.maybeBodyData()
+ if byte == "\\":
+ # I saw this in actual HTML once:
+ # <font size=\"3\"><sup>SM</sup></font>
+ return
+ self._parseError(
+ "Invalid initial attribute value: %r; Attribute values must be quoted."
+ % byte
+ )
+
+ attrname = ""
+ attrval = ""
+
+ def begin_beforeeq(self, byte):
+ self._beforeeq_termtag = 0
+
+ def do_beforeeq(self, byte):
+ if byte == "=":
+ return "beforeattrval"
+ elif byte.isspace():
+ return
+ elif self.beExtremelyLenient:
+ if byte.isalnum() or byte in identChars:
+ self.attrval = "True"
+ self.tagAttributes[self.attrname] = self.attrval
+ return "attrname"
+ elif byte == ">":
+ self.attrval = "True"
+ self.tagAttributes[self.attrname] = self.attrval
+ self.gotTagStart(self.tagName, self.tagAttributes)
+ if self._beforeeq_termtag:
+ self.gotTagEnd(self.tagName)
+ return "bodydata"
+ return self.maybeBodyData()
+ elif byte == "/":
+ self._beforeeq_termtag = 1
+ return
+ self._parseError("Invalid attribute")
+
+ def begin_attrval(self, byte):
+ self.quotetype = byte
+ self.attrval = ""
+
+ def do_attrval(self, byte):
+ if byte == self.quotetype:
+ return "attrs"
+ self.attrval += byte
+
+ def end_attrval(self):
+ self.tagAttributes[self.attrname] = self.attrval
+ self.attrname = self.attrval = ""
+
+ def begin_messyattr(self, byte):
+ self.attrval = byte
+
+ def do_messyattr(self, byte):
+ if byte.isspace():
+ return "attrs"
+ elif byte == ">":
+ endTag = 0
+ if self.attrval.endswith("/"):
+ endTag = 1
+ self.attrval = self.attrval[:-1]
+ self.tagAttributes[self.attrname] = self.attrval
+ self.gotTagStart(self.tagName, self.tagAttributes)
+ if endTag:
+ self.gotTagEnd(self.tagName)
+ return "bodydata"
+ return self.maybeBodyData()
+ else:
+ self.attrval += byte
+
+ def end_messyattr(self):
+ if self.attrval:
+ self.tagAttributes[self.attrname] = self.attrval
+
+ def begin_afterslash(self, byte):
+ self._after_slash_closed = 0
+
+ def do_afterslash(self, byte):
+ # this state is only after a self-terminating slash, e.g. <foo/>
+ if self._after_slash_closed:
+ self._parseError("Mal-formed") # XXX When does this happen??
+ if byte != ">":
+ if self.beExtremelyLenient:
+ return
+ else:
+ self._parseError("No data allowed after '/'")
+ self._after_slash_closed = 1
+ self.gotTagStart(self.tagName, self.tagAttributes)
+ self.gotTagEnd(self.tagName)
+ # don't need maybeBodyData here because there better not be
+ # any javascript code after a <script/>... we'll see :(
+ return "bodydata"
+
+ def begin_bodydata(self, byte):
+ if self._leadingBodyData:
+ self.bodydata = self._leadingBodyData
+ del self._leadingBodyData
+ else:
+ self.bodydata = ""
+
+ def do_bodydata(self, byte):
+ if byte == "<":
+ return "tagstart"
+ if byte == "&":
+ return "entityref"
+ self.bodydata += byte
+
+ def end_bodydata(self):
+ self.gotText(self.bodydata)
+ self.bodydata = ""
+
+ def do_waitforendscript(self, byte):
+ if byte == "<":
+ return "waitscriptendtag"
+ self.bodydata += byte
+
+ def begin_waitscriptendtag(self, byte):
+ self.temptagdata = ""
+ self.tagName = ""
+ self.endtag = 0
+
+ def do_waitscriptendtag(self, byte):
+ # 1 enforce / as first byte read
+ # 2 enforce following bytes to be subset of "script" until
+ # tagName == "script"
+ # 2a when that happens, gotText(self.bodydata) and gotTagEnd(self.tagName)
+ # 3 spaces can happen anywhere, they're ignored
+ # e.g. < / script >
+ # 4 anything else causes all data I've read to be moved to the
+ # bodydata, and switch back to waitforendscript state
+
+ # If it turns out this _isn't_ a </script>, we need to
+ # remember all the data we've been through so we can append it
+ # to bodydata
+ self.temptagdata += byte
+
+ # 1
+ if byte == "/":
+ self.endtag = True
+ elif not self.endtag:
+ self.bodydata += "<" + self.temptagdata
+ return "waitforendscript"
+ # 2
+ elif byte.isalnum() or byte in identChars:
+ self.tagName += byte
+ if not "script".startswith(self.tagName):
+ self.bodydata += "<" + self.temptagdata
+ return "waitforendscript"
+ elif self.tagName == "script":
+ self.gotText(self.bodydata)
+ self.gotTagEnd(self.tagName)
+ return "waitforgt"
+ # 3
+ elif byte.isspace():
+ return "waitscriptendtag"
+ # 4
+ else:
+ self.bodydata += "<" + self.temptagdata
+ return "waitforendscript"
+
+ def begin_entityref(self, byte):
+ self.erefbuf = ""
+ self.erefextra = "" # extra bit for lenient mode
+
+ def do_entityref(self, byte):
+ if byte.isspace() or byte == "<":
+ if self.beExtremelyLenient:
+ # '&foo' probably was '&amp;foo'
+ if self.erefbuf and self.erefbuf != "amp":
+ self.erefextra = self.erefbuf
+ self.erefbuf = "amp"
+ if byte == "<":
+ return "tagstart"
+ else:
+ self.erefextra += byte
+ return "spacebodydata"
+ self._parseError("Bad entity reference")
+ elif byte != ";":
+ self.erefbuf += byte
+ else:
+ return "bodydata"
+
+ def end_entityref(self):
+ self.gotEntityReference(self.erefbuf)
+
+ # hacky support for space after & in entityref in beExtremelyLenient
+ # state should only happen in that case
+ def begin_spacebodydata(self, byte):
+ self.bodydata = self.erefextra
+ self.erefextra = None
+
+ do_spacebodydata = do_bodydata
+ end_spacebodydata = end_bodydata
+
+ # Sorta SAX-ish API
+
+ def gotTagStart(self, name, attributes):
+ """Encountered an opening tag.
+
+ Default behaviour is to print."""
+ print("begin", name, attributes)
+
+ def gotText(self, data):
+ """Encountered text
+
+ Default behaviour is to print."""
+ print("text:", repr(data))
+
+ def gotEntityReference(self, entityRef):
+ """Encountered mnemonic entity reference
+
+ Default behaviour is to print."""
+ print("entityRef: &%s;" % entityRef)
+
+ def gotComment(self, comment):
+ """Encountered comment.
+
+ Default behaviour is to ignore."""
+ pass
+
+ def gotCData(self, cdata):
+ """Encountered CDATA
+
+ Default behaviour is to call the gotText method"""
+ self.gotText(cdata)
+
+ def gotDoctype(self, doctype):
+ """Encountered DOCTYPE
+
+ This is really grotty: it basically just gives you everything between
+ '<!DOCTYPE' and '>' as an argument.
+ """
+ print("!DOCTYPE", repr(doctype))
+
+ def gotTagEnd(self, name):
+ """Encountered closing tag
+
+ Default behaviour is to print."""
+ print("end", name)