Update libxml to 2.9.13

ref:f572491d236694e847142c36f0f5546c649e05d7
author: setser <setser@yandex-team.ru> 2022-05-09 00:13:37 +0300
committer: setser <setser@yandex-team.ru> 2022-05-09 00:13:37 +0300
commit: e87e3fc8d0e04eb7ba3eee221bb91613b527ad85 (patch)
tree: 5279c128bdbdf902b9a08d9fae8e55b91910a553 /contrib/libs/libxml/HTMLparser.c
parent: f4f3e4024a1f32bd0bc3fa20239025a1b179e42d (diff)
download: ydb-e87e3fc8d0e04eb7ba3eee221bb91613b527ad85.tar.gz
1 files changed, 657 insertions, 551 deletions
diff --git a/contrib/libs/libxml/HTMLparser.c b/contrib/libs/libxml/HTMLparser.c
index 7b6d68961c..3e8a165740 100644
--- a/contrib/libs/libxml/HTMLparser.c
+++ b/contrib/libs/libxml/HTMLparser.c
@@ -69,7 +69,7 @@ static void htmlParseComment(htmlParserCtxtPtr ctxt);
 /**
  * htmlErrMemory:
  * @ctxt:  an HTML parser context
- * @extra:  extra informations
+ * @extra:  extra information
  *
  * Handle a redefinition of attribute error
  */
@@ -296,7 +296,7 @@ htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
 
 #define UPPER (toupper(*ctxt->input->cur))
 
-#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
+#define SKIP(val) ctxt->input->cur += (val),ctxt->input->col+=(val)
 
 #define NXT(val) ctxt->input->cur[(val)]
 
@@ -330,7 +330,7 @@ htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
     if (*(ctxt->input->cur) == '\n') {					\
 	ctxt->input->line++; ctxt->input->col = 1;			\
     } else ctxt->input->col++;						\
-    ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++;		\
+    ctxt->token = 0; ctxt->input->cur += l;				\
   } while (0)
 
 /************
@@ -414,6 +414,10 @@ htmlFindEncoding(xmlParserCtxtPtr ctxt) {
 
 static int
 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
+    const unsigned char *cur;
+    unsigned char c;
+    unsigned int val;
+
     if (ctxt->instate == XML_PARSER_EOF)
 	return(0);
 
@@ -421,99 +425,29 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
 	*len = 0;
 	return(ctxt->token);
     }
-    if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
-	/*
-	 * We are supposed to handle UTF8, check it's valid
-	 * From rfc2044: encoding of the Unicode values on UTF-8:
-	 *
-	 * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
-	 * 0000 0000-0000 007F   0xxxxxxx
-	 * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
-	 * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
-	 *
-	 * Check for the 0x110000 limit too
-	 */
-	const unsigned char *cur = ctxt->input->cur;
-	unsigned char c;
-	unsigned int val;
-
-	c = *cur;
-	if (c & 0x80) {
-	    if (cur[1] == 0) {
-		xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
-                cur = ctxt->input->cur;
-            }
-	    if ((cur[1] & 0xc0) != 0x80)
-		goto encoding_error;
-	    if ((c & 0xe0) == 0xe0) {
+    if (ctxt->charset != XML_CHAR_ENCODING_UTF8) {
+        xmlChar * guess;
+        xmlCharEncodingHandlerPtr handler;
 
-		if (cur[2] == 0) {
-		    xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
-                    cur = ctxt->input->cur;
-                }
-		if ((cur[2] & 0xc0) != 0x80)
-		    goto encoding_error;
-		if ((c & 0xf0) == 0xf0) {
-		    if (cur[3] == 0) {
-			xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
-                        cur = ctxt->input->cur;
-                    }
-		    if (((c & 0xf8) != 0xf0) ||
-			((cur[3] & 0xc0) != 0x80))
-			goto encoding_error;
-		    /* 4-byte code */
-		    *len = 4;
-		    val = (cur[0] & 0x7) << 18;
-		    val |= (cur[1] & 0x3f) << 12;
-		    val |= (cur[2] & 0x3f) << 6;
-		    val |= cur[3] & 0x3f;
-		} else {
-		  /* 3-byte code */
-		    *len = 3;
-		    val = (cur[0] & 0xf) << 12;
-		    val |= (cur[1] & 0x3f) << 6;
-		    val |= cur[2] & 0x3f;
-		}
-	    } else {
-	      /* 2-byte code */
-		*len = 2;
-		val = (cur[0] & 0x1f) << 6;
-		val |= cur[1] & 0x3f;
-	    }
-	    if (!IS_CHAR(val)) {
-	        htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
-				"Char 0x%X out of allowed range\n", val);
-	    }
-	    return(val);
-	} else {
+        /*
+         * Assume it's a fixed length encoding (1) with
+         * a compatible encoding for the ASCII set, since
+         * HTML constructs only use < 128 chars
+         */
+        if ((int) *ctxt->input->cur < 0x80) {
+            *len = 1;
             if ((*ctxt->input->cur == 0) &&
                 (ctxt->input->cur < ctxt->input->end)) {
-                    htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
-				"Char 0x%X out of allowed range\n", 0);
-                *len = 1;
+                htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
+                                "Char 0x%X out of allowed range\n", 0);
                 return(' ');
             }
-	    /* 1-byte code */
-	    *len = 1;
-	    return((int) *ctxt->input->cur);
-	}
-    }
-    /*
-     * Assume it's a fixed length encoding (1) with
-     * a compatible encoding for the ASCII set, since
-     * XML constructs only use < 128 chars
-     */
-    *len = 1;
-    if ((int) *ctxt->input->cur < 0x80)
-	return((int) *ctxt->input->cur);
-
-    /*
-     * Humm this is bad, do an automatic flow conversion
-     */
-    {
-        xmlChar * guess;
-        xmlCharEncodingHandlerPtr handler;
+            return((int) *ctxt->input->cur);
+        }
 
+        /*
+         * Humm this is bad, do an automatic flow conversion
+         */
         guess = htmlFindEncoding(ctxt);
         if (guess == NULL) {
             xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
@@ -523,7 +457,12 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
             ctxt->input->encoding = guess;
             handler = xmlFindCharEncodingHandler((const char *) guess);
             if (handler != NULL) {
-                xmlSwitchToEncoding(ctxt, handler);
+                /*
+                 * Don't use UTF-8 encoder which isn't required and
+                 * can produce invalid UTF-8.
+                 */
+                if (!xmlStrEqual(BAD_CAST handler->name, BAD_CAST "UTF-8"))
+                    xmlSwitchToEncoding(ctxt, handler);
             } else {
                 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
                              "Unsupported encoding %s", guess, NULL);
@@ -532,7 +471,86 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
         ctxt->charset = XML_CHAR_ENCODING_UTF8;
     }
 
-    return(xmlCurrentChar(ctxt, len));
+    /*
+     * We are supposed to handle UTF8, check it's valid
+     * From rfc2044: encoding of the Unicode values on UTF-8:
+     *
+     * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
+     * 0000 0000-0000 007F   0xxxxxxx
+     * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
+     * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
+     *
+     * Check for the 0x110000 limit too
+     */
+    cur = ctxt->input->cur;
+    c = *cur;
+    if (c & 0x80) {
+        if ((c & 0x40) == 0)
+            goto encoding_error;
+        if (cur[1] == 0) {
+            xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
+            cur = ctxt->input->cur;
+        }
+        if ((cur[1] & 0xc0) != 0x80)
+            goto encoding_error;
+        if ((c & 0xe0) == 0xe0) {
+
+            if (cur[2] == 0) {
+                xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
+                cur = ctxt->input->cur;
+            }
+            if ((cur[2] & 0xc0) != 0x80)
+                goto encoding_error;
+            if ((c & 0xf0) == 0xf0) {
+                if (cur[3] == 0) {
+                    xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
+                    cur = ctxt->input->cur;
+                }
+                if (((c & 0xf8) != 0xf0) ||
+                    ((cur[3] & 0xc0) != 0x80))
+                    goto encoding_error;
+                /* 4-byte code */
+                *len = 4;
+                val = (cur[0] & 0x7) << 18;
+                val |= (cur[1] & 0x3f) << 12;
+                val |= (cur[2] & 0x3f) << 6;
+                val |= cur[3] & 0x3f;
+                if (val < 0x10000)
+                    goto encoding_error;
+            } else {
+              /* 3-byte code */
+                *len = 3;
+                val = (cur[0] & 0xf) << 12;
+                val |= (cur[1] & 0x3f) << 6;
+                val |= cur[2] & 0x3f;
+                if (val < 0x800)
+                    goto encoding_error;
+            }
+        } else {
+          /* 2-byte code */
+            *len = 2;
+            val = (cur[0] & 0x1f) << 6;
+            val |= cur[1] & 0x3f;
+            if (val < 0x80)
+                goto encoding_error;
+        }
+        if (!IS_CHAR(val)) {
+            htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
+                            "Char 0x%X out of allowed range\n", val);
+        }
+        return(val);
+    } else {
+        if ((*ctxt->input->cur == 0) &&
+            (ctxt->input->cur < ctxt->input->end)) {
+            htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
+                            "Char 0x%X out of allowed range\n", 0);
+            *len = 1;
+            return(' ');
+        }
+        /* 1-byte code */
+        *len = 1;
+        return((int) *ctxt->input->cur);
+    }
 
 encoding_error:
     /*
@@ -557,7 +575,16 @@ encoding_error:
 		     BAD_CAST buffer, NULL);
     }
 
-    ctxt->charset = XML_CHAR_ENCODING_8859_1;
+    /*
+     * Don't switch encodings twice. Note that if there's an encoder, we
+     * shouldn't receive invalid UTF-8 anyway.
+     *
+     * Note that if ctxt->input->buf == NULL, switching encodings is
+     * impossible, see Gitlab issue #34.
+     */
+    if ((ctxt->input->buf != NULL) &&
+        (ctxt->input->buf->encoder == NULL))
+        xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
     *len = 1;
     return((int) *ctxt->input->cur);
 }
@@ -584,7 +611,6 @@ htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
 		ctxt->input->line++; ctxt->input->col = 1;
 	    } else ctxt->input->col++;
 	    ctxt->input->cur++;
-	    ctxt->nbChars++;
 	    if (*ctxt->input->cur == 0)
 		xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
 	}
@@ -1046,102 +1072,266 @@ html40ElementTable[] = {
 }
 };
 
+typedef struct {
+    const char *oldTag;
+    const char *newTag;
+} htmlStartCloseEntry;
+
 /*
  * start tags that imply the end of current element
  */
-static const char * const htmlStartClose[] = {
-"form",		"form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
-		"dl", "ul", "ol", "menu", "dir", "address", "pre",
-		"listing", "xmp", "head", NULL,
-"head",		"p", NULL,
-"title",	"p", NULL,
-"body",		"head", "style", "link", "title", "p", NULL,
-"frameset",	"head", "style", "link", "title", "p", NULL,
-"li",		"p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
-		"pre", "listing", "xmp", "head", "li", NULL,
-"hr",		"p", "head", NULL,
-"h1",		"p", "head", NULL,
-"h2",		"p", "head", NULL,
-"h3",		"p", "head", NULL,
-"h4",		"p", "head", NULL,
-"h5",		"p", "head", NULL,
-"h6",		"p", "head", NULL,
-"dir",		"p", "head", NULL,
-"address",	"p", "head", "ul", NULL,
-"pre",		"p", "head", "ul", NULL,
-"listing",	"p", "head", NULL,
-"xmp",		"p", "head", NULL,
-"blockquote",	"p", "head", NULL,
-"dl",		"p", "dt", "menu", "dir", "address", "pre", "listing",
-		"xmp", "head", NULL,
-"dt",		"p", "menu", "dir", "address", "pre", "listing", "xmp",
-                "head", "dd", NULL,
-"dd",		"p", "menu", "dir", "address", "pre", "listing", "xmp",
-                "head", "dt", NULL,
-"ul",		"p", "head", "ol", "menu", "dir", "address", "pre",
-		"listing", "xmp", NULL,
-"ol",		"p", "head", "ul", NULL,
-"menu",		"p", "head", "ul", NULL,
-"p",		"p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL,
-"div",		"p", "head", NULL,
-"noscript",	"script", NULL,
-"center",	"font", "b", "i", "p", "head", NULL,
-"a",		"a", "head", NULL,
-"caption",	"p", NULL,
-"colgroup",	"caption", "colgroup", "col", "p", NULL,
-"col",		"caption", "col", "p", NULL,
-"table",	"p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
-		"listing", "xmp", "a", NULL,
-"th",		"th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
-"td",		"th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
-"tr",		"th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
-"thead",	"caption", "col", "colgroup", NULL,
-"tfoot",	"th", "td", "tr", "caption", "col", "colgroup", "thead",
-		"tbody", "p", NULL,
-"tbody",	"th", "td", "tr", "caption", "col", "colgroup", "thead",
-		"tfoot", "tbody", "p", NULL,
-"optgroup",	"option", NULL,
-"option",	"option", NULL,
-"fieldset",	"legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
-		"pre", "listing", "xmp", "a", NULL,
-/* most tags in in FONTSTYLE, PHRASE and SPECIAL should close <head> */
-"tt",		"head", NULL,
-"i",		"head", NULL,
-"b",		"head", NULL,
-"u",		"head", NULL,
-"s",		"head", NULL,
-"strike",	"head", NULL,
-"big",		"head", NULL,
-"small",	"head", NULL,
-
-"em",		"head", NULL,
-"strong",	"head", NULL,
-"dfn",		"head", NULL,
-"code",		"head", NULL,
-"samp",		"head", NULL,
-"kbd",		"head", NULL,
-"var",		"head", NULL,
-"cite",		"head", NULL,
-"abbr",		"head", NULL,
-"acronym",	"head", NULL,
-
-/* "a" */
-"img",		"head", NULL,
-/* "applet" */
-/* "embed" */
-/* "object" */
-"font",		"head", NULL,
-/* "basefont" */
-"br",		"head", NULL,
-/* "script" */
-"map",		"head", NULL,
-"q",		"head", NULL,
-"sub",		"head", NULL,
-"sup",		"head", NULL,
-"span",		"head", NULL,
-"bdo",		"head", NULL,
-"iframe",	"head", NULL,
-NULL
+static const htmlStartCloseEntry htmlStartClose[] = {
+    { "a", "a" },
+    { "a", "fieldset" },
+    { "a", "table" },
+    { "a", "td" },
+    { "a", "th" },
+    { "address", "dd" },
+    { "address", "dl" },
+    { "address", "dt" },
+    { "address", "form" },
+    { "address", "li" },
+    { "address", "ul" },
+    { "b", "center" },
+    { "b", "p" },
+    { "b", "td" },
+    { "b", "th" },
+    { "big", "p" },
+    { "caption", "col" },
+    { "caption", "colgroup" },
+    { "caption", "tbody" },
+    { "caption", "tfoot" },
+    { "caption", "thead" },
+    { "caption", "tr" },
+    { "col", "col" },
+    { "col", "colgroup" },
+    { "col", "tbody" },
+    { "col", "tfoot" },
+    { "col", "thead" },
+    { "col", "tr" },
+    { "colgroup", "colgroup" },
+    { "colgroup", "tbody" },
+    { "colgroup", "tfoot" },
+    { "colgroup", "thead" },
+    { "colgroup", "tr" },
+    { "dd", "dt" },
+    { "dir", "dd" },
+    { "dir", "dl" },
+    { "dir", "dt" },
+    { "dir", "form" },
+    { "dir", "ul" },
+    { "dl", "form" },
+    { "dl", "li" },
+    { "dt", "dd" },
+    { "dt", "dl" },
+    { "font", "center" },
+    { "font", "td" },
+    { "font", "th" },
+    { "form", "form" },
+    { "h1", "fieldset" },
+    { "h1", "form" },
+    { "h1", "li" },
+    { "h1", "p" },
+    { "h1", "table" },
+    { "h2", "fieldset" },
+    { "h2", "form" },
+    { "h2", "li" },
+    { "h2", "p" },
+    { "h2", "table" },
+    { "h3", "fieldset" },
+    { "h3", "form" },
+    { "h3", "li" },
+    { "h3", "p" },
+    { "h3", "table" },
+    { "h4", "fieldset" },
+    { "h4", "form" },
+    { "h4", "li" },
+    { "h4", "p" },
+    { "h4", "table" },
+    { "h5", "fieldset" },
+    { "h5", "form" },
+    { "h5", "li" },
+    { "h5", "p" },
+    { "h5", "table" },
+    { "h6", "fieldset" },
+    { "h6", "form" },
+    { "h6", "li" },
+    { "h6", "p" },
+    { "h6", "table" },
+    { "head", "a" },
+    { "head", "abbr" },
+    { "head", "acronym" },
+    { "head", "address" },
+    { "head", "b" },
+    { "head", "bdo" },
+    { "head", "big" },
+    { "head", "blockquote" },
+    { "head", "body" },
+    { "head", "br" },
+    { "head", "center" },
+    { "head", "cite" },
+    { "head", "code" },
+    { "head", "dd" },
+    { "head", "dfn" },
+    { "head", "dir" },
+    { "head", "div" },
+    { "head", "dl" },
+    { "head", "dt" },
+    { "head", "em" },
+    { "head", "fieldset" },
+    { "head", "font" },
+    { "head", "form" },
+    { "head", "frameset" },
+    { "head", "h1" },
+    { "head", "h2" },
+    { "head", "h3" },
+    { "head", "h4" },
+    { "head", "h5" },
+    { "head", "h6" },
+    { "head", "hr" },
+    { "head", "i" },
+    { "head", "iframe" },
+    { "head", "img" },
+    { "head", "kbd" },
+    { "head", "li" },
+    { "head", "listing" },
+    { "head", "map" },
+    { "head", "menu" },
+    { "head", "ol" },
+    { "head", "p" },
+    { "head", "pre" },
+    { "head", "q" },
+    { "head", "s" },
+    { "head", "samp" },
+    { "head", "small" },
+    { "head", "span" },
+    { "head", "strike" },
+    { "head", "strong" },
+    { "head", "sub" },
+    { "head", "sup" },
+    { "head", "table" },
+    { "head", "tt" },
+    { "head", "u" },
+    { "head", "ul" },
+    { "head", "var" },
+    { "head", "xmp" },
+    { "hr", "form" },
+    { "i", "center" },
+    { "i", "p" },
+    { "i", "td" },
+    { "i", "th" },
+    { "legend", "fieldset" },
+    { "li", "li" },
+    { "link", "body" },
+    { "link", "frameset" },
+    { "listing", "dd" },
+    { "listing", "dl" },
+    { "listing", "dt" },
+    { "listing", "fieldset" },
+    { "listing", "form" },
+    { "listing", "li" },
+    { "listing", "table" },
+    { "listing", "ul" },
+    { "menu", "dd" },
+    { "menu", "dl" },
+    { "menu", "dt" },
+    { "menu", "form" },
+    { "menu", "ul" },
+    { "ol", "form" },
+    { "ol", "ul" },
+    { "option", "optgroup" },
+    { "option", "option" },
+    { "p", "address" },
+    { "p", "blockquote" },
+    { "p", "body" },
+    { "p", "caption" },
+    { "p", "center" },
+    { "p", "col" },
+    { "p", "colgroup" },
+    { "p", "dd" },
+    { "p", "dir" },
+    { "p", "div" },
+    { "p", "dl" },
+    { "p", "dt" },
+    { "p", "fieldset" },
+    { "p", "form" },
+    { "p", "frameset" },
+    { "p", "h1" },
+    { "p", "h2" },
+    { "p", "h3" },
+    { "p", "h4" },
+    { "p", "h5" },
+    { "p", "h6" },
+    { "p", "head" },
+    { "p", "hr" },
+    { "p", "li" },
+    { "p", "listing" },
+    { "p", "menu" },
+    { "p", "ol" },
+    { "p", "p" },
+    { "p", "pre" },
+    { "p", "table" },
+    { "p", "tbody" },
+    { "p", "td" },
+    { "p", "tfoot" },
+    { "p", "th" },
+    { "p", "title" },
+    { "p", "tr" },
+    { "p", "ul" },
+    { "p", "xmp" },
+    { "pre", "dd" },
+    { "pre", "dl" },
+    { "pre", "dt" },
+    { "pre", "fieldset" },
+    { "pre", "form" },
+    { "pre", "li" },
+    { "pre", "table" },
+    { "pre", "ul" },
+    { "s", "p" },
+    { "script", "noscript" },
+    { "small", "p" },
+    { "span", "td" },
+    { "span", "th" },
+    { "strike", "p" },
+    { "style", "body" },
+    { "style", "frameset" },
+    { "tbody", "tbody" },
+    { "tbody", "tfoot" },
+    { "td", "tbody" },
+    { "td", "td" },
+    { "td", "tfoot" },
+    { "td", "th" },
+    { "td", "tr" },
+    { "tfoot", "tbody" },
+    { "th", "tbody" },
+    { "th", "td" },
+    { "th", "tfoot" },
+    { "th", "th" },
+    { "th", "tr" },
+    { "thead", "tbody" },
+    { "thead", "tfoot" },
+    { "title", "body" },
+    { "title", "frameset" },
+    { "tr", "tbody" },
+    { "tr", "tfoot" },
+    { "tr", "tr" },
+    { "tt", "p" },
+    { "u", "p" },
+    { "u", "td" },
+    { "u", "th" },
+    { "ul", "address" },
+    { "ul", "form" },
+    { "ul", "menu" },
+    { "ul", "ol" },
+    { "ul", "pre" },
+    { "xmp", "dd" },
+    { "xmp", "dl" },
+    { "xmp", "dt" },
+    { "xmp", "fieldset" },
+    { "xmp", "form" },
+    { "xmp", "li" },
+    { "xmp", "table" },
+    { "xmp", "ul" }
 };
 
 /*
@@ -1211,9 +1401,6 @@ static const elementPriority htmlEndPriority[] = {
     {NULL,    100} /* Default priority */
 };
 
-static const char** htmlStartCloseIndex[100];
-static int htmlStartCloseIndexinitialized = 0;
-
 /************************************************************************
  *									*
  *	functions to handle HTML specific data			*
@@ -1223,24 +1410,18 @@ static int htmlStartCloseIndexinitialized = 0;
 /**
  * htmlInitAutoClose:
  *
- * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
- * This is not reentrant. Call xmlInitParser() once before processing in
- * case of use in multithreaded programs.
+ * This is a no-op now.
  */
 void
 htmlInitAutoClose(void) {
-    int indx, i = 0;
+}
 
-    if (htmlStartCloseIndexinitialized) return;
+static int
+htmlCompareTags(const void *key, const void *member) {
+    const xmlChar *tag = (const xmlChar *) key;
+    const htmlElemDesc *desc = (const htmlElemDesc *) member;
 
-    for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
-    indx = 0;
-    while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
-        htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
-	while (htmlStartClose[i] != NULL) i++;
-	i++;
-    }
-    htmlStartCloseIndexinitialized = 1;
+    return(xmlStrcasecmp(tag, BAD_CAST desc->name));
 }
 
 /**
@@ -1253,14 +1434,12 @@ htmlInitAutoClose(void) {
  */
 const htmlElemDesc *
 htmlTagLookup(const xmlChar *tag) {
-    unsigned int i;
+    if (tag == NULL)
+        return(NULL);
 
-    for (i = 0; i < (sizeof(html40ElementTable) /
-                     sizeof(html40ElementTable[0]));i++) {
-        if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
-	    return((htmlElemDescPtr) &html40ElementTable[i]);
-    }
-    return(NULL);
+    return((const htmlElemDesc *) bsearch(tag, html40ElementTable,
+                sizeof(html40ElementTable) / sizeof(htmlElemDesc),
+                sizeof(htmlElemDesc), htmlCompareTags));
 }
 
 /**
@@ -1281,6 +1460,19 @@ htmlGetEndPriority (const xmlChar *name) {
 }
 
 
+static int
+htmlCompareStartClose(const void *vkey, const void *member) {
+    const htmlStartCloseEntry *key = (const htmlStartCloseEntry *) vkey;
+    const htmlStartCloseEntry *entry = (const htmlStartCloseEntry *) member;
+    int ret;
+
+    ret = strcmp(key->oldTag, entry->oldTag);
+    if (ret == 0)
+        ret = strcmp(key->newTag, entry->newTag);
+
+    return(ret);
+}
+
 /**
  * htmlCheckAutoClose:
  * @newtag:  The new tag name
@@ -1288,37 +1480,21 @@ htmlGetEndPriority (const xmlChar *name) {
  *
  * Checks whether the new tag is one of the registered valid tags for
  * closing old.
- * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
  *
  * Returns 0 if no, 1 if yes.
  */
 static int
 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
 {
-    int i, indx;
-    const char **closed = NULL;
-
-    if (htmlStartCloseIndexinitialized == 0)
-        htmlInitAutoClose();
-
-    /* inefficient, but not a big deal */
-    for (indx = 0; indx < 100; indx++) {
-        closed = htmlStartCloseIndex[indx];
-        if (closed == NULL)
-            return (0);
-        if (xmlStrEqual(BAD_CAST * closed, newtag))
-            break;
-    }
-
-    i = closed - htmlStartClose;
-    i++;
-    while (htmlStartClose[i] != NULL) {
-        if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
-            return (1);
-        }
-        i++;
-    }
-    return (0);
+    htmlStartCloseEntry key;
+    void *res;
+
+    key.oldTag = (const char *) oldtag;
+    key.newTag = (const char *) newtag;
+    res = bsearch(&key, htmlStartClose,
+            sizeof(htmlStartClose) / sizeof(htmlStartCloseEntry),
+            sizeof(htmlStartCloseEntry), htmlCompareStartClose);
+    return(res != NULL);
 }
 
 /**
@@ -2341,6 +2517,8 @@ htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
     if ((ExternalID != NULL) ||
 	(URI != NULL))
 	xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
+    if ((__xmlRegisterCallbacks) && (xmlRegisterNodeDefaultValue))
+	xmlRegisterNodeDefaultValue((xmlNodePtr)cur);
     return(cur);
 }
 
@@ -2482,7 +2660,6 @@ htmlParseName(htmlParserCtxtPtr ctxt) {
 	    count = in - ctxt->input->cur;
 	    ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
 	    ctxt->input->cur = in;
-	    ctxt->nbChars += count;
 	    ctxt->input->col += count;
 	    return(ret);
 	}
@@ -2789,47 +2966,39 @@ htmlParseAttValue(htmlParserCtxtPtr ctxt) {
 static xmlChar *
 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
     size_t len = 0, startPosition = 0;
+    int err = 0;
+    int quote;
     xmlChar *ret = NULL;
 
-    if (CUR == '"') {
-        NEXT;
+    if ((CUR != '"') && (CUR != '\'')) {
+	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
+	             "SystemLiteral \" or ' expected\n", NULL, NULL);
+        return(NULL);
+    }
+    quote = CUR;
+    NEXT;
 
-        if (CUR_PTR < BASE_PTR)
-            return(ret);
-        startPosition = CUR_PTR - BASE_PTR;
+    if (CUR_PTR < BASE_PTR)
+        return(ret);
+    startPosition = CUR_PTR - BASE_PTR;
 
-	while ((IS_CHAR_CH(CUR)) && (CUR != '"')) {
-	    NEXT;
-	    len++;
-	}
-	if (!IS_CHAR_CH(CUR)) {
-	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
-			 "Unfinished SystemLiteral\n", NULL, NULL);
-	} else {
-	    ret = xmlStrndup((BASE_PTR+startPosition), len);
-	    NEXT;
+    while ((CUR != 0) && (CUR != quote)) {
+        /* TODO: Handle UTF-8 */
+        if (!IS_CHAR_CH(CUR)) {
+            htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
+                            "Invalid char in SystemLiteral 0x%X\n", CUR);
+            err = 1;
         }
-    } else if (CUR == '\'') {
         NEXT;
-
-        if (CUR_PTR < BASE_PTR)
-            return(ret);
-        startPosition = CUR_PTR - BASE_PTR;
-
-	while ((IS_CHAR_CH(CUR)) && (CUR != '\'')) {
-	    NEXT;
-	    len++;
-	}
-	if (!IS_CHAR_CH(CUR)) {
-	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
-			 "Unfinished SystemLiteral\n", NULL, NULL);
-	} else {
-	    ret = xmlStrndup((BASE_PTR+startPosition), len);
-	    NEXT;
-        }
+        len++;
+    }
+    if (CUR != quote) {
+        htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
+                     "Unfinished SystemLiteral\n", NULL, NULL);
     } else {
-	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
-	             " or ' expected\n", NULL, NULL);
+        NEXT;
+        if (err == 0)
+            ret = xmlStrndup((BASE_PTR+startPosition), len);
     }
 
     return(ret);
@@ -2849,51 +3018,42 @@ htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
 static xmlChar *
 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
     size_t len = 0, startPosition = 0;
+    int err = 0;
+    int quote;
     xmlChar *ret = NULL;
+
+    if ((CUR != '"') && (CUR != '\'')) {
+	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
+	             "PubidLiteral \" or ' expected\n", NULL, NULL);
+        return(NULL);
+    }
+    quote = CUR;
+    NEXT;
+
     /*
      * Name ::= (Letter | '_') (NameChar)*
      */
-    if (CUR == '"') {
-        NEXT;
-
-        if (CUR_PTR < BASE_PTR)
-            return(ret);
-        startPosition = CUR_PTR - BASE_PTR;
-
-        while (IS_PUBIDCHAR_CH(CUR)) {
-            len++;
-            NEXT;
+    if (CUR_PTR < BASE_PTR)
+        return(ret);
+    startPosition = CUR_PTR - BASE_PTR;
+
+    while ((CUR != 0) && (CUR != quote)) {
+        if (!IS_PUBIDCHAR_CH(CUR)) {
+            htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
+                            "Invalid char in PubidLiteral 0x%X\n", CUR);
+            err = 1;
         }
-
-	if (CUR != '"') {
-	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
-	                 "Unfinished PubidLiteral\n", NULL, NULL);
-	} else {
-	    ret = xmlStrndup((BASE_PTR + startPosition), len);
-	    NEXT;
-	}
-    } else if (CUR == '\'') {
+        len++;
         NEXT;
+    }
 
-        if (CUR_PTR < BASE_PTR)
-            return(ret);
-        startPosition = CUR_PTR - BASE_PTR;
-
-        while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\'')){
-            len++;
-            NEXT;
-        }
-
-	if (CUR != '\'') {
-	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
-	                 "Unfinished PubidLiteral\n", NULL, NULL);
-	} else {
-	    ret = xmlStrndup((BASE_PTR + startPosition), len);
-	    NEXT;
-	}
+    if (CUR != quote) {
+        htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
+                     "Unfinished PubidLiteral\n", NULL, NULL);
     } else {
-	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
-	             "PubidLiteral \" or ' expected\n", NULL, NULL);
+        NEXT;
+        if (err == 0)
+            ret = xmlStrndup((BASE_PTR + startPosition), len);
     }
 
     return(ret);
@@ -2928,7 +3088,7 @@ htmlParseScript(htmlParserCtxtPtr ctxt) {
 
     SHRINK;
     cur = CUR_CHAR(l);
-    while (IS_CHAR_CH(cur)) {
+    while (cur != 0) {
 	if ((cur == '<') && (NXT(1) == '/')) {
             /*
              * One should break here, the specification is clear:
@@ -2959,7 +3119,12 @@ htmlParseScript(htmlParserCtxtPtr ctxt) {
                 }
             }
 	}
-	COPY_BUF(l,buf,nbchar,cur);
+        if (IS_CHAR(cur)) {
+	    COPY_BUF(l,buf,nbchar,cur);
+        } else {
+            htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
+                            "Invalid char in CDATA 0x%X\n", cur);
+        }
 	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
             buf[nbchar] = 0;
 	    if (ctxt->sax->cdataBlock!= NULL) {
@@ -2977,14 +3142,6 @@ htmlParseScript(htmlParserCtxtPtr ctxt) {
 	cur = CUR_CHAR(l);
     }
 
-    if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
-        htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
-                    "Invalid char in CDATA 0x%X\n", cur);
-        if (ctxt->input->cur < ctxt->input->end) {
-            NEXT;
-        }
-    }
-
     if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
         buf[nbchar] = 0;
 	if (ctxt->sax->cdataBlock!= NULL) {
@@ -3232,7 +3389,7 @@ htmlParsePI(htmlParserCtxtPtr ctxt) {
 	    }
             SKIP_BLANKS;
 	    cur = CUR_CHAR(l);
-	    while (IS_CHAR(cur) && (cur != '>')) {
+	    while ((cur != 0) && (cur != '>')) {
 		if (len + 5 >= size) {
 		    xmlChar *tmp;
 
@@ -3251,7 +3408,13 @@ htmlParsePI(htmlParserCtxtPtr ctxt) {
 		    GROW;
 		    count = 0;
 		}
-		COPY_BUF(l,buf,len,cur);
+                if (IS_CHAR(cur)) {
+		    COPY_BUF(l,buf,len,cur);
+                } else {
+                    htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
+                                    "Invalid char in processing instruction "
+                                    "0x%X\n", cur);
+                }
 		NEXTL(l);
 		cur = CUR_CHAR(l);
 		if (cur == 0) {
@@ -3300,6 +3463,7 @@ htmlParseComment(htmlParserCtxtPtr ctxt) {
     int q, ql;
     int r, rl;
     int cur, l;
+    int next, nl;
     xmlParserInputState state;
 
     /*
@@ -3321,17 +3485,32 @@ htmlParseComment(htmlParserCtxtPtr ctxt) {
     len = 0;
     buf[len] = 0;
     q = CUR_CHAR(ql);
-    if (!IS_CHAR(q))
+    if (q == 0)
         goto unfinished;
     NEXTL(ql);
     r = CUR_CHAR(rl);
-    if (!IS_CHAR(r))
+    if (r == 0)
         goto unfinished;
     NEXTL(rl);
     cur = CUR_CHAR(l);
-    while (IS_CHAR(cur) &&
+    while ((cur != 0) &&
            ((cur != '>') ||
 	    (r != '-') || (q != '-'))) {
+	NEXTL(l);
+	next = CUR_CHAR(nl);
+	if (next == 0) {
+	    SHRINK;
+	    GROW;
+	    next = CUR_CHAR(nl);
+	}
+
+	if ((q == '-') && (r == '-') && (cur == '!') && (next == '>')) {
+	  htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
+		       "Comment incorrectly closed by '--!>'", NULL, NULL);
+	  cur = '>';
+	  break;
+	}
+
 	if (len + 5 >= size) {
 	    xmlChar *tmp;
 
@@ -3345,21 +3524,22 @@ htmlParseComment(htmlParserCtxtPtr ctxt) {
 	    }
 	    buf = tmp;
 	}
-	COPY_BUF(ql,buf,len,q);
+        if (IS_CHAR(q)) {
+	    COPY_BUF(ql,buf,len,q);
+        } else {
+            htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
+                            "Invalid char in comment 0x%X\n", q);
+        }
+
 	q = r;
 	ql = rl;
 	r = cur;
 	rl = l;
-	NEXTL(l);
-	cur = CUR_CHAR(l);
-	if (cur == 0) {
-	    SHRINK;
-	    GROW;
-	    cur = CUR_CHAR(l);
-	}
+	cur = next;
+	l = nl;
     }
     buf[len] = 0;
-    if (IS_CHAR(cur)) {
+    if (cur == '>') {
         NEXT;
 	if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
 	    (!ctxt->disableSAX))
@@ -3400,13 +3580,16 @@ htmlParseCharRef(htmlParserCtxtPtr ctxt) {
         ((NXT(2) == 'x') || NXT(2) == 'X')) {
 	SKIP(3);
 	while (CUR != ';') {
-	    if ((CUR >= '0') && (CUR <= '9'))
-	        val = val * 16 + (CUR - '0');
-	    else if ((CUR >= 'a') && (CUR <= 'f'))
-	        val = val * 16 + (CUR - 'a') + 10;
-	    else if ((CUR >= 'A') && (CUR <= 'F'))
-	        val = val * 16 + (CUR - 'A') + 10;
-	    else {
+	    if ((CUR >= '0') && (CUR <= '9')) {
+                if (val < 0x110000)
+	            val = val * 16 + (CUR - '0');
+            } else if ((CUR >= 'a') && (CUR <= 'f')) {
+                if (val < 0x110000)
+	            val = val * 16 + (CUR - 'a') + 10;
+            } else if ((CUR >= 'A') && (CUR <= 'F')) {
+                if (val < 0x110000)
+	            val = val * 16 + (CUR - 'A') + 10;
+            } else {
 	        htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
 		             "htmlParseCharRef: missing semicolon\n",
 			     NULL, NULL);
@@ -3419,9 +3602,10 @@ htmlParseCharRef(htmlParserCtxtPtr ctxt) {
     } else if  ((CUR == '&') && (NXT(1) == '#')) {
 	SKIP(2);
 	while (CUR != ';') {
-	    if ((CUR >= '0') && (CUR <= '9'))
-	        val = val * 10 + (CUR - '0');
-	    else {
+	    if ((CUR >= '0') && (CUR <= '9')) {
+                if (val < 0x110000)
+	            val = val * 10 + (CUR - '0');
+            } else {
 	        htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
 		             "htmlParseCharRef: missing semicolon\n",
 			     NULL, NULL);
@@ -3440,6 +3624,9 @@ htmlParseCharRef(htmlParserCtxtPtr ctxt) {
      */
     if (IS_CHAR(val)) {
         return(val);
+    } else if (val >= 0x110000) {
+	htmlParseErr(ctxt, XML_ERR_INVALID_CHAR,
+		     "htmlParseCharRef: value too large\n", NULL, NULL);
     } else {
 	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
 			"htmlParseCharRef: invalid xmlChar value %d\n",
@@ -3499,9 +3686,12 @@ htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
     if (CUR != '>') {
 	htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
 	             "DOCTYPE improperly terminated\n", NULL, NULL);
-        /* We shouldn't try to resynchronize ... */
+        /* Ignore bogus content */
+        while ((CUR != 0) && (CUR != '>'))
+            NEXT;
     }
-    NEXT;
+    if (CUR == '>')
+        NEXT;
 
     /*
      * Create or update the document accordingly to the DOCTYPE
@@ -3770,16 +3960,28 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {
 	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
 	             "htmlParseStartTag: invalid element name\n",
 		     NULL, NULL);
+        /*
+         * The recovery code is disabled for now as it can result in
+         * quadratic behavior with the push parser. htmlParseStartTag
+         * must consume all content up to the final '>' in order to avoid
+         * rescanning for this terminator.
+         *
+         * For a proper fix in line with HTML5, htmlParseStartTag and
+         * htmlParseElement should only be called when there's an ASCII
+         * alpha character following the initial '<'. Otherwise, the '<'
+         * should be emitted as text (unless followed by '!', '/' or '?').
+         */
+#if 0
 	/* if recover preserve text on classic misconstructs */
 	if ((ctxt->recovery) && ((IS_BLANK_CH(CUR)) || (CUR == '<') ||
 	    (CUR == '=') || (CUR == '>') || (((CUR >= '0') && (CUR <= '9'))))) {
 	    htmlParseCharDataInternal(ctxt, '<');
 	    return(-1);
 	}
-
+#endif
 
 	/* Dump the bogus tag like browsers do */
-	while ((IS_CHAR_CH(CUR)) && (CUR != '>') &&
+	while ((CUR != 0) && (CUR != '>') &&
                (ctxt->instate != XML_PARSER_EOF))
 	    NEXT;
         return -1;
@@ -3835,11 +4037,9 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {
      * (S Attribute)* S?
      */
     SKIP_BLANKS;
-    while ((IS_CHAR_CH(CUR)) &&
+    while ((CUR != 0) &&
            (CUR != '>') &&
 	   ((CUR != '/') || (NXT(1) != '>'))) {
-	long cons = ctxt->nbChars;
-
 	GROW;
 	attname = htmlParseAttribute(ctxt, &attvalue);
         if (attname != NULL) {
@@ -3898,7 +4098,7 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {
 	        xmlFree(attvalue);
 	    /* Dump the bogus attribute string up to the next blank or
 	     * the end of the tag. */
-	    while ((IS_CHAR_CH(CUR)) &&
+	    while ((CUR != 0) &&
 	           !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
 		   ((CUR != '/') || (NXT(1) != '>')))
 		NEXT;
@@ -3906,12 +4106,6 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {
 
 failed:
 	SKIP_BLANKS;
-        if (cons == ctxt->nbChars) {
-	    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
-	                 "htmlParseStartTag: problem parsing attributes\n",
-			 NULL, NULL);
-	    break;
-	}
     }
 
     /*
@@ -3979,19 +4173,14 @@ htmlParseEndTag(htmlParserCtxtPtr ctxt)
      * We should definitely be at the ending "S? '>'" part
      */
     SKIP_BLANKS;
-    if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
+    if (CUR != '>') {
         htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
 	             "End tag : expected '>'\n", NULL, NULL);
-	if (ctxt->recovery) {
-	    /*
-	     * We're not at the ending > !!
-	     * Error, unless in recover mode where we search forwards
-	     * until we find a >
-	     */
-	    while (CUR != '\0' && CUR != '>') NEXT;
-	    NEXT;
-	}
-    } else
+        /* Skip to next '>' */
+        while ((CUR != 0) && (CUR != '>'))
+            NEXT;
+    }
+    if (CUR == '>')
         NEXT;
 
     /*
@@ -4032,12 +4221,10 @@ htmlParseEndTag(htmlParserCtxtPtr ctxt)
      * With the exception that the autoclose may have popped stuff out
      * of the stack.
      */
-    if (!xmlStrEqual(name, ctxt->name)) {
-        if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
-            htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
-	                 "Opening and ending tag mismatch: %s and %s\n",
-			 name, ctxt->name);
-        }
+    if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
+        htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
+                     "Opening and ending tag mismatch: %s and %s\n",
+                     name, ctxt->name);
     }
 
     /*
@@ -4152,8 +4339,6 @@ htmlParseContent(htmlParserCtxtPtr ctxt) {
     currentNode = xmlStrdup(ctxt->name);
     depth = ctxt->nameNr;
     while (1) {
-	long cons = ctxt->nbChars;
-
         GROW;
 
         if (ctxt->instate == XML_PARSER_EOF)
@@ -4181,7 +4366,7 @@ htmlParseContent(htmlParserCtxtPtr ctxt) {
 			 "htmlParseStartTag: invalid element name\n",
 			 NULL, NULL);
 	        /* Dump the bogus tag like browsers do */
-        while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
+                while ((CUR != 0) && (CUR != '>'))
 	            NEXT;
 
 	        if (currentNode != NULL)
@@ -4273,15 +4458,6 @@ htmlParseContent(htmlParserCtxtPtr ctxt) {
 	    else {
 		htmlParseCharData(ctxt);
 	    }
-
-	    if (cons == ctxt->nbChars) {
-		if (ctxt->node != NULL) {
-		    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
-		                 "detected an error in element content\n",
-				 NULL, NULL);
-		}
-		break;
-	    }
 	}
         GROW;
     }
@@ -4396,7 +4572,7 @@ htmlParseElement(htmlParserCtxtPtr ctxt) {
      */
     currentNode = xmlStrdup(ctxt->name);
     depth = ctxt->nameNr;
-    while (IS_CHAR_CH(CUR)) {
+    while (CUR != 0) {
 	oldptr = ctxt->input->cur;
 	htmlParseContent(ctxt);
 	if (oldptr==ctxt->input->cur) break;
@@ -4413,7 +4589,7 @@ htmlParseElement(htmlParserCtxtPtr ctxt) {
        node_info.node = ctxt->node;
        xmlParserAddNodeInfo(ctxt, &node_info);
     }
-    if (!IS_CHAR_CH(CUR)) {
+    if (CUR == 0) {
 	htmlAutoCloseOnEnd(ctxt);
     }
 
@@ -4434,7 +4610,7 @@ htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
        xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
        htmlNodeInfoPop(ctxt);
     }
-    if (!IS_CHAR_CH(CUR)) {
+    if (CUR == 0) {
        htmlAutoCloseOnEnd(ctxt);
     }
 }
@@ -4552,8 +4728,6 @@ htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
     currentNode = xmlStrdup(ctxt->name);
     depth = ctxt->nameNr;
     while (1) {
-	long cons = ctxt->nbChars;
-
         GROW;
 
         if (ctxt->instate == XML_PARSER_EOF)
@@ -4583,7 +4757,7 @@ htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
 			 "htmlParseStartTag: invalid element name\n",
 			 NULL, NULL);
 	        /* Dump the bogus tag like browsers do */
-	        while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
+	        while ((CUR == 0) && (CUR != '>'))
 	            NEXT;
 
 	        htmlParserFinishElementParsing(ctxt);
@@ -4687,15 +4861,6 @@ htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
 	    else {
 		htmlParseCharData(ctxt);
 	    }
-
-	    if (cons == ctxt->nbChars) {
-		if (ctxt->node != NULL) {
-		    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
-		                 "detected an error in element content\n",
-				 NULL, NULL);
-		}
-		break;
-	    }
 	}
         GROW;
     }
@@ -4959,7 +5124,6 @@ htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
     ctxt->vctxt.warning = xmlParserValidityWarning;
     ctxt->record_info = 0;
     ctxt->validate = 0;
-    ctxt->nbChars = 0;
     ctxt->checkIndex = 0;
     ctxt->catalogs = NULL;
     xmlInitNodeInfoSeq(&ctxt->node_seq);
@@ -5035,6 +5199,7 @@ htmlCreateMemoryParserCtxt(const char *buffer, int size) {
 
     input = xmlNewInputStream(ctxt);
     if (input == NULL) {
+	xmlFreeParserInputBuffer(buf);
 	xmlFreeParserCtxt(ctxt);
 	return(NULL);
     }
@@ -5119,7 +5284,7 @@ htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
  * @first:  the first char to lookup
  * @next:  the next char to lookup or zero
  * @third:  the next char to lookup or zero
- * @comment: flag to force checking inside comments
+ * @ignoreattrval: skip over attribute values
  *
  * Try to find if a sequence (first, next, third) or  just (first next) or
  * (first) is available in the input stream.
@@ -5133,13 +5298,11 @@ htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
  */
 static int
 htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
-                        xmlChar next, xmlChar third, int iscomment,
-                        int ignoreattrval)
+                        xmlChar next, xmlChar third, int ignoreattrval)
 {
     int base, len;
     htmlParserInputPtr in;
     const xmlChar *buf;
-    int incomment = 0;
     int invalue = 0;
     char valdellim = 0x0;
 
@@ -5151,8 +5314,11 @@ htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
     if (base < 0)
         return (-1);
 
-    if (ctxt->checkIndex > base)
+    if (ctxt->checkIndex > base) {
         base = ctxt->checkIndex;
+        /* Abuse hasPErefs member to restore current state. */
+        invalue = ctxt->hasPErefs & 1 ? 1 : 0;
+    }
 
     if (in->buf == NULL) {
         buf = in->base;
@@ -5168,14 +5334,6 @@ htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
     else if (next)
         len--;
     for (; base < len; base++) {
-        if ((!incomment) && (base + 4 < len) && (!iscomment)) {
-            if ((buf[base] == '<') && (buf[base + 1] == '!') &&
-                (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
-                incomment = 1;
-                /* do not increment past <! - some people use <!--> */
-                base += 2;
-            }
-        }
         if (ignoreattrval) {
             if (buf[base] == '"' || buf[base] == '\'') {
                 if (invalue) {
@@ -5192,16 +5350,6 @@ htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
                 continue;
             }
         }
-        if (incomment) {
-            if (base + 3 > len)
-                return (-1);
-            if ((buf[base] == '-') && (buf[base + 1] == '-') &&
-                (buf[base + 2] == '>')) {
-                incomment = 0;
-                base += 2;
-            }
-            continue;
-        }
         if (buf[base] == first) {
             if (third != 0) {
                 if ((buf[base + 1] != next) || (buf[base + 2] != third))
@@ -5228,8 +5376,12 @@ htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
             return (base - (in->cur - in->base));
         }
     }
-    if ((!incomment) && (!invalue))
-        ctxt->checkIndex = base;
+    ctxt->checkIndex = base;
+    /* Abuse hasPErefs member to track current state. */
+    if (invalue)
+        ctxt->hasPErefs |= 1;
+    else
+        ctxt->hasPErefs &= ~1;
 #ifdef DEBUG_PUSH
     if (next == 0)
         xmlGenericError(xmlGenericErrorContext,
@@ -5246,79 +5398,38 @@ htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
 }
 
 /**
- * htmlParseLookupChars:
+ * htmlParseLookupCommentEnd:
  * @ctxt: an HTML parser context
- * @stop: Array of chars, which stop the lookup.
- * @stopLen: Length of stop-Array
  *
- * Try to find if any char of the stop-Array is available in the input
- * stream.
+ * Try to find a comment end tag in the input stream
+ * The search includes "-->" as well as WHATWG-recommended incorrectly-closed tags.
+ * (See https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment)
  * This function has a side effect of (possibly) incrementing ctxt->checkIndex
  * to avoid rescanning sequences of bytes, it DOES change the state of the
  * parser, do not use liberally.
+ * This wraps to htmlParseLookupSequence()
  *
- * Returns the index to the current parsing point if a stopChar
- *      is available, -1 otherwise.
+ * Returns the index to the current parsing point if the full sequence is available, -1 otherwise.
  */
 static int
-htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
-                     int stopLen)
+htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)
 {
-    int base, len;
-    htmlParserInputPtr in;
-    const xmlChar *buf;
-    int incomment = 0;
-    int i;
-
-    in = ctxt->input;
-    if (in == NULL)
-        return (-1);
-
-    base = in->cur - in->base;
-    if (base < 0)
-        return (-1);
-
-    if (ctxt->checkIndex > base)
-        base = ctxt->checkIndex;
-
-    if (in->buf == NULL) {
-        buf = in->base;
-        len = in->length;
-    } else {
-        buf = xmlBufContent(in->buf->buffer);
-        len = xmlBufUse(in->buf->buffer);
-    }
-
-    for (; base < len; base++) {
-        if (!incomment && (base + 4 < len)) {
-            if ((buf[base] == '<') && (buf[base + 1] == '!') &&
-                (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
-                incomment = 1;
-                /* do not increment past <! - some people use <!--> */
-                base += 2;
-            }
-        }
-        if (incomment) {
-            if (base + 3 > len)
-                return (-1);
-            if ((buf[base] == '-') && (buf[base + 1] == '-') &&
-                (buf[base + 2] == '>')) {
-                incomment = 0;
-                base += 2;
-            }
-            continue;
-        }
-        for (i = 0; i < stopLen; ++i) {
-            if (buf[base] == stop[i]) {
-                ctxt->checkIndex = 0;
-                return (base - (in->cur - in->base));
-            }
-        }
+    int mark = 0;
+    int cur = CUR_PTR - BASE_PTR;
+
+    while (mark >= 0) {
+	mark = htmlParseLookupSequence(ctxt, '-', '-', 0, 0);
+	if ((mark < 0) ||
+	    (NXT(mark+2) == '>') ||
+	    ((NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) {
+	    return mark;
+	}
+	ctxt->checkIndex = cur + mark + 1;
     }
-    ctxt->checkIndex = base;
-    return (-1);
+    return mark;
 }
 
+
 /**
  * htmlParseTryOrFinish:
  * @ctxt:  an HTML parser context
@@ -5332,7 +5443,7 @@ static int
 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
     int ret = 0;
     htmlParserInputPtr in;
-    int avail = 0;
+    ptrdiff_t avail = 0;
     xmlChar cur, next;
 
     htmlParserNodeInfo node_info;
@@ -5397,7 +5508,8 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
 	if (in->buf == NULL)
 	    avail = in->length - (in->cur - in->base);
 	else
-	    avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
+	    avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
+                    (in->cur - in->base);
 	if ((avail == 0) && (terminate)) {
 	    htmlAutoCloseOnEnd(ctxt);
 	    if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
@@ -5411,6 +5523,12 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
 	}
         if (avail < 1)
 	    goto done;
+        /*
+         * This is done to make progress and avoid an infinite loop
+         * if a parsing attempt was aborted by hitting a NUL byte. After
+         * changing htmlCurrentChar, this probably isn't necessary anymore.
+         * We should consider removing this check.
+         */
 	cur = in->cur[0];
 	if (cur == 0) {
 	    SKIP(1);
@@ -5433,7 +5551,8 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
 		    if (in->buf == NULL)
 			avail = in->length - (in->cur - in->base);
 		    else
-			avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
+			avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
+                                (in->cur - in->base);
 		}
 		if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
 		    ctxt->sax->setDocumentLocator(ctxt->userData,
@@ -5450,7 +5569,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
 		    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
 		    (UPP(8) == 'E')) {
 		    if ((!terminate) &&
-		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
+		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
 			goto done;
 #ifdef DEBUG_PUSH
 		    xmlGenericError(xmlGenericErrorContext,
@@ -5475,14 +5594,15 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
 		if (in->buf == NULL)
 		    avail = in->length - (in->cur - in->base);
 		else
-		    avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
+		    avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
+                            (in->cur - in->base);
 		/*
 		 * no chars in buffer
 		 */
 		if (avail < 1)
 		    goto done;
 		/*
-		 * not enouth chars in buffer
+		 * not enough chars in buffer
 		 */
 		if (avail < 2) {
 		    if (!terminate)
@@ -5495,8 +5615,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
 		cur = in->cur[0];
 	        if ((cur == '<') && (next == '!') &&
 		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
-		    if ((!terminate) &&
-		        (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
+		    if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
 			goto done;
 #ifdef DEBUG_PUSH
 		    xmlGenericError(xmlGenericErrorContext,
@@ -5506,7 +5625,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
 		    ctxt->instate = XML_PARSER_MISC;
 	        } else if ((cur == '<') && (next == '?')) {
 		    if ((!terminate) &&
-		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
+		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
 			goto done;
 #ifdef DEBUG_PUSH
 		    xmlGenericError(xmlGenericErrorContext,
@@ -5520,7 +5639,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
 		    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
 		    (UPP(8) == 'E')) {
 		    if ((!terminate) &&
-		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
+		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
 			goto done;
 #ifdef DEBUG_PUSH
 		    xmlGenericError(xmlGenericErrorContext,
@@ -5536,7 +5655,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
 		           (avail < 9)) {
 		    goto done;
 		} else {
-		    ctxt->instate = XML_PARSER_START_TAG;
+		    ctxt->instate = XML_PARSER_CONTENT;
 #ifdef DEBUG_PUSH
 		    xmlGenericError(xmlGenericErrorContext,
 			    "HPP: entering START_TAG\n");
@@ -5548,15 +5667,15 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
 		if (in->buf == NULL)
 		    avail = in->length - (in->cur - in->base);
 		else
-		    avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
+		    avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
+                            (in->cur - in->base);
 		if (avail < 2)
 		    goto done;
 		cur = in->cur[0];
 		next = in->cur[1];
 		if ((cur == '<') && (next == '!') &&
 		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
-		    if ((!terminate) &&
-		        (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
+		    if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
 			goto done;
 #ifdef DEBUG_PUSH
 		    xmlGenericError(xmlGenericErrorContext,
@@ -5566,7 +5685,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
 		    ctxt->instate = XML_PARSER_PROLOG;
 	        } else if ((cur == '<') && (next == '?')) {
 		    if ((!terminate) &&
-		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
+		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
 			goto done;
 #ifdef DEBUG_PUSH
 		    xmlGenericError(xmlGenericErrorContext,
@@ -5578,7 +5697,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
 		           (avail < 4)) {
 		    goto done;
 		} else {
-		    ctxt->instate = XML_PARSER_START_TAG;
+		    ctxt->instate = XML_PARSER_CONTENT;
 #ifdef DEBUG_PUSH
 		    xmlGenericError(xmlGenericErrorContext,
 			    "HPP: entering START_TAG\n");
@@ -5589,7 +5708,8 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
 		if (in->buf == NULL)
 		    avail = in->length - (in->cur - in->base);
 		else
-		    avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
+		    avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
+                            (in->cur - in->base);
 		if (avail < 1)
 		    goto done;
 		cur = in->cur[0];
@@ -5602,8 +5722,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
 		next = in->cur[1];
 	        if ((cur == '<') && (next == '!') &&
 		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
-		    if ((!terminate) &&
-		        (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
+		    if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
 			goto done;
 #ifdef DEBUG_PUSH
 		    xmlGenericError(xmlGenericErrorContext,
@@ -5613,7 +5732,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
 		    ctxt->instate = XML_PARSER_EPILOG;
 	        } else if ((cur == '<') && (next == '?')) {
 		    if ((!terminate) &&
-		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
+		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
 			goto done;
 #ifdef DEBUG_PUSH
 		    xmlGenericError(xmlGenericErrorContext,
@@ -5648,7 +5767,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
 		if (avail < 1)
 		    goto done;
 		/*
-		 * not enouth chars in buffer
+		 * not enough chars in buffer
 		 */
 		if (avail < 2) {
 		    if (!terminate)
@@ -5677,7 +5796,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
 		    break;
 		}
 		if ((!terminate) &&
-		    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
+		    (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
 		    goto done;
 
                 /* Capture start position */
@@ -5769,7 +5888,6 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
 	    }
             case XML_PARSER_CONTENT: {
 		xmlChar chr[2] = { 0, 0 };
-		long cons;
 
                 /*
 		 * Handle preparsed entities and charRef
@@ -5814,7 +5932,6 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
 		    goto done;
 		cur = in->cur[0];
 		next = in->cur[1];
-		cons = ctxt->nbChars;
 		if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
 		    (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
 		    /*
@@ -5824,7 +5941,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
 		        int idx;
 			xmlChar val;
 
-			idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 0);
+			idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
 			if (idx < 0)
 			    goto done;
 		        val = in->cur[idx + 2];
@@ -5851,7 +5968,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
 			(UPP(6) == 'Y') && (UPP(7) == 'P') &&
 			(UPP(8) == 'E')) {
 			if ((!terminate) &&
-			    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
+			    (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
 			    goto done;
 			htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
 			             "Misplaced DOCTYPE declaration\n",
@@ -5859,9 +5976,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
 			htmlParseDocTypeDecl(ctxt);
 		    } else if ((cur == '<') && (next == '!') &&
 			(in->cur[2] == '-') && (in->cur[3] == '-')) {
-			if ((!terminate) &&
-			    (htmlParseLookupSequence(
-				ctxt, '-', '-', '>', 1, 1) < 0))
+			if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
 			    goto done;
 #ifdef DEBUG_PUSH
 			xmlGenericError(xmlGenericErrorContext,
@@ -5871,7 +5986,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
 			ctxt->instate = XML_PARSER_CONTENT;
 		    } else if ((cur == '<') && (next == '?')) {
 			if ((!terminate) &&
-			    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
+			    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
 			    goto done;
 #ifdef DEBUG_PUSH
 			xmlGenericError(xmlGenericErrorContext,
@@ -5890,24 +6005,15 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
 #endif
 			break;
 		    } else if (cur == '<') {
-			ctxt->instate = XML_PARSER_START_TAG;
-			ctxt->checkIndex = 0;
+                        if ((!terminate) && (next == 0))
+                            goto done;
+                        ctxt->instate = XML_PARSER_START_TAG;
+                        ctxt->checkIndex = 0;
 #ifdef DEBUG_PUSH
-			xmlGenericError(xmlGenericErrorContext,
-				"HPP: entering START_TAG\n");
+                        xmlGenericError(xmlGenericErrorContext,
+                                "HPP: entering START_TAG\n");
 #endif
 			break;
-		    } else if (cur == '&') {
-			if ((!terminate) &&
-			    (htmlParseLookupChars(ctxt,
-                                                  BAD_CAST "; >/", 4) < 0))
-			    goto done;
-#ifdef DEBUG_PUSH
-			xmlGenericError(xmlGenericErrorContext,
-				"HPP: Parsing Reference\n");
-#endif
-			/* TODO: check generation of subtrees if noent !!! */
-			htmlParseReference(ctxt);
 		    } else {
 		        /*
 			 * check that the text sequence is complete
@@ -5916,24 +6022,23 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
 			 * data detection.
 			 */
 			if ((!terminate) &&
-                            (htmlParseLookupChars(ctxt, BAD_CAST "<&", 2) < 0))
+                            (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
 			    goto done;
 			ctxt->checkIndex = 0;
 #ifdef DEBUG_PUSH
 			xmlGenericError(xmlGenericErrorContext,
 				"HPP: Parsing char data\n");
 #endif
-			htmlParseCharData(ctxt);
-		    }
-		}
-		if (cons == ctxt->nbChars) {
-		    if (ctxt->node != NULL) {
-			htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
-			             "detected an error in element content\n",
-				     NULL, NULL);
+                        while ((ctxt->instate != XML_PARSER_EOF) &&
+                               (cur != '<') && (in->cur < in->end)) {
+                            if (cur == '&') {
+			        htmlParseReference(ctxt);
+                            } else {
+			        htmlParseCharData(ctxt);
+                            }
+                            cur = in->cur[0];
+                        }
 		    }
-		    NEXT;
-		    break;
 		}
 
 		break;
@@ -5942,7 +6047,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
 		if (avail < 2)
 		    goto done;
 		if ((!terminate) &&
-		    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
+		    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
 		    goto done;
 		htmlParseEndTag(ctxt);
 		if (ctxt->nameNr == 0) {
@@ -6124,12 +6229,12 @@ htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
 	int res;
 
 	res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
+        xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
 	if (res < 0) {
 	    ctxt->errNo = XML_PARSER_EOF;
 	    ctxt->disableSAX = 1;
 	    return (XML_PARSER_EOF);
 	}
-        xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
 #ifdef DEBUG_PUSH
 	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
 #endif
@@ -6148,12 +6253,12 @@ htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
 		size_t current = ctxt->input->cur - ctxt->input->base;
 
 		nbchars = xmlCharEncInput(in, terminate);
+		xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current);
 		if (nbchars < 0) {
 		    htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
 			         "encoder error\n", NULL, NULL);
 		    return(XML_ERR_INVALID_ENCODING);
 		}
-		xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current);
 	    }
 	}
     }
@@ -6671,7 +6776,6 @@ htmlCtxtReset(htmlParserCtxtPtr ctxt)
     ctxt->vctxt.error = xmlParserValidityError;
     ctxt->vctxt.warning = xmlParserValidityWarning;
     ctxt->record_info = 0;
-    ctxt->nbChars = 0;
     ctxt->checkIndex = 0;
     ctxt->inSubset = 0;
     ctxt->errNo = XML_ERR_OK;
@@ -6890,7 +6994,9 @@ htmlReadMemory(const char *buffer, int size, const char *URL, const char *encodi
  * @encoding:  the document encoding, or NULL
  * @options:  a combination of htmlParserOption(s)
  *
- * parse an XML from a file descriptor and build a tree.
+ * parse an HTML from a file descriptor and build a tree.
+ * NOTE that the file descriptor will not be closed when the
+ *      reader is closed or reset.
  *
  * Returns the resulting document tree
  */
@@ -6899,17 +7005,17 @@ htmlReadFd(int fd, const char *URL, const char *encoding, int options)
 {
     htmlParserCtxtPtr ctxt;
     xmlParserInputBufferPtr input;
-    xmlParserInputPtr stream;
+    htmlParserInputPtr stream;
 
     if (fd < 0)
         return (NULL);
-    xmlInitParser();
 
     xmlInitParser();
     input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
     if (input == NULL)
         return (NULL);
-    ctxt = xmlNewParserCtxt();
+    input->closecallback = NULL;
+    ctxt = htmlNewParserCtxt();
     if (ctxt == NULL) {
         xmlFreeParserInputBuffer(input);
         return (NULL);
@@ -6917,7 +7023,7 @@ htmlReadFd(int fd, const char *URL, const char *encoding, int options)
     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
     if (stream == NULL) {
         xmlFreeParserInputBuffer(input);
-	xmlFreeParserCtxt(ctxt);
+	htmlFreeParserCtxt(ctxt);
         return (NULL);
     }
     inputPush(ctxt, stream);
author	setser <setser@yandex-team.ru>	2022-05-09 00:13:37 +0300
committer	setser <setser@yandex-team.ru>	2022-05-09 00:13:37 +0300
commit	e87e3fc8d0e04eb7ba3eee221bb91613b527ad85 (patch)
tree	5279c128bdbdf902b9a08d9fae8e55b91910a553 /contrib/libs/libxml/HTMLparser.c
parent	f4f3e4024a1f32bd0bc3fa20239025a1b179e42d (diff)
download	ydb-e87e3fc8d0e04eb7ba3eee221bb91613b527ad85.tar.gz