aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/libs/libxml/HTMLparser.c
diff options
context:
space:
mode:
authorsetser <setser@yandex-team.ru>2022-05-09 00:13:37 +0300
committersetser <setser@yandex-team.ru>2022-05-09 00:13:37 +0300
commite87e3fc8d0e04eb7ba3eee221bb91613b527ad85 (patch)
tree5279c128bdbdf902b9a08d9fae8e55b91910a553 /contrib/libs/libxml/HTMLparser.c
parentf4f3e4024a1f32bd0bc3fa20239025a1b179e42d (diff)
downloadydb-e87e3fc8d0e04eb7ba3eee221bb91613b527ad85.tar.gz
Update libxml to 2.9.13
ref:f572491d236694e847142c36f0f5546c649e05d7
Diffstat (limited to 'contrib/libs/libxml/HTMLparser.c')
-rw-r--r--contrib/libs/libxml/HTMLparser.c1208
1 files changed, 657 insertions, 551 deletions
diff --git a/contrib/libs/libxml/HTMLparser.c b/contrib/libs/libxml/HTMLparser.c
index 7b6d68961c..3e8a165740 100644
--- a/contrib/libs/libxml/HTMLparser.c
+++ b/contrib/libs/libxml/HTMLparser.c
@@ -69,7 +69,7 @@ static void htmlParseComment(htmlParserCtxtPtr ctxt);
/**
* htmlErrMemory:
* @ctxt: an HTML parser context
- * @extra: extra informations
+ * @extra: extra information
*
* Handle a redefinition of attribute error
*/
@@ -296,7 +296,7 @@ htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
#define UPPER (toupper(*ctxt->input->cur))
-#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
+#define SKIP(val) ctxt->input->cur += (val),ctxt->input->col+=(val)
#define NXT(val) ctxt->input->cur[(val)]
@@ -330,7 +330,7 @@ htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
if (*(ctxt->input->cur) == '\n') { \
ctxt->input->line++; ctxt->input->col = 1; \
} else ctxt->input->col++; \
- ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
+ ctxt->token = 0; ctxt->input->cur += l; \
} while (0)
/************
@@ -414,6 +414,10 @@ htmlFindEncoding(xmlParserCtxtPtr ctxt) {
static int
htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
+ const unsigned char *cur;
+ unsigned char c;
+ unsigned int val;
+
if (ctxt->instate == XML_PARSER_EOF)
return(0);
@@ -421,99 +425,29 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
*len = 0;
return(ctxt->token);
}
- if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
- /*
- * We are supposed to handle UTF8, check it's valid
- * From rfc2044: encoding of the Unicode values on UTF-8:
- *
- * UCS-4 range (hex.) UTF-8 octet sequence (binary)
- * 0000 0000-0000 007F 0xxxxxxx
- * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
- * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
- *
- * Check for the 0x110000 limit too
- */
- const unsigned char *cur = ctxt->input->cur;
- unsigned char c;
- unsigned int val;
-
- c = *cur;
- if (c & 0x80) {
- if (cur[1] == 0) {
- xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
- cur = ctxt->input->cur;
- }
- if ((cur[1] & 0xc0) != 0x80)
- goto encoding_error;
- if ((c & 0xe0) == 0xe0) {
+ if (ctxt->charset != XML_CHAR_ENCODING_UTF8) {
+ xmlChar * guess;
+ xmlCharEncodingHandlerPtr handler;
- if (cur[2] == 0) {
- xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
- cur = ctxt->input->cur;
- }
- if ((cur[2] & 0xc0) != 0x80)
- goto encoding_error;
- if ((c & 0xf0) == 0xf0) {
- if (cur[3] == 0) {
- xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
- cur = ctxt->input->cur;
- }
- if (((c & 0xf8) != 0xf0) ||
- ((cur[3] & 0xc0) != 0x80))
- goto encoding_error;
- /* 4-byte code */
- *len = 4;
- val = (cur[0] & 0x7) << 18;
- val |= (cur[1] & 0x3f) << 12;
- val |= (cur[2] & 0x3f) << 6;
- val |= cur[3] & 0x3f;
- } else {
- /* 3-byte code */
- *len = 3;
- val = (cur[0] & 0xf) << 12;
- val |= (cur[1] & 0x3f) << 6;
- val |= cur[2] & 0x3f;
- }
- } else {
- /* 2-byte code */
- *len = 2;
- val = (cur[0] & 0x1f) << 6;
- val |= cur[1] & 0x3f;
- }
- if (!IS_CHAR(val)) {
- htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
- "Char 0x%X out of allowed range\n", val);
- }
- return(val);
- } else {
+ /*
+ * Assume it's a fixed length encoding (1) with
+ * a compatible encoding for the ASCII set, since
+ * HTML constructs only use < 128 chars
+ */
+ if ((int) *ctxt->input->cur < 0x80) {
+ *len = 1;
if ((*ctxt->input->cur == 0) &&
(ctxt->input->cur < ctxt->input->end)) {
- htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
- "Char 0x%X out of allowed range\n", 0);
- *len = 1;
+ htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
+ "Char 0x%X out of allowed range\n", 0);
return(' ');
}
- /* 1-byte code */
- *len = 1;
- return((int) *ctxt->input->cur);
- }
- }
- /*
- * Assume it's a fixed length encoding (1) with
- * a compatible encoding for the ASCII set, since
- * XML constructs only use < 128 chars
- */
- *len = 1;
- if ((int) *ctxt->input->cur < 0x80)
- return((int) *ctxt->input->cur);
-
- /*
- * Humm this is bad, do an automatic flow conversion
- */
- {
- xmlChar * guess;
- xmlCharEncodingHandlerPtr handler;
+ return((int) *ctxt->input->cur);
+ }
+ /*
+ * Humm this is bad, do an automatic flow conversion
+ */
guess = htmlFindEncoding(ctxt);
if (guess == NULL) {
xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
@@ -523,7 +457,12 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
ctxt->input->encoding = guess;
handler = xmlFindCharEncodingHandler((const char *) guess);
if (handler != NULL) {
- xmlSwitchToEncoding(ctxt, handler);
+ /*
+ * Don't use UTF-8 encoder which isn't required and
+ * can produce invalid UTF-8.
+ */
+ if (!xmlStrEqual(BAD_CAST handler->name, BAD_CAST "UTF-8"))
+ xmlSwitchToEncoding(ctxt, handler);
} else {
htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
"Unsupported encoding %s", guess, NULL);
@@ -532,7 +471,86 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
ctxt->charset = XML_CHAR_ENCODING_UTF8;
}
- return(xmlCurrentChar(ctxt, len));
+ /*
+ * We are supposed to handle UTF8, check it's valid
+ * From rfc2044: encoding of the Unicode values on UTF-8:
+ *
+ * UCS-4 range (hex.) UTF-8 octet sequence (binary)
+ * 0000 0000-0000 007F 0xxxxxxx
+ * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
+ * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
+ *
+ * Check for the 0x110000 limit too
+ */
+ cur = ctxt->input->cur;
+ c = *cur;
+ if (c & 0x80) {
+ if ((c & 0x40) == 0)
+ goto encoding_error;
+ if (cur[1] == 0) {
+ xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
+ cur = ctxt->input->cur;
+ }
+ if ((cur[1] & 0xc0) != 0x80)
+ goto encoding_error;
+ if ((c & 0xe0) == 0xe0) {
+
+ if (cur[2] == 0) {
+ xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
+ cur = ctxt->input->cur;
+ }
+ if ((cur[2] & 0xc0) != 0x80)
+ goto encoding_error;
+ if ((c & 0xf0) == 0xf0) {
+ if (cur[3] == 0) {
+ xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
+ cur = ctxt->input->cur;
+ }
+ if (((c & 0xf8) != 0xf0) ||
+ ((cur[3] & 0xc0) != 0x80))
+ goto encoding_error;
+ /* 4-byte code */
+ *len = 4;
+ val = (cur[0] & 0x7) << 18;
+ val |= (cur[1] & 0x3f) << 12;
+ val |= (cur[2] & 0x3f) << 6;
+ val |= cur[3] & 0x3f;
+ if (val < 0x10000)
+ goto encoding_error;
+ } else {
+ /* 3-byte code */
+ *len = 3;
+ val = (cur[0] & 0xf) << 12;
+ val |= (cur[1] & 0x3f) << 6;
+ val |= cur[2] & 0x3f;
+ if (val < 0x800)
+ goto encoding_error;
+ }
+ } else {
+ /* 2-byte code */
+ *len = 2;
+ val = (cur[0] & 0x1f) << 6;
+ val |= cur[1] & 0x3f;
+ if (val < 0x80)
+ goto encoding_error;
+ }
+ if (!IS_CHAR(val)) {
+ htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
+ "Char 0x%X out of allowed range\n", val);
+ }
+ return(val);
+ } else {
+ if ((*ctxt->input->cur == 0) &&
+ (ctxt->input->cur < ctxt->input->end)) {
+ htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
+ "Char 0x%X out of allowed range\n", 0);
+ *len = 1;
+ return(' ');
+ }
+ /* 1-byte code */
+ *len = 1;
+ return((int) *ctxt->input->cur);
+ }
encoding_error:
/*
@@ -557,7 +575,16 @@ encoding_error:
BAD_CAST buffer, NULL);
}
- ctxt->charset = XML_CHAR_ENCODING_8859_1;
+ /*
+ * Don't switch encodings twice. Note that if there's an encoder, we
+ * shouldn't receive invalid UTF-8 anyway.
+ *
+ * Note that if ctxt->input->buf == NULL, switching encodings is
+ * impossible, see Gitlab issue #34.
+ */
+ if ((ctxt->input->buf != NULL) &&
+ (ctxt->input->buf->encoder == NULL))
+ xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
*len = 1;
return((int) *ctxt->input->cur);
}
@@ -584,7 +611,6 @@ htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
ctxt->input->line++; ctxt->input->col = 1;
} else ctxt->input->col++;
ctxt->input->cur++;
- ctxt->nbChars++;
if (*ctxt->input->cur == 0)
xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
}
@@ -1046,102 +1072,266 @@ html40ElementTable[] = {
}
};
+typedef struct {
+ const char *oldTag;
+ const char *newTag;
+} htmlStartCloseEntry;
+
/*
* start tags that imply the end of current element
*/
-static const char * const htmlStartClose[] = {
-"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
- "dl", "ul", "ol", "menu", "dir", "address", "pre",
- "listing", "xmp", "head", NULL,
-"head", "p", NULL,
-"title", "p", NULL,
-"body", "head", "style", "link", "title", "p", NULL,
-"frameset", "head", "style", "link", "title", "p", NULL,
-"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
- "pre", "listing", "xmp", "head", "li", NULL,
-"hr", "p", "head", NULL,
-"h1", "p", "head", NULL,
-"h2", "p", "head", NULL,
-"h3", "p", "head", NULL,
-"h4", "p", "head", NULL,
-"h5", "p", "head", NULL,
-"h6", "p", "head", NULL,
-"dir", "p", "head", NULL,
-"address", "p", "head", "ul", NULL,
-"pre", "p", "head", "ul", NULL,
-"listing", "p", "head", NULL,
-"xmp", "p", "head", NULL,
-"blockquote", "p", "head", NULL,
-"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
- "xmp", "head", NULL,
-"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
- "head", "dd", NULL,
-"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
- "head", "dt", NULL,
-"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
- "listing", "xmp", NULL,
-"ol", "p", "head", "ul", NULL,
-"menu", "p", "head", "ul", NULL,
-"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL,
-"div", "p", "head", NULL,
-"noscript", "script", NULL,
-"center", "font", "b", "i", "p", "head", NULL,
-"a", "a", "head", NULL,
-"caption", "p", NULL,
-"colgroup", "caption", "colgroup", "col", "p", NULL,
-"col", "caption", "col", "p", NULL,
-"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
- "listing", "xmp", "a", NULL,
-"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
-"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
-"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
-"thead", "caption", "col", "colgroup", NULL,
-"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
- "tbody", "p", NULL,
-"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
- "tfoot", "tbody", "p", NULL,
-"optgroup", "option", NULL,
-"option", "option", NULL,
-"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
- "pre", "listing", "xmp", "a", NULL,
-/* most tags in in FONTSTYLE, PHRASE and SPECIAL should close <head> */
-"tt", "head", NULL,
-"i", "head", NULL,
-"b", "head", NULL,
-"u", "head", NULL,
-"s", "head", NULL,
-"strike", "head", NULL,
-"big", "head", NULL,
-"small", "head", NULL,
-
-"em", "head", NULL,
-"strong", "head", NULL,
-"dfn", "head", NULL,
-"code", "head", NULL,
-"samp", "head", NULL,
-"kbd", "head", NULL,
-"var", "head", NULL,
-"cite", "head", NULL,
-"abbr", "head", NULL,
-"acronym", "head", NULL,
-
-/* "a" */
-"img", "head", NULL,
-/* "applet" */
-/* "embed" */
-/* "object" */
-"font", "head", NULL,
-/* "basefont" */
-"br", "head", NULL,
-/* "script" */
-"map", "head", NULL,
-"q", "head", NULL,
-"sub", "head", NULL,
-"sup", "head", NULL,
-"span", "head", NULL,
-"bdo", "head", NULL,
-"iframe", "head", NULL,
-NULL
+static const htmlStartCloseEntry htmlStartClose[] = {
+ { "a", "a" },
+ { "a", "fieldset" },
+ { "a", "table" },
+ { "a", "td" },
+ { "a", "th" },
+ { "address", "dd" },
+ { "address", "dl" },
+ { "address", "dt" },
+ { "address", "form" },
+ { "address", "li" },
+ { "address", "ul" },
+ { "b", "center" },
+ { "b", "p" },
+ { "b", "td" },
+ { "b", "th" },
+ { "big", "p" },
+ { "caption", "col" },
+ { "caption", "colgroup" },
+ { "caption", "tbody" },
+ { "caption", "tfoot" },
+ { "caption", "thead" },
+ { "caption", "tr" },
+ { "col", "col" },
+ { "col", "colgroup" },
+ { "col", "tbody" },
+ { "col", "tfoot" },
+ { "col", "thead" },
+ { "col", "tr" },
+ { "colgroup", "colgroup" },
+ { "colgroup", "tbody" },
+ { "colgroup", "tfoot" },
+ { "colgroup", "thead" },
+ { "colgroup", "tr" },
+ { "dd", "dt" },
+ { "dir", "dd" },
+ { "dir", "dl" },
+ { "dir", "dt" },
+ { "dir", "form" },
+ { "dir", "ul" },
+ { "dl", "form" },
+ { "dl", "li" },
+ { "dt", "dd" },
+ { "dt", "dl" },
+ { "font", "center" },
+ { "font", "td" },
+ { "font", "th" },
+ { "form", "form" },
+ { "h1", "fieldset" },
+ { "h1", "form" },
+ { "h1", "li" },
+ { "h1", "p" },
+ { "h1", "table" },
+ { "h2", "fieldset" },
+ { "h2", "form" },
+ { "h2", "li" },
+ { "h2", "p" },
+ { "h2", "table" },
+ { "h3", "fieldset" },
+ { "h3", "form" },
+ { "h3", "li" },
+ { "h3", "p" },
+ { "h3", "table" },
+ { "h4", "fieldset" },
+ { "h4", "form" },
+ { "h4", "li" },
+ { "h4", "p" },
+ { "h4", "table" },
+ { "h5", "fieldset" },
+ { "h5", "form" },
+ { "h5", "li" },
+ { "h5", "p" },
+ { "h5", "table" },
+ { "h6", "fieldset" },
+ { "h6", "form" },
+ { "h6", "li" },
+ { "h6", "p" },
+ { "h6", "table" },
+ { "head", "a" },
+ { "head", "abbr" },
+ { "head", "acronym" },
+ { "head", "address" },
+ { "head", "b" },
+ { "head", "bdo" },
+ { "head", "big" },
+ { "head", "blockquote" },
+ { "head", "body" },
+ { "head", "br" },
+ { "head", "center" },
+ { "head", "cite" },
+ { "head", "code" },
+ { "head", "dd" },
+ { "head", "dfn" },
+ { "head", "dir" },
+ { "head", "div" },
+ { "head", "dl" },
+ { "head", "dt" },
+ { "head", "em" },
+ { "head", "fieldset" },
+ { "head", "font" },
+ { "head", "form" },
+ { "head", "frameset" },
+ { "head", "h1" },
+ { "head", "h2" },
+ { "head", "h3" },
+ { "head", "h4" },
+ { "head", "h5" },
+ { "head", "h6" },
+ { "head", "hr" },
+ { "head", "i" },
+ { "head", "iframe" },
+ { "head", "img" },
+ { "head", "kbd" },
+ { "head", "li" },
+ { "head", "listing" },
+ { "head", "map" },
+ { "head", "menu" },
+ { "head", "ol" },
+ { "head", "p" },
+ { "head", "pre" },
+ { "head", "q" },
+ { "head", "s" },
+ { "head", "samp" },
+ { "head", "small" },
+ { "head", "span" },
+ { "head", "strike" },
+ { "head", "strong" },
+ { "head", "sub" },
+ { "head", "sup" },
+ { "head", "table" },
+ { "head", "tt" },
+ { "head", "u" },
+ { "head", "ul" },
+ { "head", "var" },
+ { "head", "xmp" },
+ { "hr", "form" },
+ { "i", "center" },
+ { "i", "p" },
+ { "i", "td" },
+ { "i", "th" },
+ { "legend", "fieldset" },
+ { "li", "li" },
+ { "link", "body" },
+ { "link", "frameset" },
+ { "listing", "dd" },
+ { "listing", "dl" },
+ { "listing", "dt" },
+ { "listing", "fieldset" },
+ { "listing", "form" },
+ { "listing", "li" },
+ { "listing", "table" },
+ { "listing", "ul" },
+ { "menu", "dd" },
+ { "menu", "dl" },
+ { "menu", "dt" },
+ { "menu", "form" },
+ { "menu", "ul" },
+ { "ol", "form" },
+ { "ol", "ul" },
+ { "option", "optgroup" },
+ { "option", "option" },
+ { "p", "address" },
+ { "p", "blockquote" },
+ { "p", "body" },
+ { "p", "caption" },
+ { "p", "center" },
+ { "p", "col" },
+ { "p", "colgroup" },
+ { "p", "dd" },
+ { "p", "dir" },
+ { "p", "div" },
+ { "p", "dl" },
+ { "p", "dt" },
+ { "p", "fieldset" },
+ { "p", "form" },
+ { "p", "frameset" },
+ { "p", "h1" },
+ { "p", "h2" },
+ { "p", "h3" },
+ { "p", "h4" },
+ { "p", "h5" },
+ { "p", "h6" },
+ { "p", "head" },
+ { "p", "hr" },
+ { "p", "li" },
+ { "p", "listing" },
+ { "p", "menu" },
+ { "p", "ol" },
+ { "p", "p" },
+ { "p", "pre" },
+ { "p", "table" },
+ { "p", "tbody" },
+ { "p", "td" },
+ { "p", "tfoot" },
+ { "p", "th" },
+ { "p", "title" },
+ { "p", "tr" },
+ { "p", "ul" },
+ { "p", "xmp" },
+ { "pre", "dd" },
+ { "pre", "dl" },
+ { "pre", "dt" },
+ { "pre", "fieldset" },
+ { "pre", "form" },
+ { "pre", "li" },
+ { "pre", "table" },
+ { "pre", "ul" },
+ { "s", "p" },
+ { "script", "noscript" },
+ { "small", "p" },
+ { "span", "td" },
+ { "span", "th" },
+ { "strike", "p" },
+ { "style", "body" },
+ { "style", "frameset" },
+ { "tbody", "tbody" },
+ { "tbody", "tfoot" },
+ { "td", "tbody" },
+ { "td", "td" },
+ { "td", "tfoot" },
+ { "td", "th" },
+ { "td", "tr" },
+ { "tfoot", "tbody" },
+ { "th", "tbody" },
+ { "th", "td" },
+ { "th", "tfoot" },
+ { "th", "th" },
+ { "th", "tr" },
+ { "thead", "tbody" },
+ { "thead", "tfoot" },
+ { "title", "body" },
+ { "title", "frameset" },
+ { "tr", "tbody" },
+ { "tr", "tfoot" },
+ { "tr", "tr" },
+ { "tt", "p" },
+ { "u", "p" },
+ { "u", "td" },
+ { "u", "th" },
+ { "ul", "address" },
+ { "ul", "form" },
+ { "ul", "menu" },
+ { "ul", "ol" },
+ { "ul", "pre" },
+ { "xmp", "dd" },
+ { "xmp", "dl" },
+ { "xmp", "dt" },
+ { "xmp", "fieldset" },
+ { "xmp", "form" },
+ { "xmp", "li" },
+ { "xmp", "table" },
+ { "xmp", "ul" }
};
/*
@@ -1211,9 +1401,6 @@ static const elementPriority htmlEndPriority[] = {
{NULL, 100} /* Default priority */
};
-static const char** htmlStartCloseIndex[100];
-static int htmlStartCloseIndexinitialized = 0;
-
/************************************************************************
* *
* functions to handle HTML specific data *
@@ -1223,24 +1410,18 @@ static int htmlStartCloseIndexinitialized = 0;
/**
* htmlInitAutoClose:
*
- * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
- * This is not reentrant. Call xmlInitParser() once before processing in
- * case of use in multithreaded programs.
+ * This is a no-op now.
*/
void
htmlInitAutoClose(void) {
- int indx, i = 0;
+}
- if (htmlStartCloseIndexinitialized) return;
+static int
+htmlCompareTags(const void *key, const void *member) {
+ const xmlChar *tag = (const xmlChar *) key;
+ const htmlElemDesc *desc = (const htmlElemDesc *) member;
- for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
- indx = 0;
- while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
- htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
- while (htmlStartClose[i] != NULL) i++;
- i++;
- }
- htmlStartCloseIndexinitialized = 1;
+ return(xmlStrcasecmp(tag, BAD_CAST desc->name));
}
/**
@@ -1253,14 +1434,12 @@ htmlInitAutoClose(void) {
*/
const htmlElemDesc *
htmlTagLookup(const xmlChar *tag) {
- unsigned int i;
+ if (tag == NULL)
+ return(NULL);
- for (i = 0; i < (sizeof(html40ElementTable) /
- sizeof(html40ElementTable[0]));i++) {
- if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
- return((htmlElemDescPtr) &html40ElementTable[i]);
- }
- return(NULL);
+ return((const htmlElemDesc *) bsearch(tag, html40ElementTable,
+ sizeof(html40ElementTable) / sizeof(htmlElemDesc),
+ sizeof(htmlElemDesc), htmlCompareTags));
}
/**
@@ -1281,6 +1460,19 @@ htmlGetEndPriority (const xmlChar *name) {
}
+static int
+htmlCompareStartClose(const void *vkey, const void *member) {
+ const htmlStartCloseEntry *key = (const htmlStartCloseEntry *) vkey;
+ const htmlStartCloseEntry *entry = (const htmlStartCloseEntry *) member;
+ int ret;
+
+ ret = strcmp(key->oldTag, entry->oldTag);
+ if (ret == 0)
+ ret = strcmp(key->newTag, entry->newTag);
+
+ return(ret);
+}
+
/**
* htmlCheckAutoClose:
* @newtag: The new tag name
@@ -1288,37 +1480,21 @@ htmlGetEndPriority (const xmlChar *name) {
*
* Checks whether the new tag is one of the registered valid tags for
* closing old.
- * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
*
* Returns 0 if no, 1 if yes.
*/
static int
htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
{
- int i, indx;
- const char **closed = NULL;
-
- if (htmlStartCloseIndexinitialized == 0)
- htmlInitAutoClose();
-
- /* inefficient, but not a big deal */
- for (indx = 0; indx < 100; indx++) {
- closed = htmlStartCloseIndex[indx];
- if (closed == NULL)
- return (0);
- if (xmlStrEqual(BAD_CAST * closed, newtag))
- break;
- }
-
- i = closed - htmlStartClose;
- i++;
- while (htmlStartClose[i] != NULL) {
- if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
- return (1);
- }
- i++;
- }
- return (0);
+ htmlStartCloseEntry key;
+ void *res;
+
+ key.oldTag = (const char *) oldtag;
+ key.newTag = (const char *) newtag;
+ res = bsearch(&key, htmlStartClose,
+ sizeof(htmlStartClose) / sizeof(htmlStartCloseEntry),
+ sizeof(htmlStartCloseEntry), htmlCompareStartClose);
+ return(res != NULL);
}
/**
@@ -2341,6 +2517,8 @@ htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
if ((ExternalID != NULL) ||
(URI != NULL))
xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
+ if ((__xmlRegisterCallbacks) && (xmlRegisterNodeDefaultValue))
+ xmlRegisterNodeDefaultValue((xmlNodePtr)cur);
return(cur);
}
@@ -2482,7 +2660,6 @@ htmlParseName(htmlParserCtxtPtr ctxt) {
count = in - ctxt->input->cur;
ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
ctxt->input->cur = in;
- ctxt->nbChars += count;
ctxt->input->col += count;
return(ret);
}
@@ -2789,47 +2966,39 @@ htmlParseAttValue(htmlParserCtxtPtr ctxt) {
static xmlChar *
htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
size_t len = 0, startPosition = 0;
+ int err = 0;
+ int quote;
xmlChar *ret = NULL;
- if (CUR == '"') {
- NEXT;
+ if ((CUR != '"') && (CUR != '\'')) {
+ htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
+ "SystemLiteral \" or ' expected\n", NULL, NULL);
+ return(NULL);
+ }
+ quote = CUR;
+ NEXT;
- if (CUR_PTR < BASE_PTR)
- return(ret);
- startPosition = CUR_PTR - BASE_PTR;
+ if (CUR_PTR < BASE_PTR)
+ return(ret);
+ startPosition = CUR_PTR - BASE_PTR;
- while ((IS_CHAR_CH(CUR)) && (CUR != '"')) {
- NEXT;
- len++;
- }
- if (!IS_CHAR_CH(CUR)) {
- htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
- "Unfinished SystemLiteral\n", NULL, NULL);
- } else {
- ret = xmlStrndup((BASE_PTR+startPosition), len);
- NEXT;
+ while ((CUR != 0) && (CUR != quote)) {
+ /* TODO: Handle UTF-8 */
+ if (!IS_CHAR_CH(CUR)) {
+ htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
+ "Invalid char in SystemLiteral 0x%X\n", CUR);
+ err = 1;
}
- } else if (CUR == '\'') {
NEXT;
-
- if (CUR_PTR < BASE_PTR)
- return(ret);
- startPosition = CUR_PTR - BASE_PTR;
-
- while ((IS_CHAR_CH(CUR)) && (CUR != '\'')) {
- NEXT;
- len++;
- }
- if (!IS_CHAR_CH(CUR)) {
- htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
- "Unfinished SystemLiteral\n", NULL, NULL);
- } else {
- ret = xmlStrndup((BASE_PTR+startPosition), len);
- NEXT;
- }
+ len++;
+ }
+ if (CUR != quote) {
+ htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
+ "Unfinished SystemLiteral\n", NULL, NULL);
} else {
- htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
- " or ' expected\n", NULL, NULL);
+ NEXT;
+ if (err == 0)
+ ret = xmlStrndup((BASE_PTR+startPosition), len);
}
return(ret);
@@ -2849,51 +3018,42 @@ htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
static xmlChar *
htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
size_t len = 0, startPosition = 0;
+ int err = 0;
+ int quote;
xmlChar *ret = NULL;
+
+ if ((CUR != '"') && (CUR != '\'')) {
+ htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
+ "PubidLiteral \" or ' expected\n", NULL, NULL);
+ return(NULL);
+ }
+ quote = CUR;
+ NEXT;
+
/*
* Name ::= (Letter | '_') (NameChar)*
*/
- if (CUR == '"') {
- NEXT;
-
- if (CUR_PTR < BASE_PTR)
- return(ret);
- startPosition = CUR_PTR - BASE_PTR;
-
- while (IS_PUBIDCHAR_CH(CUR)) {
- len++;
- NEXT;
+ if (CUR_PTR < BASE_PTR)
+ return(ret);
+ startPosition = CUR_PTR - BASE_PTR;
+
+ while ((CUR != 0) && (CUR != quote)) {
+ if (!IS_PUBIDCHAR_CH(CUR)) {
+ htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
+ "Invalid char in PubidLiteral 0x%X\n", CUR);
+ err = 1;
}
-
- if (CUR != '"') {
- htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
- "Unfinished PubidLiteral\n", NULL, NULL);
- } else {
- ret = xmlStrndup((BASE_PTR + startPosition), len);
- NEXT;
- }
- } else if (CUR == '\'') {
+ len++;
NEXT;
+ }
- if (CUR_PTR < BASE_PTR)
- return(ret);
- startPosition = CUR_PTR - BASE_PTR;
-
- while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\'')){
- len++;
- NEXT;
- }
-
- if (CUR != '\'') {
- htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
- "Unfinished PubidLiteral\n", NULL, NULL);
- } else {
- ret = xmlStrndup((BASE_PTR + startPosition), len);
- NEXT;
- }
+ if (CUR != quote) {
+ htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
+ "Unfinished PubidLiteral\n", NULL, NULL);
} else {
- htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
- "PubidLiteral \" or ' expected\n", NULL, NULL);
+ NEXT;
+ if (err == 0)
+ ret = xmlStrndup((BASE_PTR + startPosition), len);
}
return(ret);
@@ -2928,7 +3088,7 @@ htmlParseScript(htmlParserCtxtPtr ctxt) {
SHRINK;
cur = CUR_CHAR(l);
- while (IS_CHAR_CH(cur)) {
+ while (cur != 0) {
if ((cur == '<') && (NXT(1) == '/')) {
/*
* One should break here, the specification is clear:
@@ -2959,7 +3119,12 @@ htmlParseScript(htmlParserCtxtPtr ctxt) {
}
}
}
- COPY_BUF(l,buf,nbchar,cur);
+ if (IS_CHAR(cur)) {
+ COPY_BUF(l,buf,nbchar,cur);
+ } else {
+ htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
+ "Invalid char in CDATA 0x%X\n", cur);
+ }
if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
buf[nbchar] = 0;
if (ctxt->sax->cdataBlock!= NULL) {
@@ -2977,14 +3142,6 @@ htmlParseScript(htmlParserCtxtPtr ctxt) {
cur = CUR_CHAR(l);
}
- if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
- htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
- "Invalid char in CDATA 0x%X\n", cur);
- if (ctxt->input->cur < ctxt->input->end) {
- NEXT;
- }
- }
-
if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
buf[nbchar] = 0;
if (ctxt->sax->cdataBlock!= NULL) {
@@ -3232,7 +3389,7 @@ htmlParsePI(htmlParserCtxtPtr ctxt) {
}
SKIP_BLANKS;
cur = CUR_CHAR(l);
- while (IS_CHAR(cur) && (cur != '>')) {
+ while ((cur != 0) && (cur != '>')) {
if (len + 5 >= size) {
xmlChar *tmp;
@@ -3251,7 +3408,13 @@ htmlParsePI(htmlParserCtxtPtr ctxt) {
GROW;
count = 0;
}
- COPY_BUF(l,buf,len,cur);
+ if (IS_CHAR(cur)) {
+ COPY_BUF(l,buf,len,cur);
+ } else {
+ htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
+ "Invalid char in processing instruction "
+ "0x%X\n", cur);
+ }
NEXTL(l);
cur = CUR_CHAR(l);
if (cur == 0) {
@@ -3300,6 +3463,7 @@ htmlParseComment(htmlParserCtxtPtr ctxt) {
int q, ql;
int r, rl;
int cur, l;
+ int next, nl;
xmlParserInputState state;
/*
@@ -3321,17 +3485,32 @@ htmlParseComment(htmlParserCtxtPtr ctxt) {
len = 0;
buf[len] = 0;
q = CUR_CHAR(ql);
- if (!IS_CHAR(q))
+ if (q == 0)
goto unfinished;
NEXTL(ql);
r = CUR_CHAR(rl);
- if (!IS_CHAR(r))
+ if (r == 0)
goto unfinished;
NEXTL(rl);
cur = CUR_CHAR(l);
- while (IS_CHAR(cur) &&
+ while ((cur != 0) &&
((cur != '>') ||
(r != '-') || (q != '-'))) {
+ NEXTL(l);
+ next = CUR_CHAR(nl);
+ if (next == 0) {
+ SHRINK;
+ GROW;
+ next = CUR_CHAR(nl);
+ }
+
+ if ((q == '-') && (r == '-') && (cur == '!') && (next == '>')) {
+ htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
+ "Comment incorrectly closed by '--!>'", NULL, NULL);
+ cur = '>';
+ break;
+ }
+
if (len + 5 >= size) {
xmlChar *tmp;
@@ -3345,21 +3524,22 @@ htmlParseComment(htmlParserCtxtPtr ctxt) {
}
buf = tmp;
}
- COPY_BUF(ql,buf,len,q);
+ if (IS_CHAR(q)) {
+ COPY_BUF(ql,buf,len,q);
+ } else {
+ htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
+ "Invalid char in comment 0x%X\n", q);
+ }
+
q = r;
ql = rl;
r = cur;
rl = l;
- NEXTL(l);
- cur = CUR_CHAR(l);
- if (cur == 0) {
- SHRINK;
- GROW;
- cur = CUR_CHAR(l);
- }
+ cur = next;
+ l = nl;
}
buf[len] = 0;
- if (IS_CHAR(cur)) {
+ if (cur == '>') {
NEXT;
if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
(!ctxt->disableSAX))
@@ -3400,13 +3580,16 @@ htmlParseCharRef(htmlParserCtxtPtr ctxt) {
((NXT(2) == 'x') || NXT(2) == 'X')) {
SKIP(3);
while (CUR != ';') {
- if ((CUR >= '0') && (CUR <= '9'))
- val = val * 16 + (CUR - '0');
- else if ((CUR >= 'a') && (CUR <= 'f'))
- val = val * 16 + (CUR - 'a') + 10;
- else if ((CUR >= 'A') && (CUR <= 'F'))
- val = val * 16 + (CUR - 'A') + 10;
- else {
+ if ((CUR >= '0') && (CUR <= '9')) {
+ if (val < 0x110000)
+ val = val * 16 + (CUR - '0');
+ } else if ((CUR >= 'a') && (CUR <= 'f')) {
+ if (val < 0x110000)
+ val = val * 16 + (CUR - 'a') + 10;
+ } else if ((CUR >= 'A') && (CUR <= 'F')) {
+ if (val < 0x110000)
+ val = val * 16 + (CUR - 'A') + 10;
+ } else {
htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
"htmlParseCharRef: missing semicolon\n",
NULL, NULL);
@@ -3419,9 +3602,10 @@ htmlParseCharRef(htmlParserCtxtPtr ctxt) {
} else if ((CUR == '&') && (NXT(1) == '#')) {
SKIP(2);
while (CUR != ';') {
- if ((CUR >= '0') && (CUR <= '9'))
- val = val * 10 + (CUR - '0');
- else {
+ if ((CUR >= '0') && (CUR <= '9')) {
+ if (val < 0x110000)
+ val = val * 10 + (CUR - '0');
+ } else {
htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
"htmlParseCharRef: missing semicolon\n",
NULL, NULL);
@@ -3440,6 +3624,9 @@ htmlParseCharRef(htmlParserCtxtPtr ctxt) {
*/
if (IS_CHAR(val)) {
return(val);
+ } else if (val >= 0x110000) {
+ htmlParseErr(ctxt, XML_ERR_INVALID_CHAR,
+ "htmlParseCharRef: value too large\n", NULL, NULL);
} else {
htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
"htmlParseCharRef: invalid xmlChar value %d\n",
@@ -3499,9 +3686,12 @@ htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
if (CUR != '>') {
htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
"DOCTYPE improperly terminated\n", NULL, NULL);
- /* We shouldn't try to resynchronize ... */
+ /* Ignore bogus content */
+ while ((CUR != 0) && (CUR != '>'))
+ NEXT;
}
- NEXT;
+ if (CUR == '>')
+ NEXT;
/*
* Create or update the document accordingly to the DOCTYPE
@@ -3770,16 +3960,28 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {
htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
"htmlParseStartTag: invalid element name\n",
NULL, NULL);
+ /*
+ * The recovery code is disabled for now as it can result in
+ * quadratic behavior with the push parser. htmlParseStartTag
+ * must consume all content up to the final '>' in order to avoid
+ * rescanning for this terminator.
+ *
+ * For a proper fix in line with HTML5, htmlParseStartTag and
+ * htmlParseElement should only be called when there's an ASCII
+ * alpha character following the initial '<'. Otherwise, the '<'
+ * should be emitted as text (unless followed by '!', '/' or '?').
+ */
+#if 0
/* if recover preserve text on classic misconstructs */
if ((ctxt->recovery) && ((IS_BLANK_CH(CUR)) || (CUR == '<') ||
(CUR == '=') || (CUR == '>') || (((CUR >= '0') && (CUR <= '9'))))) {
htmlParseCharDataInternal(ctxt, '<');
return(-1);
}
-
+#endif
/* Dump the bogus tag like browsers do */
- while ((IS_CHAR_CH(CUR)) && (CUR != '>') &&
+ while ((CUR != 0) && (CUR != '>') &&
(ctxt->instate != XML_PARSER_EOF))
NEXT;
return -1;
@@ -3835,11 +4037,9 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {
* (S Attribute)* S?
*/
SKIP_BLANKS;
- while ((IS_CHAR_CH(CUR)) &&
+ while ((CUR != 0) &&
(CUR != '>') &&
((CUR != '/') || (NXT(1) != '>'))) {
- long cons = ctxt->nbChars;
-
GROW;
attname = htmlParseAttribute(ctxt, &attvalue);
if (attname != NULL) {
@@ -3898,7 +4098,7 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {
xmlFree(attvalue);
/* Dump the bogus attribute string up to the next blank or
* the end of the tag. */
- while ((IS_CHAR_CH(CUR)) &&
+ while ((CUR != 0) &&
!(IS_BLANK_CH(CUR)) && (CUR != '>') &&
((CUR != '/') || (NXT(1) != '>')))
NEXT;
@@ -3906,12 +4106,6 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {
failed:
SKIP_BLANKS;
- if (cons == ctxt->nbChars) {
- htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
- "htmlParseStartTag: problem parsing attributes\n",
- NULL, NULL);
- break;
- }
}
/*
@@ -3979,19 +4173,14 @@ htmlParseEndTag(htmlParserCtxtPtr ctxt)
* We should definitely be at the ending "S? '>'" part
*/
SKIP_BLANKS;
- if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
+ if (CUR != '>') {
htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
"End tag : expected '>'\n", NULL, NULL);
- if (ctxt->recovery) {
- /*
- * We're not at the ending > !!
- * Error, unless in recover mode where we search forwards
- * until we find a >
- */
- while (CUR != '\0' && CUR != '>') NEXT;
- NEXT;
- }
- } else
+ /* Skip to next '>' */
+ while ((CUR != 0) && (CUR != '>'))
+ NEXT;
+ }
+ if (CUR == '>')
NEXT;
/*
@@ -4032,12 +4221,10 @@ htmlParseEndTag(htmlParserCtxtPtr ctxt)
* With the exception that the autoclose may have popped stuff out
* of the stack.
*/
- if (!xmlStrEqual(name, ctxt->name)) {
- if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
- htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
- "Opening and ending tag mismatch: %s and %s\n",
- name, ctxt->name);
- }
+ if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
+ htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
+ "Opening and ending tag mismatch: %s and %s\n",
+ name, ctxt->name);
}
/*
@@ -4152,8 +4339,6 @@ htmlParseContent(htmlParserCtxtPtr ctxt) {
currentNode = xmlStrdup(ctxt->name);
depth = ctxt->nameNr;
while (1) {
- long cons = ctxt->nbChars;
-
GROW;
if (ctxt->instate == XML_PARSER_EOF)
@@ -4181,7 +4366,7 @@ htmlParseContent(htmlParserCtxtPtr ctxt) {
"htmlParseStartTag: invalid element name\n",
NULL, NULL);
/* Dump the bogus tag like browsers do */
- while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
+ while ((CUR != 0) && (CUR != '>'))
NEXT;
if (currentNode != NULL)
@@ -4273,15 +4458,6 @@ htmlParseContent(htmlParserCtxtPtr ctxt) {
else {
htmlParseCharData(ctxt);
}
-
- if (cons == ctxt->nbChars) {
- if (ctxt->node != NULL) {
- htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
- "detected an error in element content\n",
- NULL, NULL);
- }
- break;
- }
}
GROW;
}
@@ -4396,7 +4572,7 @@ htmlParseElement(htmlParserCtxtPtr ctxt) {
*/
currentNode = xmlStrdup(ctxt->name);
depth = ctxt->nameNr;
- while (IS_CHAR_CH(CUR)) {
+ while (CUR != 0) {
oldptr = ctxt->input->cur;
htmlParseContent(ctxt);
if (oldptr==ctxt->input->cur) break;
@@ -4413,7 +4589,7 @@ htmlParseElement(htmlParserCtxtPtr ctxt) {
node_info.node = ctxt->node;
xmlParserAddNodeInfo(ctxt, &node_info);
}
- if (!IS_CHAR_CH(CUR)) {
+ if (CUR == 0) {
htmlAutoCloseOnEnd(ctxt);
}
@@ -4434,7 +4610,7 @@ htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
htmlNodeInfoPop(ctxt);
}
- if (!IS_CHAR_CH(CUR)) {
+ if (CUR == 0) {
htmlAutoCloseOnEnd(ctxt);
}
}
@@ -4552,8 +4728,6 @@ htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
currentNode = xmlStrdup(ctxt->name);
depth = ctxt->nameNr;
while (1) {
- long cons = ctxt->nbChars;
-
GROW;
if (ctxt->instate == XML_PARSER_EOF)
@@ -4583,7 +4757,7 @@ htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
"htmlParseStartTag: invalid element name\n",
NULL, NULL);
/* Dump the bogus tag like browsers do */
- while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
+ while ((CUR == 0) && (CUR != '>'))
NEXT;
htmlParserFinishElementParsing(ctxt);
@@ -4687,15 +4861,6 @@ htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
else {
htmlParseCharData(ctxt);
}
-
- if (cons == ctxt->nbChars) {
- if (ctxt->node != NULL) {
- htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
- "detected an error in element content\n",
- NULL, NULL);
- }
- break;
- }
}
GROW;
}
@@ -4959,7 +5124,6 @@ htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
ctxt->vctxt.warning = xmlParserValidityWarning;
ctxt->record_info = 0;
ctxt->validate = 0;
- ctxt->nbChars = 0;
ctxt->checkIndex = 0;
ctxt->catalogs = NULL;
xmlInitNodeInfoSeq(&ctxt->node_seq);
@@ -5035,6 +5199,7 @@ htmlCreateMemoryParserCtxt(const char *buffer, int size) {
input = xmlNewInputStream(ctxt);
if (input == NULL) {
+ xmlFreeParserInputBuffer(buf);
xmlFreeParserCtxt(ctxt);
return(NULL);
}
@@ -5119,7 +5284,7 @@ htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
* @first: the first char to lookup
* @next: the next char to lookup or zero
* @third: the next char to lookup or zero
- * @comment: flag to force checking inside comments
+ * @ignoreattrval: skip over attribute values
*
* Try to find if a sequence (first, next, third) or just (first next) or
* (first) is available in the input stream.
@@ -5133,13 +5298,11 @@ htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
*/
static int
htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
- xmlChar next, xmlChar third, int iscomment,
- int ignoreattrval)
+ xmlChar next, xmlChar third, int ignoreattrval)
{
int base, len;
htmlParserInputPtr in;
const xmlChar *buf;
- int incomment = 0;
int invalue = 0;
char valdellim = 0x0;
@@ -5151,8 +5314,11 @@ htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
if (base < 0)
return (-1);
- if (ctxt->checkIndex > base)
+ if (ctxt->checkIndex > base) {
base = ctxt->checkIndex;
+ /* Abuse hasPErefs member to restore current state. */
+ invalue = ctxt->hasPErefs & 1 ? 1 : 0;
+ }
if (in->buf == NULL) {
buf = in->base;
@@ -5168,14 +5334,6 @@ htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
else if (next)
len--;
for (; base < len; base++) {
- if ((!incomment) && (base + 4 < len) && (!iscomment)) {
- if ((buf[base] == '<') && (buf[base + 1] == '!') &&
- (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
- incomment = 1;
- /* do not increment past <! - some people use <!--> */
- base += 2;
- }
- }
if (ignoreattrval) {
if (buf[base] == '"' || buf[base] == '\'') {
if (invalue) {
@@ -5192,16 +5350,6 @@ htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
continue;
}
}
- if (incomment) {
- if (base + 3 > len)
- return (-1);
- if ((buf[base] == '-') && (buf[base + 1] == '-') &&
- (buf[base + 2] == '>')) {
- incomment = 0;
- base += 2;
- }
- continue;
- }
if (buf[base] == first) {
if (third != 0) {
if ((buf[base + 1] != next) || (buf[base + 2] != third))
@@ -5228,8 +5376,12 @@ htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
return (base - (in->cur - in->base));
}
}
- if ((!incomment) && (!invalue))
- ctxt->checkIndex = base;
+ ctxt->checkIndex = base;
+ /* Abuse hasPErefs member to track current state. */
+ if (invalue)
+ ctxt->hasPErefs |= 1;
+ else
+ ctxt->hasPErefs &= ~1;
#ifdef DEBUG_PUSH
if (next == 0)
xmlGenericError(xmlGenericErrorContext,
@@ -5246,79 +5398,38 @@ htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
}
/**
- * htmlParseLookupChars:
+ * htmlParseLookupCommentEnd:
* @ctxt: an HTML parser context
- * @stop: Array of chars, which stop the lookup.
- * @stopLen: Length of stop-Array
*
- * Try to find if any char of the stop-Array is available in the input
- * stream.
+ * Try to find a comment end tag in the input stream
+ * The search includes "-->" as well as WHATWG-recommended incorrectly-closed tags.
+ * (See https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment)
* This function has a side effect of (possibly) incrementing ctxt->checkIndex
* to avoid rescanning sequences of bytes, it DOES change the state of the
* parser, do not use liberally.
+ * This wraps to htmlParseLookupSequence()
*
- * Returns the index to the current parsing point if a stopChar
- * is available, -1 otherwise.
+ * Returns the index to the current parsing point if the full sequence is available, -1 otherwise.
*/
static int
-htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
- int stopLen)
+htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)
{
- int base, len;
- htmlParserInputPtr in;
- const xmlChar *buf;
- int incomment = 0;
- int i;
-
- in = ctxt->input;
- if (in == NULL)
- return (-1);
-
- base = in->cur - in->base;
- if (base < 0)
- return (-1);
-
- if (ctxt->checkIndex > base)
- base = ctxt->checkIndex;
-
- if (in->buf == NULL) {
- buf = in->base;
- len = in->length;
- } else {
- buf = xmlBufContent(in->buf->buffer);
- len = xmlBufUse(in->buf->buffer);
- }
-
- for (; base < len; base++) {
- if (!incomment && (base + 4 < len)) {
- if ((buf[base] == '<') && (buf[base + 1] == '!') &&
- (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
- incomment = 1;
- /* do not increment past <! - some people use <!--> */
- base += 2;
- }
- }
- if (incomment) {
- if (base + 3 > len)
- return (-1);
- if ((buf[base] == '-') && (buf[base + 1] == '-') &&
- (buf[base + 2] == '>')) {
- incomment = 0;
- base += 2;
- }
- continue;
- }
- for (i = 0; i < stopLen; ++i) {
- if (buf[base] == stop[i]) {
- ctxt->checkIndex = 0;
- return (base - (in->cur - in->base));
- }
- }
+ int mark = 0;
+ int cur = CUR_PTR - BASE_PTR;
+
+ while (mark >= 0) {
+ mark = htmlParseLookupSequence(ctxt, '-', '-', 0, 0);
+ if ((mark < 0) ||
+ (NXT(mark+2) == '>') ||
+ ((NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) {
+ return mark;
+ }
+ ctxt->checkIndex = cur + mark + 1;
}
- ctxt->checkIndex = base;
- return (-1);
+ return mark;
}
+
/**
* htmlParseTryOrFinish:
* @ctxt: an HTML parser context
@@ -5332,7 +5443,7 @@ static int
htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
int ret = 0;
htmlParserInputPtr in;
- int avail = 0;
+ ptrdiff_t avail = 0;
xmlChar cur, next;
htmlParserNodeInfo node_info;
@@ -5397,7 +5508,8 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
if (in->buf == NULL)
avail = in->length - (in->cur - in->base);
else
- avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
+ avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
+ (in->cur - in->base);
if ((avail == 0) && (terminate)) {
htmlAutoCloseOnEnd(ctxt);
if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
@@ -5411,6 +5523,12 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
}
if (avail < 1)
goto done;
+ /*
+ * This is done to make progress and avoid an infinite loop
+ * if a parsing attempt was aborted by hitting a NUL byte. After
+ * changing htmlCurrentChar, this probably isn't necessary anymore.
+ * We should consider removing this check.
+ */
cur = in->cur[0];
if (cur == 0) {
SKIP(1);
@@ -5433,7 +5551,8 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
if (in->buf == NULL)
avail = in->length - (in->cur - in->base);
else
- avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
+ avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
+ (in->cur - in->base);
}
if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
ctxt->sax->setDocumentLocator(ctxt->userData,
@@ -5450,7 +5569,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
(UPP(6) == 'Y') && (UPP(7) == 'P') &&
(UPP(8) == 'E')) {
if ((!terminate) &&
- (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
+ (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
goto done;
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext,
@@ -5475,14 +5594,15 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
if (in->buf == NULL)
avail = in->length - (in->cur - in->base);
else
- avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
+ avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
+ (in->cur - in->base);
/*
* no chars in buffer
*/
if (avail < 1)
goto done;
/*
- * not enouth chars in buffer
+ * not enough chars in buffer
*/
if (avail < 2) {
if (!terminate)
@@ -5495,8 +5615,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
cur = in->cur[0];
if ((cur == '<') && (next == '!') &&
(in->cur[2] == '-') && (in->cur[3] == '-')) {
- if ((!terminate) &&
- (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
+ if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
goto done;
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext,
@@ -5506,7 +5625,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
ctxt->instate = XML_PARSER_MISC;
} else if ((cur == '<') && (next == '?')) {
if ((!terminate) &&
- (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
+ (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
goto done;
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext,
@@ -5520,7 +5639,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
(UPP(6) == 'Y') && (UPP(7) == 'P') &&
(UPP(8) == 'E')) {
if ((!terminate) &&
- (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
+ (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
goto done;
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext,
@@ -5536,7 +5655,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
(avail < 9)) {
goto done;
} else {
- ctxt->instate = XML_PARSER_START_TAG;
+ ctxt->instate = XML_PARSER_CONTENT;
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext,
"HPP: entering START_TAG\n");
@@ -5548,15 +5667,15 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
if (in->buf == NULL)
avail = in->length - (in->cur - in->base);
else
- avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
+ avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
+ (in->cur - in->base);
if (avail < 2)
goto done;
cur = in->cur[0];
next = in->cur[1];
if ((cur == '<') && (next == '!') &&
(in->cur[2] == '-') && (in->cur[3] == '-')) {
- if ((!terminate) &&
- (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
+ if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
goto done;
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext,
@@ -5566,7 +5685,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
ctxt->instate = XML_PARSER_PROLOG;
} else if ((cur == '<') && (next == '?')) {
if ((!terminate) &&
- (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
+ (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
goto done;
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext,
@@ -5578,7 +5697,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
(avail < 4)) {
goto done;
} else {
- ctxt->instate = XML_PARSER_START_TAG;
+ ctxt->instate = XML_PARSER_CONTENT;
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext,
"HPP: entering START_TAG\n");
@@ -5589,7 +5708,8 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
if (in->buf == NULL)
avail = in->length - (in->cur - in->base);
else
- avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
+ avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
+ (in->cur - in->base);
if (avail < 1)
goto done;
cur = in->cur[0];
@@ -5602,8 +5722,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
next = in->cur[1];
if ((cur == '<') && (next == '!') &&
(in->cur[2] == '-') && (in->cur[3] == '-')) {
- if ((!terminate) &&
- (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
+ if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
goto done;
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext,
@@ -5613,7 +5732,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
ctxt->instate = XML_PARSER_EPILOG;
} else if ((cur == '<') && (next == '?')) {
if ((!terminate) &&
- (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
+ (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
goto done;
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext,
@@ -5648,7 +5767,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
if (avail < 1)
goto done;
/*
- * not enouth chars in buffer
+ * not enough chars in buffer
*/
if (avail < 2) {
if (!terminate)
@@ -5677,7 +5796,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
break;
}
if ((!terminate) &&
- (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
+ (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
goto done;
/* Capture start position */
@@ -5769,7 +5888,6 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
}
case XML_PARSER_CONTENT: {
xmlChar chr[2] = { 0, 0 };
- long cons;
/*
* Handle preparsed entities and charRef
@@ -5814,7 +5932,6 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
goto done;
cur = in->cur[0];
next = in->cur[1];
- cons = ctxt->nbChars;
if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
(xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
/*
@@ -5824,7 +5941,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
int idx;
xmlChar val;
- idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 0);
+ idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
if (idx < 0)
goto done;
val = in->cur[idx + 2];
@@ -5851,7 +5968,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
(UPP(6) == 'Y') && (UPP(7) == 'P') &&
(UPP(8) == 'E')) {
if ((!terminate) &&
- (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
+ (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
goto done;
htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
"Misplaced DOCTYPE declaration\n",
@@ -5859,9 +5976,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
htmlParseDocTypeDecl(ctxt);
} else if ((cur == '<') && (next == '!') &&
(in->cur[2] == '-') && (in->cur[3] == '-')) {
- if ((!terminate) &&
- (htmlParseLookupSequence(
- ctxt, '-', '-', '>', 1, 1) < 0))
+ if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
goto done;
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext,
@@ -5871,7 +5986,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
ctxt->instate = XML_PARSER_CONTENT;
} else if ((cur == '<') && (next == '?')) {
if ((!terminate) &&
- (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
+ (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
goto done;
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext,
@@ -5890,24 +6005,15 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
#endif
break;
} else if (cur == '<') {
- ctxt->instate = XML_PARSER_START_TAG;
- ctxt->checkIndex = 0;
+ if ((!terminate) && (next == 0))
+ goto done;
+ ctxt->instate = XML_PARSER_START_TAG;
+ ctxt->checkIndex = 0;
#ifdef DEBUG_PUSH
- xmlGenericError(xmlGenericErrorContext,
- "HPP: entering START_TAG\n");
+ xmlGenericError(xmlGenericErrorContext,
+ "HPP: entering START_TAG\n");
#endif
break;
- } else if (cur == '&') {
- if ((!terminate) &&
- (htmlParseLookupChars(ctxt,
- BAD_CAST "; >/", 4) < 0))
- goto done;
-#ifdef DEBUG_PUSH
- xmlGenericError(xmlGenericErrorContext,
- "HPP: Parsing Reference\n");
-#endif
- /* TODO: check generation of subtrees if noent !!! */
- htmlParseReference(ctxt);
} else {
/*
* check that the text sequence is complete
@@ -5916,24 +6022,23 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
* data detection.
*/
if ((!terminate) &&
- (htmlParseLookupChars(ctxt, BAD_CAST "<&", 2) < 0))
+ (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
goto done;
ctxt->checkIndex = 0;
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext,
"HPP: Parsing char data\n");
#endif
- htmlParseCharData(ctxt);
- }
- }
- if (cons == ctxt->nbChars) {
- if (ctxt->node != NULL) {
- htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
- "detected an error in element content\n",
- NULL, NULL);
+ while ((ctxt->instate != XML_PARSER_EOF) &&
+ (cur != '<') && (in->cur < in->end)) {
+ if (cur == '&') {
+ htmlParseReference(ctxt);
+ } else {
+ htmlParseCharData(ctxt);
+ }
+ cur = in->cur[0];
+ }
}
- NEXT;
- break;
}
break;
@@ -5942,7 +6047,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
if (avail < 2)
goto done;
if ((!terminate) &&
- (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
+ (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
goto done;
htmlParseEndTag(ctxt);
if (ctxt->nameNr == 0) {
@@ -6124,12 +6229,12 @@ htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
int res;
res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
+ xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
if (res < 0) {
ctxt->errNo = XML_PARSER_EOF;
ctxt->disableSAX = 1;
return (XML_PARSER_EOF);
}
- xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
#endif
@@ -6148,12 +6253,12 @@ htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
size_t current = ctxt->input->cur - ctxt->input->base;
nbchars = xmlCharEncInput(in, terminate);
+ xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current);
if (nbchars < 0) {
htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
"encoder error\n", NULL, NULL);
return(XML_ERR_INVALID_ENCODING);
}
- xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current);
}
}
}
@@ -6671,7 +6776,6 @@ htmlCtxtReset(htmlParserCtxtPtr ctxt)
ctxt->vctxt.error = xmlParserValidityError;
ctxt->vctxt.warning = xmlParserValidityWarning;
ctxt->record_info = 0;
- ctxt->nbChars = 0;
ctxt->checkIndex = 0;
ctxt->inSubset = 0;
ctxt->errNo = XML_ERR_OK;
@@ -6890,7 +6994,9 @@ htmlReadMemory(const char *buffer, int size, const char *URL, const char *encodi
* @encoding: the document encoding, or NULL
* @options: a combination of htmlParserOption(s)
*
- * parse an XML from a file descriptor and build a tree.
+ * parse an HTML from a file descriptor and build a tree.
+ * NOTE that the file descriptor will not be closed when the
+ * reader is closed or reset.
*
* Returns the resulting document tree
*/
@@ -6899,17 +7005,17 @@ htmlReadFd(int fd, const char *URL, const char *encoding, int options)
{
htmlParserCtxtPtr ctxt;
xmlParserInputBufferPtr input;
- xmlParserInputPtr stream;
+ htmlParserInputPtr stream;
if (fd < 0)
return (NULL);
- xmlInitParser();
xmlInitParser();
input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
if (input == NULL)
return (NULL);
- ctxt = xmlNewParserCtxt();
+ input->closecallback = NULL;
+ ctxt = htmlNewParserCtxt();
if (ctxt == NULL) {
xmlFreeParserInputBuffer(input);
return (NULL);
@@ -6917,7 +7023,7 @@ htmlReadFd(int fd, const char *URL, const char *encoding, int options)
stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
if (stream == NULL) {
xmlFreeParserInputBuffer(input);
- xmlFreeParserCtxt(ctxt);
+ htmlFreeParserCtxt(ctxt);
return (NULL);
}
inputPush(ctxt, stream);