diff options
author | setser <setser@yandex-team.ru> | 2022-05-09 00:13:37 +0300 |
---|---|---|
committer | setser <setser@yandex-team.ru> | 2022-05-09 00:13:37 +0300 |
commit | e87e3fc8d0e04eb7ba3eee221bb91613b527ad85 (patch) | |
tree | 5279c128bdbdf902b9a08d9fae8e55b91910a553 /contrib/libs/libxml/HTMLparser.c | |
parent | f4f3e4024a1f32bd0bc3fa20239025a1b179e42d (diff) | |
download | ydb-e87e3fc8d0e04eb7ba3eee221bb91613b527ad85.tar.gz |
Update libxml to 2.9.13
ref:f572491d236694e847142c36f0f5546c649e05d7
Diffstat (limited to 'contrib/libs/libxml/HTMLparser.c')
-rw-r--r-- | contrib/libs/libxml/HTMLparser.c | 1208 |
1 files changed, 657 insertions, 551 deletions
diff --git a/contrib/libs/libxml/HTMLparser.c b/contrib/libs/libxml/HTMLparser.c index 7b6d68961c..3e8a165740 100644 --- a/contrib/libs/libxml/HTMLparser.c +++ b/contrib/libs/libxml/HTMLparser.c @@ -69,7 +69,7 @@ static void htmlParseComment(htmlParserCtxtPtr ctxt); /** * htmlErrMemory: * @ctxt: an HTML parser context - * @extra: extra informations + * @extra: extra information * * Handle a redefinition of attribute error */ @@ -296,7 +296,7 @@ htmlNodeInfoPop(htmlParserCtxtPtr ctxt) #define UPPER (toupper(*ctxt->input->cur)) -#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val) +#define SKIP(val) ctxt->input->cur += (val),ctxt->input->col+=(val) #define NXT(val) ctxt->input->cur[(val)] @@ -330,7 +330,7 @@ htmlNodeInfoPop(htmlParserCtxtPtr ctxt) if (*(ctxt->input->cur) == '\n') { \ ctxt->input->line++; ctxt->input->col = 1; \ } else ctxt->input->col++; \ - ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \ + ctxt->token = 0; ctxt->input->cur += l; \ } while (0) /************ @@ -414,6 +414,10 @@ htmlFindEncoding(xmlParserCtxtPtr ctxt) { static int htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) { + const unsigned char *cur; + unsigned char c; + unsigned int val; + if (ctxt->instate == XML_PARSER_EOF) return(0); @@ -421,99 +425,29 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) { *len = 0; return(ctxt->token); } - if (ctxt->charset == XML_CHAR_ENCODING_UTF8) { - /* - * We are supposed to handle UTF8, check it's valid - * From rfc2044: encoding of the Unicode values on UTF-8: - * - * UCS-4 range (hex.) UTF-8 octet sequence (binary) - * 0000 0000-0000 007F 0xxxxxxx - * 0000 0080-0000 07FF 110xxxxx 10xxxxxx - * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx - * - * Check for the 0x110000 limit too - */ - const unsigned char *cur = ctxt->input->cur; - unsigned char c; - unsigned int val; - - c = *cur; - if (c & 0x80) { - if (cur[1] == 0) { - xmlParserInputGrow(ctxt->input, INPUT_CHUNK); - cur = ctxt->input->cur; - } - if ((cur[1] & 0xc0) != 0x80) - goto encoding_error; - if ((c & 0xe0) == 0xe0) { + if (ctxt->charset != XML_CHAR_ENCODING_UTF8) { + xmlChar * guess; + xmlCharEncodingHandlerPtr handler; - if (cur[2] == 0) { - xmlParserInputGrow(ctxt->input, INPUT_CHUNK); - cur = ctxt->input->cur; - } - if ((cur[2] & 0xc0) != 0x80) - goto encoding_error; - if ((c & 0xf0) == 0xf0) { - if (cur[3] == 0) { - xmlParserInputGrow(ctxt->input, INPUT_CHUNK); - cur = ctxt->input->cur; - } - if (((c & 0xf8) != 0xf0) || - ((cur[3] & 0xc0) != 0x80)) - goto encoding_error; - /* 4-byte code */ - *len = 4; - val = (cur[0] & 0x7) << 18; - val |= (cur[1] & 0x3f) << 12; - val |= (cur[2] & 0x3f) << 6; - val |= cur[3] & 0x3f; - } else { - /* 3-byte code */ - *len = 3; - val = (cur[0] & 0xf) << 12; - val |= (cur[1] & 0x3f) << 6; - val |= cur[2] & 0x3f; - } - } else { - /* 2-byte code */ - *len = 2; - val = (cur[0] & 0x1f) << 6; - val |= cur[1] & 0x3f; - } - if (!IS_CHAR(val)) { - htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, - "Char 0x%X out of allowed range\n", val); - } - return(val); - } else { + /* + * Assume it's a fixed length encoding (1) with + * a compatible encoding for the ASCII set, since + * HTML constructs only use < 128 chars + */ + if ((int) *ctxt->input->cur < 0x80) { + *len = 1; if ((*ctxt->input->cur == 0) && (ctxt->input->cur < ctxt->input->end)) { - htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, - "Char 0x%X out of allowed range\n", 0); - *len = 1; + htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, + "Char 0x%X out of allowed range\n", 0); return(' '); } - /* 1-byte code */ - *len = 1; - return((int) *ctxt->input->cur); - } - } - /* - * Assume it's a fixed length encoding (1) with - * a compatible encoding for the ASCII set, since - * XML constructs only use < 128 chars - */ - *len = 1; - if ((int) *ctxt->input->cur < 0x80) - return((int) *ctxt->input->cur); - - /* - * Humm this is bad, do an automatic flow conversion - */ - { - xmlChar * guess; - xmlCharEncodingHandlerPtr handler; + return((int) *ctxt->input->cur); + } + /* + * Humm this is bad, do an automatic flow conversion + */ guess = htmlFindEncoding(ctxt); if (guess == NULL) { xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1); @@ -523,7 +457,12 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) { ctxt->input->encoding = guess; handler = xmlFindCharEncodingHandler((const char *) guess); if (handler != NULL) { - xmlSwitchToEncoding(ctxt, handler); + /* + * Don't use UTF-8 encoder which isn't required and + * can produce invalid UTF-8. + */ + if (!xmlStrEqual(BAD_CAST handler->name, BAD_CAST "UTF-8")) + xmlSwitchToEncoding(ctxt, handler); } else { htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, "Unsupported encoding %s", guess, NULL); @@ -532,7 +471,86 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) { ctxt->charset = XML_CHAR_ENCODING_UTF8; } - return(xmlCurrentChar(ctxt, len)); + /* + * We are supposed to handle UTF8, check it's valid + * From rfc2044: encoding of the Unicode values on UTF-8: + * + * UCS-4 range (hex.) UTF-8 octet sequence (binary) + * 0000 0000-0000 007F 0xxxxxxx + * 0000 0080-0000 07FF 110xxxxx 10xxxxxx + * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx + * + * Check for the 0x110000 limit too + */ + cur = ctxt->input->cur; + c = *cur; + if (c & 0x80) { + if ((c & 0x40) == 0) + goto encoding_error; + if (cur[1] == 0) { + xmlParserInputGrow(ctxt->input, INPUT_CHUNK); + cur = ctxt->input->cur; + } + if ((cur[1] & 0xc0) != 0x80) + goto encoding_error; + if ((c & 0xe0) == 0xe0) { + + if (cur[2] == 0) { + xmlParserInputGrow(ctxt->input, INPUT_CHUNK); + cur = ctxt->input->cur; + } + if ((cur[2] & 0xc0) != 0x80) + goto encoding_error; + if ((c & 0xf0) == 0xf0) { + if (cur[3] == 0) { + xmlParserInputGrow(ctxt->input, INPUT_CHUNK); + cur = ctxt->input->cur; + } + if (((c & 0xf8) != 0xf0) || + ((cur[3] & 0xc0) != 0x80)) + goto encoding_error; + /* 4-byte code */ + *len = 4; + val = (cur[0] & 0x7) << 18; + val |= (cur[1] & 0x3f) << 12; + val |= (cur[2] & 0x3f) << 6; + val |= cur[3] & 0x3f; + if (val < 0x10000) + goto encoding_error; + } else { + /* 3-byte code */ + *len = 3; + val = (cur[0] & 0xf) << 12; + val |= (cur[1] & 0x3f) << 6; + val |= cur[2] & 0x3f; + if (val < 0x800) + goto encoding_error; + } + } else { + /* 2-byte code */ + *len = 2; + val = (cur[0] & 0x1f) << 6; + val |= cur[1] & 0x3f; + if (val < 0x80) + goto encoding_error; + } + if (!IS_CHAR(val)) { + htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, + "Char 0x%X out of allowed range\n", val); + } + return(val); + } else { + if ((*ctxt->input->cur == 0) && + (ctxt->input->cur < ctxt->input->end)) { + htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, + "Char 0x%X out of allowed range\n", 0); + *len = 1; + return(' '); + } + /* 1-byte code */ + *len = 1; + return((int) *ctxt->input->cur); + } encoding_error: /* @@ -557,7 +575,16 @@ encoding_error: BAD_CAST buffer, NULL); } - ctxt->charset = XML_CHAR_ENCODING_8859_1; + /* + * Don't switch encodings twice. Note that if there's an encoder, we + * shouldn't receive invalid UTF-8 anyway. + * + * Note that if ctxt->input->buf == NULL, switching encodings is + * impossible, see Gitlab issue #34. + */ + if ((ctxt->input->buf != NULL) && + (ctxt->input->buf->encoder == NULL)) + xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1); *len = 1; return((int) *ctxt->input->cur); } @@ -584,7 +611,6 @@ htmlSkipBlankChars(xmlParserCtxtPtr ctxt) { ctxt->input->line++; ctxt->input->col = 1; } else ctxt->input->col++; ctxt->input->cur++; - ctxt->nbChars++; if (*ctxt->input->cur == 0) xmlParserInputGrow(ctxt->input, INPUT_CHUNK); } @@ -1046,102 +1072,266 @@ html40ElementTable[] = { } }; +typedef struct { + const char *oldTag; + const char *newTag; +} htmlStartCloseEntry; + /* * start tags that imply the end of current element */ -static const char * const htmlStartClose[] = { -"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6", - "dl", "ul", "ol", "menu", "dir", "address", "pre", - "listing", "xmp", "head", NULL, -"head", "p", NULL, -"title", "p", NULL, -"body", "head", "style", "link", "title", "p", NULL, -"frameset", "head", "style", "link", "title", "p", NULL, -"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address", - "pre", "listing", "xmp", "head", "li", NULL, -"hr", "p", "head", NULL, -"h1", "p", "head", NULL, -"h2", "p", "head", NULL, -"h3", "p", "head", NULL, -"h4", "p", "head", NULL, -"h5", "p", "head", NULL, -"h6", "p", "head", NULL, -"dir", "p", "head", NULL, -"address", "p", "head", "ul", NULL, -"pre", "p", "head", "ul", NULL, -"listing", "p", "head", NULL, -"xmp", "p", "head", NULL, -"blockquote", "p", "head", NULL, -"dl", "p", "dt", "menu", "dir", "address", "pre", "listing", - "xmp", "head", NULL, -"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp", - "head", "dd", NULL, -"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp", - "head", "dt", NULL, -"ul", "p", "head", "ol", "menu", "dir", "address", "pre", - "listing", "xmp", NULL, -"ol", "p", "head", "ul", NULL, -"menu", "p", "head", "ul", NULL, -"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL, -"div", "p", "head", NULL, -"noscript", "script", NULL, -"center", "font", "b", "i", "p", "head", NULL, -"a", "a", "head", NULL, -"caption", "p", NULL, -"colgroup", "caption", "colgroup", "col", "p", NULL, -"col", "caption", "col", "p", NULL, -"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre", - "listing", "xmp", "a", NULL, -"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL, -"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL, -"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL, -"thead", "caption", "col", "colgroup", NULL, -"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead", - "tbody", "p", NULL, -"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead", - "tfoot", "tbody", "p", NULL, -"optgroup", "option", NULL, -"option", "option", NULL, -"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", - "pre", "listing", "xmp", "a", NULL, -/* most tags in in FONTSTYLE, PHRASE and SPECIAL should close <head> */ -"tt", "head", NULL, -"i", "head", NULL, -"b", "head", NULL, -"u", "head", NULL, -"s", "head", NULL, -"strike", "head", NULL, -"big", "head", NULL, -"small", "head", NULL, - -"em", "head", NULL, -"strong", "head", NULL, -"dfn", "head", NULL, -"code", "head", NULL, -"samp", "head", NULL, -"kbd", "head", NULL, -"var", "head", NULL, -"cite", "head", NULL, -"abbr", "head", NULL, -"acronym", "head", NULL, - -/* "a" */ -"img", "head", NULL, -/* "applet" */ -/* "embed" */ -/* "object" */ -"font", "head", NULL, -/* "basefont" */ -"br", "head", NULL, -/* "script" */ -"map", "head", NULL, -"q", "head", NULL, -"sub", "head", NULL, -"sup", "head", NULL, -"span", "head", NULL, -"bdo", "head", NULL, -"iframe", "head", NULL, -NULL +static const htmlStartCloseEntry htmlStartClose[] = { + { "a", "a" }, + { "a", "fieldset" }, + { "a", "table" }, + { "a", "td" }, + { "a", "th" }, + { "address", "dd" }, + { "address", "dl" }, + { "address", "dt" }, + { "address", "form" }, + { "address", "li" }, + { "address", "ul" }, + { "b", "center" }, + { "b", "p" }, + { "b", "td" }, + { "b", "th" }, + { "big", "p" }, + { "caption", "col" }, + { "caption", "colgroup" }, + { "caption", "tbody" }, + { "caption", "tfoot" }, + { "caption", "thead" }, + { "caption", "tr" }, + { "col", "col" }, + { "col", "colgroup" }, + { "col", "tbody" }, + { "col", "tfoot" }, + { "col", "thead" }, + { "col", "tr" }, + { "colgroup", "colgroup" }, + { "colgroup", "tbody" }, + { "colgroup", "tfoot" }, + { "colgroup", "thead" }, + { "colgroup", "tr" }, + { "dd", "dt" }, + { "dir", "dd" }, + { "dir", "dl" }, + { "dir", "dt" }, + { "dir", "form" }, + { "dir", "ul" }, + { "dl", "form" }, + { "dl", "li" }, + { "dt", "dd" }, + { "dt", "dl" }, + { "font", "center" }, + { "font", "td" }, + { "font", "th" }, + { "form", "form" }, + { "h1", "fieldset" }, + { "h1", "form" }, + { "h1", "li" }, + { "h1", "p" }, + { "h1", "table" }, + { "h2", "fieldset" }, + { "h2", "form" }, + { "h2", "li" }, + { "h2", "p" }, + { "h2", "table" }, + { "h3", "fieldset" }, + { "h3", "form" }, + { "h3", "li" }, + { "h3", "p" }, + { "h3", "table" }, + { "h4", "fieldset" }, + { "h4", "form" }, + { "h4", "li" }, + { "h4", "p" }, + { "h4", "table" }, + { "h5", "fieldset" }, + { "h5", "form" }, + { "h5", "li" }, + { "h5", "p" }, + { "h5", "table" }, + { "h6", "fieldset" }, + { "h6", "form" }, + { "h6", "li" }, + { "h6", "p" }, + { "h6", "table" }, + { "head", "a" }, + { "head", "abbr" }, + { "head", "acronym" }, + { "head", "address" }, + { "head", "b" }, + { "head", "bdo" }, + { "head", "big" }, + { "head", "blockquote" }, + { "head", "body" }, + { "head", "br" }, + { "head", "center" }, + { "head", "cite" }, + { "head", "code" }, + { "head", "dd" }, + { "head", "dfn" }, + { "head", "dir" }, + { "head", "div" }, + { "head", "dl" }, + { "head", "dt" }, + { "head", "em" }, + { "head", "fieldset" }, + { "head", "font" }, + { "head", "form" }, + { "head", "frameset" }, + { "head", "h1" }, + { "head", "h2" }, + { "head", "h3" }, + { "head", "h4" }, + { "head", "h5" }, + { "head", "h6" }, + { "head", "hr" }, + { "head", "i" }, + { "head", "iframe" }, + { "head", "img" }, + { "head", "kbd" }, + { "head", "li" }, + { "head", "listing" }, + { "head", "map" }, + { "head", "menu" }, + { "head", "ol" }, + { "head", "p" }, + { "head", "pre" }, + { "head", "q" }, + { "head", "s" }, + { "head", "samp" }, + { "head", "small" }, + { "head", "span" }, + { "head", "strike" }, + { "head", "strong" }, + { "head", "sub" }, + { "head", "sup" }, + { "head", "table" }, + { "head", "tt" }, + { "head", "u" }, + { "head", "ul" }, + { "head", "var" }, + { "head", "xmp" }, + { "hr", "form" }, + { "i", "center" }, + { "i", "p" }, + { "i", "td" }, + { "i", "th" }, + { "legend", "fieldset" }, + { "li", "li" }, + { "link", "body" }, + { "link", "frameset" }, + { "listing", "dd" }, + { "listing", "dl" }, + { "listing", "dt" }, + { "listing", "fieldset" }, + { "listing", "form" }, + { "listing", "li" }, + { "listing", "table" }, + { "listing", "ul" }, + { "menu", "dd" }, + { "menu", "dl" }, + { "menu", "dt" }, + { "menu", "form" }, + { "menu", "ul" }, + { "ol", "form" }, + { "ol", "ul" }, + { "option", "optgroup" }, + { "option", "option" }, + { "p", "address" }, + { "p", "blockquote" }, + { "p", "body" }, + { "p", "caption" }, + { "p", "center" }, + { "p", "col" }, + { "p", "colgroup" }, + { "p", "dd" }, + { "p", "dir" }, + { "p", "div" }, + { "p", "dl" }, + { "p", "dt" }, + { "p", "fieldset" }, + { "p", "form" }, + { "p", "frameset" }, + { "p", "h1" }, + { "p", "h2" }, + { "p", "h3" }, + { "p", "h4" }, + { "p", "h5" }, + { "p", "h6" }, + { "p", "head" }, + { "p", "hr" }, + { "p", "li" }, + { "p", "listing" }, + { "p", "menu" }, + { "p", "ol" }, + { "p", "p" }, + { "p", "pre" }, + { "p", "table" }, + { "p", "tbody" }, + { "p", "td" }, + { "p", "tfoot" }, + { "p", "th" }, + { "p", "title" }, + { "p", "tr" }, + { "p", "ul" }, + { "p", "xmp" }, + { "pre", "dd" }, + { "pre", "dl" }, + { "pre", "dt" }, + { "pre", "fieldset" }, + { "pre", "form" }, + { "pre", "li" }, + { "pre", "table" }, + { "pre", "ul" }, + { "s", "p" }, + { "script", "noscript" }, + { "small", "p" }, + { "span", "td" }, + { "span", "th" }, + { "strike", "p" }, + { "style", "body" }, + { "style", "frameset" }, + { "tbody", "tbody" }, + { "tbody", "tfoot" }, + { "td", "tbody" }, + { "td", "td" }, + { "td", "tfoot" }, + { "td", "th" }, + { "td", "tr" }, + { "tfoot", "tbody" }, + { "th", "tbody" }, + { "th", "td" }, + { "th", "tfoot" }, + { "th", "th" }, + { "th", "tr" }, + { "thead", "tbody" }, + { "thead", "tfoot" }, + { "title", "body" }, + { "title", "frameset" }, + { "tr", "tbody" }, + { "tr", "tfoot" }, + { "tr", "tr" }, + { "tt", "p" }, + { "u", "p" }, + { "u", "td" }, + { "u", "th" }, + { "ul", "address" }, + { "ul", "form" }, + { "ul", "menu" }, + { "ul", "ol" }, + { "ul", "pre" }, + { "xmp", "dd" }, + { "xmp", "dl" }, + { "xmp", "dt" }, + { "xmp", "fieldset" }, + { "xmp", "form" }, + { "xmp", "li" }, + { "xmp", "table" }, + { "xmp", "ul" } }; /* @@ -1211,9 +1401,6 @@ static const elementPriority htmlEndPriority[] = { {NULL, 100} /* Default priority */ }; -static const char** htmlStartCloseIndex[100]; -static int htmlStartCloseIndexinitialized = 0; - /************************************************************************ * * * functions to handle HTML specific data * @@ -1223,24 +1410,18 @@ static int htmlStartCloseIndexinitialized = 0; /** * htmlInitAutoClose: * - * Initialize the htmlStartCloseIndex for fast lookup of closing tags names. - * This is not reentrant. Call xmlInitParser() once before processing in - * case of use in multithreaded programs. + * This is a no-op now. */ void htmlInitAutoClose(void) { - int indx, i = 0; +} - if (htmlStartCloseIndexinitialized) return; +static int +htmlCompareTags(const void *key, const void *member) { + const xmlChar *tag = (const xmlChar *) key; + const htmlElemDesc *desc = (const htmlElemDesc *) member; - for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL; - indx = 0; - while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) { - htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i]; - while (htmlStartClose[i] != NULL) i++; - i++; - } - htmlStartCloseIndexinitialized = 1; + return(xmlStrcasecmp(tag, BAD_CAST desc->name)); } /** @@ -1253,14 +1434,12 @@ htmlInitAutoClose(void) { */ const htmlElemDesc * htmlTagLookup(const xmlChar *tag) { - unsigned int i; + if (tag == NULL) + return(NULL); - for (i = 0; i < (sizeof(html40ElementTable) / - sizeof(html40ElementTable[0]));i++) { - if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name)) - return((htmlElemDescPtr) &html40ElementTable[i]); - } - return(NULL); + return((const htmlElemDesc *) bsearch(tag, html40ElementTable, + sizeof(html40ElementTable) / sizeof(htmlElemDesc), + sizeof(htmlElemDesc), htmlCompareTags)); } /** @@ -1281,6 +1460,19 @@ htmlGetEndPriority (const xmlChar *name) { } +static int +htmlCompareStartClose(const void *vkey, const void *member) { + const htmlStartCloseEntry *key = (const htmlStartCloseEntry *) vkey; + const htmlStartCloseEntry *entry = (const htmlStartCloseEntry *) member; + int ret; + + ret = strcmp(key->oldTag, entry->oldTag); + if (ret == 0) + ret = strcmp(key->newTag, entry->newTag); + + return(ret); +} + /** * htmlCheckAutoClose: * @newtag: The new tag name @@ -1288,37 +1480,21 @@ htmlGetEndPriority (const xmlChar *name) { * * Checks whether the new tag is one of the registered valid tags for * closing old. - * Initialize the htmlStartCloseIndex for fast lookup of closing tags names. * * Returns 0 if no, 1 if yes. */ static int htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag) { - int i, indx; - const char **closed = NULL; - - if (htmlStartCloseIndexinitialized == 0) - htmlInitAutoClose(); - - /* inefficient, but not a big deal */ - for (indx = 0; indx < 100; indx++) { - closed = htmlStartCloseIndex[indx]; - if (closed == NULL) - return (0); - if (xmlStrEqual(BAD_CAST * closed, newtag)) - break; - } - - i = closed - htmlStartClose; - i++; - while (htmlStartClose[i] != NULL) { - if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) { - return (1); - } - i++; - } - return (0); + htmlStartCloseEntry key; + void *res; + + key.oldTag = (const char *) oldtag; + key.newTag = (const char *) newtag; + res = bsearch(&key, htmlStartClose, + sizeof(htmlStartClose) / sizeof(htmlStartCloseEntry), + sizeof(htmlStartCloseEntry), htmlCompareStartClose); + return(res != NULL); } /** @@ -2341,6 +2517,8 @@ htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) { if ((ExternalID != NULL) || (URI != NULL)) xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI); + if ((__xmlRegisterCallbacks) && (xmlRegisterNodeDefaultValue)) + xmlRegisterNodeDefaultValue((xmlNodePtr)cur); return(cur); } @@ -2482,7 +2660,6 @@ htmlParseName(htmlParserCtxtPtr ctxt) { count = in - ctxt->input->cur; ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count); ctxt->input->cur = in; - ctxt->nbChars += count; ctxt->input->col += count; return(ret); } @@ -2789,47 +2966,39 @@ htmlParseAttValue(htmlParserCtxtPtr ctxt) { static xmlChar * htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) { size_t len = 0, startPosition = 0; + int err = 0; + int quote; xmlChar *ret = NULL; - if (CUR == '"') { - NEXT; + if ((CUR != '"') && (CUR != '\'')) { + htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED, + "SystemLiteral \" or ' expected\n", NULL, NULL); + return(NULL); + } + quote = CUR; + NEXT; - if (CUR_PTR < BASE_PTR) - return(ret); - startPosition = CUR_PTR - BASE_PTR; + if (CUR_PTR < BASE_PTR) + return(ret); + startPosition = CUR_PTR - BASE_PTR; - while ((IS_CHAR_CH(CUR)) && (CUR != '"')) { - NEXT; - len++; - } - if (!IS_CHAR_CH(CUR)) { - htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, - "Unfinished SystemLiteral\n", NULL, NULL); - } else { - ret = xmlStrndup((BASE_PTR+startPosition), len); - NEXT; + while ((CUR != 0) && (CUR != quote)) { + /* TODO: Handle UTF-8 */ + if (!IS_CHAR_CH(CUR)) { + htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, + "Invalid char in SystemLiteral 0x%X\n", CUR); + err = 1; } - } else if (CUR == '\'') { NEXT; - - if (CUR_PTR < BASE_PTR) - return(ret); - startPosition = CUR_PTR - BASE_PTR; - - while ((IS_CHAR_CH(CUR)) && (CUR != '\'')) { - NEXT; - len++; - } - if (!IS_CHAR_CH(CUR)) { - htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, - "Unfinished SystemLiteral\n", NULL, NULL); - } else { - ret = xmlStrndup((BASE_PTR+startPosition), len); - NEXT; - } + len++; + } + if (CUR != quote) { + htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, + "Unfinished SystemLiteral\n", NULL, NULL); } else { - htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED, - " or ' expected\n", NULL, NULL); + NEXT; + if (err == 0) + ret = xmlStrndup((BASE_PTR+startPosition), len); } return(ret); @@ -2849,51 +3018,42 @@ htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) { static xmlChar * htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) { size_t len = 0, startPosition = 0; + int err = 0; + int quote; xmlChar *ret = NULL; + + if ((CUR != '"') && (CUR != '\'')) { + htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED, + "PubidLiteral \" or ' expected\n", NULL, NULL); + return(NULL); + } + quote = CUR; + NEXT; + /* * Name ::= (Letter | '_') (NameChar)* */ - if (CUR == '"') { - NEXT; - - if (CUR_PTR < BASE_PTR) - return(ret); - startPosition = CUR_PTR - BASE_PTR; - - while (IS_PUBIDCHAR_CH(CUR)) { - len++; - NEXT; + if (CUR_PTR < BASE_PTR) + return(ret); + startPosition = CUR_PTR - BASE_PTR; + + while ((CUR != 0) && (CUR != quote)) { + if (!IS_PUBIDCHAR_CH(CUR)) { + htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, + "Invalid char in PubidLiteral 0x%X\n", CUR); + err = 1; } - - if (CUR != '"') { - htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, - "Unfinished PubidLiteral\n", NULL, NULL); - } else { - ret = xmlStrndup((BASE_PTR + startPosition), len); - NEXT; - } - } else if (CUR == '\'') { + len++; NEXT; + } - if (CUR_PTR < BASE_PTR) - return(ret); - startPosition = CUR_PTR - BASE_PTR; - - while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\'')){ - len++; - NEXT; - } - - if (CUR != '\'') { - htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, - "Unfinished PubidLiteral\n", NULL, NULL); - } else { - ret = xmlStrndup((BASE_PTR + startPosition), len); - NEXT; - } + if (CUR != quote) { + htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, + "Unfinished PubidLiteral\n", NULL, NULL); } else { - htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED, - "PubidLiteral \" or ' expected\n", NULL, NULL); + NEXT; + if (err == 0) + ret = xmlStrndup((BASE_PTR + startPosition), len); } return(ret); @@ -2928,7 +3088,7 @@ htmlParseScript(htmlParserCtxtPtr ctxt) { SHRINK; cur = CUR_CHAR(l); - while (IS_CHAR_CH(cur)) { + while (cur != 0) { if ((cur == '<') && (NXT(1) == '/')) { /* * One should break here, the specification is clear: @@ -2959,7 +3119,12 @@ htmlParseScript(htmlParserCtxtPtr ctxt) { } } } - COPY_BUF(l,buf,nbchar,cur); + if (IS_CHAR(cur)) { + COPY_BUF(l,buf,nbchar,cur); + } else { + htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, + "Invalid char in CDATA 0x%X\n", cur); + } if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) { buf[nbchar] = 0; if (ctxt->sax->cdataBlock!= NULL) { @@ -2977,14 +3142,6 @@ htmlParseScript(htmlParserCtxtPtr ctxt) { cur = CUR_CHAR(l); } - if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) { - htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, - "Invalid char in CDATA 0x%X\n", cur); - if (ctxt->input->cur < ctxt->input->end) { - NEXT; - } - } - if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) { buf[nbchar] = 0; if (ctxt->sax->cdataBlock!= NULL) { @@ -3232,7 +3389,7 @@ htmlParsePI(htmlParserCtxtPtr ctxt) { } SKIP_BLANKS; cur = CUR_CHAR(l); - while (IS_CHAR(cur) && (cur != '>')) { + while ((cur != 0) && (cur != '>')) { if (len + 5 >= size) { xmlChar *tmp; @@ -3251,7 +3408,13 @@ htmlParsePI(htmlParserCtxtPtr ctxt) { GROW; count = 0; } - COPY_BUF(l,buf,len,cur); + if (IS_CHAR(cur)) { + COPY_BUF(l,buf,len,cur); + } else { + htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, + "Invalid char in processing instruction " + "0x%X\n", cur); + } NEXTL(l); cur = CUR_CHAR(l); if (cur == 0) { @@ -3300,6 +3463,7 @@ htmlParseComment(htmlParserCtxtPtr ctxt) { int q, ql; int r, rl; int cur, l; + int next, nl; xmlParserInputState state; /* @@ -3321,17 +3485,32 @@ htmlParseComment(htmlParserCtxtPtr ctxt) { len = 0; buf[len] = 0; q = CUR_CHAR(ql); - if (!IS_CHAR(q)) + if (q == 0) goto unfinished; NEXTL(ql); r = CUR_CHAR(rl); - if (!IS_CHAR(r)) + if (r == 0) goto unfinished; NEXTL(rl); cur = CUR_CHAR(l); - while (IS_CHAR(cur) && + while ((cur != 0) && ((cur != '>') || (r != '-') || (q != '-'))) { + NEXTL(l); + next = CUR_CHAR(nl); + if (next == 0) { + SHRINK; + GROW; + next = CUR_CHAR(nl); + } + + if ((q == '-') && (r == '-') && (cur == '!') && (next == '>')) { + htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED, + "Comment incorrectly closed by '--!>'", NULL, NULL); + cur = '>'; + break; + } + if (len + 5 >= size) { xmlChar *tmp; @@ -3345,21 +3524,22 @@ htmlParseComment(htmlParserCtxtPtr ctxt) { } buf = tmp; } - COPY_BUF(ql,buf,len,q); + if (IS_CHAR(q)) { + COPY_BUF(ql,buf,len,q); + } else { + htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, + "Invalid char in comment 0x%X\n", q); + } + q = r; ql = rl; r = cur; rl = l; - NEXTL(l); - cur = CUR_CHAR(l); - if (cur == 0) { - SHRINK; - GROW; - cur = CUR_CHAR(l); - } + cur = next; + l = nl; } buf[len] = 0; - if (IS_CHAR(cur)) { + if (cur == '>') { NEXT; if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) && (!ctxt->disableSAX)) @@ -3400,13 +3580,16 @@ htmlParseCharRef(htmlParserCtxtPtr ctxt) { ((NXT(2) == 'x') || NXT(2) == 'X')) { SKIP(3); while (CUR != ';') { - if ((CUR >= '0') && (CUR <= '9')) - val = val * 16 + (CUR - '0'); - else if ((CUR >= 'a') && (CUR <= 'f')) - val = val * 16 + (CUR - 'a') + 10; - else if ((CUR >= 'A') && (CUR <= 'F')) - val = val * 16 + (CUR - 'A') + 10; - else { + if ((CUR >= '0') && (CUR <= '9')) { + if (val < 0x110000) + val = val * 16 + (CUR - '0'); + } else if ((CUR >= 'a') && (CUR <= 'f')) { + if (val < 0x110000) + val = val * 16 + (CUR - 'a') + 10; + } else if ((CUR >= 'A') && (CUR <= 'F')) { + if (val < 0x110000) + val = val * 16 + (CUR - 'A') + 10; + } else { htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF, "htmlParseCharRef: missing semicolon\n", NULL, NULL); @@ -3419,9 +3602,10 @@ htmlParseCharRef(htmlParserCtxtPtr ctxt) { } else if ((CUR == '&') && (NXT(1) == '#')) { SKIP(2); while (CUR != ';') { - if ((CUR >= '0') && (CUR <= '9')) - val = val * 10 + (CUR - '0'); - else { + if ((CUR >= '0') && (CUR <= '9')) { + if (val < 0x110000) + val = val * 10 + (CUR - '0'); + } else { htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF, "htmlParseCharRef: missing semicolon\n", NULL, NULL); @@ -3440,6 +3624,9 @@ htmlParseCharRef(htmlParserCtxtPtr ctxt) { */ if (IS_CHAR(val)) { return(val); + } else if (val >= 0x110000) { + htmlParseErr(ctxt, XML_ERR_INVALID_CHAR, + "htmlParseCharRef: value too large\n", NULL, NULL); } else { htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, "htmlParseCharRef: invalid xmlChar value %d\n", @@ -3499,9 +3686,12 @@ htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) { if (CUR != '>') { htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED, "DOCTYPE improperly terminated\n", NULL, NULL); - /* We shouldn't try to resynchronize ... */ + /* Ignore bogus content */ + while ((CUR != 0) && (CUR != '>')) + NEXT; } - NEXT; + if (CUR == '>') + NEXT; /* * Create or update the document accordingly to the DOCTYPE @@ -3770,16 +3960,28 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) { htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, "htmlParseStartTag: invalid element name\n", NULL, NULL); + /* + * The recovery code is disabled for now as it can result in + * quadratic behavior with the push parser. htmlParseStartTag + * must consume all content up to the final '>' in order to avoid + * rescanning for this terminator. + * + * For a proper fix in line with HTML5, htmlParseStartTag and + * htmlParseElement should only be called when there's an ASCII + * alpha character following the initial '<'. Otherwise, the '<' + * should be emitted as text (unless followed by '!', '/' or '?'). + */ +#if 0 /* if recover preserve text on classic misconstructs */ if ((ctxt->recovery) && ((IS_BLANK_CH(CUR)) || (CUR == '<') || (CUR == '=') || (CUR == '>') || (((CUR >= '0') && (CUR <= '9'))))) { htmlParseCharDataInternal(ctxt, '<'); return(-1); } - +#endif /* Dump the bogus tag like browsers do */ - while ((IS_CHAR_CH(CUR)) && (CUR != '>') && + while ((CUR != 0) && (CUR != '>') && (ctxt->instate != XML_PARSER_EOF)) NEXT; return -1; @@ -3835,11 +4037,9 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) { * (S Attribute)* S? */ SKIP_BLANKS; - while ((IS_CHAR_CH(CUR)) && + while ((CUR != 0) && (CUR != '>') && ((CUR != '/') || (NXT(1) != '>'))) { - long cons = ctxt->nbChars; - GROW; attname = htmlParseAttribute(ctxt, &attvalue); if (attname != NULL) { @@ -3898,7 +4098,7 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) { xmlFree(attvalue); /* Dump the bogus attribute string up to the next blank or * the end of the tag. */ - while ((IS_CHAR_CH(CUR)) && + while ((CUR != 0) && !(IS_BLANK_CH(CUR)) && (CUR != '>') && ((CUR != '/') || (NXT(1) != '>'))) NEXT; @@ -3906,12 +4106,6 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) { failed: SKIP_BLANKS; - if (cons == ctxt->nbChars) { - htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, - "htmlParseStartTag: problem parsing attributes\n", - NULL, NULL); - break; - } } /* @@ -3979,19 +4173,14 @@ htmlParseEndTag(htmlParserCtxtPtr ctxt) * We should definitely be at the ending "S? '>'" part */ SKIP_BLANKS; - if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) { + if (CUR != '>') { htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, "End tag : expected '>'\n", NULL, NULL); - if (ctxt->recovery) { - /* - * We're not at the ending > !! - * Error, unless in recover mode where we search forwards - * until we find a > - */ - while (CUR != '\0' && CUR != '>') NEXT; - NEXT; - } - } else + /* Skip to next '>' */ + while ((CUR != 0) && (CUR != '>')) + NEXT; + } + if (CUR == '>') NEXT; /* @@ -4032,12 +4221,10 @@ htmlParseEndTag(htmlParserCtxtPtr ctxt) * With the exception that the autoclose may have popped stuff out * of the stack. */ - if (!xmlStrEqual(name, ctxt->name)) { - if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) { - htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, - "Opening and ending tag mismatch: %s and %s\n", - name, ctxt->name); - } + if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) { + htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, + "Opening and ending tag mismatch: %s and %s\n", + name, ctxt->name); } /* @@ -4152,8 +4339,6 @@ htmlParseContent(htmlParserCtxtPtr ctxt) { currentNode = xmlStrdup(ctxt->name); depth = ctxt->nameNr; while (1) { - long cons = ctxt->nbChars; - GROW; if (ctxt->instate == XML_PARSER_EOF) @@ -4181,7 +4366,7 @@ htmlParseContent(htmlParserCtxtPtr ctxt) { "htmlParseStartTag: invalid element name\n", NULL, NULL); /* Dump the bogus tag like browsers do */ - while ((IS_CHAR_CH(CUR)) && (CUR != '>')) + while ((CUR != 0) && (CUR != '>')) NEXT; if (currentNode != NULL) @@ -4273,15 +4458,6 @@ htmlParseContent(htmlParserCtxtPtr ctxt) { else { htmlParseCharData(ctxt); } - - if (cons == ctxt->nbChars) { - if (ctxt->node != NULL) { - htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, - "detected an error in element content\n", - NULL, NULL); - } - break; - } } GROW; } @@ -4396,7 +4572,7 @@ htmlParseElement(htmlParserCtxtPtr ctxt) { */ currentNode = xmlStrdup(ctxt->name); depth = ctxt->nameNr; - while (IS_CHAR_CH(CUR)) { + while (CUR != 0) { oldptr = ctxt->input->cur; htmlParseContent(ctxt); if (oldptr==ctxt->input->cur) break; @@ -4413,7 +4589,7 @@ htmlParseElement(htmlParserCtxtPtr ctxt) { node_info.node = ctxt->node; xmlParserAddNodeInfo(ctxt, &node_info); } - if (!IS_CHAR_CH(CUR)) { + if (CUR == 0) { htmlAutoCloseOnEnd(ctxt); } @@ -4434,7 +4610,7 @@ htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) { xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo); htmlNodeInfoPop(ctxt); } - if (!IS_CHAR_CH(CUR)) { + if (CUR == 0) { htmlAutoCloseOnEnd(ctxt); } } @@ -4552,8 +4728,6 @@ htmlParseContentInternal(htmlParserCtxtPtr ctxt) { currentNode = xmlStrdup(ctxt->name); depth = ctxt->nameNr; while (1) { - long cons = ctxt->nbChars; - GROW; if (ctxt->instate == XML_PARSER_EOF) @@ -4583,7 +4757,7 @@ htmlParseContentInternal(htmlParserCtxtPtr ctxt) { "htmlParseStartTag: invalid element name\n", NULL, NULL); /* Dump the bogus tag like browsers do */ - while ((IS_CHAR_CH(CUR)) && (CUR != '>')) + while ((CUR == 0) && (CUR != '>')) NEXT; htmlParserFinishElementParsing(ctxt); @@ -4687,15 +4861,6 @@ htmlParseContentInternal(htmlParserCtxtPtr ctxt) { else { htmlParseCharData(ctxt); } - - if (cons == ctxt->nbChars) { - if (ctxt->node != NULL) { - htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, - "detected an error in element content\n", - NULL, NULL); - } - break; - } } GROW; } @@ -4959,7 +5124,6 @@ htmlInitParserCtxt(htmlParserCtxtPtr ctxt) ctxt->vctxt.warning = xmlParserValidityWarning; ctxt->record_info = 0; ctxt->validate = 0; - ctxt->nbChars = 0; ctxt->checkIndex = 0; ctxt->catalogs = NULL; xmlInitNodeInfoSeq(&ctxt->node_seq); @@ -5035,6 +5199,7 @@ htmlCreateMemoryParserCtxt(const char *buffer, int size) { input = xmlNewInputStream(ctxt); if (input == NULL) { + xmlFreeParserInputBuffer(buf); xmlFreeParserCtxt(ctxt); return(NULL); } @@ -5119,7 +5284,7 @@ htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) { * @first: the first char to lookup * @next: the next char to lookup or zero * @third: the next char to lookup or zero - * @comment: flag to force checking inside comments + * @ignoreattrval: skip over attribute values * * Try to find if a sequence (first, next, third) or just (first next) or * (first) is available in the input stream. @@ -5133,13 +5298,11 @@ htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) { */ static int htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first, - xmlChar next, xmlChar third, int iscomment, - int ignoreattrval) + xmlChar next, xmlChar third, int ignoreattrval) { int base, len; htmlParserInputPtr in; const xmlChar *buf; - int incomment = 0; int invalue = 0; char valdellim = 0x0; @@ -5151,8 +5314,11 @@ htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first, if (base < 0) return (-1); - if (ctxt->checkIndex > base) + if (ctxt->checkIndex > base) { base = ctxt->checkIndex; + /* Abuse hasPErefs member to restore current state. */ + invalue = ctxt->hasPErefs & 1 ? 1 : 0; + } if (in->buf == NULL) { buf = in->base; @@ -5168,14 +5334,6 @@ htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first, else if (next) len--; for (; base < len; base++) { - if ((!incomment) && (base + 4 < len) && (!iscomment)) { - if ((buf[base] == '<') && (buf[base + 1] == '!') && - (buf[base + 2] == '-') && (buf[base + 3] == '-')) { - incomment = 1; - /* do not increment past <! - some people use <!--> */ - base += 2; - } - } if (ignoreattrval) { if (buf[base] == '"' || buf[base] == '\'') { if (invalue) { @@ -5192,16 +5350,6 @@ htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first, continue; } } - if (incomment) { - if (base + 3 > len) - return (-1); - if ((buf[base] == '-') && (buf[base + 1] == '-') && - (buf[base + 2] == '>')) { - incomment = 0; - base += 2; - } - continue; - } if (buf[base] == first) { if (third != 0) { if ((buf[base + 1] != next) || (buf[base + 2] != third)) @@ -5228,8 +5376,12 @@ htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first, return (base - (in->cur - in->base)); } } - if ((!incomment) && (!invalue)) - ctxt->checkIndex = base; + ctxt->checkIndex = base; + /* Abuse hasPErefs member to track current state. */ + if (invalue) + ctxt->hasPErefs |= 1; + else + ctxt->hasPErefs &= ~1; #ifdef DEBUG_PUSH if (next == 0) xmlGenericError(xmlGenericErrorContext, @@ -5246,79 +5398,38 @@ htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first, } /** - * htmlParseLookupChars: + * htmlParseLookupCommentEnd: * @ctxt: an HTML parser context - * @stop: Array of chars, which stop the lookup. - * @stopLen: Length of stop-Array * - * Try to find if any char of the stop-Array is available in the input - * stream. + * Try to find a comment end tag in the input stream + * The search includes "-->" as well as WHATWG-recommended incorrectly-closed tags. + * (See https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment) * This function has a side effect of (possibly) incrementing ctxt->checkIndex * to avoid rescanning sequences of bytes, it DOES change the state of the * parser, do not use liberally. + * This wraps to htmlParseLookupSequence() * - * Returns the index to the current parsing point if a stopChar - * is available, -1 otherwise. + * Returns the index to the current parsing point if the full sequence is available, -1 otherwise. */ static int -htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop, - int stopLen) +htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt) { - int base, len; - htmlParserInputPtr in; - const xmlChar *buf; - int incomment = 0; - int i; - - in = ctxt->input; - if (in == NULL) - return (-1); - - base = in->cur - in->base; - if (base < 0) - return (-1); - - if (ctxt->checkIndex > base) - base = ctxt->checkIndex; - - if (in->buf == NULL) { - buf = in->base; - len = in->length; - } else { - buf = xmlBufContent(in->buf->buffer); - len = xmlBufUse(in->buf->buffer); - } - - for (; base < len; base++) { - if (!incomment && (base + 4 < len)) { - if ((buf[base] == '<') && (buf[base + 1] == '!') && - (buf[base + 2] == '-') && (buf[base + 3] == '-')) { - incomment = 1; - /* do not increment past <! - some people use <!--> */ - base += 2; - } - } - if (incomment) { - if (base + 3 > len) - return (-1); - if ((buf[base] == '-') && (buf[base + 1] == '-') && - (buf[base + 2] == '>')) { - incomment = 0; - base += 2; - } - continue; - } - for (i = 0; i < stopLen; ++i) { - if (buf[base] == stop[i]) { - ctxt->checkIndex = 0; - return (base - (in->cur - in->base)); - } - } + int mark = 0; + int cur = CUR_PTR - BASE_PTR; + + while (mark >= 0) { + mark = htmlParseLookupSequence(ctxt, '-', '-', 0, 0); + if ((mark < 0) || + (NXT(mark+2) == '>') || + ((NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) { + return mark; + } + ctxt->checkIndex = cur + mark + 1; } - ctxt->checkIndex = base; - return (-1); + return mark; } + /** * htmlParseTryOrFinish: * @ctxt: an HTML parser context @@ -5332,7 +5443,7 @@ static int htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { int ret = 0; htmlParserInputPtr in; - int avail = 0; + ptrdiff_t avail = 0; xmlChar cur, next; htmlParserNodeInfo node_info; @@ -5397,7 +5508,8 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { if (in->buf == NULL) avail = in->length - (in->cur - in->base); else - avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base); + avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) - + (in->cur - in->base); if ((avail == 0) && (terminate)) { htmlAutoCloseOnEnd(ctxt); if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) { @@ -5411,6 +5523,12 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { } if (avail < 1) goto done; + /* + * This is done to make progress and avoid an infinite loop + * if a parsing attempt was aborted by hitting a NUL byte. After + * changing htmlCurrentChar, this probably isn't necessary anymore. + * We should consider removing this check. + */ cur = in->cur[0]; if (cur == 0) { SKIP(1); @@ -5433,7 +5551,8 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { if (in->buf == NULL) avail = in->length - (in->cur - in->base); else - avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base); + avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) - + (in->cur - in->base); } if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) ctxt->sax->setDocumentLocator(ctxt->userData, @@ -5450,7 +5569,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { (UPP(6) == 'Y') && (UPP(7) == 'P') && (UPP(8) == 'E')) { if ((!terminate) && - (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) + (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0)) goto done; #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, @@ -5475,14 +5594,15 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { if (in->buf == NULL) avail = in->length - (in->cur - in->base); else - avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base); + avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) - + (in->cur - in->base); /* * no chars in buffer */ if (avail < 1) goto done; /* - * not enouth chars in buffer + * not enough chars in buffer */ if (avail < 2) { if (!terminate) @@ -5495,8 +5615,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { cur = in->cur[0]; if ((cur == '<') && (next == '!') && (in->cur[2] == '-') && (in->cur[3] == '-')) { - if ((!terminate) && - (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0)) + if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0)) goto done; #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, @@ -5506,7 +5625,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { ctxt->instate = XML_PARSER_MISC; } else if ((cur == '<') && (next == '?')) { if ((!terminate) && - (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) + (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) goto done; #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, @@ -5520,7 +5639,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { (UPP(6) == 'Y') && (UPP(7) == 'P') && (UPP(8) == 'E')) { if ((!terminate) && - (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) + (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0)) goto done; #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, @@ -5536,7 +5655,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { (avail < 9)) { goto done; } else { - ctxt->instate = XML_PARSER_START_TAG; + ctxt->instate = XML_PARSER_CONTENT; #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, "HPP: entering START_TAG\n"); @@ -5548,15 +5667,15 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { if (in->buf == NULL) avail = in->length - (in->cur - in->base); else - avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base); + avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) - + (in->cur - in->base); if (avail < 2) goto done; cur = in->cur[0]; next = in->cur[1]; if ((cur == '<') && (next == '!') && (in->cur[2] == '-') && (in->cur[3] == '-')) { - if ((!terminate) && - (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0)) + if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0)) goto done; #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, @@ -5566,7 +5685,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { ctxt->instate = XML_PARSER_PROLOG; } else if ((cur == '<') && (next == '?')) { if ((!terminate) && - (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) + (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) goto done; #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, @@ -5578,7 +5697,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { (avail < 4)) { goto done; } else { - ctxt->instate = XML_PARSER_START_TAG; + ctxt->instate = XML_PARSER_CONTENT; #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, "HPP: entering START_TAG\n"); @@ -5589,7 +5708,8 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { if (in->buf == NULL) avail = in->length - (in->cur - in->base); else - avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base); + avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) - + (in->cur - in->base); if (avail < 1) goto done; cur = in->cur[0]; @@ -5602,8 +5722,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { next = in->cur[1]; if ((cur == '<') && (next == '!') && (in->cur[2] == '-') && (in->cur[3] == '-')) { - if ((!terminate) && - (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0)) + if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0)) goto done; #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, @@ -5613,7 +5732,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { ctxt->instate = XML_PARSER_EPILOG; } else if ((cur == '<') && (next == '?')) { if ((!terminate) && - (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) + (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) goto done; #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, @@ -5648,7 +5767,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { if (avail < 1) goto done; /* - * not enouth chars in buffer + * not enough chars in buffer */ if (avail < 2) { if (!terminate) @@ -5677,7 +5796,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { break; } if ((!terminate) && - (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) + (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0)) goto done; /* Capture start position */ @@ -5769,7 +5888,6 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { } case XML_PARSER_CONTENT: { xmlChar chr[2] = { 0, 0 }; - long cons; /* * Handle preparsed entities and charRef @@ -5814,7 +5932,6 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { goto done; cur = in->cur[0]; next = in->cur[1]; - cons = ctxt->nbChars; if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) || (xmlStrEqual(ctxt->name, BAD_CAST"style"))) { /* @@ -5824,7 +5941,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { int idx; xmlChar val; - idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 0); + idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0); if (idx < 0) goto done; val = in->cur[idx + 2]; @@ -5851,7 +5968,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { (UPP(6) == 'Y') && (UPP(7) == 'P') && (UPP(8) == 'E')) { if ((!terminate) && - (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) + (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0)) goto done; htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, "Misplaced DOCTYPE declaration\n", @@ -5859,9 +5976,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { htmlParseDocTypeDecl(ctxt); } else if ((cur == '<') && (next == '!') && (in->cur[2] == '-') && (in->cur[3] == '-')) { - if ((!terminate) && - (htmlParseLookupSequence( - ctxt, '-', '-', '>', 1, 1) < 0)) + if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0)) goto done; #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, @@ -5871,7 +5986,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { ctxt->instate = XML_PARSER_CONTENT; } else if ((cur == '<') && (next == '?')) { if ((!terminate) && - (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) + (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) goto done; #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, @@ -5890,24 +6005,15 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { #endif break; } else if (cur == '<') { - ctxt->instate = XML_PARSER_START_TAG; - ctxt->checkIndex = 0; + if ((!terminate) && (next == 0)) + goto done; + ctxt->instate = XML_PARSER_START_TAG; + ctxt->checkIndex = 0; #ifdef DEBUG_PUSH - xmlGenericError(xmlGenericErrorContext, - "HPP: entering START_TAG\n"); + xmlGenericError(xmlGenericErrorContext, + "HPP: entering START_TAG\n"); #endif break; - } else if (cur == '&') { - if ((!terminate) && - (htmlParseLookupChars(ctxt, - BAD_CAST "; >/", 4) < 0)) - goto done; -#ifdef DEBUG_PUSH - xmlGenericError(xmlGenericErrorContext, - "HPP: Parsing Reference\n"); -#endif - /* TODO: check generation of subtrees if noent !!! */ - htmlParseReference(ctxt); } else { /* * check that the text sequence is complete @@ -5916,24 +6022,23 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { * data detection. */ if ((!terminate) && - (htmlParseLookupChars(ctxt, BAD_CAST "<&", 2) < 0)) + (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0)) goto done; ctxt->checkIndex = 0; #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, "HPP: Parsing char data\n"); #endif - htmlParseCharData(ctxt); - } - } - if (cons == ctxt->nbChars) { - if (ctxt->node != NULL) { - htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, - "detected an error in element content\n", - NULL, NULL); + while ((ctxt->instate != XML_PARSER_EOF) && + (cur != '<') && (in->cur < in->end)) { + if (cur == '&') { + htmlParseReference(ctxt); + } else { + htmlParseCharData(ctxt); + } + cur = in->cur[0]; + } } - NEXT; - break; } break; @@ -5942,7 +6047,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { if (avail < 2) goto done; if ((!terminate) && - (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) + (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) goto done; htmlParseEndTag(ctxt); if (ctxt->nameNr == 0) { @@ -6124,12 +6229,12 @@ htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size, int res; res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk); + xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur); if (res < 0) { ctxt->errNo = XML_PARSER_EOF; ctxt->disableSAX = 1; return (XML_PARSER_EOF); } - xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur); #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size); #endif @@ -6148,12 +6253,12 @@ htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size, size_t current = ctxt->input->cur - ctxt->input->base; nbchars = xmlCharEncInput(in, terminate); + xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current); if (nbchars < 0) { htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, "encoder error\n", NULL, NULL); return(XML_ERR_INVALID_ENCODING); } - xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current); } } } @@ -6671,7 +6776,6 @@ htmlCtxtReset(htmlParserCtxtPtr ctxt) ctxt->vctxt.error = xmlParserValidityError; ctxt->vctxt.warning = xmlParserValidityWarning; ctxt->record_info = 0; - ctxt->nbChars = 0; ctxt->checkIndex = 0; ctxt->inSubset = 0; ctxt->errNo = XML_ERR_OK; @@ -6890,7 +6994,9 @@ htmlReadMemory(const char *buffer, int size, const char *URL, const char *encodi * @encoding: the document encoding, or NULL * @options: a combination of htmlParserOption(s) * - * parse an XML from a file descriptor and build a tree. + * parse an HTML from a file descriptor and build a tree. + * NOTE that the file descriptor will not be closed when the + * reader is closed or reset. * * Returns the resulting document tree */ @@ -6899,17 +7005,17 @@ htmlReadFd(int fd, const char *URL, const char *encoding, int options) { htmlParserCtxtPtr ctxt; xmlParserInputBufferPtr input; - xmlParserInputPtr stream; + htmlParserInputPtr stream; if (fd < 0) return (NULL); - xmlInitParser(); xmlInitParser(); input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE); if (input == NULL) return (NULL); - ctxt = xmlNewParserCtxt(); + input->closecallback = NULL; + ctxt = htmlNewParserCtxt(); if (ctxt == NULL) { xmlFreeParserInputBuffer(input); return (NULL); @@ -6917,7 +7023,7 @@ htmlReadFd(int fd, const char *URL, const char *encoding, int options) stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); if (stream == NULL) { xmlFreeParserInputBuffer(input); - xmlFreeParserCtxt(ctxt); + htmlFreeParserCtxt(ctxt); return (NULL); } inputPush(ctxt, stream); |