Skip to content

Commit 37c6618

Browse files
committed
parser: Rework parsing of attribute and entity values
Don't use a separate function to handle "complex" attributes. Validate UTF-8 byte sequences without decoding. This should improve performance considerably when parsing multi-byte UTF-8 sequences. Use a string buffer to avoid unnecessary allocations and copying when expanding entities. Normalize attribute values in a single pass while expanding entities. Be more lenient in recovery mode. If no entity substitution was requested, validate entities without expanding. Fixes #596. Also fixes #655.
1 parent 4dcc2d7 commit 37c6618

File tree

16 files changed

+1073
-1026
lines changed

16 files changed

+1073
-1026
lines changed

SAX2.c

Lines changed: 12 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -970,10 +970,8 @@ xmlSAX2AttributeInternal(void *ctx, const xmlChar *fullname,
970970
(void) nsret;
971971

972972
if (!ctxt->replaceEntities) {
973-
ctxt->depth++;
974-
val = xmlStringDecodeEntities(ctxt, value, XML_SUBSTITUTE_REF,
975-
0,0,0);
976-
ctxt->depth--;
973+
/* TODO: normalize if needed */
974+
val = xmlExpandEntitiesInAttValue(ctxt, value, /* normalize */ 0);
977975
if (val == NULL) {
978976
xmlSAX2ErrMemory(ctxt);
979977
if (name != NULL)
@@ -1038,10 +1036,8 @@ xmlSAX2AttributeInternal(void *ctx, const xmlChar *fullname,
10381036
(void) nsret;
10391037

10401038
if (!ctxt->replaceEntities) {
1041-
ctxt->depth++;
1042-
val = xmlStringDecodeEntities(ctxt, value, XML_SUBSTITUTE_REF,
1043-
0,0,0);
1044-
ctxt->depth--;
1039+
/* TODO: normalize if needed */
1040+
val = xmlExpandEntitiesInAttValue(ctxt, value, /* normalize */ 0);
10451041
if (val == NULL) {
10461042
xmlSAX2ErrMemory(ctxt);
10471043
xmlFree(ns);
@@ -1179,10 +1175,8 @@ xmlSAX2AttributeInternal(void *ctx, const xmlChar *fullname,
11791175
if (!ctxt->replaceEntities) {
11801176
xmlChar *val;
11811177

1182-
ctxt->depth++;
1183-
val = xmlStringDecodeEntities(ctxt, value, XML_SUBSTITUTE_REF,
1184-
0,0,0);
1185-
ctxt->depth--;
1178+
/* TODO: normalize if needed */
1179+
val = xmlExpandEntitiesInAttValue(ctxt, value, /* normalize */ 0);
11861180

11871181
if (val == NULL)
11881182
ctxt->valid &= xmlValidateOneAttribute(&ctxt->vctxt,
@@ -1736,19 +1730,19 @@ static xmlChar *
17361730
xmlSAX2DecodeAttrEntities(xmlParserCtxtPtr ctxt, const xmlChar *str,
17371731
const xmlChar *end) {
17381732
const xmlChar *in;
1739-
xmlChar *ret;
17401733

17411734
in = str;
17421735
while (in < end)
17431736
if (*in++ == '&')
17441737
goto decode;
17451738
return(NULL);
17461739
decode:
1747-
ctxt->depth++;
1748-
ret = xmlStringLenDecodeEntities(ctxt, str, end - str,
1749-
XML_SUBSTITUTE_REF, 0,0,0);
1750-
ctxt->depth--;
1751-
return(ret);
1740+
/*
1741+
* If the value contains '&', we can be sure it was allocated and is
1742+
* zero-terminated.
1743+
*/
1744+
/* TODO: normalize if needed */
1745+
return(xmlExpandEntitiesInAttValue(ctxt, str, /* normalize */ 0));
17521746
}
17531747
#endif /* LIBXML_VALID_ENABLED */
17541748

include/private/entities.h

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,17 @@
99
*
1010
* XML_ENT_PARSED: The entity was parsed and `children` points to the
1111
* content.
12-
* XML_ENT_CHECKED: The entity was checked for loops.
12+
*
13+
* XML_ENT_CHECKED: The entity was checked for loops and amplification.
14+
* expandedSize was set.
15+
*
16+
* XML_ENT_VALIDATED: The entity contains a valid attribute value.
17+
* Only used when entities aren't substituted.
1318
*/
14-
#define XML_ENT_PARSED (1<<0)
15-
#define XML_ENT_CHECKED (1<<1)
16-
#define XML_ENT_EXPANDING (1<<2)
17-
#define XML_ENT_CHECKED_LT (1<<3)
18-
#define XML_ENT_CONTAINS_LT (1<<4)
19+
#define XML_ENT_PARSED (1u << 0)
20+
#define XML_ENT_CHECKED (1u << 1)
21+
#define XML_ENT_VALIDATED (1u << 2)
22+
#define XML_ENT_EXPANDING (1u << 3)
1923

2024
XML_HIDDEN xmlChar *
2125
xmlEncodeAttributeEntities(xmlDocPtr doc, const xmlChar *input);

include/private/parser.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,4 +87,8 @@ XML_HIDDEN xmlParserInputPtr
8787
xmlNewInputPush(xmlParserCtxtPtr ctxt, const char *url,
8888
const char *chunk, int size, const char *encoding);
8989

90+
XML_HIDDEN xmlChar *
91+
xmlExpandEntitiesInAttValue(xmlParserCtxtPtr ctxt, const xmlChar *str,
92+
int normalize);
93+
9094
#endif /* XML_PARSER_H_PRIVATE__ */

0 commit comments

Comments
 (0)