Skip to content

Commit 8dc2391

Browse files
committed
Fix bug #79701: getElementById does not correctly work with duplicate definitions
This is a long standing bug: IDs aren't properly tracked causing either outdated or plain incorrect results from getElementById. This PR implements a pragmatic solution in which we still try to use the ID lookup table to a degree, but only as a performance boost not as a "single source of truth". Full details are explained in the getElementById code. Closes phpGH-14349.
1 parent 5fe799a commit 8dc2391

18 files changed

+672
-41
lines changed

NEWS

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@ PHP NEWS
4545
. Implemented "Improve callbacks in ext/dom and ext/xsl" RFC. (nielsdos)
4646
. Added DOMXPath::quote() static method. (divinity76)
4747
. Implemented opt-in ext/dom spec compliance RFC. (nielsdos)
48+
. Fixed bug #79701 (getElementById does not correctly work with duplicate
49+
definitions). (nielsdos)
4850

4951
- Fileinfo:
5052
. Update to libmagic 5.45. (nielsdos)

ext/dom/attr.c

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525

2626
#include "php_dom.h"
2727
#include "dom_properties.h"
28+
#include "internal_helpers.h"
2829

2930
/*
3031
* class DOMAttr extends DOMNode
@@ -106,6 +107,16 @@ zend_result dom_attr_specified_read(dom_object *obj, zval *retval)
106107

107108
/* }}} */
108109

110+
void dom_attr_value_will_change(dom_object *obj, xmlAttrPtr attrp)
111+
{
112+
if (attrp->atype == XML_ATTRIBUTE_ID) {
113+
xmlRemoveID(attrp->doc, attrp);
114+
attrp->atype = XML_ATTRIBUTE_ID;
115+
}
116+
117+
dom_mark_ids_modified(obj->document);
118+
}
119+
109120
/* {{{ value string
110121
readonly=no
111122
URL: http://www.w3.org/TR/2003/WD-DOM-Level-3-Core-20030226/DOM3-Core.html#ID-221662474
@@ -122,6 +133,8 @@ zend_result dom_attr_value_write(dom_object *obj, zval *newval)
122133
{
123134
DOM_PROP_NODE(xmlAttrPtr, attrp, obj);
124135

136+
dom_attr_value_will_change(obj, attrp);
137+
125138
/* Typed property, this is already a string */
126139
ZEND_ASSERT(Z_TYPE_P(newval) == IS_STRING);
127140
zend_string *str = Z_STR_P(newval);

ext/dom/document.c

Lines changed: 32 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1028,34 +1028,47 @@ Since: DOM Level 2
10281028
*/
10291029
PHP_METHOD(DOMDocument, getElementById)
10301030
{
1031-
zval *id;
10321031
xmlDocPtr docp;
1033-
xmlAttrPtr attrp;
10341032
size_t idname_len;
10351033
dom_object *intern;
10361034
char *idname;
10371035

1038-
id = ZEND_THIS;
1039-
if (zend_parse_parameters(ZEND_NUM_ARGS(), "s", &idname, &idname_len) == FAILURE) {
1040-
RETURN_THROWS();
1041-
}
1042-
1043-
DOM_GET_OBJ(docp, id, xmlDocPtr, intern);
1036+
ZEND_PARSE_PARAMETERS_START(1, 1)
1037+
Z_PARAM_STRING(idname, idname_len)
1038+
ZEND_PARSE_PARAMETERS_END();
10441039

1045-
attrp = xmlGetID(docp, BAD_CAST idname);
1040+
DOM_GET_OBJ(docp, ZEND_THIS, xmlDocPtr, intern);
10461041

1047-
/* From the moment an ID is created, libxml2's behaviour is to cache that element, even
1048-
* if that element is not yet attached to the document. Similarly, only upon destruction of
1049-
* the element the ID is actually removed by libxml2. Since libxml2 has such behaviour deeply
1050-
* ingrained in the library, and uses the cache for various purposes, it seems like a bad
1051-
* idea and lost cause to fight it. Instead, we'll simply walk the tree upwards to check
1052-
* if the node is attached to the document. */
1053-
if (attrp && attrp->parent && php_dom_is_node_connected(attrp->parent)) {
1054-
DOM_RET_OBJ((xmlNodePtr) attrp->parent, intern);
1042+
/* If the document has not been manipulated yet, the ID cache will be in sync and we can trust its result.
1043+
* This check mainly exists because a lot of times people just query a document without modifying it,
1044+
* and we can allow quick access to IDs in that case. */
1045+
if (!dom_is_document_cache_modified_since_parsing(intern->document)) {
1046+
const xmlAttr *attrp = xmlGetID(docp, BAD_CAST idname);
1047+
if (attrp && attrp->parent) {
1048+
DOM_RET_OBJ(attrp->parent, intern);
1049+
}
10551050
} else {
1056-
RETVAL_NULL();
1057-
}
1051+
/* From the moment an ID is created, libxml2's behaviour is to cache that element, even
1052+
* if that element is not yet attached to the document. Similarly, only upon destruction of
1053+
* the element the ID is actually removed by libxml2. Since libxml2 has such behaviour deeply
1054+
* ingrained in the library, and uses the cache for various purposes, it seems like a bad
1055+
* idea and lost cause to fight it. */
1056+
1057+
const xmlNode *base = (const xmlNode *) docp;
1058+
const xmlNode *node = base->children;
1059+
while (node != NULL) {
1060+
if (node->type == XML_ELEMENT_NODE) {
1061+
for (const xmlAttr *attr = node->properties; attr != NULL; attr = attr->next) {
1062+
if (attr->atype == XML_ATTRIBUTE_ID && dom_compare_value(attr, BAD_CAST idname)) {
1063+
DOM_RET_OBJ((xmlNodePtr) node, intern);
1064+
return;
1065+
}
1066+
}
1067+
}
10581068

1069+
node = php_dom_next_in_tree_order(node, base);
1070+
}
1071+
}
10591072
}
10601073
/* }}} end dom_document_get_element_by_id */
10611074

ext/dom/element.c

Lines changed: 40 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -183,15 +183,15 @@ zend_result dom_element_id_read(dom_object *obj, zval *retval)
183183
return dom_element_reflected_attribute_read(obj, retval, "id");
184184
}
185185

186-
static void php_set_attribute_id(xmlAttrPtr attrp, bool is_id);
186+
static void php_set_attribute_id(xmlAttrPtr attrp, bool is_id, php_libxml_ref_obj *document);
187187

188188
zend_result dom_element_id_write(dom_object *obj, zval *newval)
189189
{
190190
xmlAttrPtr attr = dom_element_reflected_attribute_write(obj, newval, "id");
191191
if (!attr) {
192192
return FAILURE;
193193
}
194-
php_set_attribute_id(attr, true);
194+
php_set_attribute_id(attr, true, obj->document);
195195
return SUCCESS;
196196
}
197197
/* }}} */
@@ -352,6 +352,16 @@ static xmlNodePtr dom_create_attribute(xmlNodePtr nodep, const char *name, const
352352
}
353353
}
354354

355+
static void dom_check_register_attribute_id(xmlAttrPtr attr, php_libxml_ref_obj *document)
356+
{
357+
dom_mark_ids_modified(document);
358+
359+
if (attr->atype != XML_ATTRIBUTE_ID && attr->doc->type == XML_HTML_DOCUMENT_NODE && attr->ns == NULL && xmlStrEqual(attr->name, BAD_CAST "id")) {
360+
/* To respect XML's ID behaviour, we only do this registration for HTML documents. */
361+
attr->atype = XML_ATTRIBUTE_ID;
362+
}
363+
}
364+
355365
/* {{{ URL: http://www.w3.org/TR/2003/WD-DOM-Level-3-Core-20030226/DOM3-Core.html#core-ID-F68F082
356366
Modern spec URL: https://dom.spec.whatwg.org/#dom-element-setattribute
357367
Since:
@@ -360,7 +370,6 @@ PHP_METHOD(DOMElement, setAttribute)
360370
{
361371
zval *id;
362372
xmlNode *nodep;
363-
xmlNodePtr attr = NULL;
364373
int name_valid;
365374
size_t name_len, value_len;
366375
dom_object *intern;
@@ -394,23 +403,28 @@ PHP_METHOD(DOMElement, setAttribute)
394403
}
395404

396405
/* Can't use xmlSetNsProp unconditionally here because that doesn't take into account the qualified name matching... */
397-
attr = (xmlNodePtr) php_dom_get_attribute_node(nodep, BAD_CAST name, name_len);
406+
xmlAttrPtr attr = php_dom_get_attribute_node(nodep, BAD_CAST name, name_len);
398407
if (attr != NULL) {
399-
dom_remove_all_children(attr);
408+
dom_attr_value_will_change(intern, attr);
409+
dom_remove_all_children((xmlNodePtr) attr);
400410
xmlNodePtr node = xmlNewDocText(attr->doc, BAD_CAST value);
401-
xmlAddChild(attr, node);
411+
xmlAddChild((xmlNodePtr) attr, node);
402412
} else {
403-
attr = (xmlNodePtr) xmlSetNsProp(nodep, NULL, name_processed, BAD_CAST value);
413+
attr = xmlSetNsProp(nodep, NULL, name_processed, BAD_CAST value);
414+
if (EXPECTED(attr != NULL)) {
415+
dom_check_register_attribute_id(attr, intern->document);
416+
}
404417
}
405418

406419
if (name_processed != BAD_CAST name) {
407420
efree(name_processed);
408421
}
409422
} else {
410-
attr = dom_get_attribute_or_nsdecl(intern, nodep, BAD_CAST name, name_len);
423+
xmlNodePtr attr = dom_get_attribute_or_nsdecl(intern, nodep, BAD_CAST name, name_len);
411424
if (attr != NULL) {
412425
switch (attr->type) {
413426
case XML_ATTRIBUTE_NODE:
427+
dom_attr_value_will_change(intern, (xmlAttrPtr) attr);
414428
node_list_unlink(attr->children);
415429
break;
416430
case XML_NAMESPACE_DECL:
@@ -693,7 +707,10 @@ static void dom_element_set_attribute_node_common(INTERNAL_FUNCTION_PARAMETERS,
693707

694708
xmlAddChild(nodep, (xmlNodePtr) attrp);
695709
if (!modern) {
710+
dom_mark_ids_modified(intern->document);
696711
php_dom_reconcile_attribute_namespace_after_insertion(attrp);
712+
} else {
713+
dom_check_register_attribute_id(attrp, intern->document);
697714
}
698715

699716
/* Returns old property if removed otherwise NULL */
@@ -870,6 +887,8 @@ static void dom_set_attribute_ns_legacy(dom_object *intern, xmlNodePtr elemp, ch
870887
int errorcode = dom_check_qname(name, &localname, &prefix, uri_len, name_len);
871888

872889
if (errorcode == 0) {
890+
dom_mark_ids_modified(intern->document);
891+
873892
if (uri_len > 0) {
874893
nodep = (xmlNodePtr) xmlHasNsProp(elemp, BAD_CAST localname, BAD_CAST uri);
875894
if (nodep != NULL && nodep->type != XML_ATTRIBUTE_DECL) {
@@ -958,8 +977,11 @@ static void dom_set_attribute_ns_modern(dom_object *intern, xmlNodePtr elemp, ze
958977
if (errorcode == 0) {
959978
php_dom_libxml_ns_mapper *ns_mapper = php_dom_get_ns_mapper(intern);
960979
xmlNsPtr ns = php_dom_libxml_ns_mapper_get_ns_raw_prefix_string(ns_mapper, prefix, xmlStrlen(prefix), uri);
961-
if (UNEXPECTED(xmlSetNsProp(elemp, ns, localname, BAD_CAST value) == NULL)) {
980+
xmlAttrPtr attr = xmlSetNsProp(elemp, ns, localname, BAD_CAST value);
981+
if (UNEXPECTED(attr == NULL)) {
962982
php_dom_throw_error(INVALID_STATE_ERR, /* strict */ true);
983+
} else {
984+
dom_check_register_attribute_id(attr, intern->document);
963985
}
964986
} else {
965987
php_dom_throw_error(errorcode, /* strict */ true);
@@ -1270,20 +1292,16 @@ PHP_METHOD(DOMElement, hasAttributeNS)
12701292
}
12711293
/* }}} end dom_element_has_attribute_ns */
12721294

1273-
static void php_set_attribute_id(xmlAttrPtr attrp, bool is_id) /* {{{ */
1295+
static void php_set_attribute_id(xmlAttrPtr attrp, bool is_id, php_libxml_ref_obj *document) /* {{{ */
12741296
{
1275-
if (is_id == 1 && attrp->atype != XML_ATTRIBUTE_ID) {
1276-
xmlChar *id_val;
1277-
1278-
id_val = xmlNodeListGetString(attrp->doc, attrp->children, 1);
1279-
if (id_val != NULL) {
1280-
xmlAddID(NULL, attrp->doc, id_val, attrp);
1281-
xmlFree(id_val);
1282-
}
1283-
} else if (is_id == 0 && attrp->atype == XML_ATTRIBUTE_ID) {
1297+
if (is_id && attrp->atype != XML_ATTRIBUTE_ID) {
1298+
attrp->atype = XML_ATTRIBUTE_ID;
1299+
} else if (!is_id && attrp->atype == XML_ATTRIBUTE_ID) {
12841300
xmlRemoveID(attrp->doc, attrp);
12851301
attrp->atype = 0;
12861302
}
1303+
1304+
dom_mark_ids_modified(document);
12871305
}
12881306
/* }}} */
12891307

@@ -1311,7 +1329,7 @@ PHP_METHOD(DOMElement, setIdAttribute)
13111329
if (attrp == NULL || attrp->type == XML_ATTRIBUTE_DECL) {
13121330
php_dom_throw_error(NOT_FOUND_ERR, dom_get_strict_error(intern->document));
13131331
} else {
1314-
php_set_attribute_id(attrp, is_id);
1332+
php_set_attribute_id(attrp, is_id, intern->document);
13151333
}
13161334
}
13171335
/* }}} end dom_element_set_id_attribute */
@@ -1340,7 +1358,7 @@ PHP_METHOD(DOMElement, setIdAttributeNS)
13401358
if (attrp == NULL || attrp->type == XML_ATTRIBUTE_DECL) {
13411359
php_dom_throw_error(NOT_FOUND_ERR, dom_get_strict_error(intern->document));
13421360
} else {
1343-
php_set_attribute_id(attrp, is_id);
1361+
php_set_attribute_id(attrp, is_id, intern->document);
13441362
}
13451363
}
13461364
/* }}} end dom_element_set_id_attribute_ns */
@@ -1367,7 +1385,7 @@ static void dom_element_set_id_attribute_node(INTERNAL_FUNCTION_PARAMETERS, zend
13671385
if (attrp->parent != nodep) {
13681386
php_dom_throw_error(NOT_FOUND_ERR, dom_get_strict_error(intern->document));
13691387
} else {
1370-
php_set_attribute_id(attrp, is_id);
1388+
php_set_attribute_id(attrp, is_id, intern->document);
13711389
}
13721390
}
13731391

ext/dom/internal_helpers.h

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,4 +63,31 @@ DOM_DEF_GET_CE_FUNC(domimplementation)
6363

6464
#endif
6565

66+
static zend_always_inline size_t dom_minimum_modification_nr_since_parsing(php_libxml_ref_obj *doc_ptr)
67+
{
68+
/* For old-style DOM, we need a "new DOMDocument" + a load, so the minimum modification nr is 2.
69+
* For new-style DOM, we need only to call a named constructor, so the minimum modification nr is 1. */
70+
return doc_ptr->class_type == PHP_LIBXML_CLASS_MODERN ? 1 : 2;
71+
}
72+
73+
static zend_always_inline void dom_mark_document_cache_as_modified_since_parsing(php_libxml_ref_obj *doc_ptr)
74+
{
75+
if (doc_ptr) {
76+
doc_ptr->cache_tag.modification_nr = MAX(dom_minimum_modification_nr_since_parsing(doc_ptr) + 1,
77+
doc_ptr->cache_tag.modification_nr);
78+
}
79+
}
80+
81+
/* Marks the ID cache as potentially stale */
82+
static zend_always_inline void dom_mark_ids_modified(php_libxml_ref_obj *doc_ptr)
83+
{
84+
/* Although this is currently a wrapper function, it's best to abstract the actual implementation away. */
85+
dom_mark_document_cache_as_modified_since_parsing(doc_ptr);
86+
}
87+
88+
static zend_always_inline bool dom_is_document_cache_modified_since_parsing(php_libxml_ref_obj *doc_ptr)
89+
{
90+
return !doc_ptr || doc_ptr->cache_tag.modification_nr > dom_minimum_modification_nr_since_parsing(doc_ptr);
91+
}
92+
6693
#endif

ext/dom/node.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,7 @@ zend_result dom_node_node_value_write(dom_object *obj, zval *newval)
185185
/* Access to Element node is implemented as a convenience method */
186186
switch (nodep->type) {
187187
case XML_ATTRIBUTE_NODE:
188+
dom_attr_value_will_change(obj, (xmlAttrPtr) nodep);
188189
if (php_dom_follow_spec_intern(obj)) {
189190
dom_remove_all_children(nodep);
190191
xmlAddChild(nodep, xmlNewTextLen(BAD_CAST ZSTR_VAL(str), ZSTR_LEN(str)));

ext/dom/php_dom.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,7 @@ xmlDocPtr php_dom_create_html_doc(void);
174174
xmlEntityPtr dom_entity_reference_fetch_and_sync_declaration(xmlNodePtr reference);
175175
void dom_set_xml_class(php_libxml_ref_obj *document);
176176
bool dom_compare_value(const xmlAttr *attr, const xmlChar *value);
177+
void dom_attr_value_will_change(dom_object *obj, xmlAttrPtr attrp);
177178

178179
typedef enum {
179180
DOM_LOAD_STRING = 0,
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
--TEST--
2+
Bug #79701 (getElementById does not correctly work with duplicate definitions) - id property variation
3+
--EXTENSIONS--
4+
dom
5+
--FILE--
6+
<?php
7+
$dom = Dom\XMLDocument::createFromString(<<<XML
8+
<root>
9+
<test1/>
10+
<test2/>
11+
</root>
12+
XML);
13+
14+
$test1 = $dom->documentElement->firstElementChild;
15+
$test2 = $test1->nextElementSibling;
16+
17+
echo "--- After parsing ---\n";
18+
var_dump($dom->getElementById("x")?->nodeName);
19+
20+
echo "--- After setting test2 ---\n";
21+
$test2->id = "x";
22+
var_dump($dom->getElementById("x")?->nodeName);
23+
echo "--- After setting test1 ---\n";
24+
$test1->id = "x";
25+
var_dump($dom->getElementById("x")?->nodeName);
26+
echo "--- After resetting test1 ---\n";
27+
$test1->id = "y";
28+
var_dump($dom->getElementById("x")?->nodeName);
29+
echo "--- After resetting test2 ---\n";
30+
$test2->id = "y";
31+
var_dump($dom->getElementById("x")?->nodeName);
32+
echo "--- After resetting test1 ---\n";
33+
$test1->id = "x";
34+
var_dump($dom->getElementById("x")?->nodeName);
35+
echo "--- After calling setIdAttribute with false on test1 ---\n";
36+
$test1->setIdAttribute("id", false);
37+
var_dump($dom->getElementById("x")?->nodeName);
38+
?>
39+
--EXPECT--
40+
--- After parsing ---
41+
NULL
42+
--- After setting test2 ---
43+
string(5) "test2"
44+
--- After setting test1 ---
45+
string(5) "test1"
46+
--- After resetting test1 ---
47+
string(5) "test2"
48+
--- After resetting test2 ---
49+
NULL
50+
--- After resetting test1 ---
51+
string(5) "test1"
52+
--- After calling setIdAttribute with false on test1 ---
53+
NULL

ext/dom/tests/bug79701/node.phpt

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
--TEST--
2+
Bug #79701 (getElementById does not correctly work with duplicate definitions) - attribute node variation
3+
--EXTENSIONS--
4+
dom
5+
--FILE--
6+
<?php
7+
$dom = Dom\HTMLDocument::createEmpty();
8+
$element = $dom->createElement('foo');
9+
$dom->append($element);
10+
$attr = $dom->createAttribute('id');
11+
$attr->value = 'test';
12+
var_dump($dom->getElementById('test')?->nodeName);
13+
$element->setAttributeNode($attr);
14+
var_dump($dom->getElementById('test')?->nodeName);
15+
?>
16+
--EXPECT--
17+
NULL
18+
string(3) "FOO"

0 commit comments

Comments
 (0)