From 47e89a38d0649eb7f708757a0fdca6b55c0f032e Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Fri, 20 Dec 2024 15:17:09 -0500 Subject: [PATCH 1/2] Revert most of "reuse XPathContext objects" (969bea94) Revert the pieces of 969bea94 related to re-use of xpath context objects. Leave in these pieces of that commit: - ability to de-register variables and namespaces - XPathContext#node= and backfill explicit test coverage for these features. --- ext/nokogiri/xml_xpath_context.c | 17 ------- lib/nokogiri/xml/searchable.rb | 33 ++----------- lib/nokogiri/xml/xpath_context.rb | 22 --------- test/xml/test_xpath.rb | 80 ------------------------------- test/xml/test_xpath_context.rb | 79 ++++++++++++++++++++++++++++++ 5 files changed, 84 insertions(+), 147 deletions(-) create mode 100644 test/xml/test_xpath_context.rb diff --git a/ext/nokogiri/xml_xpath_context.c b/ext/nokogiri/xml_xpath_context.c index 7b09f5272cc..8f71c2e316b 100644 --- a/ext/nokogiri/xml_xpath_context.c +++ b/ext/nokogiri/xml_xpath_context.c @@ -144,13 +144,6 @@ noko_xml_xpath_context_register_ns(VALUE rb_context, VALUE prefix, VALUE uri) xmlXPathRegisterNs(c_context, (const xmlChar *)StringValueCStr(prefix), ns_uri); - VALUE registered_namespaces = rb_iv_get(rb_context, "@registered_namespaces"); - if (NIL_P(uri)) { - rb_hash_delete(registered_namespaces, prefix); - } else { - rb_hash_aset(registered_namespaces, prefix, Qtrue); - } - return rb_context; } @@ -179,13 +172,6 @@ noko_xml_xpath_context_register_variable(VALUE rb_context, VALUE name, VALUE val xmlXPathRegisterVariable(c_context, (const xmlChar *)StringValueCStr(name), xmlValue); - VALUE registered_variables = rb_iv_get(rb_context, "@registered_variables"); - if (NIL_P(value)) { - rb_hash_delete(registered_variables, name); - } else { - rb_hash_aset(registered_variables, name, Qtrue); - } - return rb_context; } @@ -461,9 +447,6 @@ noko_xml_xpath_context_new(VALUE klass, VALUE rb_node) rb_context = TypedData_Wrap_Struct(klass, &_noko_xml_xpath_context_type, c_context); - rb_iv_set(rb_context, "@registered_namespaces", rb_hash_new()); - rb_iv_set(rb_context, "@registered_variables", rb_hash_new()); - return rb_context; } diff --git a/lib/nokogiri/xml/searchable.rb b/lib/nokogiri/xml/searchable.rb index 11c0532639d..2985fa63d18 100644 --- a/lib/nokogiri/xml/searchable.rb +++ b/lib/nokogiri/xml/searchable.rb @@ -261,36 +261,13 @@ def xpath_internal(node, paths, handler, ns, binds) end def xpath_impl(node, path, handler, ns, binds) - get_xpath_context(node) do |context| - context.register_namespaces(ns) - context.register_variables(binds) + context = XPathContext.new(node) + context.register_namespaces(ns) + context.register_variables(binds) - path = path.gsub("xmlns:", " :") unless Nokogiri.uses_libxml? + path = path.gsub("xmlns:", " :") unless Nokogiri.uses_libxml? - context.evaluate(path, handler) - end - end - - if Nokogiri.uses_libxml? && ENV["NOKOGIRI_DEOPTIMIZE_XPATH"].nil? # env var is an escape hatch - # optimized path - def get_xpath_context(node) - context = Thread.current.thread_variable_get(:nokogiri_xpath_context) - if context - context.node = node - else - context = Thread.current.thread_variable_set(:nokogiri_xpath_context, XPathContext.new(node)) - end - - begin - yield context - ensure - context.reset - end - end - else - def get_xpath_context(node) - yield XPathContext.new(node) - end + context.evaluate(path, handler) end end end diff --git a/lib/nokogiri/xml/xpath_context.rb b/lib/nokogiri/xml/xpath_context.rb index 2078f40098f..033af04b024 100644 --- a/lib/nokogiri/xml/xpath_context.rb +++ b/lib/nokogiri/xml/xpath_context.rb @@ -22,28 +22,6 @@ def register_variables(binds) register_variable(key, value) end end - - if Nokogiri.uses_libxml? - def reset - return unless - - @registered_namespaces.each do |key, _| - register_ns(key, nil) - end - unless @registered_namespaces.empty? - warn "Nokogiri::XML::XPathContext#reset: unexpected registered namespaces: #{@registered_namespaces.keys}" - @registered_namespaces.clear - end - - @registered_variables.each do |key, _| - register_variable(key, nil) - end - unless @registered_variables.empty? - warn "Nokogiri::XML::XPathContext#reset: unexpected registered variables: #{@registered_variables.keys}" - @registered_variables.clear - end - end - end end end end diff --git a/test/xml/test_xpath.rb b/test/xml/test_xpath.rb index 790f0094d00..7948cac0514 100644 --- a/test/xml/test_xpath.rb +++ b/test/xml/test_xpath.rb @@ -698,86 +698,6 @@ def collision(nodes) assert_equal(3, doc.xpath("//self::*:child").length) end end - - describe "re-using XPathContext objects within a thread" do - # see https://github.com/sparklemotion/nokogiri/issues/3266 - # this optimization is CRuby-only, but we run the tests on JRuby for consistency - let(:doc) { - Nokogiri::XML::Document.parse(<<~XML) - - default - ns1 - - XML - } - - it "clears registered namespaces" do - # make sure the thread-local XPathContext is initialized - doc.xpath("//xmlns:child") - - # do namespaces behave the way we expect? - node = doc.at_xpath("//ns:child", { "ns" => "http://nokogiri.org/ns1" }) - assert_equal("ns1", node.text) - - assert_raises(XPath::SyntaxError) { - doc.at_xpath("//ns:child") - } - - node = doc.at_xpath("//child") - assert_nil(node) - - # make sure the nokogiri and nokogiri-builting namespaces are re-registered - doc.xpath("//xmlns:child[nokogiri:thing(.)]", @handler) - assert_equal(1, @handler.things.length) - - if Nokogiri.uses_libxml? - nodes = doc.xpath("//xmlns:child[nokogiri-builtin:local-name-is('child')]") - assert_equal(1, nodes.length) - end - end - - it "clears registered handlers and functions" do - # make sure the thread-local XPathContext is initialized - doc.xpath("//xmlns:child") - - doc.xpath("//xmlns:child[nokogiri:thing(.)]", @handler) - assert_equal(1, @handler.things.length) - - assert_raises(XPath::SyntaxError) { - doc.xpath("//xmlns:child[nokogiri:thing(.)]") - } - - doc.xpath("//xmlns:child[nokogiri:thing(.)]", @handler) - assert_equal(2, @handler.things.length) - - if Nokogiri.uses_libxml? - nodes = doc.xpath("//xmlns:child[nokogiri-builtin:local-name-is('child')]") - assert_equal(1, nodes.length) - end - end - - it "clears registered variables" do - # make sure the thread-local XPathContext is initialized - doc.xpath("//xmlns:child") - - nodes = @xml.xpath("//address[@domestic=$value]", nil, value: "Yes") - assert_equal(4, nodes.length) - - assert_raises(XPath::SyntaxError) { - @xml.xpath("//address[@domestic=$value]") - } - - nodes = @xml.xpath("//address[@domestic=$value]", nil, value: "Qwerty") - assert_empty(nodes) - - assert_raises(XPath::SyntaxError) { - @xml.xpath("//address[@domestic=$value]") - } - - nodes = @xml.xpath("//address[@domestic=$value]", nil, value: "Yes") - assert_equal(4, nodes.length) - end - end end end end diff --git a/test/xml/test_xpath_context.rb b/test/xml/test_xpath_context.rb new file mode 100644 index 00000000000..aa1d88a55c2 --- /dev/null +++ b/test/xml/test_xpath_context.rb @@ -0,0 +1,79 @@ +# frozen_string_literal: true + +require "helper" + +module Nokogiri + module XML + describe XPathContext do + it "can register and deregister namespaces" do + doc = Document.parse(<<~XML) + + default + ns1 + + XML + + xc = XPathContext.new(doc) + + assert_raises(XPath::SyntaxError) do + xc.evaluate("//xmlns:child") + end + + xc.register_namespaces({ "xmlns" => "http://nokogiri.org/default" }) + assert_pattern do + xc.evaluate("//xmlns:child") => [ + { name: "child", namespace: { href: "http://nokogiri.org/default" } } + ] + end + + xc.register_namespaces({ "xmlns" => nil }) + assert_raises(XPath::SyntaxError) do + xc.evaluate("//xmlns:child") + end + end + + it "can register and deregister variables" do + doc = Nokogiri::XML.parse(File.read(TestBase::XML_FILE), TestBase::XML_FILE) + + xc = XPathContext.new(doc) + + assert_raises(XPath::SyntaxError) do + xc.evaluate("//address[@domestic=$value]") + end + + xc.register_variables({ "value" => "Yes" }) + nodes = xc.evaluate("//address[@domestic=$value]") + assert_equal(4, nodes.length) + + xc.register_variables({ "value" => "Qwerty" }) + nodes = xc.evaluate("//address[@domestic=$value]") + assert_empty(nodes) + + xc.register_variables({ "value" => nil }) + assert_raises(XPath::SyntaxError) do + xc.evaluate("//address[@domestic=$value]") + end + end + + it "#node=" do + doc = Nokogiri::XML::Document.parse(<<~XML) + + one + two + three + + XML + + xc = XPathContext.new(doc) + results = xc.evaluate(".//foo") + assert_equal(3, results.length) + + xc.node = doc.root.elements[0] + assert_pattern { xc.evaluate(".//foo") => [{ name: "foo", inner_html: "one" }] } + + xc.node = doc.root.elements[1] + assert_pattern { xc.evaluate(".//foo") => [{ name: "foo", inner_html: "two" }] } + end + end + end +end From 3f20dd912fd9a0126ec7e60fe639114ffbf15a2f Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Sat, 21 Dec 2024 14:53:39 -0500 Subject: [PATCH 2/2] ext: backport libxml2/gnome@bf5fcf6e for xmlXPathContext perf See extended discussion at #3378 Benchmark comparing this commit against v1.17.x ("main"): Comparison: large: main: 3910.6 i/s large: patched: 3759.6 i/s - same-ish: difference falls within error Comparison: small: patched: 242901.7 i/s small: main: 127486.0 i/s - 1.91x slower I think we could get greater performance gains by re-using XPathContext objects, but only at the cost of a significant amount of additional complexity, since in order to properly support recursive XPath evaluation, Nokogiri would have to push and pop "stack frames" containing: - internal state contextSize and proximityPosition - registered namespaces - registered variables - function lookup handler That feels like a lot of code for a small win. Comparatively, pulling in this upstream patch is still a 2x speedup for zero additional complexity. --- CHANGELOG.md | 2 +- ...te-static-hash-table-for-standard-fu.patch | 244 ++++++++++++++++++ 2 files changed, 245 insertions(+), 1 deletion(-) create mode 100644 patches/libxml2/0019-xpath-Use-separate-static-hash-table-for-standard-fu.patch diff --git a/CHANGELOG.md b/CHANGELOG.md index 7effa5c5884..5378bb54d3f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,7 +21,7 @@ This release drops precompiled native platform gems for `x86-linux` and `x86-min ### Improved -* [CRuby] CSS and XPath queries are faster now that `Node#xpath`, `Node#css`, and related functions are re-using the underlying xpath context object (which is expensive to initialize). We benchmarked a 2.8x improvement for a 6kb file, and a more modest 1.3x improvement for a 70kb file. (#3378) @flavorjones +* [CRuby] CSS and XPath queries are faster now that `Node#xpath`, `Node#css`, and related functions are using a faster XPathContext initialization process. We benchmarked a 1.9x improvement for a 6kb file. Big thanks to @nwellnhof for helping with this one. (#3378, superseded by #3389) @flavorjones ## v1.17.2 / 2024-12-12 diff --git a/patches/libxml2/0019-xpath-Use-separate-static-hash-table-for-standard-fu.patch b/patches/libxml2/0019-xpath-Use-separate-static-hash-table-for-standard-fu.patch new file mode 100644 index 00000000000..f84dc8e52ee --- /dev/null +++ b/patches/libxml2/0019-xpath-Use-separate-static-hash-table-for-standard-fu.patch @@ -0,0 +1,244 @@ +From d3e3526111097560cf7c002613e2cb1d469b59e0 Mon Sep 17 00:00:00 2001 +From: Nick Wellnhofer +Date: Sat, 21 Dec 2024 16:03:46 +0100 +Subject: [PATCH] xpath: Use separate static hash table for standard functions + +This avoids registering standard functions when creating an XPath +context. + +Lookup of extension functions is a bit slower now, but ultimately, all +function lookups should be moved to the compilation phase. + +(cherry picked from commit bf5fcf6e646bb51a0f6a3655a1d64bea97274867) +--- + xpath.c | 170 ++++++++++++++++++++++++++++++++------------------------ + 1 file changed, 98 insertions(+), 72 deletions(-) + +diff --git a/xpath.c b/xpath.c +index 485d7747..21711653 100644 +--- a/xpath.c ++++ b/xpath.c +@@ -136,11 +136,48 @@ + + #if defined(LIBXML_XPATH_ENABLED) || defined(LIBXML_SCHEMAS_ENABLED) + +-/************************************************************************ +- * * +- * Floating point stuff * +- * * +- ************************************************************************/ ++static void ++xmlXPathNameFunction(xmlXPathParserContextPtr ctxt, int nargs); ++ ++static const struct { ++ const char *name; ++ xmlXPathFunction func; ++} xmlXPathStandardFunctions[] = { ++ { "boolean", xmlXPathBooleanFunction }, ++ { "ceiling", xmlXPathCeilingFunction }, ++ { "count", xmlXPathCountFunction }, ++ { "concat", xmlXPathConcatFunction }, ++ { "contains", xmlXPathContainsFunction }, ++ { "id", xmlXPathIdFunction }, ++ { "false", xmlXPathFalseFunction }, ++ { "floor", xmlXPathFloorFunction }, ++ { "last", xmlXPathLastFunction }, ++ { "lang", xmlXPathLangFunction }, ++ { "local-name", xmlXPathLocalNameFunction }, ++ { "not", xmlXPathNotFunction }, ++ { "name", xmlXPathNameFunction }, ++ { "namespace-uri", xmlXPathNamespaceURIFunction }, ++ { "normalize-space", xmlXPathNormalizeFunction }, ++ { "number", xmlXPathNumberFunction }, ++ { "position", xmlXPathPositionFunction }, ++ { "round", xmlXPathRoundFunction }, ++ { "string", xmlXPathStringFunction }, ++ { "string-length", xmlXPathStringLengthFunction }, ++ { "starts-with", xmlXPathStartsWithFunction }, ++ { "substring", xmlXPathSubstringFunction }, ++ { "substring-before", xmlXPathSubstringBeforeFunction }, ++ { "substring-after", xmlXPathSubstringAfterFunction }, ++ { "sum", xmlXPathSumFunction }, ++ { "true", xmlXPathTrueFunction }, ++ { "translate", xmlXPathTranslateFunction } ++}; ++ ++#define NUM_STANDARD_FUNCTIONS \ ++ (sizeof(xmlXPathStandardFunctions) / sizeof(xmlXPathStandardFunctions[0])) ++ ++#define SF_HASH_SIZE 64 ++ ++static unsigned char xmlXPathSFHash[SF_HASH_SIZE]; + + double xmlXPathNAN = 0.0; + double xmlXPathPINF = 0.0; +@@ -156,6 +193,18 @@ xmlXPathInit(void) { + xmlInitParser(); + } + ++ATTRIBUTE_NO_SANITIZE_INTEGER ++static unsigned ++xmlXPathSFComputeHash(const xmlChar *name) { ++ unsigned hashValue = 5381; ++ const xmlChar *ptr; ++ ++ for (ptr = name; *ptr; ptr++) ++ hashValue = hashValue * 33 + *ptr; ++ ++ return(hashValue); ++} ++ + /** + * xmlInitXPathInternal: + * +@@ -164,6 +213,8 @@ xmlXPathInit(void) { + ATTRIBUTE_NO_SANITIZE("float-divide-by-zero") + void + xmlInitXPathInternal(void) { ++ size_t i; ++ + #if defined(NAN) && defined(INFINITY) + xmlXPathNAN = NAN; + xmlXPathPINF = INFINITY; +@@ -175,8 +226,34 @@ xmlInitXPathInternal(void) { + xmlXPathPINF = 1.0 / zero; + xmlXPathNINF = -xmlXPathPINF; + #endif ++ ++ /* ++ * Initialize hash table for standard functions ++ */ ++ ++ for (i = 0; i < SF_HASH_SIZE; i++) ++ xmlXPathSFHash[i] = UCHAR_MAX; ++ ++ for (i = 0; i < NUM_STANDARD_FUNCTIONS; i++) { ++ const char *name = xmlXPathStandardFunctions[i].name; ++ int bucketIndex = xmlXPathSFComputeHash(BAD_CAST name) % SF_HASH_SIZE; ++ ++ while (xmlXPathSFHash[bucketIndex] != UCHAR_MAX) { ++ bucketIndex += 1; ++ if (bucketIndex >= SF_HASH_SIZE) ++ bucketIndex = 0; ++ } ++ ++ xmlXPathSFHash[bucketIndex] = i; ++ } + } + ++/************************************************************************ ++ * * ++ * Floating point stuff * ++ * * ++ ************************************************************************/ ++ + /** + * xmlXPathIsNaN: + * @val: a double value +@@ -3979,18 +4056,6 @@ xmlXPathRegisterFuncLookup (xmlXPathContextPtr ctxt, + */ + xmlXPathFunction + xmlXPathFunctionLookup(xmlXPathContextPtr ctxt, const xmlChar *name) { +- if (ctxt == NULL) +- return (NULL); +- +- if (ctxt->funcLookupFunc != NULL) { +- xmlXPathFunction ret; +- xmlXPathFuncLookupFunc f; +- +- f = ctxt->funcLookupFunc; +- ret = f(ctxt->funcLookupData, name, NULL); +- if (ret != NULL) +- return(ret); +- } + return(xmlXPathFunctionLookupNS(ctxt, name, NULL)); + } + +@@ -4015,6 +4080,22 @@ xmlXPathFunctionLookupNS(xmlXPathContextPtr ctxt, const xmlChar *name, + if (name == NULL) + return(NULL); + ++ if (ns_uri == NULL) { ++ int bucketIndex = xmlXPathSFComputeHash(name) % SF_HASH_SIZE; ++ ++ while (xmlXPathSFHash[bucketIndex] != UCHAR_MAX) { ++ int funcIndex = xmlXPathSFHash[bucketIndex]; ++ ++ if (strcmp(xmlXPathStandardFunctions[funcIndex].name, ++ (char *) name) == 0) ++ return(xmlXPathStandardFunctions[funcIndex].func); ++ ++ bucketIndex += 1; ++ if (bucketIndex >= SF_HASH_SIZE) ++ bucketIndex = 0; ++ } ++ } ++ + if (ctxt->funcLookupFunc != NULL) { + xmlXPathFuncLookupFunc f; + +@@ -13494,61 +13575,6 @@ xmlXPathEscapeUriFunction(xmlXPathParserContextPtr ctxt, int nargs) { + void + xmlXPathRegisterAllFunctions(xmlXPathContextPtr ctxt) + { +- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"boolean", +- xmlXPathBooleanFunction); +- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"ceiling", +- xmlXPathCeilingFunction); +- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"count", +- xmlXPathCountFunction); +- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"concat", +- xmlXPathConcatFunction); +- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"contains", +- xmlXPathContainsFunction); +- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"id", +- xmlXPathIdFunction); +- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"false", +- xmlXPathFalseFunction); +- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"floor", +- xmlXPathFloorFunction); +- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"last", +- xmlXPathLastFunction); +- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"lang", +- xmlXPathLangFunction); +- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"local-name", +- xmlXPathLocalNameFunction); +- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"not", +- xmlXPathNotFunction); +- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"name", +- xmlXPathNameFunction); +- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"namespace-uri", +- xmlXPathNamespaceURIFunction); +- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"normalize-space", +- xmlXPathNormalizeFunction); +- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"number", +- xmlXPathNumberFunction); +- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"position", +- xmlXPathPositionFunction); +- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"round", +- xmlXPathRoundFunction); +- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"string", +- xmlXPathStringFunction); +- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"string-length", +- xmlXPathStringLengthFunction); +- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"starts-with", +- xmlXPathStartsWithFunction); +- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"substring", +- xmlXPathSubstringFunction); +- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"substring-before", +- xmlXPathSubstringBeforeFunction); +- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"substring-after", +- xmlXPathSubstringAfterFunction); +- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"sum", +- xmlXPathSumFunction); +- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"true", +- xmlXPathTrueFunction); +- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"translate", +- xmlXPathTranslateFunction); +- + xmlXPathRegisterFuncNS(ctxt, (const xmlChar *)"escape-uri", + (const xmlChar *)"http://www.w3.org/2002/08/xquery-functions", + xmlXPathEscapeUriFunction); +-- +2.47.1 +