Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perf: use libxml2 patch for faster xpath context init #3389

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ This release drops precompiled native platform gems for `x86-linux` and `x86-min

### Improved

* [CRuby] CSS and XPath queries are faster now that `Node#xpath`, `Node#css`, and related functions are re-using the underlying xpath context object (which is expensive to initialize). We benchmarked a 2.8x improvement for a 6kb file, and a more modest 1.3x improvement for a 70kb file. (#3378) @flavorjones
* [CRuby] CSS and XPath queries are faster now that `Node#xpath`, `Node#css`, and related functions are using a faster XPathContext initialization process. We benchmarked a 1.9x improvement for a 6kb file. Big thanks to @nwellnhof for helping with this one. (#3378, superseded by #3389) @flavorjones


## v1.17.2 / 2024-12-12
Expand Down
17 changes: 0 additions & 17 deletions ext/nokogiri/xml_xpath_context.c
Original file line number Diff line number Diff line change
Expand Up @@ -144,13 +144,6 @@ noko_xml_xpath_context_register_ns(VALUE rb_context, VALUE prefix, VALUE uri)

xmlXPathRegisterNs(c_context, (const xmlChar *)StringValueCStr(prefix), ns_uri);

VALUE registered_namespaces = rb_iv_get(rb_context, "@registered_namespaces");
if (NIL_P(uri)) {
rb_hash_delete(registered_namespaces, prefix);
} else {
rb_hash_aset(registered_namespaces, prefix, Qtrue);
}

return rb_context;
}

Expand Down Expand Up @@ -179,13 +172,6 @@ noko_xml_xpath_context_register_variable(VALUE rb_context, VALUE name, VALUE val

xmlXPathRegisterVariable(c_context, (const xmlChar *)StringValueCStr(name), xmlValue);

VALUE registered_variables = rb_iv_get(rb_context, "@registered_variables");
if (NIL_P(value)) {
rb_hash_delete(registered_variables, name);
} else {
rb_hash_aset(registered_variables, name, Qtrue);
}

return rb_context;
}

Expand Down Expand Up @@ -461,9 +447,6 @@ noko_xml_xpath_context_new(VALUE klass, VALUE rb_node)

rb_context = TypedData_Wrap_Struct(klass, &_noko_xml_xpath_context_type, c_context);

rb_iv_set(rb_context, "@registered_namespaces", rb_hash_new());
rb_iv_set(rb_context, "@registered_variables", rb_hash_new());

return rb_context;
}

Expand Down
33 changes: 5 additions & 28 deletions lib/nokogiri/xml/searchable.rb
Original file line number Diff line number Diff line change
Expand Up @@ -261,36 +261,13 @@ def xpath_internal(node, paths, handler, ns, binds)
end

def xpath_impl(node, path, handler, ns, binds)
get_xpath_context(node) do |context|
context.register_namespaces(ns)
context.register_variables(binds)
context = XPathContext.new(node)
context.register_namespaces(ns)
context.register_variables(binds)

path = path.gsub("xmlns:", " :") unless Nokogiri.uses_libxml?
path = path.gsub("xmlns:", " :") unless Nokogiri.uses_libxml?

context.evaluate(path, handler)
end
end

if Nokogiri.uses_libxml? && ENV["NOKOGIRI_DEOPTIMIZE_XPATH"].nil? # env var is an escape hatch
# optimized path
def get_xpath_context(node)
context = Thread.current.thread_variable_get(:nokogiri_xpath_context)
if context
context.node = node
else
context = Thread.current.thread_variable_set(:nokogiri_xpath_context, XPathContext.new(node))
end

begin
yield context
ensure
context.reset
end
end
else
def get_xpath_context(node)
yield XPathContext.new(node)
end
context.evaluate(path, handler)
end
end
end
Expand Down
22 changes: 0 additions & 22 deletions lib/nokogiri/xml/xpath_context.rb
Original file line number Diff line number Diff line change
Expand Up @@ -22,28 +22,6 @@ def register_variables(binds)
register_variable(key, value)
end
end

if Nokogiri.uses_libxml?
def reset
return unless

@registered_namespaces.each do |key, _|
register_ns(key, nil)
end
unless @registered_namespaces.empty?
warn "Nokogiri::XML::XPathContext#reset: unexpected registered namespaces: #{@registered_namespaces.keys}"
@registered_namespaces.clear
end

@registered_variables.each do |key, _|
register_variable(key, nil)
end
unless @registered_variables.empty?
warn "Nokogiri::XML::XPathContext#reset: unexpected registered variables: #{@registered_variables.keys}"
@registered_variables.clear
end
end
end
end
end
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,244 @@
From d3e3526111097560cf7c002613e2cb1d469b59e0 Mon Sep 17 00:00:00 2001
From: Nick Wellnhofer <[email protected]>
Date: Sat, 21 Dec 2024 16:03:46 +0100
Subject: [PATCH] xpath: Use separate static hash table for standard functions

This avoids registering standard functions when creating an XPath
context.

Lookup of extension functions is a bit slower now, but ultimately, all
function lookups should be moved to the compilation phase.

(cherry picked from commit bf5fcf6e646bb51a0f6a3655a1d64bea97274867)
---
xpath.c | 170 ++++++++++++++++++++++++++++++++------------------------
1 file changed, 98 insertions(+), 72 deletions(-)

diff --git a/xpath.c b/xpath.c
index 485d7747..21711653 100644
--- a/xpath.c
+++ b/xpath.c
@@ -136,11 +136,48 @@

#if defined(LIBXML_XPATH_ENABLED) || defined(LIBXML_SCHEMAS_ENABLED)

-/************************************************************************
- * *
- * Floating point stuff *
- * *
- ************************************************************************/
+static void
+xmlXPathNameFunction(xmlXPathParserContextPtr ctxt, int nargs);
+
+static const struct {
+ const char *name;
+ xmlXPathFunction func;
+} xmlXPathStandardFunctions[] = {
+ { "boolean", xmlXPathBooleanFunction },
+ { "ceiling", xmlXPathCeilingFunction },
+ { "count", xmlXPathCountFunction },
+ { "concat", xmlXPathConcatFunction },
+ { "contains", xmlXPathContainsFunction },
+ { "id", xmlXPathIdFunction },
+ { "false", xmlXPathFalseFunction },
+ { "floor", xmlXPathFloorFunction },
+ { "last", xmlXPathLastFunction },
+ { "lang", xmlXPathLangFunction },
+ { "local-name", xmlXPathLocalNameFunction },
+ { "not", xmlXPathNotFunction },
+ { "name", xmlXPathNameFunction },
+ { "namespace-uri", xmlXPathNamespaceURIFunction },
+ { "normalize-space", xmlXPathNormalizeFunction },
+ { "number", xmlXPathNumberFunction },
+ { "position", xmlXPathPositionFunction },
+ { "round", xmlXPathRoundFunction },
+ { "string", xmlXPathStringFunction },
+ { "string-length", xmlXPathStringLengthFunction },
+ { "starts-with", xmlXPathStartsWithFunction },
+ { "substring", xmlXPathSubstringFunction },
+ { "substring-before", xmlXPathSubstringBeforeFunction },
+ { "substring-after", xmlXPathSubstringAfterFunction },
+ { "sum", xmlXPathSumFunction },
+ { "true", xmlXPathTrueFunction },
+ { "translate", xmlXPathTranslateFunction }
+};
+
+#define NUM_STANDARD_FUNCTIONS \
+ (sizeof(xmlXPathStandardFunctions) / sizeof(xmlXPathStandardFunctions[0]))
+
+#define SF_HASH_SIZE 64
+
+static unsigned char xmlXPathSFHash[SF_HASH_SIZE];

double xmlXPathNAN = 0.0;
double xmlXPathPINF = 0.0;
@@ -156,6 +193,18 @@ xmlXPathInit(void) {
xmlInitParser();
}

+ATTRIBUTE_NO_SANITIZE_INTEGER
+static unsigned
+xmlXPathSFComputeHash(const xmlChar *name) {
+ unsigned hashValue = 5381;
+ const xmlChar *ptr;
+
+ for (ptr = name; *ptr; ptr++)
+ hashValue = hashValue * 33 + *ptr;
+
+ return(hashValue);
+}
+
/**
* xmlInitXPathInternal:
*
@@ -164,6 +213,8 @@ xmlXPathInit(void) {
ATTRIBUTE_NO_SANITIZE("float-divide-by-zero")
void
xmlInitXPathInternal(void) {
+ size_t i;
+
#if defined(NAN) && defined(INFINITY)
xmlXPathNAN = NAN;
xmlXPathPINF = INFINITY;
@@ -175,8 +226,34 @@ xmlInitXPathInternal(void) {
xmlXPathPINF = 1.0 / zero;
xmlXPathNINF = -xmlXPathPINF;
#endif
+
+ /*
+ * Initialize hash table for standard functions
+ */
+
+ for (i = 0; i < SF_HASH_SIZE; i++)
+ xmlXPathSFHash[i] = UCHAR_MAX;
+
+ for (i = 0; i < NUM_STANDARD_FUNCTIONS; i++) {
+ const char *name = xmlXPathStandardFunctions[i].name;
+ int bucketIndex = xmlXPathSFComputeHash(BAD_CAST name) % SF_HASH_SIZE;
+
+ while (xmlXPathSFHash[bucketIndex] != UCHAR_MAX) {
+ bucketIndex += 1;
+ if (bucketIndex >= SF_HASH_SIZE)
+ bucketIndex = 0;
+ }
+
+ xmlXPathSFHash[bucketIndex] = i;
+ }
}

+/************************************************************************
+ * *
+ * Floating point stuff *
+ * *
+ ************************************************************************/
+
/**
* xmlXPathIsNaN:
* @val: a double value
@@ -3979,18 +4056,6 @@ xmlXPathRegisterFuncLookup (xmlXPathContextPtr ctxt,
*/
xmlXPathFunction
xmlXPathFunctionLookup(xmlXPathContextPtr ctxt, const xmlChar *name) {
- if (ctxt == NULL)
- return (NULL);
-
- if (ctxt->funcLookupFunc != NULL) {
- xmlXPathFunction ret;
- xmlXPathFuncLookupFunc f;
-
- f = ctxt->funcLookupFunc;
- ret = f(ctxt->funcLookupData, name, NULL);
- if (ret != NULL)
- return(ret);
- }
return(xmlXPathFunctionLookupNS(ctxt, name, NULL));
}

@@ -4015,6 +4080,22 @@ xmlXPathFunctionLookupNS(xmlXPathContextPtr ctxt, const xmlChar *name,
if (name == NULL)
return(NULL);

+ if (ns_uri == NULL) {
+ int bucketIndex = xmlXPathSFComputeHash(name) % SF_HASH_SIZE;
+
+ while (xmlXPathSFHash[bucketIndex] != UCHAR_MAX) {
+ int funcIndex = xmlXPathSFHash[bucketIndex];
+
+ if (strcmp(xmlXPathStandardFunctions[funcIndex].name,
+ (char *) name) == 0)
+ return(xmlXPathStandardFunctions[funcIndex].func);
+
+ bucketIndex += 1;
+ if (bucketIndex >= SF_HASH_SIZE)
+ bucketIndex = 0;
+ }
+ }
+
if (ctxt->funcLookupFunc != NULL) {
xmlXPathFuncLookupFunc f;

@@ -13494,61 +13575,6 @@ xmlXPathEscapeUriFunction(xmlXPathParserContextPtr ctxt, int nargs) {
void
xmlXPathRegisterAllFunctions(xmlXPathContextPtr ctxt)
{
- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"boolean",
- xmlXPathBooleanFunction);
- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"ceiling",
- xmlXPathCeilingFunction);
- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"count",
- xmlXPathCountFunction);
- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"concat",
- xmlXPathConcatFunction);
- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"contains",
- xmlXPathContainsFunction);
- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"id",
- xmlXPathIdFunction);
- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"false",
- xmlXPathFalseFunction);
- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"floor",
- xmlXPathFloorFunction);
- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"last",
- xmlXPathLastFunction);
- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"lang",
- xmlXPathLangFunction);
- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"local-name",
- xmlXPathLocalNameFunction);
- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"not",
- xmlXPathNotFunction);
- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"name",
- xmlXPathNameFunction);
- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"namespace-uri",
- xmlXPathNamespaceURIFunction);
- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"normalize-space",
- xmlXPathNormalizeFunction);
- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"number",
- xmlXPathNumberFunction);
- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"position",
- xmlXPathPositionFunction);
- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"round",
- xmlXPathRoundFunction);
- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"string",
- xmlXPathStringFunction);
- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"string-length",
- xmlXPathStringLengthFunction);
- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"starts-with",
- xmlXPathStartsWithFunction);
- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"substring",
- xmlXPathSubstringFunction);
- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"substring-before",
- xmlXPathSubstringBeforeFunction);
- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"substring-after",
- xmlXPathSubstringAfterFunction);
- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"sum",
- xmlXPathSumFunction);
- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"true",
- xmlXPathTrueFunction);
- xmlXPathRegisterFunc(ctxt, (const xmlChar *)"translate",
- xmlXPathTranslateFunction);
-
xmlXPathRegisterFuncNS(ctxt, (const xmlChar *)"escape-uri",
(const xmlChar *)"http://www.w3.org/2002/08/xquery-functions",
xmlXPathEscapeUriFunction);
--
2.47.1

Loading
Loading