From 920ffd54fea0d1c25f1f6bf2694b900535ccbb02 Mon Sep 17 00:00:00 2001 From: Attila Szakacs Date: Mon, 4 Mar 2024 19:12:42 +0100 Subject: [PATCH] xml: add filterx parser Signed-off-by: Attila Szakacs --- lib/filterx/expr-function.h | 6 +- modules/xml/CMakeLists.txt | 2 + modules/xml/Makefile.am | 4 +- modules/xml/filterx-parse-xml.c | 557 ++++++++++++++++++++++++++++++++ modules/xml/filterx-parse-xml.h | 33 ++ modules/xml/xml-plugin.c | 4 + tests/copyright/policy | 1 + 7 files changed, 603 insertions(+), 4 deletions(-) create mode 100644 modules/xml/filterx-parse-xml.c create mode 100644 modules/xml/filterx-parse-xml.h diff --git a/lib/filterx/expr-function.h b/lib/filterx/expr-function.h index a1a5c40ef6..71b97323ed 100644 --- a/lib/filterx/expr-function.h +++ b/lib/filterx/expr-function.h @@ -146,9 +146,9 @@ FilterXExpr *filterx_generator_function_lookup(GlobalConfig *cfg, const gchar *f .construct = filterx_function_ ## func_name ## _construct, \ } -#define FILTERX_GENERATOR_FUNCTION_PROTOTYPE(func_name) \ - gpointer \ - filterx_function_ ## func_name ## _construct(Plugin *self) +#define FILTERX_GENERATOR_FUNCTION_PROTOTYPE(func_name) \ + gpointer \ + filterx_generator_function_ ## func_name ## _construct(Plugin *self) #define FILTERX_GENERATOR_FUNCTION_DECLARE(func_name) \ FILTERX_GENERATOR_FUNCTION_PROTOTYPE(func_name); diff --git a/modules/xml/CMakeLists.txt b/modules/xml/CMakeLists.txt index a9f048feba..ea4f29b68e 100644 --- a/modules/xml/CMakeLists.txt +++ b/modules/xml/CMakeLists.txt @@ -9,11 +9,13 @@ set(xml_SOURCES "xml.h" "xml-private.h" "windows-eventlog-xml-parser.h" + "filterx-parse-xml.h" "xml-plugin.c" "xml-parser.c" "xml.c" "windows-eventlog-xml-parser.c" + "filterx-parse-xml.c" ) diff --git a/modules/xml/Makefile.am b/modules/xml/Makefile.am index 565b6fbd1c..c4af2ef716 100644 --- a/modules/xml/Makefile.am +++ b/modules/xml/Makefile.am @@ -8,7 +8,9 @@ modules_xml_libxml_la_SOURCES = \ modules/xml/xml.c \ modules/xml/xml-private.h \ modules/xml/windows-eventlog-xml-parser.h \ - modules/xml/windows-eventlog-xml-parser.c + modules/xml/windows-eventlog-xml-parser.c \ + modules/xml/filterx-parse-xml.h \ + modules/xml/filterx-parse-xml.c BUILT_SOURCES += \ diff --git a/modules/xml/filterx-parse-xml.c b/modules/xml/filterx-parse-xml.c new file mode 100644 index 0000000000..12aa959141 --- /dev/null +++ b/modules/xml/filterx-parse-xml.c @@ -0,0 +1,557 @@ +/* + * Copyright (c) 2024 Axoflow + * Copyright (c) 2024 Attila Szakacs + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * As an additional exemption you are allowed to compile & link against the + * OpenSSL libraries as published by the OpenSSL project. See the file + * COPYING for details. + * + */ + +#include "filterx-parse-xml.h" +#include "filterx/object-extractor.h" +#include "filterx/object-string.h" +#include "filterx/object-null.h" +#include "filterx/object-primitive.h" +#include "filterx/object-list-interface.h" +#include "filterx/object-dict-interface.h" +#include "filterx/filterx-eval.h" +#include "scratch-buffers.h" + +#define FILTERX_FUNC_PARSE_XML_USAGE "Usage: parse_xml(raw_xml)" + +static GQuark +_error_quark(void) +{ + return g_quark_from_static_string("filterx-parse-xml"); +} + +enum FilterXParseXmlErrorCode +{ + FILTERX_PARSE_XML_ERROR_CODE_PARSE_ERROR, +}; + +typedef struct FilterXGeneratorFunctionParseXml_ +{ + FilterXGeneratorFunction super; + FilterXExpr *xml_expr; +} FilterXGeneratorFunctionParseXml; + +typedef struct XmlElemContext_ +{ + FilterXObject *current_obj; + FilterXObject *parent_obj; +} XmlElemContext; + +static void +_elem_context_set_current_obj(XmlElemContext *self, FilterXObject *current_obj) +{ + filterx_object_unref(self->current_obj); + self->current_obj = filterx_object_ref(current_obj); + + GString *msg = g_string_new("Setting current obj: parent_obj: "); + + if (self->parent_obj) + filterx_object_repr_append(self->parent_obj, msg); + else + g_string_append(msg, "(null)"); + + g_string_append(msg, " current_obj: "); + + if (self->current_obj) + filterx_object_repr_append(self->current_obj, msg); + else + g_string_append(msg, "(null)"); + + printf("%s\n", msg->str); +} + +static void +_elem_context_set_parent_obj(XmlElemContext *self, FilterXObject *parent_obj) +{ + filterx_object_unref(self->parent_obj); + self->parent_obj = filterx_object_ref(parent_obj); + + GString *msg = g_string_new("Setting parent obj: parent_obj: "); + + if (self->parent_obj) + filterx_object_repr_append(self->parent_obj, msg); + else + g_string_append(msg, "(null)"); + + g_string_append(msg, " current_obj: "); + + if (self->current_obj) + filterx_object_repr_append(self->current_obj, msg); + else + g_string_append(msg, "(null)"); + + printf("%s\n", msg->str); +} + +static void +_elem_context_free(XmlElemContext *self) +{ + _elem_context_set_current_obj(self, NULL); + _elem_context_set_parent_obj(self, NULL); + g_free(self); +} + +static XmlElemContext * +_elem_context_new(void) +{ + return g_new0(XmlElemContext, 1); +} + +static FilterXObject * +_create_list_and_move(FilterXObject *parent_obj, FilterXObject **obj) +{ + FilterXObject *list = filterx_object_create_list(parent_obj); + g_assert(list); + g_assert(filterx_list_append(list, obj)); + return list; +} + +static XmlElemContext * +_create_element_object(const gchar *elem_name, XmlElemContext *last_elem_context, gboolean has_attrs, GError **error) +{ + g_assert(filterx_object_is_type(last_elem_context->current_obj, &FILTERX_TYPE_NAME(dict))); + + FilterXObject *existing_element_obj = NULL; + + XmlElemContext *new_elem_context = _elem_context_new(); + _elem_context_set_parent_obj(new_elem_context, last_elem_context->current_obj); + + FilterXObject *new_elem_obj_key = filterx_string_new(elem_name, -1); + const gchar *new_elem_obj_key_repr; + + /* + * If the new element has attributes, we create a dict for them, and we will either set "#text" if this is a leaf + * or we will create inner dicts otherwise. + * + * If the new element does not have attributes, we store an empty string, because that is what we want to see + * for leaves. But this might not be a leaf, so there is a logic that converts this empty string to a dict in + * _ensure_elem_is_dict(). + */ + if (has_attrs) + { + _elem_context_set_current_obj(new_elem_context, filterx_object_create_dict(last_elem_context->current_obj)); + new_elem_obj_key_repr = "{}"; + } + else + { + _elem_context_set_current_obj(new_elem_context, filterx_string_new("", 0)); + new_elem_obj_key_repr = "\"\""; + } + + if (!filterx_object_is_key_set(last_elem_context->current_obj, new_elem_obj_key)) + { + /* + * This is the first element, we store it as a string, and later convert to a list if more elements come, + * or to a dict if it is not a leaf, but a node. + */ + if (!filterx_object_set_subscript(new_elem_context->parent_obj, new_elem_obj_key, + &new_elem_context->current_obj)) + { + g_set_error(error, _error_quark(), FILTERX_PARSE_XML_ERROR_CODE_PARSE_ERROR, + "Failed to insert empty element: %s=%s", elem_name, new_elem_obj_key_repr); + } + goto exit; + } + + existing_element_obj = filterx_object_get_subscript(last_elem_context->current_obj, new_elem_obj_key); + + if (filterx_object_is_type(existing_element_obj, &FILTERX_TYPE_NAME(string)) || + filterx_object_is_type(existing_element_obj, &FILTERX_TYPE_NAME(dict))) + { + /* + * There is already a single element here. + * Let's replace the existing element with a list and add the existing element and the new one. + */ + FilterXObject *list_obj = _create_list_and_move(new_elem_context->parent_obj, &existing_element_obj); + if (!filterx_list_append(list_obj, &new_elem_context->current_obj)) + goto exit; + + if (!filterx_object_set_subscript(new_elem_context->parent_obj, new_elem_obj_key, &list_obj)) + { + g_set_error(error, _error_quark(), FILTERX_PARSE_XML_ERROR_CODE_PARSE_ERROR, + "Failed to insert list: %s=[..., %s]", elem_name, new_elem_obj_key_repr); + goto exit; + } + _elem_context_set_parent_obj(new_elem_context, list_obj); + goto exit; + } + + if (filterx_object_is_type(existing_element_obj, &FILTERX_TYPE_NAME(list))) + { + /* + * The inner object is already a list with values. + * Let's append the new element. + */ + _elem_context_set_parent_obj(new_elem_context, existing_element_obj); + if (!filterx_list_append(existing_element_obj, &new_elem_context->current_obj)) + { + g_set_error(error, _error_quark(), FILTERX_PARSE_XML_ERROR_CODE_PARSE_ERROR, + "Failed to append to list: %s", has_attrs ? "\"\"" : "{}"); + } + goto exit; + } + + msg_debug("FilterX: parse_xml(): Unexpected node type, removing", + evt_tag_str("type", existing_element_obj->type->name)); + g_assert(filterx_object_unset_key(last_elem_context->current_obj, new_elem_obj_key)); + + _elem_context_free(new_elem_context); + new_elem_context = _create_element_object(elem_name, last_elem_context, has_attrs, error); + +exit: + filterx_object_unref(new_elem_obj_key); + filterx_object_unref(existing_element_obj); + if (*error) + { + _elem_context_free(new_elem_context); + new_elem_context = NULL; + } + return new_elem_context; +} + +static void +_collect_element_attributes(const gchar *element_name, XmlElemContext *elem_context, + const gchar **attribute_names, const gchar **attribute_values, + GError **error) +{ + g_assert(filterx_object_is_type(elem_context->current_obj, &FILTERX_TYPE_NAME(dict))); + + gint i = 0; + while (TRUE) + { + if (!attribute_names[i]) + break; + + GString *attr_key = scratch_buffers_alloc(); + g_string_append_c(attr_key, '@'); + g_string_append(attr_key, attribute_names[i]); + + const gchar *attr_value = attribute_values[i]; + + FilterXObject *key = filterx_string_new(attr_key->str, attr_key->len); + FilterXObject *value = filterx_string_new(attr_value, -1); + + gboolean success = filterx_object_set_subscript(elem_context->current_obj, key, &value); + + filterx_object_unref(key); + filterx_object_unref(value); + + if (!success) + { + g_set_error(error, _error_quark(), FILTERX_PARSE_XML_ERROR_CODE_PARSE_ERROR, + "Failed to insert element attribute to dict: %s=%s", attr_key->str, attr_value); + break; + } + i++; + } +} + +static gboolean +_ensure_elem_is_dict(GMarkupParseContext *context, XmlElemContext *elem_context, GError **error) +{ + if (filterx_object_is_type(elem_context->current_obj, &FILTERX_TYPE_NAME(dict))) + return TRUE; + + const gchar *parent_element_name = (const gchar *) g_markup_parse_context_get_element_stack(context)->next->data; + FilterXObject *key = filterx_string_new(parent_element_name, -1); + + if (!filterx_object_is_type(elem_context->current_obj, &FILTERX_TYPE_NAME(string))) + { + msg_debug("FilterX: parse_xml(): unexpected node type, overwriting", + evt_tag_str("type", elem_context->current_obj->type->name)); + } + else + { + gsize len = 0; + const gchar *existing_text = filterx_string_get_value(elem_context->current_obj, &len); // todo extract + if (len != 0) + { + /* + * There is a non-empty string here already. + * This must have been here before starting to parse the XML. + * We cannot do better than to just overwrite it. + */ + msg_debug("FilterX: parse_xml(): unexpected text, overwriting", evt_tag_str("text", existing_text)); + } + } + + /* + * We turned out to be a non-leaf node instead of a leaf node. + * Let's transform the empty string to an empty dict. + */ + + _elem_context_set_current_obj(elem_context, filterx_object_create_dict(elem_context->parent_obj)); + g_assert(elem_context->current_obj); + + if (filterx_object_is_type(elem_context->parent_obj, &FILTERX_TYPE_NAME(dict))) + { + if (!filterx_object_set_subscript(elem_context->parent_obj, key, &elem_context->current_obj)) + { + g_set_error(error, _error_quark(), FILTERX_PARSE_XML_ERROR_CODE_PARSE_ERROR, + "Failed to insert empty dict: %s={}", parent_element_name); + _elem_context_set_current_obj(elem_context, NULL); + } + goto exit; + } + + if (filterx_object_is_type(elem_context->parent_obj, &FILTERX_TYPE_NAME(list))) + { + if (!filterx_list_set_subscript(elem_context->parent_obj, -1, &elem_context->current_obj)) + { + g_set_error(error, _error_quark(), FILTERX_PARSE_XML_ERROR_CODE_PARSE_ERROR, + "Failed to insert empty dict: {}"); + _elem_context_set_current_obj(elem_context, NULL); + } + goto exit; + } + + g_assert_not_reached(); + +exit: + filterx_object_unref(key); + return !(*error); +} + +static void +_start_element(GMarkupParseContext *context, const gchar *element_name, + const gchar **attribute_names, const gchar **attribute_values, + gpointer user_data, GError **error) +{ + GQueue *obj_stack = (GQueue *) user_data; + XmlElemContext *last_elem_context = g_queue_peek_head(obj_stack); + + if (!_ensure_elem_is_dict(context, last_elem_context, error)) + return; + + gboolean has_attrs = !!attribute_names[0]; + XmlElemContext *new_elem_context = _create_element_object(element_name, last_elem_context, has_attrs, error); + if (!new_elem_context) + return; + + g_queue_push_head(obj_stack, new_elem_context); + + if (has_attrs) + _collect_element_attributes(element_name, new_elem_context, attribute_names, attribute_values, error); +} + +void +_end_element(GMarkupParseContext *context, const gchar *element_name, gpointer user_data, GError **error) +{ + GQueue *obj_stack = (GQueue *) user_data; + XmlElemContext *elem_context = g_queue_pop_head(obj_stack); + _elem_context_free(elem_context); +} + +static FilterXObject * +_create_text_object(const gchar *text, gsize text_len) +{ + gchar *stripped_text = g_strndup(text, text_len); + g_strstrip(stripped_text); + + gsize stripped_text_len = strlen(stripped_text); + if (!stripped_text_len) + { + g_free(stripped_text); + return NULL; + } + + FilterXObject *result = filterx_string_new(stripped_text, stripped_text_len); + g_free(stripped_text); + return result; +} + +static void +_text(GMarkupParseContext *context, const gchar *text, gsize text_len, gpointer user_data, GError **error) +{ + GQueue *obj_stack = (GQueue *) user_data; + XmlElemContext *elem_context = g_queue_peek_head(obj_stack); + const gchar *element_name = g_markup_parse_context_get_element(context); + + FilterXObject *text_obj = _create_text_object(text, text_len); + if (!text_obj) + return; // Nothing to do + + if (filterx_object_is_type(elem_context->current_obj, &FILTERX_TYPE_NAME(string))) + { + if (filterx_object_is_type(elem_context->parent_obj, &FILTERX_TYPE_NAME(dict))) + { + FilterXObject *key = filterx_string_new(element_name, -1); + gboolean result = filterx_object_set_subscript(elem_context->parent_obj, key, &text_obj); + + filterx_object_unref(key); + filterx_object_unref(text_obj); + + if (!result) + g_set_error(error, _error_quark(), FILTERX_PARSE_XML_ERROR_CODE_PARSE_ERROR, + "Failed to add text to dict: %s=%s", element_name, text); + return; + } + + if (filterx_object_is_type(elem_context->parent_obj, &FILTERX_TYPE_NAME(list))) + { + gboolean result = filterx_list_set_subscript(elem_context->parent_obj, -1, &text_obj); + filterx_object_unref(text_obj); + + if (!result) + g_set_error(error, _error_quark(), FILTERX_PARSE_XML_ERROR_CODE_PARSE_ERROR, + "Failed to add text to list: %s", text); + return; + } + + g_assert_not_reached(); + } + + if (filterx_object_is_type(elem_context->current_obj, &FILTERX_TYPE_NAME(dict))) + { + FilterXObject *key = filterx_string_new("#text", 5); + gboolean result = filterx_object_set_subscript(elem_context->current_obj, key, &text_obj); + + filterx_object_unref(key); + filterx_object_unref(text_obj); + + if (!result) + g_set_error(error, _error_quark(), FILTERX_PARSE_XML_ERROR_CODE_PARSE_ERROR, + "Failed to add text to dict: #text=%s", text); + return; + } +} + +static gboolean +_generate(FilterXExprGenerator *s, FilterXObject *fillable) +{ + FilterXGeneratorFunctionParseXml *self = (FilterXGeneratorFunctionParseXml *) s; + + if (!filterx_object_is_type(fillable, &FILTERX_TYPE_NAME(dict))) + { + filterx_eval_push_error_info("fillable must be dict", &s->super, + g_strdup_printf("got %s instead", fillable->type->name), TRUE); + return FALSE; + } + + FilterXObject *xml_obj = filterx_expr_eval(self->xml_expr); + if (!xml_obj) + return FALSE; + + gsize xml_str_len; + const gchar *xml_str; + if (!filterx_object_extract_string(xml_obj, &xml_str, &xml_str_len)) + { + filterx_eval_push_error_info("input must be string", &s->super, + g_strdup_printf("got %s instead", xml_obj->type->name), TRUE); + filterx_object_unref(xml_obj); + return FALSE; + } + + gboolean success = FALSE; + + GMarkupParser scanner_callbacks = + { + .start_element = _start_element, + .end_element = _end_element, + .text = _text + }; + + GQueue *obj_stack = g_queue_new(); + XmlElemContext *root_elem_context = _elem_context_new(); + _elem_context_set_current_obj(root_elem_context, fillable); + g_queue_push_head(obj_stack, root_elem_context); + + GMarkupParseContext *context = g_markup_parse_context_new(&scanner_callbacks, 0, obj_stack, NULL); + + GError *error = NULL; + if (!g_markup_parse_context_parse(context, xml_str, xml_str_len, &error) || + !g_markup_parse_context_end_parse(context, &error)) + { + filterx_eval_push_error_info("failed to parse xml", &s->super, error->message, FALSE); + goto exit; + } + + success = TRUE; + +exit: + filterx_object_unref(xml_obj); + g_queue_free(obj_stack); + g_markup_parse_context_free(context); + if (error) + g_error_free(error); + return success; +} + +static FilterXObject * +_create_container(FilterXExprGenerator *s, FilterXExpr *fillable_parent) +{ + FilterXObject *fillable_parent_obj = filterx_expr_eval_typed(fillable_parent); + if (!fillable_parent_obj) + return NULL; + + FilterXObject *result = filterx_object_create_dict(fillable_parent_obj); + filterx_object_unref(fillable_parent_obj); + return result; +} + +static gboolean +_extract_args(FilterXGeneratorFunctionParseXml *self, FilterXFunctionArgs *args, GError **error) +{ + if (filterx_function_args_len(args) != 1) + { + g_set_error(error, FILTERX_FUNCTION_ERROR, FILTERX_FUNCTION_ERROR_CTOR_FAIL, + "invalid number of arguments. " FILTERX_FUNC_PARSE_XML_USAGE); + return FALSE; + } + + self->xml_expr = filterx_function_args_get_expr(args, 0); + return TRUE; +} + +static void +_free(FilterXExpr *s) +{ + FilterXGeneratorFunctionParseXml *self = (FilterXGeneratorFunctionParseXml *) s; + + filterx_expr_unref(self->xml_expr); + filterx_generator_function_free_method(&self->super); +} + +FilterXExpr * +filterx_generator_function_parse_xml_new(const gchar *func_name, FilterXFunctionArgs *args, GError **error) +{ + FilterXGeneratorFunctionParseXml *self = g_new0(FilterXGeneratorFunctionParseXml, 1); + + filterx_generator_function_init_instance(&self->super, func_name); + self->super.super.generate = _generate; + self->super.super.create_container = _create_container; + self->super.super.super.free_fn = _free; + + if (!_extract_args(self, args, error) || + !filterx_function_args_check(args, error)) + goto error; + + filterx_function_args_free(args); + return &self->super.super.super; + +error: + filterx_function_args_free(args); + filterx_expr_unref(&self->super.super.super); + return NULL; +} + +FILTERX_GENERATOR_FUNCTION(parse_xml, filterx_generator_function_parse_xml_new); diff --git a/modules/xml/filterx-parse-xml.h b/modules/xml/filterx-parse-xml.h new file mode 100644 index 0000000000..8a9574b0fa --- /dev/null +++ b/modules/xml/filterx-parse-xml.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2024 Attila Szakacs + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * As an additional exemption you are allowed to compile & link against the + * OpenSSL libraries as published by the OpenSSL project. See the file + * COPYING for details. + * + */ + +#ifndef FILTERX_PARSE_XML_H_INCLUDED +#define FILTERX_PARSE_XML_H_INCLUDED + +#include "filterx/expr-function.h" + +FILTERX_GENERATOR_FUNCTION_DECLARE(parse_xml); + +FilterXExpr *filterx_generator_function_parse_xml_new(const gchar *func_name, FilterXFunctionArgs *args, + GError **error); + +#endif diff --git a/modules/xml/xml-plugin.c b/modules/xml/xml-plugin.c index 6d38bcf1ac..f60ab1cdfd 100644 --- a/modules/xml/xml-plugin.c +++ b/modules/xml/xml-plugin.c @@ -20,6 +20,9 @@ * */ +#include "filterx-parse-xml.h" +#include "filterx/expr-function.h" + #include "cfg-parser.h" #include "plugin.h" #include "plugin-types.h" @@ -38,6 +41,7 @@ static Plugin xml_plugins[] = .name = "windows-eventlog-xml-parser", .parser = &xml_parser, }, + FILTERX_GENERATOR_FUNCTION_PLUGIN(parse_xml), }; gboolean diff --git a/tests/copyright/policy b/tests/copyright/policy index 13babd6f75..1829275a4c 100644 --- a/tests/copyright/policy +++ b/tests/copyright/policy @@ -273,6 +273,7 @@ tests/light/src/syslog_ng_config/statements/__init__\.py modules/correlation/id-counter\.[ch]$ modules/correlation/group-lines.h modules/xml/windows-eventlog-xml-parser\.h +modules/xml/filterx-parse-xml\.[ch]$ modules/xml/tests/test_windows_eventlog_xml_parser\.c modules/examples/filterx/example-filterx-func/example-filterx-func-plugin\.[ch] modules/grpc/otel/filterx