From f649c0b355639c04a8d282e2de8eedfde2c89271 Mon Sep 17 00:00:00 2001 From: Sechkem Date: Wed, 5 Mar 2025 15:15:45 +0000 Subject: [PATCH] Mon 158969 cma improve checks syntax errors handling (#2174) * enh(agent): improve checks syntax errors handling /add check_dummy class for predefined error message handling * fix(tests): improve error messages for invalid check commands and arguments * fix(agent): update error handling to use e_status REFS: 158969 --- agent/inc/com/centreon/agent/check_exec.hh | 55 +++++++++++++ agent/src/check_exec.cc | 89 +++++++++++++++++++++- agent/src/scheduler.cc | 79 ++++++++++--------- tests/broker-engine/cma.robot | 55 +++++++++++++ tests/resources/Common.py | 53 +++++++++++++ 5 files changed, 293 insertions(+), 38 deletions(-) diff --git a/agent/inc/com/centreon/agent/check_exec.hh b/agent/inc/com/centreon/agent/check_exec.hh index 37b932c1d6f..ea1f0220e85 100644 --- a/agent/inc/com/centreon/agent/check_exec.hh +++ b/agent/inc/com/centreon/agent/check_exec.hh @@ -119,6 +119,61 @@ class check_exec : public check { void on_completion(unsigned running_index); }; +/** + * @class check_dummy + * @brief A class derived from check that is used to send an error message. + * + * This class represents a dummy check that sends a predefined error message. + * It inherits from the check class and overrides the start_check method. + * + * @param io_context Shared pointer to the ASIO io_context. + * @param logger Shared pointer to the spdlog logger. + * @param first_start_expected The expected start time of the first check. + * @param check_interval The interval between checks. + * @param serv The service name. + * @param cmd_name The command name. + * @param cmd_line The command line. + * @param output The predefined error message to be sent. + * @param cnf Configuration for the engine to agent request. + * @param handler Completion handler for the check. + * @param stat Pointer to the checks statistics. + */ +class check_dummy : public check { + std::string _output; + + protected: + using check::completion_handler; + + public: + check_dummy(const std::shared_ptr& io_context, + const std::shared_ptr& logger, + time_point first_start_expected, + duration check_interval, + const std::string& serv, + const std::string& cmd_name, + const std::string& cmd_line, + const std::string& output, + const engine_to_agent_request_ptr& cnf, + check::completion_handler&& handler, + const checks_statistics::pointer& stat); + + static std::shared_ptr load( + const std::shared_ptr& io_context, + const std::shared_ptr& logger, + time_point first_start_expected, + duration check_interval, + const std::string& serv, + const std::string& cmd_name, + const std::string& cmd_line, + const std::string& output, + const engine_to_agent_request_ptr& cnf, + check::completion_handler&& handler, + const checks_statistics::pointer& stat); + + void start_check(const duration& timeout) override; + + std::string get_output() { return _output; } +}; } // namespace com::centreon::agent #endif diff --git a/agent/src/check_exec.cc b/agent/src/check_exec.cc index 27a1250f9aa..982233ef0e4 100644 --- a/agent/src/check_exec.cc +++ b/agent/src/check_exec.cc @@ -199,7 +199,7 @@ void check_exec::start_check(const duration& timeout) { if (!_process) { _io_context->post([me = check::shared_from_this(), start_check_index = _get_running_check_index()]() { - me->on_completion(start_check_index, 3, + me->on_completion(start_check_index, e_status::unknown, std::list(), {"empty command"}); }); @@ -213,7 +213,8 @@ void check_exec::start_check(const duration& timeout) { _io_context->post([me = check::shared_from_this(), start_check_index = _get_running_check_index(), e]() { me->on_completion( - start_check_index, 3, std::list(), + start_check_index, e_status::unknown, + std::list(), {fmt::format("Fail to execute {} : {}", me->get_command_line(), e.code().message())}); }); @@ -222,7 +223,7 @@ void check_exec::start_check(const duration& timeout) { get_service(), get_command_line(), e.what()); _io_context->post([me = check::shared_from_this(), start_check_index = _get_running_check_index(), e]() { - me->on_completion(start_check_index, 3, + me->on_completion(start_check_index, e_status::unknown, std::list(), {fmt::format("Fail to execute {} : {}", me->get_command_line(), e.what())}); @@ -281,3 +282,85 @@ void check_exec::on_completion(unsigned running_index) { check::on_completion(running_index, _process->get_exit_status(), perfs, outputs); } + +/****************************************************************** + * check_dummy + ******************************************************************/ + +check_dummy::check_dummy(const std::shared_ptr& io_context, + const std::shared_ptr& logger, + time_point first_start_expected, + duration check_interval, + const std::string& serv, + const std::string& cmd_name, + const std::string& cmd_line, + const std::string& output, + const engine_to_agent_request_ptr& cnf, + check::completion_handler&& handler, + const checks_statistics::pointer& stat) + : check(io_context, + logger, + first_start_expected, + check_interval, + serv, + cmd_name, + cmd_line, + cnf, + std::move(handler), + stat), + _output(output) {} + +/** + * @brief create and initialize a check_dummy object (don't use constructor) + * + * @tparam handler_type + * @param io_context + * @param logger + * @param first_start_expected start expected + * @param check_interval check interval between two checks (not only this but + * also others) + * @param serv + * @param cmd_name + * @param cmd_line + * @param cnf agent configuration + * @param handler completion handler + * @return std::shared_ptr + */ +std::shared_ptr check_dummy::load( + const std::shared_ptr& io_context, + const std::shared_ptr& logger, + time_point first_start_expected, + duration check_interval, + const std::string& serv, + const std::string& cmd_name, + const std::string& cmd_line, + const std::string& output, + const engine_to_agent_request_ptr& cnf, + check::completion_handler&& handler, + const checks_statistics::pointer& stat) { + std::shared_ptr ret = std::make_shared( + io_context, logger, first_start_expected, check_interval, serv, cmd_name, + cmd_line, output, cnf, std::move(handler), stat); + return ret; +} + +/** + * @brief start a check, completion handler is always called asynchronously even + * in case of failure + * + * @param timeout + */ +void check_dummy::start_check(const duration& timeout) { + if (!check::_start_check(timeout)) { + return; + } + + _io_context->post([me = check::shared_from_this(), + start_check_index = _get_running_check_index(), this]() { + me->on_completion( + start_check_index, e_status::critical, + std::list(), + {fmt::format("unable to execute native check {} , output error : {}", + me->get_command_line(), get_output())}); + }); +} diff --git a/agent/src/scheduler.cc b/agent/src/scheduler.cc index 4ed5dcbcfe8..f8c91c64b44 100644 --- a/agent/src/scheduler.cc +++ b/agent/src/scheduler.cc @@ -577,43 +577,52 @@ std::shared_ptr scheduler::default_check_builder( rapidjson::Document native_check_info = common::rapidjson_helper::read_from_string(cmd_line); common::rapidjson_helper native_params(native_check_info); - std::string_view check_type = native_params.get_string("check"); - const rapidjson::Value* args; - if (native_params.has_member("args")) { - args = &native_params.get_member("args"); - } else { - static const rapidjson::Value no_arg; - args = &no_arg; - } - if (check_type == "cpu_percentage"sv) { - return std::make_shared( - io_context, logger, first_start_expected, check_interval, service, - cmd_name, cmd_line, *args, conf, std::move(handler), stat); - } else if (check_type == "health"sv) { - return std::make_shared( - io_context, logger, first_start_expected, check_interval, service, - cmd_name, cmd_line, *args, conf, std::move(handler), stat); + try { + std::string_view check_type = native_params.get_string("check"); + const rapidjson::Value* args; + if (native_params.has_member("args")) { + args = &native_params.get_member("args"); + } else { + static const rapidjson::Value no_arg; + args = &no_arg; + } + + if (check_type == "cpu_percentage"sv) { + return std::make_shared( + io_context, logger, first_start_expected, check_interval, service, + cmd_name, cmd_line, *args, conf, std::move(handler), stat); + } else if (check_type == "health"sv) { + return std::make_shared( + io_context, logger, first_start_expected, check_interval, service, + cmd_name, cmd_line, *args, conf, std::move(handler), stat); #ifdef _WIN32 - } else if (check_type == "uptime"sv) { - return std::make_shared( - io_context, logger, first_start_expected, check_interval, service, - cmd_name, cmd_line, *args, conf, std::move(handler), stat); - } else if (check_type == "storage"sv) { - return std::make_shared( - io_context, logger, first_start_expected, check_interval, service, - cmd_name, cmd_line, *args, conf, std::move(handler), stat); - } else if (check_type == "memory"sv) { - return std::make_shared( - io_context, logger, first_start_expected, check_interval, service, - cmd_name, cmd_line, *args, conf, std::move(handler), stat); - } else if (check_type == "service"sv) { - return std::make_shared( - io_context, logger, first_start_expected, check_interval, service, - cmd_name, cmd_line, *args, conf, std::move(handler), stat); + } else if (check_type == "uptime"sv) { + return std::make_shared( + io_context, logger, first_start_expected, check_interval, service, + cmd_name, cmd_line, *args, conf, std::move(handler), stat); + } else if (check_type == "storage"sv) { + return std::make_shared( + io_context, logger, first_start_expected, check_interval, service, + cmd_name, cmd_line, *args, conf, std::move(handler), stat); + } else if (check_type == "memory"sv) { + return std::make_shared( + io_context, logger, first_start_expected, check_interval, service, + cmd_name, cmd_line, *args, conf, std::move(handler), stat); + } else if (check_type == "service"sv) { + return std::make_shared( + io_context, logger, first_start_expected, check_interval, service, + cmd_name, cmd_line, *args, conf, std::move(handler), stat); #endif - } else { - throw exceptions::msg_fmt("command {}, unknown native check:{}", cmd_name, - cmd_line); + } else { + throw exceptions::msg_fmt("command {}, unknown native check:{}", + cmd_name, cmd_line); + } + } catch (const std::exception& e) { + SPDLOG_LOGGER_ERROR(logger, "unexpected error: {}", e.what()); + return check_dummy::load(io_context, logger, first_start_expected, + check_interval, service, cmd_name, cmd_line, + std::string(e.what()), conf, std::move(handler), + stat); } } catch (const std::exception&) { return check_exec::load(io_context, logger, first_start_expected, diff --git a/tests/broker-engine/cma.robot b/tests/broker-engine/cma.robot index 84d24a0f2a5..d014ec9224f 100644 --- a/tests/broker-engine/cma.robot +++ b/tests/broker-engine/cma.robot @@ -853,7 +853,62 @@ BEOTEL_CENTREON_AGENT_LINUX_NO_DEFUNCT_PROCESS Should Be True ${nb_agent_process} == 0 "There should be no centagent process" +BEOTEL_INVALID_CHECK_COMMANDS_AND_ARGUMENTS + [Documentation] Given the agent is configured with native checks for services + ... And the OpenTelemetry server module is added + ... And services are configured with incorrect check commands and arguments + ... When the broker, engine, and agent are started + ... Then the resources table should be updated with the correct status + ... And appropriate error messages should be generated for invalid checks + [Tags] broker engine agent opentelemetry MON-158969 + Ctn Config Engine ${1} ${2} ${2} + Ctn Add Otl ServerModule + ... 0 + ... {"otel_server":{"host": "0.0.0.0","port": 4317},"max_length_grpc_log":0,"centreon_agent":{"check_interval":10, "export_period":15}} + Ctn Config Add Otl Connector + ... 0 + ... OTEL connector + ... opentelemetry --processor=centreon_agent --extractor=attributes --host_path=resource_metrics.resource.attributes.host.name --service_path=resource_metrics.resource.attributes.service.name + Ctn Engine Config Replace Value In Services ${0} service_1 check_command cpu_check + Ctn Engine Config Replace Value In Services ${0} service_2 check_command health_check + Ctn Set Services Passive 0 service_[1-2] + + # wrong check command for service_1 + Ctn Engine Config Add Command ${0} cpu_check {"check": "error"} OTEL connector + # wrong args value for service_2 + Ctn Engine Config Add Command ${0} health_check {"check": "health","args":{"warning-interval": "A", "critical-interval": "6"} } OTEL connector + + Ctn Engine Config Set Value 0 log_level_checks trace + Ctn Clear Metrics + + Ctn Config Broker central + Ctn Config Broker module + Ctn Config Broker rrd + Ctn Config Centreon Agent + Ctn Broker Config Log central sql trace + + Ctn Config BBDO3 1 + Ctn Clear Retention + + ${start} Ctn Get Round Current Date + Ctn Start Broker + Ctn Start Engine + Ctn Start Agent + + # Let's wait for the otel server start + Ctn Wait For Otel Server To Be Ready ${start} + + ${result} ${content} Ctn Check Service Resource Status With Timeout Rt host_1 service_1 2 60 ANY + Should Be True ${result} resources table not updated for service_1 + Should Be Equal As Strings ${content} unable to execute native check {"check": "error"} , output error : command cpu_check, unknown native check:{"check": "error"} + ... "Error the output for invalid check command is not correct" + + ${result} ${content} Ctn Check Service Resource Status With Timeout RT host_1 service_2 2 60 ANY + Should Be True ${result} resources table not updated for service_2 + Should Be Equal As Strings ${content} unable to execute native check {"check": "health","args":{"warning-interval": "A", "critical-interval": "6"} } , output error : field warning-interval is not a unsigned int string + ... "Error the output for invalid check args is not correct" + *** Keywords *** Ctn Create Cert And Init [Documentation] create key and certificates used by agent and engine on linux side diff --git a/tests/resources/Common.py b/tests/resources/Common.py index 4c25b8acfd7..7b2e0dff969 100644 --- a/tests/resources/Common.py +++ b/tests/resources/Common.py @@ -632,6 +632,59 @@ def ctn_check_service_resource_status_with_timeout(hostname: str, service_desc: time.sleep(1) return False +def ctn_check_service_resource_status_with_timeout_rt(hostname: str, service_desc: str, status: int, timeout: int, state_type: str = "SOFT"): + """ + brief : same as ctn_check_service_resource_status_with_timeout but with additional return + + Check the status of a service resource within a specified timeout period. + + This function connects to a MySQL database and queries the status of a service resource + associated with a given hostname and service description. It repeatedly checks the status + until the specified timeout period is reached. The function can check for different state types + (SOFT, HARD, or ANY). + + Args: + hostname (str): The name of the host. + service_desc (str): The description of the service. + status (int): The desired status to check for. + timeout (int): The timeout period in seconds. + state_type (str, optional): The type of state to check for. Defaults to "SOFT". + Can be "SOFT", "HARD", or "ANY". + + Returns: + tuple: A tuple containing a boolean indicating if the desired status was found and the output message. + (True, output) if the desired status is found within the timeout period. + (False, "") if the desired status is not found within the timeout period. + """ + limit = time.time() + timeout + while time.time() < limit: + connection = pymysql.connect(host=DB_HOST, + user=DB_USER, + password=DB_PASS, + autocommit=True, + database=DB_NAME_STORAGE, + charset='utf8mb4', + cursorclass=pymysql.cursors.DictCursor) + + with connection: + with connection.cursor() as cursor: + cursor.execute( + f"SELECT r.status,r.status_confirmed,r.output FROM resources r LEFT JOIN services s ON r.id=s.service_id AND r.parent_id=s.host_id LEFT JOIN hosts h ON s.host_id=h.host_id WHERE h.name='{hostname}' AND s.description='{service_desc}'") + result = cursor.fetchall() + if len(result) > 0: + logger.console(f"result: {result}") + if len(result) > 0 and result[0]['status'] is not None and int(result[0]['status']) == int(status): + logger.console( + f"status={result[0]['status']} and status_confirmed={result[0]['status_confirmed']}") + if state_type == 'ANY': + return True,result[0]['output'] + elif state_type == 'HARD' and int(result[0]['status_confirmed']) == 1: + return True,result[0]['output'] + elif state_type == 'SOFT' and int(result[0]['status_confirmed']) == 0: + return True,result[0]['output'] + time.sleep(1) + return False,"" + def ctn_check_acknowledgement_with_timeout(hostname: str, service_desc: str, entry_time: int, status: int, timeout: int, state_type: str = "SOFT"): limit = time.time() + timeout