Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Mon 161867 backport cma improve checks syntax errors handling (#2174) #2197

Open
wants to merge 1 commit into
base: dev-24.10.x
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 55 additions & 0 deletions agent/inc/com/centreon/agent/check_exec.hh
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,61 @@ class check_exec : public check {
void on_completion(unsigned running_index);
};

/**
* @class check_dummy
* @brief A class derived from check that is used to send an error message.
*
* This class represents a dummy check that sends a predefined error message.
* It inherits from the check class and overrides the start_check method.
*
* @param io_context Shared pointer to the ASIO io_context.
* @param logger Shared pointer to the spdlog logger.
* @param first_start_expected The expected start time of the first check.
* @param check_interval The interval between checks.
* @param serv The service name.
* @param cmd_name The command name.
* @param cmd_line The command line.
* @param output The predefined error message to be sent.
* @param cnf Configuration for the engine to agent request.
* @param handler Completion handler for the check.
* @param stat Pointer to the checks statistics.
*/
class check_dummy : public check {
std::string _output;

protected:
using check::completion_handler;

public:
check_dummy(const std::shared_ptr<asio::io_context>& io_context,
const std::shared_ptr<spdlog::logger>& logger,
time_point first_start_expected,
duration check_interval,
const std::string& serv,
const std::string& cmd_name,
const std::string& cmd_line,
const std::string& output,
const engine_to_agent_request_ptr& cnf,
check::completion_handler&& handler,
const checks_statistics::pointer& stat);

static std::shared_ptr<check_dummy> load(
const std::shared_ptr<asio::io_context>& io_context,
const std::shared_ptr<spdlog::logger>& logger,
time_point first_start_expected,
duration check_interval,
const std::string& serv,
const std::string& cmd_name,
const std::string& cmd_line,
const std::string& output,
const engine_to_agent_request_ptr& cnf,
check::completion_handler&& handler,
const checks_statistics::pointer& stat);

void start_check(const duration& timeout) override;

std::string get_output() { return _output; }
};
} // namespace com::centreon::agent

#endif
89 changes: 86 additions & 3 deletions agent/src/check_exec.cc
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ void check_exec::start_check(const duration& timeout) {
if (!_process) {
_io_context->post([me = check::shared_from_this(),
start_check_index = _get_running_check_index()]() {
me->on_completion(start_check_index, 3,
me->on_completion(start_check_index, e_status::unknown,
std::list<com::centreon::common::perfdata>(),
{"empty command"});
});
Expand All @@ -213,7 +213,8 @@ void check_exec::start_check(const duration& timeout) {
_io_context->post([me = check::shared_from_this(),
start_check_index = _get_running_check_index(), e]() {
me->on_completion(
start_check_index, 3, std::list<com::centreon::common::perfdata>(),
start_check_index, e_status::unknown,
std::list<com::centreon::common::perfdata>(),
{fmt::format("Fail to execute {} : {}", me->get_command_line(),
e.code().message())});
});
Expand All @@ -222,7 +223,7 @@ void check_exec::start_check(const duration& timeout) {
get_service(), get_command_line(), e.what());
_io_context->post([me = check::shared_from_this(),
start_check_index = _get_running_check_index(), e]() {
me->on_completion(start_check_index, 3,
me->on_completion(start_check_index, e_status::unknown,
std::list<com::centreon::common::perfdata>(),
{fmt::format("Fail to execute {} : {}",
me->get_command_line(), e.what())});
Expand Down Expand Up @@ -281,3 +282,85 @@ void check_exec::on_completion(unsigned running_index) {
check::on_completion(running_index, _process->get_exit_status(), perfs,
outputs);
}

/******************************************************************
* check_dummy
******************************************************************/

check_dummy::check_dummy(const std::shared_ptr<asio::io_context>& io_context,
const std::shared_ptr<spdlog::logger>& logger,
time_point first_start_expected,
duration check_interval,
const std::string& serv,
const std::string& cmd_name,
const std::string& cmd_line,
const std::string& output,
const engine_to_agent_request_ptr& cnf,
check::completion_handler&& handler,
const checks_statistics::pointer& stat)
: check(io_context,
logger,
first_start_expected,
check_interval,
serv,
cmd_name,
cmd_line,
cnf,
std::move(handler),
stat),
_output(output) {}

/**
* @brief create and initialize a check_dummy object (don't use constructor)
*
* @tparam handler_type
* @param io_context
* @param logger
* @param first_start_expected start expected
* @param check_interval check interval between two checks (not only this but
* also others)
* @param serv
* @param cmd_name
* @param cmd_line
* @param cnf agent configuration
* @param handler completion handler
* @return std::shared_ptr<check_dummy>
*/
std::shared_ptr<check_dummy> check_dummy::load(
const std::shared_ptr<asio::io_context>& io_context,
const std::shared_ptr<spdlog::logger>& logger,
time_point first_start_expected,
duration check_interval,
const std::string& serv,
const std::string& cmd_name,
const std::string& cmd_line,
const std::string& output,
const engine_to_agent_request_ptr& cnf,
check::completion_handler&& handler,
const checks_statistics::pointer& stat) {
std::shared_ptr<check_dummy> ret = std::make_shared<check_dummy>(
io_context, logger, first_start_expected, check_interval, serv, cmd_name,
cmd_line, output, cnf, std::move(handler), stat);
return ret;
}

/**
* @brief start a check, completion handler is always called asynchronously even
* in case of failure
*
* @param timeout
*/
void check_dummy::start_check(const duration& timeout) {
if (!check::_start_check(timeout)) {
return;
}

_io_context->post([me = check::shared_from_this(),
start_check_index = _get_running_check_index(), this]() {
me->on_completion(
start_check_index, e_status::critical,
std::list<com::centreon::common::perfdata>(),
{fmt::format("unable to execute native check {} , output error : {}",
me->get_command_line(), get_output())});
});
}
79 changes: 44 additions & 35 deletions agent/src/scheduler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -577,43 +577,52 @@ std::shared_ptr<check> scheduler::default_check_builder(
rapidjson::Document native_check_info =
common::rapidjson_helper::read_from_string(cmd_line);
common::rapidjson_helper native_params(native_check_info);
std::string_view check_type = native_params.get_string("check");
const rapidjson::Value* args;
if (native_params.has_member("args")) {
args = &native_params.get_member("args");
} else {
static const rapidjson::Value no_arg;
args = &no_arg;
}
if (check_type == "cpu_percentage"sv) {
return std::make_shared<check_cpu>(
io_context, logger, first_start_expected, check_interval, service,
cmd_name, cmd_line, *args, conf, std::move(handler), stat);
} else if (check_type == "health"sv) {
return std::make_shared<check_health>(
io_context, logger, first_start_expected, check_interval, service,
cmd_name, cmd_line, *args, conf, std::move(handler), stat);
try {
std::string_view check_type = native_params.get_string("check");
const rapidjson::Value* args;
if (native_params.has_member("args")) {
args = &native_params.get_member("args");
} else {
static const rapidjson::Value no_arg;
args = &no_arg;
}

if (check_type == "cpu_percentage"sv) {
return std::make_shared<check_cpu>(
io_context, logger, first_start_expected, check_interval, service,
cmd_name, cmd_line, *args, conf, std::move(handler), stat);
} else if (check_type == "health"sv) {
return std::make_shared<check_health>(
io_context, logger, first_start_expected, check_interval, service,
cmd_name, cmd_line, *args, conf, std::move(handler), stat);
#ifdef _WIN32
} else if (check_type == "uptime"sv) {
return std::make_shared<check_uptime>(
io_context, logger, first_start_expected, check_interval, service,
cmd_name, cmd_line, *args, conf, std::move(handler), stat);
} else if (check_type == "storage"sv) {
return std::make_shared<check_drive_size>(
io_context, logger, first_start_expected, check_interval, service,
cmd_name, cmd_line, *args, conf, std::move(handler), stat);
} else if (check_type == "memory"sv) {
return std::make_shared<check_memory>(
io_context, logger, first_start_expected, check_interval, service,
cmd_name, cmd_line, *args, conf, std::move(handler), stat);
} else if (check_type == "service"sv) {
return std::make_shared<check_service>(
io_context, logger, first_start_expected, check_interval, service,
cmd_name, cmd_line, *args, conf, std::move(handler), stat);
} else if (check_type == "uptime"sv) {
return std::make_shared<check_uptime>(
io_context, logger, first_start_expected, check_interval, service,
cmd_name, cmd_line, *args, conf, std::move(handler), stat);
} else if (check_type == "storage"sv) {
return std::make_shared<check_drive_size>(
io_context, logger, first_start_expected, check_interval, service,
cmd_name, cmd_line, *args, conf, std::move(handler), stat);
} else if (check_type == "memory"sv) {
return std::make_shared<check_memory>(
io_context, logger, first_start_expected, check_interval, service,
cmd_name, cmd_line, *args, conf, std::move(handler), stat);
} else if (check_type == "service"sv) {
return std::make_shared<check_service>(
io_context, logger, first_start_expected, check_interval, service,
cmd_name, cmd_line, *args, conf, std::move(handler), stat);
#endif
} else {
throw exceptions::msg_fmt("command {}, unknown native check:{}", cmd_name,
cmd_line);
} else {
throw exceptions::msg_fmt("command {}, unknown native check:{}",
cmd_name, cmd_line);
}
} catch (const std::exception& e) {
SPDLOG_LOGGER_ERROR(logger, "unexpected error: {}", e.what());
return check_dummy::load(io_context, logger, first_start_expected,
check_interval, service, cmd_name, cmd_line,
std::string(e.what()), conf, std::move(handler),
stat);
}
} catch (const std::exception&) {
return check_exec::load(io_context, logger, first_start_expected,
Expand Down
55 changes: 55 additions & 0 deletions tests/broker-engine/cma.robot
Original file line number Diff line number Diff line change
Expand Up @@ -853,7 +853,62 @@ BEOTEL_CENTREON_AGENT_LINUX_NO_DEFUNCT_PROCESS

Should Be True ${nb_agent_process} == 0 "There should be no centagent process"

BEOTEL_INVALID_CHECK_COMMANDS_AND_ARGUMENTS
[Documentation] Given the agent is configured with native checks for services
... And the OpenTelemetry server module is added
... And services are configured with incorrect check commands and arguments
... When the broker, engine, and agent are started
... Then the resources table should be updated with the correct status
... And appropriate error messages should be generated for invalid checks
[Tags] broker engine agent opentelemetry MON-158969
Ctn Config Engine ${1} ${2} ${2}
Ctn Add Otl ServerModule
... 0
... {"otel_server":{"host": "0.0.0.0","port": 4317},"max_length_grpc_log":0,"centreon_agent":{"check_interval":10, "export_period":15}}
Ctn Config Add Otl Connector
... 0
... OTEL connector
... opentelemetry --processor=centreon_agent --extractor=attributes --host_path=resource_metrics.resource.attributes.host.name --service_path=resource_metrics.resource.attributes.service.name
Ctn Engine Config Replace Value In Services ${0} service_1 check_command cpu_check
Ctn Engine Config Replace Value In Services ${0} service_2 check_command health_check
Ctn Set Services Passive 0 service_[1-2]

# wrong check command for service_1
Ctn Engine Config Add Command ${0} cpu_check {"check": "error"} OTEL connector
# wrong args value for service_2
Ctn Engine Config Add Command ${0} health_check {"check": "health","args":{"warning-interval": "A", "critical-interval": "6"} } OTEL connector

Ctn Engine Config Set Value 0 log_level_checks trace

Ctn Clear Metrics

Ctn Config Broker central
Ctn Config Broker module
Ctn Config Broker rrd
Ctn Config Centreon Agent
Ctn Broker Config Log central sql trace

Ctn Config BBDO3 1
Ctn Clear Retention

${start} Ctn Get Round Current Date
Ctn Start Broker
Ctn Start Engine
Ctn Start Agent

# Let's wait for the otel server start
Ctn Wait For Otel Server To Be Ready ${start}

${result} ${content} Ctn Check Service Resource Status With Timeout Rt host_1 service_1 2 60 ANY
Should Be True ${result} resources table not updated for service_1
Should Be Equal As Strings ${content} unable to execute native check {"check": "error"} , output error : command cpu_check, unknown native check:{"check": "error"}
... "Error the output for invalid check command is not correct"

${result} ${content} Ctn Check Service Resource Status With Timeout RT host_1 service_2 2 60 ANY
Should Be True ${result} resources table not updated for service_2
Should Be Equal As Strings ${content} unable to execute native check {"check": "health","args":{"warning-interval": "A", "critical-interval": "6"} } , output error : field warning-interval is not a unsigned int string
... "Error the output for invalid check args is not correct"

*** Keywords ***
Ctn Create Cert And Init
[Documentation] create key and certificates used by agent and engine on linux side
Expand Down
53 changes: 53 additions & 0 deletions tests/resources/Common.py
Original file line number Diff line number Diff line change
Expand Up @@ -632,6 +632,59 @@ def ctn_check_service_resource_status_with_timeout(hostname: str, service_desc:
time.sleep(1)
return False

def ctn_check_service_resource_status_with_timeout_rt(hostname: str, service_desc: str, status: int, timeout: int, state_type: str = "SOFT"):
"""
brief : same as ctn_check_service_resource_status_with_timeout but with additional return

Check the status of a service resource within a specified timeout period.

This function connects to a MySQL database and queries the status of a service resource
associated with a given hostname and service description. It repeatedly checks the status
until the specified timeout period is reached. The function can check for different state types
(SOFT, HARD, or ANY).

Args:
hostname (str): The name of the host.
service_desc (str): The description of the service.
status (int): The desired status to check for.
timeout (int): The timeout period in seconds.
state_type (str, optional): The type of state to check for. Defaults to "SOFT".
Can be "SOFT", "HARD", or "ANY".

Returns:
tuple: A tuple containing a boolean indicating if the desired status was found and the output message.
(True, output) if the desired status is found within the timeout period.
(False, "") if the desired status is not found within the timeout period.
"""
limit = time.time() + timeout
while time.time() < limit:
connection = pymysql.connect(host=DB_HOST,
user=DB_USER,
password=DB_PASS,
autocommit=True,
database=DB_NAME_STORAGE,
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor)

with connection:
with connection.cursor() as cursor:
cursor.execute(
f"SELECT r.status,r.status_confirmed,r.output FROM resources r LEFT JOIN services s ON r.id=s.service_id AND r.parent_id=s.host_id LEFT JOIN hosts h ON s.host_id=h.host_id WHERE h.name='{hostname}' AND s.description='{service_desc}'")
result = cursor.fetchall()
if len(result) > 0:
logger.console(f"result: {result}")
if len(result) > 0 and result[0]['status'] is not None and int(result[0]['status']) == int(status):
logger.console(
f"status={result[0]['status']} and status_confirmed={result[0]['status_confirmed']}")
if state_type == 'ANY':
return True,result[0]['output']
elif state_type == 'HARD' and int(result[0]['status_confirmed']) == 1:
return True,result[0]['output']
elif state_type == 'SOFT' and int(result[0]['status_confirmed']) == 0:
return True,result[0]['output']
time.sleep(1)
return False,""


def ctn_check_acknowledgement_with_timeout(hostname: str, service_desc: str, entry_time: int, status: int, timeout: int, state_type: str = "SOFT"):
limit = time.time() + timeout
Expand Down
Loading