Skip to content

Commit

Permalink
agent scheduler modified to spread checks over period and fix a crash…
Browse files Browse the repository at this point in the history
… in opentelemetry module (#1775)

process execution is killed on timeout

some reverse_connection were forgotten, reverse_connection => reversed_grpc_streaming
  • Loading branch information
jean-christophe81 authored Nov 5, 2024
1 parent 0c73860 commit a864406
Show file tree
Hide file tree
Showing 29 changed files with 344 additions and 152 deletions.
10 changes: 5 additions & 5 deletions .github/scripts/agent_robot_test.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ $agent_log_path = $current_dir + "\reports\centagent.log"
Set-ItemProperty -Path HKLM:\SOFTWARE\Centreon\CentreonMonitoringAgent -Name log_file -Value $agent_log_path

#Start agent
Start-Process -FilePath build_windows\agent\Release\centagent.exe -RedirectStandardOutput reports\centagent_stdout.log -RedirectStandardError reports\centagent_stderr.log
$agent_process = Start-Process -PassThru -FilePath build_windows\agent\Release\centagent.exe -ArgumentList "--standalone" -RedirectStandardOutput reports\centagent_stdout.log -RedirectStandardError reports\centagent_stderr.log

Write-Host ($agent_process | Format-Table | Out-String)

Expand All @@ -73,7 +73,7 @@ Set-ItemProperty -Path HKLM:\SOFTWARE\Centreon\CentreonMonitoringAgent -Name en
$agent_log_path = $current_dir + "\reports\encrypted_centagent.log"
Set-ItemProperty -Path HKLM:\SOFTWARE\Centreon\CentreonMonitoringAgent -Name log_file -Value $agent_log_path

Start-Process -FilePath build_windows\agent\Release\centagent.exe -RedirectStandardOutput reports\encrypted_centagent_stdout.log -RedirectStandardError reports\encrypted_centagent_stderr.log
Start-Process -FilePath build_windows\agent\Release\centagent.exe -ArgumentList "--standalone" -RedirectStandardOutput reports\encrypted_centagent_stdout.log -RedirectStandardError reports\encrypted_centagent_stderr.log


Start-Sleep -Seconds 1
Expand All @@ -82,11 +82,11 @@ Start-Sleep -Seconds 1
Set-ItemProperty -Path HKLM:\SOFTWARE\Centreon\CentreonMonitoringAgent -Name ca_certificate -Value ""
Set-ItemProperty -Path HKLM:\SOFTWARE\Centreon\CentreonMonitoringAgent -Name encryption -Value 0
Set-ItemProperty -Path HKLM:\SOFTWARE\Centreon\CentreonMonitoringAgent -Name endpoint -Value 0.0.0.0:4320
Set-ItemProperty -Path HKLM:\SOFTWARE\Centreon\CentreonMonitoringAgent -Name reverse_connection -Value 1
Set-ItemProperty -Path HKLM:\SOFTWARE\Centreon\CentreonMonitoringAgent -Name reversed_grpc_streaming -Value 1
$agent_log_path = $current_dir + "\reports\reverse_centagent.log"
Set-ItemProperty -Path HKLM:\SOFTWARE\Centreon\CentreonMonitoringAgent -Name log_file -Value $agent_log_path

Start-Process -FilePath build_windows\agent\Release\centagent.exe -RedirectStandardOutput reports\reversed_centagent_stdout.log -RedirectStandardError reports\reversed_centagent_stderr.log
Start-Process -FilePath build_windows\agent\Release\centagent.exe -ArgumentList "--standalone" -RedirectStandardOutput reports\reversed_centagent_stdout.log -RedirectStandardError reports\reversed_centagent_stderr.log

Start-Sleep -Seconds 1

Expand All @@ -98,7 +98,7 @@ Set-ItemProperty -Path HKLM:\SOFTWARE\Centreon\CentreonMonitoringAgent -Name en
$agent_log_path = $current_dir + "\reports\encrypted_reverse_centagent.log"
Set-ItemProperty -Path HKLM:\SOFTWARE\Centreon\CentreonMonitoringAgent -Name log_file -Value $agent_log_path

Start-Process -FilePath build_windows\agent\Release\centagent.exe -RedirectStandardOutput reports\encrypted_reversed_centagent_stdout.log -RedirectStandardError reports\encrypted_reversed_centagent_stderr.log
Start-Process -FilePath build_windows\agent\Release\centagent.exe -ArgumentList "--standalone" -RedirectStandardOutput reports\encrypted_reversed_centagent_stdout.log -RedirectStandardError reports\encrypted_reversed_centagent_stderr.log

wsl cd $wsl_path `&`& .github/scripts/wsl-collect-test-robot.sh broker-engine/cma.robot $my_host_name $my_ip $pwsh_path ${current_dir}.replace('\','/')

Expand Down
23 changes: 16 additions & 7 deletions .github/scripts/windows-agent-compile.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,14 @@ Write-Host $env:VCPKG_BINARY_SOURCES

$current_dir = $pwd.ToString()

#install recent version of 7zip needed by some packages
Write-Host "install 7zip"

#download 7zip
Invoke-WebRequest -Uri "https://www.7-zip.org/a/7z2408-x64.msi" -OutFile "7z2408-x64.msi"
#install 7zip
Start-Process 'msiexec.exe' -ArgumentList '/I "7z2408-x64.msi" /qn' -Wait

#get cache from s3
$files_to_hash = "vcpkg.json", "custom-triplets\x64-windows.cmake", "CMakeLists.txt", "CMakeListsWindows.txt"
$files_content = Get-Content -Path $files_to_hash -Raw
Expand All @@ -32,7 +40,7 @@ $writer = [System.IO.StreamWriter]::new($stringAsStream)
$writer.write($files_content -join " ")
$writer.Flush()
$stringAsStream.Position = 0
$vcpkg_release = "2024.09.30"
$vcpkg_release = "2024.10.21"
$vcpkg_hash = Get-FileHash -InputStream $stringAsStream -Algorithm SHA256 | Select-Object Hash
$file_name = "windows-agent-vcpkg-dependencies-cache-" + $vcpkg_hash.Hash + "-" + $vcpkg_release
$file_name_extension = "${file_name}.7z"
Expand All @@ -58,18 +66,19 @@ if ( $? -ne $true ) {
Write-Host "compile vcpkg dependencies"
vcpkg install --vcpkg-root $env:VCPKG_ROOT --x-install-root build_windows\vcpkg_installed --x-manifest-root . --overlay-triplets custom-triplets --triplet x64-windows

Write-Host "Compress binary archive"
7z a $file_name_extension build_windows\vcpkg_installed
Write-Host "Upload binary archive"
aws s3 cp $file_name_extension s3://centreon-collect-robot-report/$file_name_extension
Write-Host "create CMake files"
if ( $? -eq $true ) {
Write-Host "Compress binary archive"
7z a $file_name_extension build_windows\vcpkg_installed
Write-Host "Upload binary archive"
aws s3 cp $file_name_extension s3://centreon-collect-robot-report/$file_name_extension
}
}
else {
7z x $file_name_extension
Write-Host "Create cmake files from binary-cache downloaded without use vcpkg"
}


Write-Host "create CMake files"

cmake -DCMAKE_BUILD_TYPE=Release -DWITH_TESTING=On -DWINDOWS=On -DBUILD_FROM_CACHE=On -S. -DVCPKG_CRT_LINKAGE=dynamic -DBUILD_SHARED_LIBS=OFF -Bbuild_windows

Expand Down
4 changes: 3 additions & 1 deletion .github/workflows/windows-agent.yml
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,9 @@ jobs:
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
with:
name: packages-centreon-monitoring-agent-windows
path: agent\installer\centreon-monitoring-agent.exe
path: |
agent\installer\centreon-monitoring-agent.exe
build_windows\agent\Release\centagent.exe
- name: Deliver
if: |
Expand Down
9 changes: 5 additions & 4 deletions agent/doc/agent-doc.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,10 @@ We don't care about the duration of tests, we work with time points.
In the previous example, the second check for the first service will be scheduled at 12:00:10 even if all other checks has not been yet started.

In case of check duration is too long, we might exceed maximum of concurrent checks. In that case checks will be executed as soon one will be ended.
This means that the second check may start later than the scheduled time point (12:00:10) if the other first checks are too long. The order of checks is always respected even in case of a bottleneck.
For example, a check lambda has a start_expected to 12:00, because of bottleneck, it starts at 12:15. Next start_expected of check lambda will then be 12:15 + check_period.

This means that the second check may start later than the scheduled time point (12:00:10) if the first checks take too long.

When a check completes, it is inserted into _waiting_check_queue, and its start will be scheduled as soon as a slot in the queue is available (the queue is a set indexed by expected_start) minus old_start plus check_period.


## native checks
Expand All @@ -39,8 +41,6 @@ That method need 4 arguments:
* perfdata: a list of com::centreon::common::perfdata objects
* outputs: equivalent of plugins output as "CPU 54% OK"

BEWARE, in some cases, we can have recursion, check::on_completion can call start_check

A little example:
```c++
class dummy_check : public check {
Expand Down Expand Up @@ -74,6 +74,7 @@ class dummy_check : public check {
: check(g_io_context,
spdlog::default_logger(),
std::chrono::system_clock::now(),
std::chrono::seconds(1),
serv,
command_name,
command_line,
Expand Down
54 changes: 41 additions & 13 deletions agent/inc/com/centreon/agent/check.hh
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,21 @@ enum e_status : unsigned { ok = 0, warning = 1, critical = 2, unknown = 3 };
*
*/
class time_step {
const time_point _start_point;
const duration _step;
time_point _start_point;
duration _step;
uint64_t _step_index = 0;

public:
time_step(duration step)
: _start_point(std::chrono::system_clock::now()), _step(step) {}
/**
* @brief Construct a new time step object
*
* @param start_point this time_point is the first time_point of the sequence
* @param step value() will return start_point + step * step_index
*/
time_step(time_point start_point, duration step)
: _start_point(start_point), _step(step) {}

time_step() : _start_point(), _step() {}

/**
* @brief increment time of one duration (one step)
Expand All @@ -61,15 +69,27 @@ class time_step {
}

/**
* @brief set _step_index to the first step after now
* @brief set _step_index to the first step after or equal to now
*
*/
void increment_to_after_now() {
time_point now = std::chrono::system_clock::now();
_step_index = (now - _start_point) / _step + 1;
_step_index =
(now - _start_point + _step - std::chrono::microseconds(1)) / _step;
}

time_point value() { return _start_point + _step_index * _step; }
/**
* @brief set _step_index to the first step after or equal to min_tp
*
*/
void increment_to_after_min(time_point min_tp) {
_step_index =
(min_tp - _start_point + _step - std::chrono::microseconds(1)) / _step;
}

time_point value() const { return _start_point + _step_index * _step; }

uint64_t get_step_index() const { return _step_index; }
};

/**
Expand All @@ -88,8 +108,9 @@ class check : public std::enable_shared_from_this<check> {

private:
//_start_expected is set on construction on config receive
// it's updated on check_start and added of check_period on check completion
time_point _start_expected;
// it's updated on check_start and added of multiple of check_interval
// (check_period / nb_check) on check completion
time_step _start_expected;
const std::string& _service;
const std::string& _command_name;
const std::string& _command_line;
Expand Down Expand Up @@ -123,12 +144,15 @@ class check : public std::enable_shared_from_this<check> {

bool _start_check(const duration& timeout);

virtual void _on_timeout(){};

public:
using pointer = std::shared_ptr<check>;

check(const std::shared_ptr<asio::io_context>& io_context,
const std::shared_ptr<spdlog::logger>& logger,
time_point exp,
time_point first_start_expected,
duration check_interval,
const std::string& serv,
const std::string& command_name,
const std::string& cmd_line,
Expand All @@ -140,13 +164,17 @@ class check : public std::enable_shared_from_this<check> {
struct pointer_start_compare {
bool operator()(const check::pointer& left,
const check::pointer& right) const {
return left->_start_expected < right->_start_expected;
return left->_start_expected.value() < right->_start_expected.value();
}
};

void add_duration_to_start_expected(const duration& to_add);
void increment_start_expected_to_after_min_timepoint(time_point min_tp) {
_start_expected.increment_to_after_min(min_tp);
}

void add_check_interval_to_start_expected() { ++_start_expected; }

time_point get_start_expected() const { return _start_expected; }
time_point get_start_expected() const { return _start_expected.value(); }

const std::string& get_service() const { return _service; }

Expand Down
11 changes: 7 additions & 4 deletions agent/inc/com/centreon/agent/check_exec.hh
Original file line number Diff line number Diff line change
Expand Up @@ -84,15 +84,15 @@ class check_exec : public check {
protected:
using check::completion_handler;

void _timeout_timer_handler(const boost::system::error_code& err,
unsigned start_check_index) override;
void _on_timeout() override;

void _init();

public:
check_exec(const std::shared_ptr<asio::io_context>& io_context,
const std::shared_ptr<spdlog::logger>& logger,
time_point exp,
time_point first_start_expected,
duration check_interval,
const std::string& serv,
const std::string& cmd_name,
const std::string& cmd_line,
Expand All @@ -102,7 +102,8 @@ class check_exec : public check {
static std::shared_ptr<check_exec> load(
const std::shared_ptr<asio::io_context>& io_context,
const std::shared_ptr<spdlog::logger>& logger,
time_point exp,
time_point first_start_expected,
duration check_interval,
const std::string& serv,
const std::string& cmd_name,
const std::string& cmd_line,
Expand All @@ -111,6 +112,8 @@ class check_exec : public check {

void start_check(const duration& timeout) override;

int get_pid() const;

void on_completion(unsigned running_index);
};

Expand Down
11 changes: 8 additions & 3 deletions agent/inc/com/centreon/agent/scheduler.hh
Original file line number Diff line number Diff line change
Expand Up @@ -37,16 +37,18 @@ class scheduler : public std::enable_shared_from_this<scheduler> {
const std::shared_ptr<asio::io_context>&,
const std::shared_ptr<spdlog::logger>& /*logger*/,
time_point /* start expected*/,
duration /* check interval */,
const std::string& /*service*/,
const std::string& /*cmd_name*/,
const std::string& /*cmd_line*/,
const engine_to_agent_request_ptr& /*engine to agent request*/,
check::completion_handler&&)>;

private:
using check_queue = std::set<check::pointer, check::pointer_start_compare>;
using check_queue =
absl::btree_set<check::pointer, check::pointer_start_compare>;

check_queue _check_queue;
check_queue _waiting_check_queue;
// running check counter that must not exceed max_concurrent_check
unsigned _active_check = 0;
bool _alive = true;
Expand All @@ -72,6 +74,8 @@ class scheduler : public std::enable_shared_from_this<scheduler> {
metric_sender _metric_sender;
asio::system_timer _send_timer;
asio::system_timer _check_timer;
time_step
_check_time_step; // time point used when too many checks are running
check_builder _check_builder;
// in order to send check_results at regular intervals, we work with absolute
// time points that we increment
Expand Down Expand Up @@ -154,7 +158,8 @@ class scheduler : public std::enable_shared_from_this<scheduler> {
static std::shared_ptr<check> default_check_builder(
const std::shared_ptr<asio::io_context>& io_context,
const std::shared_ptr<spdlog::logger>& logger,
time_point start_expected,
time_point first_start_expected,
duration check_interval,
const std::string& service,
const std::string& cmd_name,
const std::string& cmd_line,
Expand Down
3 changes: 2 additions & 1 deletion agent/native_linux/inc/com/centreon/agent/check_cpu.hh
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,8 @@ class check_cpu : public check {
public:
check_cpu(const std::shared_ptr<asio::io_context>& io_context,
const std::shared_ptr<spdlog::logger>& logger,
time_point exp,
time_point first_start_expected,
duration check_interval,
const std::string& serv,
const std::string& cmd_name,
const std::string& cmd_line,
Expand Down
Loading

0 comments on commit a864406

Please sign in to comment.