From 0041af4b27688a8be868f0e666e8277803566fe2 Mon Sep 17 00:00:00 2001 From: jvilarru Date: Tue, 3 Sep 2024 17:18:06 +0200 Subject: [PATCH] Add sackd file override to restart on-failure (#204) * Add sackd file override to restart on-failure * Update ansible/roles/slurm/tasks/service.yml Co-authored-by: Tom Downes * Update ansible/roles/slurm/tasks/service.yml Co-authored-by: Tom Downes --------- Co-authored-by: Tom Downes --- ansible/roles/slurm/defaults/main.yml | 1 + ansible/roles/slurm/tasks/service.yml | 12 ++++++++++++ .../roles/slurm/templates/systemd/sackd_overrides.j2 | 3 +++ 3 files changed, 16 insertions(+) create mode 100644 ansible/roles/slurm/templates/systemd/sackd_overrides.j2 diff --git a/ansible/roles/slurm/defaults/main.yml b/ansible/roles/slurm/defaults/main.yml index 92229255..f1e3474b 100644 --- a/ansible/roles/slurm/defaults/main.yml +++ b/ansible/roles/slurm/defaults/main.yml @@ -40,5 +40,6 @@ slurmrestd_user: gid: 982 slurmd_override_path: /etc/systemd/system/slurmd.service.d/overrides.conf +sackd_override_path: /etc/systemd/system/sackd.service.d/override.conf slurmcmd_timeout: 30 handle_services: true diff --git a/ansible/roles/slurm/tasks/service.yml b/ansible/roles/slurm/tasks/service.yml index 1f143392..7d2e43b0 100644 --- a/ansible/roles/slurm/tasks/service.yml +++ b/ansible/roles/slurm/tasks/service.yml @@ -59,3 +59,15 @@ dest: '{{ slurmd_override_path }}' mode: 0o644 notify: Reload SystemD configuration + +- name: Create sackd override directory + file: + path: '{{ sackd_override_path | dirname }}' + state: directory + +- name: Sackd SystemD overrides + template: + src: systemd/sackd_overrides.j2 + dest: '{{ sackd_override_path }}' + mode: 0o644 + notify: Reload SystemD configuration diff --git a/ansible/roles/slurm/templates/systemd/sackd_overrides.j2 b/ansible/roles/slurm/templates/systemd/sackd_overrides.j2 new file mode 100644 index 00000000..f9117dc9 --- /dev/null +++ b/ansible/roles/slurm/templates/systemd/sackd_overrides.j2 @@ -0,0 +1,3 @@ +[Service] +RestartSec=15s +Restart=on-failure