Skip to content

Commit 3f12f78

Browse files
committed
+monit 新增monit
1 parent 6179f00 commit 3f12f78

33 files changed

+1661
-2
lines changed

ansible.cfg

+2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
[defaults]
22
force_valid_group_names = ignore
33
inventory = ./inventory
4+
library = ./modules
5+
action_plugins = ./modules/action_plugins
46
roles_path = ./roles
57
sudo_user = root
68
remote_user = root
+49
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
groups:
2+
- name: moosefs
3+
rules:
4+
- alert: "文件库降级"
5+
expr: moosefs_all_chunks_matrix_chunkclass_endangered{job="node"}
6+
/ (moosefs_all_chunks_matrix_chunkclass_endangered{job="node"} + moosefs_all_chunks_matrix_chunkclass_stable{job="node"}) > 0.1
7+
for: 1h
8+
labels:
9+
severity: warning
10+
channels: "@linux/feishu;"
11+
level: P1
12+
annotations:
13+
summary: "{{ $value | humanizePercentage }} 危急占比"
14+
15+
- alert: "文件库落盘"
16+
expr: time() - moosefs_master_info_last_successful_store{job="node"} > (3600 * 2)
17+
for: 2m
18+
labels:
19+
severity: warning
20+
channels: "@linux/feishu;"
21+
level: P1
22+
annotations:
23+
summary: "{{ $value | humanizeDuration }} 距离上次"
24+
25+
- alert: "文件库配额"
26+
expr: |
27+
(moosefs_active_quotas_current_size{job="node"} / moosefs_active_quotas_soft_size{job="node"}) <= 1 > 0.9
28+
OR
29+
(moosefs_active_quotas_current_size{job="node"} / moosefs_active_quotas_hard_size{job="node"}) <= 1 > 0.9
30+
for: 1m
31+
labels:
32+
severity: info
33+
channels: "@linux/feishu;"
34+
level: P2
35+
annotations:
36+
summary: "{{ $labels.path }}: {{ $value | humanizePercentage }}({{ $labels.type }})"
37+
38+
- alert: "文件库配额"
39+
expr: |
40+
(moosefs_active_quotas_soft_size{job="node"} - moosefs_active_quotas_current_size{job="node"}) >= 0 < (2*1024*1024*1024)
41+
OR
42+
(moosefs_active_quotas_hard_size{job="node"} - moosefs_active_quotas_current_size{job="node"}) >= 0 < (2*1024*1024*1024)
43+
for: 1m
44+
labels:
45+
severity: warning
46+
channels: "@linux/feishu;"
47+
level: P1
48+
annotations:
49+
summary: "{{ $labels.path }}: {{ $value | humanize1024 }}B({{ $labels.type }})"
+155
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
#!/usr/bin/awk -f
2+
#
3+
# usage:
4+
#
5+
# % /usr/bin/mfscli -ns"#" -SIM -SMU -SIG -SCS -SIC -SSC -SQU | ./mfscli2prom.awk
6+
#
7+
# moosefs >= v3.0.105
8+
#
9+
10+
BEGIN {
11+
FS="[#:|]"
12+
}
13+
14+
function SCS() {
15+
k=gensub(/\s/, "_", "g", $1);
16+
printf "moosefs_%s_load{ip=\"%s\",port=\"%s\",id=\"%s\",labels=\"%s\",version=\"%s\"} %s\n", k,$3,$4,$5,$6,$7,$8;
17+
18+
if ($9 ~ /maintenance_off/) {
19+
printf "moosefs_%s_maintenance{ip=\"%s\",port=\"%s\",id=\"%s\",labels=\"%s\",version=\"%s\"} 0\n", k,$3,$4,$5,$6,$7;
20+
} else {
21+
printf "moosefs_%s_maintenance{ip=\"%s\",port=\"%s\",id=\"%s\",labels=\"%s\",version=\"%s\"} 0\n", k,$3,$4,$5,$6,$7;
22+
}
23+
24+
printf "moosefs_%s_regular_hdd_space_chunks{ip=\"%s\",port=\"%s\",id=\"%s\",labels=\"%s\",version=\"%s\"} %s\n", k,$3,$4,$5,$6,$7,$10;
25+
printf "moosefs_%s_regular_hdd_space_used{ip=\"%s\",port=\"%s\",id=\"%s\",labels=\"%s\",version=\"%s\"} %s\n", k,$3,$4,$5,$6,$7,$11;
26+
printf "moosefs_%s_regular_hdd_space_total{ip=\"%s\",port=\"%s\",id=\"%s\",labels=\"%s\",version=\"%s\"} %s\n", k,$3,$4,$5,$6,$7,$12;
27+
28+
printf "moosefs_%s_removal_hdd_space_chunks{ip=\"%s\",port=\"%s\",id=\"%s\",labels=\"%s\",version=\"%s\"} %s\n", k,$3,$4,$5,$6,$7,$14;
29+
printf "moosefs_%s_removal_hdd_space_used{ip=\"%s\",port=\"%s\",id=\"%s\",labels=\"%s\",version=\"%s\"} %s\n", k,$3,$4,$5,$6,$7,$15;
30+
printf "moosefs_%s_removal_hdd_space_total{ip=\"%s\",port=\"%s\",id=\"%s\",labels=\"%s\",version=\"%s\"} %s\n", k,$3,$4,$5,$6,$7,$16;
31+
32+
}
33+
34+
function is_number(x) { return x+0 == x }
35+
function SIG() {
36+
if (!is_number($NF)) next
37+
38+
v=$NF;
39+
$(NF--);
40+
41+
k=$0;
42+
k=gensub(/\s+/, "_", "g", k);
43+
k=gensub(/[()]/, "", "g", k);
44+
k=tolower(k)
45+
46+
print "moosefs_"k,v
47+
}
48+
49+
function SIM() {
50+
k=gensub(/\s/, "_", "g", $1);
51+
52+
printf "moosefs_%s_metadata_version{ip=\"%s\", version=\"%s\"} %d\n", k,$3,$4,gensub(/[ ]/, "", "g", $7)
53+
printf "moosefs_%s_ram_used{ip=\"%s\", version=\"%s\"} %d\n", k,$3,$4,$9
54+
55+
printf "moosefs_%s_cpu_used_all{ip=\"%s\", version=\"%s\"} %f\n", k,$3,$4,gensub(/(.*)%.*/, "\\1", "g", $11)
56+
printf "moosefs_%s_cpu_used_sys{ip=\"%s\", version=\"%s\"} %f\n", k,$3,$4,gensub(/(.*)%.*/, "\\1", "g", $12)
57+
printf "moosefs_%s_cpu_used_user{ip=\"%s\", version=\"%s\"} %f\n", k,$3,$4,gensub(/(.*)%.*/, "\\1", "g", $13)
58+
printf "moosefs_%s_last_meta_save{ip=\"%s\", version=\"%s\", cksum=\"%s\"} %d\n", k,$3,$4,$NF,$14
59+
printf "moosefs_%s_last_save_duration{ip=\"%s\", version=\"%s\", cksum=\"%s\"} %f\n", k,$3,$4,$NF,$15
60+
61+
if ($16 ~ /[Ss]aved in background/){
62+
printf "moosefs_%s_last_saved_in_background{ip=\"%s\", version=\"%s\", cksum=\"%s\"} 1\n", k,$3,$4,$NF
63+
} else {
64+
printf "moosefs_%s_last_saved_in_background{ip=\"%s\", version=\"%s\", cksum=\"%s\"} 0\n", k,$3,$4,$NF
65+
}
66+
}
67+
68+
function SMU() {
69+
# k=gensub(/\s/, "_", "g", $1);
70+
#
71+
# printf "moosefs_%s_%s_used %s\n", k, gensub(/\s/, "_", "g", $3), $4
72+
# printf "moosefs_%s_%s_allocated %s\n", k, gensub(/\s/, "_", "g", $3), $5
73+
}
74+
75+
function SIC() {
76+
k=gensub(/\s/, "_", "g", $1) "_" gensub(/\s/, "_", "g", $3);
77+
78+
printf "moosefs_%s %s\n", k, $5
79+
}
80+
81+
function SSC() {
82+
k=gensub(/\s/, "_", "g", $1);
83+
C=gensub(/\s/, "", "g", $17);
84+
K=gensub(/\s/, "", "g", $20);
85+
A=gensub(/\s/, "", "g", $23);
86+
printf "moosefs_%s_create{id=\"%s\",name=\"%s\",mode=\"%s\",can=\"%s\",labels=\"%s\"} %s\n", k,$3,$4,$6,$15,C,$16
87+
printf "moosefs_%s_keep{id=\"%s\",name=\"%s\",mode=\"%s\",can=\"%s\",labels=\"%s\"} %s\n", k,$3,$4,$6,$18,K,$19
88+
printf "moosefs_%s_archive{id=\"%s\",name=\"%s\",mode=\"%s\",can=\"%s\",labels=\"%s\"} %s\n", k,$3,$4,$6,$21,A,$22
89+
90+
printf "moosefs_%s_files_total{id=\"%s\",name=\"%s\"} %d\n", k,$3,$4,$7
91+
printf "moosefs_%s_dirs_total{id=\"%s\",name=\"%s\"} %d\n", k,$3,$4,$8
92+
printf "moosefs_%s_standard_under_total{id=\"%s\",name=\"%s\"} %d\n", k,$3,$4,$9
93+
printf "moosefs_%s_standard_exact_total{id=\"%s\",name=\"%s\"} %d\n", k,$3,$4,$10
94+
printf "moosefs_%s_standard_over_total{id=\"%s\",name=\"%s\"} %d\n", k,$3,$4,$11
95+
printf "moosefs_%s_archived_under_total{id=\"%s\",name=\"%s\"} %d\n", k,$3,$4,$12
96+
printf "moosefs_%s_archived_exact_total{id=\"%s\",name=\"%s\"} %d\n", k,$3,$4,$13
97+
printf "moosefs_%s_archived_over_total{id=\"%s\",name=\"%s\"} %d\n", k,$3,$4,$14
98+
}
99+
100+
function SQU() {
101+
k=gensub(/\s/, "_", "g", $1); # keyword
102+
P=gensub(/\s/, "", "g", $3); # path
103+
SI=gensub(/\s/, "", "g", $7); # soft inodes
104+
SL=gensub(/\s/, "", "g", $8); # soft length
105+
SS=gensub(/\s/, "", "g", $9); # soft size
106+
SR=gensub(/\s/, "", "g", $10); # soft real size
107+
HI=gensub(/\s/, "", "g", $11); # hard inodes
108+
HL=gensub(/\s/, "", "g", $12); # hard length
109+
HS=gensub(/\s/, "", "g", $13); # hard size
110+
HR=gensub(/\s/, "", "g", $14); # hard real size
111+
CI=gensub(/\s/, "", "g", $15); # current inodes
112+
CL=gensub(/\s/, "", "g", $16); # current length
113+
CS=gensub(/\s/, "", "g", $17); # current size
114+
CR=gensub(/\s/, "", "g", $18); # current real size
115+
116+
printf "moosefs_%s_soft_inodes{path=\"%s\"} %d\n", k,P,SI
117+
printf "moosefs_%s_soft_length{path=\"%s\"} %d\n", k,P,SL
118+
printf "moosefs_%s_soft_size{path=\"%s\"} %d\n", k,P,SS
119+
printf "moosefs_%s_soft_real{path=\"%s\"} %d\n", k,P,SR
120+
121+
printf "moosefs_%s_hard_inodes{path=\"%s\"} %d\n", k,P,HI
122+
printf "moosefs_%s_hard_length{path=\"%s\"} %d\n", k,P,HL
123+
printf "moosefs_%s_hard_size{path=\"%s\"} %d\n", k,P,HS
124+
printf "moosefs_%s_hard_real{path=\"%s\"} %d\n", k,P,HR
125+
126+
printf "moosefs_%s_current_inodes{path=\"%s\"} %d\n", k,P,CI
127+
printf "moosefs_%s_current_length{path=\"%s\"} %d\n", k,P,CL
128+
printf "moosefs_%s_current_size{path=\"%s\"} %d\n", k,P,CS
129+
printf "moosefs_%s_current_real{path=\"%s\"} %d\n", k,P,CR
130+
}
131+
132+
{
133+
if ($0 ~ /^\s*$/) next
134+
135+
if ($0 ~ /^active quotas:/)
136+
SQU()
137+
138+
if ($0 ~ /^metadata servers:/)
139+
SIM()
140+
141+
if ($0 ~ /^memory usage detailed info:/)
142+
SMU()
143+
144+
if ($0 ~ /^chunk servers:/)
145+
SCS()
146+
147+
if ($0 ~ /^master info:/)
148+
SIG()
149+
150+
if ($0 ~ /chunkclass /)
151+
SIC()
152+
153+
if ($0 ~ /storage classes:/)
154+
SSC()
155+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
#!/bin/bash
2+
#
3+
# 统计 / 子目录的细节
4+
#
5+
6+
awk -f- <( \
7+
8+
cd /mnt/mfs/ && \
9+
find prometheus -mindepth 1 -maxdepth 1 -type d -exec mfsdirinfo -i -d -f -c -l -s -r {} \; && \
10+
find jenkins -mindepth 1 -maxdepth 1 -type d -exec mfsdirinfo -i -d -f -c -l -s -r {} \; && \
11+
find backuppc -mindepth 1 -maxdepth 1 -type d -exec mfsdirinfo -i -d -f -c -l -s -r {} \;
12+
13+
) <<'EOF'
14+
15+
function mfsdirinfo(line) {
16+
split(line, info)
17+
18+
inodes =info[1]
19+
dirs =info[2]
20+
files =info[3]
21+
chunks =info[4]
22+
len =info[5]
23+
size =info[6]
24+
real =info[7]
25+
path =info[8]
26+
27+
printf "moosefs_inodes_count{path=\"%s\"} %d\n", path, inodes
28+
printf "moosefs_dirs_count{path=\"%s\"} %d\n", path, dirs
29+
printf "moosefs_files_count{path=\"%s\"} %d\n", path, files
30+
printf "moosefs_chunks_count{path=\"%s\"} %d\n", path, chunks
31+
printf "moosefs_length_bytes{path=\"%s\"} %d\n", path, len
32+
printf "moosefs_size_bytes{path=\"%s\"} %d\n", path, size
33+
printf "moosefs_real_bytes{path=\"%s\"} %d\n", path, real
34+
}
35+
36+
{
37+
mfsdirinfo($0)
38+
}
39+
40+
EOF
41+

group_vars/pro.mfs.master

+40
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
gens_mfs_master_monit_services__to_merge:
2+
- name: mfsmaster
3+
type: process
4+
target: "/var/lib/mfs/.mfsmaster.lock"
5+
rules: |
6+
if changed pid then alert
7+
8+
- name: mfscgiserv
9+
type: process
10+
target: "/var/lib/mfs/.mfscgiserv.lock"
11+
rules: |
12+
if changed pid then alert
13+
14+
- name: metadata
15+
type: file
16+
target: "/var/lib/mfs/metadata.mfs.back"
17+
rules: |
18+
if not exist for 3 cycles then alert
19+
if timestamp is older than 2 hours then alert
20+
21+
- name: available
22+
type: program
23+
target: "/bin/sh -c 'mfsdirinfo /mnt/mfs/ | tee /mnt/mfs/.watchdog_by_monit'"
24+
extend: "with timeout 10 seconds"
25+
rules: |
26+
if status != 0 for 2 cycles then alert
27+
28+
- name: mfscli2prom
29+
type: program
30+
target: "/bin/sh -c '/usr/bin/mfscli -ns\"|\" -SIM -SMU -SIG -SCS -SIC -SSC -SQU | /opt/mfscli2prom.awk > /var/log/prometheus/mfscli.prom'"
31+
extend: "with timeout 10 seconds"
32+
rules: |
33+
if status != 0 for 2 cycles then alert
34+
35+
- name: mfsdirinfo2prom
36+
type: program
37+
target: "/bin/sh -c '/opt/mfsdirinfo2prom.sh | /usr/bin/stdbuf -oL tee /var/log/prometheus/mfsdirinfo.prom'"
38+
extend: "with timeout 10 seconds"
39+
rules: |
40+
if status != 0 for 2 cycles then alert

group_vars/pro.mfs.slaver

+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
gens_mfs_slaver_monit_services__to_merge:
2+
- name: mfsmetalogger
3+
type: process
4+
target: "/var/lib/mfs/.mfsmetalogger.lock"
5+
rules: |
6+
if changed pid then alert
7+
8+
9+
- name: changelog
10+
type: file
11+
target: "/var/lib/mfs/changelog_ml.0.mfs"
12+
rules: |
13+
if not exist for 3 cycles then alert
14+
if timestamp is older than 10 minutes then alert
15+
alert [email protected] on { timestamp }
16+
17+
- name: metadata
18+
type: file
19+
target: "/var/lib/mfs/metadata_ml.mfs.back"
20+
rules: |
21+
if not exist for 3 cycles then alert
22+
if timestamp is older than 2 days then alert
23+
alert [email protected] on { timestamp }

inventory/pro

+20
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,32 @@ minio-cluster-4
2323
# }}}
2424

2525
[pro.moosefs] # {{{ moosefs 集群
26+
[pro.moosefs:children]
27+
pro.mfs.master
28+
pro.mfs.slaver
29+
pro.mfs.chunks
30+
pro.mfs.client
31+
2632
mfsmaster
2733
mfs-master
2834
mfs-metalogger
2935
mfs-chunkserver-1
3036
mfs-chunkserver-2
3137
mfs-chunkserver-3
38+
39+
[pro.mfs.master]
40+
mfs-master
41+
42+
[pro.mfs.slaver]
43+
mfs-metalogger
44+
45+
[pro.mfs.chunks]
46+
mfs-chunkserver-1
47+
mfs-chunkserver-2
48+
mfs-chunkserver-3
49+
50+
[pro.mfs.client]
51+
mfs-master
3252
# }}}
3353

3454
[pro.redis] # {{{ redis 集群

modules/README.md

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
=== action_plugins/merge_vars.py
2+
https://github.com/leapfrogonline/ansible-merge-vars
3+
4+
monit 的 services,散布在不同的 group_vars 和 host_vars。
5+
某些资产,属于多个 group,本应该从各个 group 继承对应的 services,
6+
但是由于 ansible 默认的覆盖(override)行为,导致变量无法自动合并。
7+
8+
依赖 `pip3 install ansible-merge-vars`
9+
10+
=== mount_leif160519.py
11+
原生的 mount 模块的 absent 动作,不仅从 fstab 移除挂载点,
12+
而且还会执行 umount、以及更过分的 rm 删除挂载点目录的动作。
13+
14+
因为不希望对系统执行 umount 和 rm 这样的危险动作,
15+
所以从原生模块继承并修改得到了自定义的 mount_leif160519 模块。

modules/action_plugins/merge_vars.py

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from ansible_merge_vars import ActionModule

0 commit comments

Comments
 (0)