Skip to content

Commit

Permalink
feat: 检查Agent、bkmonitorbeat异常状态并发送邮件告知运维 (closed TencentBlueKing#2512)
Browse files Browse the repository at this point in the history
  • Loading branch information
Huayeaaa committed Dec 26, 2024
1 parent 2d566ca commit 07a298b
Show file tree
Hide file tree
Showing 4 changed files with 128 additions and 0 deletions.
1 change: 1 addition & 0 deletions apps/node_man/periodic_tasks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from .clean_subscription_record_info import ( # noqa
clean_subscription_record_info_periodic_task,
)
from .send_mail_to_maintainer import send_mail_to_maintainer_periodic_task # noqa
from .sync_agent_status_task import sync_agent_status_periodic_task # noqa
from .sync_all_isp_to_cmdb import sync_all_isp_to_cmdb_periodic_task # noqa
from .sync_cmdb_cloud_area import sync_cmdb_cloud_area_periodic_task # noqa
Expand Down
116 changes: 116 additions & 0 deletions apps/node_man/periodic_tasks/send_mail_to_maintainer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
# -*- coding: utf-8 -*-
"""
TencentBlueKing is pleased to support the open source community by making 蓝鲸智云-节点管理(BlueKing-BK-NODEMAN) available.
Copyright (C) 2017-2022 THL A29 Limited, a Tencent company. All rights reserved.
Licensed under the MIT License (the "License"); you may not use this file except in compliance with the License.
You may obtain a copy of the License at https://opensource.org/licenses/MIT
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
"""
import hashlib
import time
from collections import defaultdict
from typing import Any, Dict, List, Set
from uuid import uuid4

import requests
from celery.schedules import crontab
from celery.task import periodic_task
from django.conf import settings
from django.db.models import QuerySet

from apps.node_man import constants, models
from common.api import CCApi
from common.log import logger


def send_mail_to_maintainer(task_id):
if hasattr(settings, "TAIHU_TOKEN"):
return
logger.info(f"start send_mail_to_maintainer, task_id -> {task_id}")
# 异常Agent HostID
terminated_agent: Set[int] = models.ProcessStatus.objects.filter(
status=constants.ProcStateType.TERMINATED, name=models.ProcessStatus.GSE_AGENT_PROCESS_NAME
).values_list("bk_host_id", flat=True)
# 异常bkmonitorbeat HostID
terminated_plugin: Set[int] = models.ProcessStatus.objects.filter(
status=constants.ProcStateType.TERMINATED, name="bkmonitorbeat"
).values_list("bk_host_id", flat=True)
query_kwargs = {"fields": ["bk_biz_id", "bk_biz_name", "bk_biz_maintainer"]}
try:
biz_infos: List[Dict[str, Any]] = CCApi.search_business(query_kwargs)["info"]
# 去除业务运维为空的数据
biz_infos: List[Dict[str, Any]] = [biz_info for biz_info in biz_infos if biz_info["bk_biz_maintainer"]]
# 构建成业务ID映射业务信息字典
biz_id_biz_info_map: Dict[int, Dict[str, Any]] = {biz_info["bk_biz_id"]: biz_info for biz_info in biz_infos}

except Exception as e:
logger.exception(f"get business info error: {str(e)}")
return
terminated_agent_qs: QuerySet = models.Host.objects.filter(bk_host_id__in=terminated_agent).values(
"bk_biz_id", "inner_ip"
)
terminated_plugin_qs: QuerySet = models.Host.objects.filter(bk_host_id__in=terminated_plugin).values(
"bk_biz_id", "inner_ip"
)
# 异常Agent,bkmonitorbeat以业务分组
terminated_agent_ips_gby_biz_id: Dict[int, List[str]] = defaultdict(list)
terminated_plugin_ips_gby_biz_id: Dict[int, List[str]] = defaultdict(list)
for host_info in terminated_agent_qs:
terminated_agent_ips_gby_biz_id[host_info["bk_biz_id"]].append(host_info["inner_ip"])
for plugin_info in terminated_plugin_qs:
terminated_plugin_ips_gby_biz_id[plugin_info["bk_biz_id"]].append(plugin_info["inner_ip"])
total_handle_biz_ids = set(terminated_plugin_ips_gby_biz_id.keys()) | set(terminated_plugin_ips_gby_biz_id.keys())

passid = settings.APP_CODE
# 邮件发送人:配置为蓝鲸,如需更改或配置,在太湖已申请API中添加发件人白名单
sender = settings.TAIHU_MAIL_SENDER
pass_token = settings.TAIHU_TOKEN
url_path = settings.TAIHU_SEND_MAIL_API
timestamp = str(int(time.time())) # 生成时间戳,注意服务器的时间与标准时间差不能大于180秒
nonce = str(uuid4()) # 随机字符串,十分钟内不重复即可
signature = hashlib.sha256()
# 签名算法:x-rio-signature= sha256(x-rio-timestamp+Token+x-rio-nonce+x-rio-timestamp).upper()
string = timestamp + pass_token + nonce + timestamp
signature.update(string.encode())
signature = signature.hexdigest().upper() # 输出大写的结果
headers = {"x-rio-paasid": passid, "x-rio-nonce": nonce, "x-rio-timestamp": timestamp, "x-rio-signature": signature}
# 发送邮件必填参数
data = {
"From": sender,
"To": None,
"Title": "业务-{}-ID:{}:Agent-bkmonitorbeat状态异常告警",
"Content": "Agent异常IP : {}, bkmonitorbeat异常IP : {}",
}

req_obj = requests.Session()
for bk_biz_id in total_handle_biz_ids:
biz_info = biz_id_biz_info_map.get(bk_biz_id)
# 没有运维信息的业务不发送邮件
if not biz_info:
continue
biz_name = biz_info["bk_biz_name"]
biz_maintainer = biz_info["bk_biz_maintainer"]
agent_ips = terminated_agent_ips_gby_biz_id.get(bk_biz_id)
plugin_ips = terminated_plugin_ips_gby_biz_id.get(bk_biz_id)
data["To"] = biz_maintainer
data["Title"] = data["Title"].format(biz_name, bk_biz_id)
data["Content"] = data["Content"].format(agent_ips=agent_ips, plugin_ips=plugin_ips)
try:
req_obj.post(url=url_path, headers=headers, json=data)
except Exception as e:
logger.exception(f"bk_biz_id -> {bk_biz_id} send mail to maintainer error: {str(e)}")
continue
logger.info(f"send mail to maintainer success, task_id -> {task_id}")


@periodic_task(
queue="default",
options={"queue": "default"},
run_every=crontab(hour="9", minute="0", day_of_week="*", day_of_month="*", month_of_year="*"),
)
def send_mail_to_maintainer_periodic_task():
"""定时发送邮件给运维"""
task_id = send_mail_to_maintainer_periodic_task.request.id
send_mail_to_maintainer(task_id)
4 changes: 4 additions & 0 deletions config/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -824,6 +824,10 @@ def get_standard_redis_mode(cls, config_redis_mode: str, default: Optional[str]

# 腾讯云endpoint
TXY_ENDPOINT = env.TXY_ENDPOINT
# 太湖:邮件发送人、token、API
TAIHU_MAIL_SENDER = env.TAIHU_MAIL_SENDER
TAIHU_TOKEN = env.TAIHU_TOKEN
TAIHU_SEND_MAIL_API = env.TAIHU_SEND_MAIL_API

# ==============================================================================
# 可观测
Expand Down
7 changes: 7 additions & 0 deletions env/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,9 @@
"TXY_ENDPOINT",
# 未分配管控区域ID
"BKAPP_UNASSIGNED_CLOUD_ID",
"TAIHU_MAIL_SENDER",
"TAIHU_TOKEN",
"TAIHU_SEND_MAIL_API",
]

# ===============================================================================
Expand Down Expand Up @@ -101,6 +104,10 @@
BKAPP_AUTOMATIC_CHOICE_CLOUD_ID = get_type_env(key="BKAPP_AUTOMATIC_CHOICE_CLOUD_ID", default=-1, _type=int)
# 适配从k8s读取值变成科学计数法问题
BKAPP_UNASSIGNED_CLOUD_ID = int(get_type_env(key="BKAPP_UNASSIGNED_CLOUD_ID", default=90000001, _type=float))
# 太湖:邮件发送人、token、API
TAIHU_MAIL_SENDER = get_type_env(key="TAIHU_MAIL_SENDER", default="", _type=str)
TAIHU_TOKEN = get_type_env(key="TAIHU_TOKEN", default="", _type=str)
TAIHU_SEND_MAIL_API = get_type_env(key="TAIHU_SEND_MAIL_API", default="", _type=str)

# ===============================================================================
# 日志
Expand Down

0 comments on commit 07a298b

Please sign in to comment.