Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Devops 1593 fix #64

Draft
wants to merge 12 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
.idea
19 changes: 3 additions & 16 deletions deployment/bin/cron/container_reaper.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
# SLACK_WEBHOOK_URL - The slack webhook url to send messages to
"""

import json
import os
import socket
import subprocess
Expand All @@ -17,21 +16,9 @@
from typing import Set

import docker
import requests
from docker.models.containers import Container


def send_slack_message(message: str):
"""
:param message: Escaped Message to send to slack
"""
webhook_url = os.environ.get("SLACK_WEBHOOK_URL", None)
slack_data = {"text": message}
requests.post(
webhook_url,
data=json.dumps(slack_data),
headers={"Content-Type": "application/json"},
)
from .send_slack_message import notify_kbase_slack


def filter_containers_by_time(potential_containers, days=0, minutes=0):
Expand Down Expand Up @@ -74,7 +61,7 @@ def reap_containers_running_more_than_7_days(potential_containers: Set[Container
if old_containers:
for old_container in old_containers:
message = get_running_time_message(old_container, title="reaper7daylimit")
send_slack_message(message)
notify_kbase_slack(message)
remove_with_backoff(old_container, message)


Expand All @@ -91,7 +78,7 @@ def reap_containers_when_there_is_no_starter(potential_containers: Set[Container
if runaway_containers:
for runaway_container in runaway_containers:
message = get_running_time_message(runaway_container, title="reaper_no_starter")
send_slack_message(message)
notify_kbase_slack(message)
remove_with_backoff(container,message)


Expand Down
18 changes: 2 additions & 16 deletions deployment/bin/cron/delete_exited_containers.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,12 @@
#!/miniconda/bin/python
# This script is automatically run by the condor cronjob periodically
# in order to clean up exited docker containers.
import json
import os
import socket

import docker
import requests


def send_slack_message(message: str):
"""
:param message: Escaped Message to send to slack
"""
webhook_url = os.environ.get("SLACK_WEBHOOK_URL", None)
slack_data = {"text": message}
requests.post(
webhook_url,
data=json.dumps(slack_data),
headers={"Content-Type": "application/json"},
)

from .send_slack_message import notify_kbase_slack

if __name__ == "__main__":
hostname = socket.gethostname()
Expand All @@ -33,5 +19,5 @@ def send_slack_message(message: str):
container.remove()
debug_mode = os.environ.get("DEBUG", "false").lower() == "true"
if debug_mode:
send_slack_message(
notify_kbase_slack(
f"Deleted {len(kbase_containers)} `exited` containers with 'kbase' in image name on {hostname}: {container_image_names}")
43 changes: 20 additions & 23 deletions deployment/bin/cron/health_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
"""
import datetime
import inspect
import json
import logging
import os
import pwd
Expand All @@ -18,6 +17,8 @@
import psutil
import requests

from .send_slack_message import notify_kbase_slack

# Optional environment variables
var_lib_docker = os.environ.get("DOCKER_CACHE", "/var/lib/docker/")
scratch = os.environ.get("CONDOR_SUBMIT_WORKDIR", "/cdr")
Expand All @@ -38,17 +39,7 @@
pid = pwd.getpwnam(user).pw_uid
gid = pwd.getpwnam(user).pw_gid


def send_slack_message(message: str):
"""
:param message: Escaped Message to send to slack
"""
slack_data = {"text": message}
requests.post(
webhook_url,
data=json.dumps(slack_data),
headers={"Content-Type": "application/json"},
)
LOCKFILE_PATH = "/tmp/lockfile"


def exit_unsuccessfully(message: str, send_to_slack=True):
Expand All @@ -58,22 +49,29 @@ def exit_unsuccessfully(message: str, send_to_slack=True):
print("NODE_IS_HEALTHY = False")
print(f'HEALTH_STATUS_MESSAGE = "{message}"')
print("- update:true")
now = datetime.datetime.now()

if send_to_slack:
try:
function_name = lambda: inspect.stack()[1][3]
except Exception:
if not os.path.exists(LOCKFILE_PATH):
# Lock file doesn't exist, meaning it's a new failure
# Notify Slack and create the lockfile to prevent future notifications for NODE_IS_HEALTHY = False
with open(LOCKFILE_PATH, 'w') as lock_file:
lock_file.write(message) # Optionally write the failure message into the lockfile
function_name = ""

send_slack_message(
f"POSSIBLE BLACK HOLE: {function_name} Ran healthcheck at {now} on {socket.gethostname()} with failure: {message}"
)

try:
function_name = inspect.stack()[1][3]
except Exception:
pass
notify_kbase_slack(
f"POSSIBLE BLACK HOLE: {function_name} on {socket.gethostname()} with failure: {message}"
)
# If the lock file exists, don't message slack, but continue the sys.exit(1)
sys.exit(1)


def exit_successfully():
if os.path.exists(LOCKFILE_PATH):
os.remove(LOCKFILE_PATH) # Delete the lock file if it exists

print("NODE_IS_HEALTHY = True")
print(f'HEALTH_STATUS_MESSAGE = "Healthy {datetime.datetime.now()}"')
print("- update:true")
Expand Down Expand Up @@ -197,7 +195,7 @@ def test_enough_space(mount_point, nickname, percentage):

def check_kbase_endpoints():
"""
Check auth/njs/catalog/ws
Check Auth, Catalog, and Workspace status
"""

post_services = {
Expand Down Expand Up @@ -232,7 +230,6 @@ def check_kbase_endpoints():

def main():
try:
# send_slack_message(f"Job HEALTH_CHECK is beginning at {datetime.datetime.now()}")
test_docker_socket()
test_docker_socket2()
test_scratch_world_writeable()
Expand Down
16 changes: 16 additions & 0 deletions deployment/bin/cron/send_slack_message.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import requests
import os
import json

def notify_kbase_slack(message: str):
"""
:param message: Escaped Message to send to slack
"""
webhook_url = os.environ.get("SLACK_WEBHOOK_URL", None)
slack_data = {"text": message}
requests.post(
webhook_url,
data=json.dumps(slack_data),
headers={"Content-Type": "application/json"},
)

22 changes: 22 additions & 0 deletions deployment/bin/cron/testing_create_dummy_containers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import docker

# Create a container with ee2 labels and username labels
# Fake the creation time to be 7 days ago

client = docker.from_env()
c = client.containers.run("ubuntu:latest", "/bin/sleep infinity", labels={"user_name": "test_user", "ee2_endpoint":
"https://ci.kbase.us/services/ee2"},
detach=True,
name="test_container2",
auto_remove=True)

def remove_with_backoff(container,message,backoff=30):
try:
container.stop()
import time
time.sleep(backoff) # Wait for backoff period before attempting to remove
container.remove()
except Exception as e:
pass

remove_with_backoff(c,"test_container2")
Loading