Skip to content

Commit

Permalink
Deploy agent wait for first successful puppet run (#1208)
Browse files Browse the repository at this point in the history
* Deploy agent prerun conditions reenforcement

* added UT and bazel target

* Update deploy-agent/deployd/client/client.py

Co-authored-by: Tyler Ouyang <[email protected]>
  • Loading branch information
ntascii and tylerwowen committed Jun 21, 2023
1 parent 66f1179 commit d8d74f5
Show file tree
Hide file tree
Showing 12 changed files with 236 additions and 114 deletions.
10 changes: 8 additions & 2 deletions deploy-agent/deployd/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,12 @@
IS_PINTEREST = True if os.getenv("IS_PINTEREST", "false") == "true" else False
METRIC_PORT_HEALTH = int(os.getenv('METRIC_PORT_HEALTH')) if os.getenv('METRIC_PORT_HEALTH', False) else None
METRIC_CACHE_PATH = os.getenv('METRIC_CACHE_PATH', None)
TELEFIG_BINARY = os.getenv('TELEFIG_BINARY', "")
TELEFIG_BINARY = os.getenv('TELEFIG_BINARY', None)
MAIN_LOGGER = "deployd"
STATSBOARD_URL=os.getenv('STATSBOARD_URL', "https://statsboard.pinadmin.com/api/v1/")

__version__ = '1.2.42'
# 0: puppet applied successfully with no changes
# 2: puppet applied successfully with changes
PUPPET_SUCCESS_EXIT_CODES = [0, 2]

__version__ = '1.2.43'
21 changes: 9 additions & 12 deletions deploy-agent/deployd/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import daemon
import logging
import os
import sys
from random import randrange
import time
import traceback
Expand All @@ -32,11 +33,9 @@
from deployd.common.executor import Executor
from deployd.common.types import DeployReport, PingStatus, DeployStatus, OpCode, \
DeployError, DeployErrorSource, DeployStage, AgentStatus
from deployd.common.utils import check_telefig_unavailable_error
from deployd import IS_PINTEREST

log = logging.getLogger(__name__)
from deployd import IS_PINTEREST, MAIN_LOGGER

log = logging.getLogger(MAIN_LOGGER)

class PingServer(object):
def __init__(self, ag):
Expand Down Expand Up @@ -112,12 +111,6 @@ def _send_deploy_status_stats(self, deploy_report):
tags['stage_name'] = self._response.deployGoal.stageName
if deploy_report.status_code:
tags['status_code'] = deploy_report.status_code
if deploy_report.output_msg:
if check_telefig_unavailable_error(deploy_report.output_msg):
tags['error_source'] = DeployErrorSource.TELEFIG
tags['error'] = DeployError.TELEFIG_UNAVAILABLE
elif deploy_report.output_msg.find("teletraan_config_manager") != -1:
tags['error_source'] = DeployErrorSource.TELEFIG

create_sc_increment('deployd.stats.deploy.status', tags=tags)

Expand Down Expand Up @@ -483,9 +476,9 @@ def main():
is_serverless_mode = AgentRunMode.is_serverless(args.mode)
if args.daemon and is_serverless_mode:
raise ValueError("daemon and serverless mode is mutually exclusive.")

config = Config(args.config_file)
utils.run_prereqs(config)


if IS_PINTEREST:
import pinlogger

Expand All @@ -496,6 +489,10 @@ def main():
logging.basicConfig(filename=log_filename, level=config.get_log_level(),
format='%(asctime)s %(name)s:%(lineno)d %(levelname)s %(message)s')

if not utils.check_prereqs(config):
log.warning("Deploy agent cannot start because the prerequisites on puppet did not meet.")
sys.exit(0)

log.info("Start to run deploy-agent.")
# timing stats - agent start time
create_sc_timing('deployd.stats.internal.time_start_sec',
Expand Down
3 changes: 2 additions & 1 deletion deploy-agent/deployd/client/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,8 @@ def _read_host_info(self):
# Note: on U14, facter -p ec2_tags.Autoscaling does not work.
# so need to read ec2_tags from facter and parse Autoscaling tag to cover this case
if not self._autoscaling_group:
self._autoscaling_group = facter_data.get(ec2_tags_key, {}).get(asg_tag_key, None)
ec2_tags = facter_data.get(ec2_tags_key)
self._autoscaling_group = ec2_tags.get(asg_tag_key) if ec2_tags else None

if not self._stage_type and not self._stage_type_fetched:
self._stage_type = facter_data.get(stage_type_key, None)
Expand Down
8 changes: 7 additions & 1 deletion deploy-agent/deployd/common/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,9 +233,15 @@ def get_num_builds_retain(self):
def respect_puppet(self):
return self.get_intvar("respect_puppet", 0)

def get_puppet_file_path(self):
def get_puppet_state_file_path(self):
return self.get_var("puppet_file_path", None)

def get_puppet_summary_file_path(self):
return self.get_var("puppet_summary_file_path", "/var/cache/puppet/state/last_run_summary.yaml")

def get_puppet_exit_code_file_path(self):
return self.get_var("puppet_exit_code_file_path", "/var/log/puppet/puppet_exit_code")

def get_daemon_sleep_time(self):
return self.get_intvar("daemon_sleep_time", 30)

Expand Down
2 changes: 1 addition & 1 deletion deploy-agent/deployd/common/single_instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def __init__(self):
try:
fcntl.lockf(lockfile_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
except IOError:
print(('Error: {0} may already be running. Only one instance of it '
log.error(('Error: {0} may already be running. Only one instance of it '
'can run at a time.').format(appname))
# noinspection PyTypeChecker
os.close(lockfile_fd)
Expand Down
18 changes: 16 additions & 2 deletions deploy-agent/deployd/common/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,12 @@
# limitations under the License.

import logging
from deployd import __version__, IS_PINTEREST, METRIC_PORT_HEALTH, METRIC_CACHE_PATH
from deployd import __version__, IS_PINTEREST, METRIC_PORT_HEALTH, METRIC_CACHE_PATH, STATSBOARD_URL
import timeit
import socket
import json
import os
import requests

if IS_PINTEREST:
from pinstatsd.statsd import sc, sc_v2
Expand Down Expand Up @@ -106,7 +107,20 @@ def create_sc_gauge(name, value, sample_rate=1.0, tags=None):
else:
return


def send_statsboard_metric(name, value, tags=None):
tags['host'] = socket.gethostname()
tags_params = [f"{tag}={tags[tag]}" for tag in tags]
tags_str = ",".join(tags_params)
url = (
f"{STATSBOARD_URL}put/"
f"{name}?value={value}"
f"&tags={tags_str}"
)

resp = requests.put(url)
if resp.status_code == 200:
log.info("Successfully send the metric to statsboard")

class MetricCacheConfigurationError(ValueError):
""" Raised when MetricCache has missing configuration """
def __init__(self, name, value):
Expand Down
116 changes: 94 additions & 22 deletions deploy-agent/deployd/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
Expand All @@ -23,14 +23,17 @@
import traceback
import subprocess
import yaml


import json
from deployd import IS_PINTEREST, TELEFIG_BINARY
from deployd.common.stats import TimeElapsed, create_sc_increment, create_sc_timing
from deployd import IS_PINTEREST, PUPPET_SUCCESS_EXIT_CODES
from deployd.common.stats import TimeElapsed, create_sc_increment, create_sc_timing, send_statsboard_metric

log = logging.getLogger(__name__)


# noinspection PyProtectedMember


def exit_abruptly(status=0):
"""Exit method that just quits abruptly.
Expand Down Expand Up @@ -92,6 +95,7 @@ def mkdir_p(path):
else:
raise


def uptime():
""" return int: seconds of uptime in int, default 0 """
sec = 0
Expand All @@ -101,36 +105,109 @@ def uptime():
sec = int(float(line[0]))
return sec


def ensure_dirs(config):
# make sure deployd directories exist
mkdir_p(config.get_builds_directory())
mkdir_p(config.get_agent_directory())
mkdir_p(config.get_log_directory())


def run_prereqs(config):
# check if the puppet has finished or not
def is_first_run(config):
env_status_file = config.get_env_status_fn()
return not os.path.exists(env_status_file)


def check_prereqs(config):
"""
Check prerequisites before deploy agent can run
:return: True all conditions meet else False
"""
if IS_PINTEREST:
respect_puppet = config.respect_puppet()
puppet_file_path = config.get_puppet_file_path()
if respect_puppet and \
puppet_file_path is not None and \
not os.path.exists(puppet_file_path):
print("Waiting for first puppet run.")
sys.exit(0)
# check if the puppet has finished successfully or not
if respect_puppet:
puppet_state_file_path = config.get_puppet_state_file_path()
if puppet_state_file_path and (not os.path.exists(puppet_state_file_path)):
log.error("Waiting for first puppet run.")
return False
if not check_first_puppet_run_success(config):
log.error("First puppet run failed.")
return False

ensure_dirs(config)
return True


def get_puppet_exit_code(config):
"""
Get puppet exit code from the corresponding file
:return: puppet exit code or 999 if file doesn't exist
"""
puppet_exit_code_file = config.get_puppet_exit_code_file_path()
try:
with open(puppet_exit_code_file, "rt") as f:
exit_code = f.readline().strip()
except Exception as e:
log.warning(f"Could not read {puppet_exit_code_file} file: {e}")
exit_code = 999

return exit_code


def load_puppet_summary(config):
"""
Load last_run_summary yaml file, parse results
:return: returns a dict constructed from for the puppet summary file
"""
summary_file = config.get_puppet_summary_file_path()
summary = {}
if not os.path.exists(summary_file):
log.warning(f"{summary_file} does not exist. This could be the first puppet run")
return summary

with open(summary_file) as f:
summary = yaml.safe_load(f)
return summary


def check_first_puppet_run_success(config):
"""
Check first puppet run success from exit code and last run summary
:return: returns True if success else False
"""
if not is_first_run(config):
return True

puppet_exit_code = get_puppet_exit_code(config)
if puppet_exit_code in PUPPET_SUCCESS_EXIT_CODES:
return True

# If failed, double check with puppet last summary
puppet_summary = load_puppet_summary(config)
puppet_failures = puppet_summary.get('events', {}).get(
'failure', None) if puppet_summary else None
log.info(f"Puppet failures: {puppet_failures}")

if puppet_failures != 0:
send_statsboard_metric(name='deployd.first_puppet_failed', value=1,
tags={"puppet_exit_code": puppet_exit_code})
return puppet_failures == 0


def get_info_from_facter(keys):
try:
time_facter = TimeElapsed()
# increment stats - facter calls
create_sc_increment('deployd.stats.internal.facter_calls_sum', 1)
log.info("Fetching {} keys from facter".format(keys))
cmd = ['facter', '-p', '-j']
log.info(f"Fetching {keys} keys from facter")
cmd = ['facter', '-jp']
cmd.extend(keys)
output = subprocess.check_output(cmd)
output = subprocess.run(cmd, check=True, stdout=subprocess.PIPE).stdout
# timing stats - facter run time
create_sc_timing('deployd.stats.internal.time_elapsed_facter_calls_sec',
time_facter.get())
Expand All @@ -142,13 +219,8 @@ def get_info_from_facter(keys):
log.error("Failed to get info from facter by keys {}".format(keys))
return None


def check_not_none(arg, msg=None):
if arg is None:
raise ValueError(msg)
return arg

def check_telefig_unavailable_error(msg):
if not TELEFIG_BINARY:
return False
telefig_unavailable_error = "{}: No such file or directory".format(TELEFIG_BINARY)
return msg.find(telefig_unavailable_error) != -1
14 changes: 0 additions & 14 deletions deploy-agent/deployd/types/__init__.py

This file was deleted.

7 changes: 7 additions & 0 deletions deploy-agent/tests/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,13 @@ py_test(
python_version = "PY3",
)

py_test(
name = "test_utils",
srcs = ['unit/deploy/common/test_utils.py'],
deps = ["test_lib"],
python_version = "PY3",
)

py_library(
name = "test_lib",
srcs = ["__init__.py"],
Expand Down
Loading

0 comments on commit d8d74f5

Please sign in to comment.