From d8d74f50819e8857fc6b166211a5f4b9a985683c Mon Sep 17 00:00:00 2001 From: Anh Nguyen <50259686+ntascii@users.noreply.github.com> Date: Wed, 21 Jun 2023 15:21:51 -0700 Subject: [PATCH] Deploy agent wait for first successful puppet run (#1208) * Deploy agent prerun conditions reenforcement * added UT and bazel target * Update deploy-agent/deployd/client/client.py Co-authored-by: Tyler Ouyang --- deploy-agent/deployd/__init__.py | 10 +- deploy-agent/deployd/agent.py | 21 ++-- deploy-agent/deployd/client/client.py | 3 +- deploy-agent/deployd/common/config.py | 8 +- .../deployd/common/single_instance.py | 2 +- deploy-agent/deployd/common/stats.py | 18 ++- deploy-agent/deployd/common/utils.py | 116 ++++++++++++++---- deploy-agent/deployd/types/__init__.py | 14 --- deploy-agent/tests/BUILD.bazel | 7 ++ .../tests/unit/deploy/common/test_utils.py | 87 +++++++++++++ .../tests/unit/deploy/server/test_agent.py | 54 -------- deploy-agent/thirdparty/requirements.txt | 10 +- 12 files changed, 236 insertions(+), 114 deletions(-) delete mode 100644 deploy-agent/deployd/types/__init__.py create mode 100644 deploy-agent/tests/unit/deploy/common/test_utils.py diff --git a/deploy-agent/deployd/__init__.py b/deploy-agent/deployd/__init__.py index e544347fe4..7786299ce4 100644 --- a/deploy-agent/deployd/__init__.py +++ b/deploy-agent/deployd/__init__.py @@ -18,6 +18,12 @@ IS_PINTEREST = True if os.getenv("IS_PINTEREST", "false") == "true" else False METRIC_PORT_HEALTH = int(os.getenv('METRIC_PORT_HEALTH')) if os.getenv('METRIC_PORT_HEALTH', False) else None METRIC_CACHE_PATH = os.getenv('METRIC_CACHE_PATH', None) -TELEFIG_BINARY = os.getenv('TELEFIG_BINARY', "") +TELEFIG_BINARY = os.getenv('TELEFIG_BINARY', None) +MAIN_LOGGER = "deployd" +STATSBOARD_URL=os.getenv('STATSBOARD_URL', "https://statsboard.pinadmin.com/api/v1/") -__version__ = '1.2.42' +# 0: puppet applied successfully with no changes +# 2: puppet applied successfully with changes +PUPPET_SUCCESS_EXIT_CODES = [0, 2] + +__version__ = '1.2.43' diff --git a/deploy-agent/deployd/agent.py b/deploy-agent/deployd/agent.py index abd284a9db..c0685fa0bb 100644 --- a/deploy-agent/deployd/agent.py +++ b/deploy-agent/deployd/agent.py @@ -16,6 +16,7 @@ import daemon import logging import os +import sys from random import randrange import time import traceback @@ -32,11 +33,9 @@ from deployd.common.executor import Executor from deployd.common.types import DeployReport, PingStatus, DeployStatus, OpCode, \ DeployError, DeployErrorSource, DeployStage, AgentStatus -from deployd.common.utils import check_telefig_unavailable_error -from deployd import IS_PINTEREST - -log = logging.getLogger(__name__) +from deployd import IS_PINTEREST, MAIN_LOGGER +log = logging.getLogger(MAIN_LOGGER) class PingServer(object): def __init__(self, ag): @@ -112,12 +111,6 @@ def _send_deploy_status_stats(self, deploy_report): tags['stage_name'] = self._response.deployGoal.stageName if deploy_report.status_code: tags['status_code'] = deploy_report.status_code - if deploy_report.output_msg: - if check_telefig_unavailable_error(deploy_report.output_msg): - tags['error_source'] = DeployErrorSource.TELEFIG - tags['error'] = DeployError.TELEFIG_UNAVAILABLE - elif deploy_report.output_msg.find("teletraan_config_manager") != -1: - tags['error_source'] = DeployErrorSource.TELEFIG create_sc_increment('deployd.stats.deploy.status', tags=tags) @@ -483,9 +476,9 @@ def main(): is_serverless_mode = AgentRunMode.is_serverless(args.mode) if args.daemon and is_serverless_mode: raise ValueError("daemon and serverless mode is mutually exclusive.") + config = Config(args.config_file) - utils.run_prereqs(config) - + if IS_PINTEREST: import pinlogger @@ -496,6 +489,10 @@ def main(): logging.basicConfig(filename=log_filename, level=config.get_log_level(), format='%(asctime)s %(name)s:%(lineno)d %(levelname)s %(message)s') + if not utils.check_prereqs(config): + log.warning("Deploy agent cannot start because the prerequisites on puppet did not meet.") + sys.exit(0) + log.info("Start to run deploy-agent.") # timing stats - agent start time create_sc_timing('deployd.stats.internal.time_start_sec', diff --git a/deploy-agent/deployd/client/client.py b/deploy-agent/deployd/client/client.py index a9c5f64b30..d57ec60b47 100644 --- a/deploy-agent/deployd/client/client.py +++ b/deploy-agent/deployd/client/client.py @@ -164,7 +164,8 @@ def _read_host_info(self): # Note: on U14, facter -p ec2_tags.Autoscaling does not work. # so need to read ec2_tags from facter and parse Autoscaling tag to cover this case if not self._autoscaling_group: - self._autoscaling_group = facter_data.get(ec2_tags_key, {}).get(asg_tag_key, None) + ec2_tags = facter_data.get(ec2_tags_key) + self._autoscaling_group = ec2_tags.get(asg_tag_key) if ec2_tags else None if not self._stage_type and not self._stage_type_fetched: self._stage_type = facter_data.get(stage_type_key, None) diff --git a/deploy-agent/deployd/common/config.py b/deploy-agent/deployd/common/config.py index d2b1501234..c966bc9995 100644 --- a/deploy-agent/deployd/common/config.py +++ b/deploy-agent/deployd/common/config.py @@ -233,9 +233,15 @@ def get_num_builds_retain(self): def respect_puppet(self): return self.get_intvar("respect_puppet", 0) - def get_puppet_file_path(self): + def get_puppet_state_file_path(self): return self.get_var("puppet_file_path", None) + def get_puppet_summary_file_path(self): + return self.get_var("puppet_summary_file_path", "/var/cache/puppet/state/last_run_summary.yaml") + + def get_puppet_exit_code_file_path(self): + return self.get_var("puppet_exit_code_file_path", "/var/log/puppet/puppet_exit_code") + def get_daemon_sleep_time(self): return self.get_intvar("daemon_sleep_time", 30) diff --git a/deploy-agent/deployd/common/single_instance.py b/deploy-agent/deployd/common/single_instance.py index 78438a512c..6ecd173029 100644 --- a/deploy-agent/deployd/common/single_instance.py +++ b/deploy-agent/deployd/common/single_instance.py @@ -48,7 +48,7 @@ def __init__(self): try: fcntl.lockf(lockfile_fd, fcntl.LOCK_EX | fcntl.LOCK_NB) except IOError: - print(('Error: {0} may already be running. Only one instance of it ' + log.error(('Error: {0} may already be running. Only one instance of it ' 'can run at a time.').format(appname)) # noinspection PyTypeChecker os.close(lockfile_fd) diff --git a/deploy-agent/deployd/common/stats.py b/deploy-agent/deployd/common/stats.py index 3bea707f47..bdaa4d0378 100644 --- a/deploy-agent/deployd/common/stats.py +++ b/deploy-agent/deployd/common/stats.py @@ -13,11 +13,12 @@ # limitations under the License. import logging -from deployd import __version__, IS_PINTEREST, METRIC_PORT_HEALTH, METRIC_CACHE_PATH +from deployd import __version__, IS_PINTEREST, METRIC_PORT_HEALTH, METRIC_CACHE_PATH, STATSBOARD_URL import timeit import socket import json import os +import requests if IS_PINTEREST: from pinstatsd.statsd import sc, sc_v2 @@ -106,7 +107,20 @@ def create_sc_gauge(name, value, sample_rate=1.0, tags=None): else: return - +def send_statsboard_metric(name, value, tags=None): + tags['host'] = socket.gethostname() + tags_params = [f"{tag}={tags[tag]}" for tag in tags] + tags_str = ",".join(tags_params) + url = ( + f"{STATSBOARD_URL}put/" + f"{name}?value={value}" + f"&tags={tags_str}" + ) + + resp = requests.put(url) + if resp.status_code == 200: + log.info("Successfully send the metric to statsboard") + class MetricCacheConfigurationError(ValueError): """ Raised when MetricCache has missing configuration """ def __init__(self, name, value): diff --git a/deploy-agent/deployd/common/utils.py b/deploy-agent/deployd/common/utils.py index fa5dc4f486..e658c819cd 100644 --- a/deploy-agent/deployd/common/utils.py +++ b/deploy-agent/deployd/common/utils.py @@ -4,9 +4,9 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -23,14 +23,17 @@ import traceback import subprocess import yaml + + import json -from deployd import IS_PINTEREST, TELEFIG_BINARY -from deployd.common.stats import TimeElapsed, create_sc_increment, create_sc_timing +from deployd import IS_PINTEREST, PUPPET_SUCCESS_EXIT_CODES +from deployd.common.stats import TimeElapsed, create_sc_increment, create_sc_timing, send_statsboard_metric log = logging.getLogger(__name__) - # noinspection PyProtectedMember + + def exit_abruptly(status=0): """Exit method that just quits abruptly. @@ -92,6 +95,7 @@ def mkdir_p(path): else: raise + def uptime(): """ return int: seconds of uptime in int, default 0 """ sec = 0 @@ -101,6 +105,7 @@ def uptime(): sec = int(float(line[0])) return sec + def ensure_dirs(config): # make sure deployd directories exist mkdir_p(config.get_builds_directory()) @@ -108,18 +113,90 @@ def ensure_dirs(config): mkdir_p(config.get_log_directory()) -def run_prereqs(config): - # check if the puppet has finished or not +def is_first_run(config): + env_status_file = config.get_env_status_fn() + return not os.path.exists(env_status_file) + + +def check_prereqs(config): + """ + Check prerequisites before deploy agent can run + + :return: True all conditions meet else False + """ if IS_PINTEREST: respect_puppet = config.respect_puppet() - puppet_file_path = config.get_puppet_file_path() - if respect_puppet and \ - puppet_file_path is not None and \ - not os.path.exists(puppet_file_path): - print("Waiting for first puppet run.") - sys.exit(0) + # check if the puppet has finished successfully or not + if respect_puppet: + puppet_state_file_path = config.get_puppet_state_file_path() + if puppet_state_file_path and (not os.path.exists(puppet_state_file_path)): + log.error("Waiting for first puppet run.") + return False + if not check_first_puppet_run_success(config): + log.error("First puppet run failed.") + return False ensure_dirs(config) + return True + + +def get_puppet_exit_code(config): + """ + Get puppet exit code from the corresponding file + + :return: puppet exit code or 999 if file doesn't exist + """ + puppet_exit_code_file = config.get_puppet_exit_code_file_path() + try: + with open(puppet_exit_code_file, "rt") as f: + exit_code = f.readline().strip() + except Exception as e: + log.warning(f"Could not read {puppet_exit_code_file} file: {e}") + exit_code = 999 + + return exit_code + + +def load_puppet_summary(config): + """ + Load last_run_summary yaml file, parse results + + :return: returns a dict constructed from for the puppet summary file + """ + summary_file = config.get_puppet_summary_file_path() + summary = {} + if not os.path.exists(summary_file): + log.warning(f"{summary_file} does not exist. This could be the first puppet run") + return summary + + with open(summary_file) as f: + summary = yaml.safe_load(f) + return summary + + +def check_first_puppet_run_success(config): + """ + Check first puppet run success from exit code and last run summary + + :return: returns True if success else False + """ + if not is_first_run(config): + return True + + puppet_exit_code = get_puppet_exit_code(config) + if puppet_exit_code in PUPPET_SUCCESS_EXIT_CODES: + return True + + # If failed, double check with puppet last summary + puppet_summary = load_puppet_summary(config) + puppet_failures = puppet_summary.get('events', {}).get( + 'failure', None) if puppet_summary else None + log.info(f"Puppet failures: {puppet_failures}") + + if puppet_failures != 0: + send_statsboard_metric(name='deployd.first_puppet_failed', value=1, + tags={"puppet_exit_code": puppet_exit_code}) + return puppet_failures == 0 def get_info_from_facter(keys): @@ -127,10 +204,10 @@ def get_info_from_facter(keys): time_facter = TimeElapsed() # increment stats - facter calls create_sc_increment('deployd.stats.internal.facter_calls_sum', 1) - log.info("Fetching {} keys from facter".format(keys)) - cmd = ['facter', '-p', '-j'] + log.info(f"Fetching {keys} keys from facter") + cmd = ['facter', '-jp'] cmd.extend(keys) - output = subprocess.check_output(cmd) + output = subprocess.run(cmd, check=True, stdout=subprocess.PIPE).stdout # timing stats - facter run time create_sc_timing('deployd.stats.internal.time_elapsed_facter_calls_sec', time_facter.get()) @@ -142,13 +219,8 @@ def get_info_from_facter(keys): log.error("Failed to get info from facter by keys {}".format(keys)) return None + def check_not_none(arg, msg=None): if arg is None: raise ValueError(msg) return arg - -def check_telefig_unavailable_error(msg): - if not TELEFIG_BINARY: - return False - telefig_unavailable_error = "{}: No such file or directory".format(TELEFIG_BINARY) - return msg.find(telefig_unavailable_error) != -1 diff --git a/deploy-agent/deployd/types/__init__.py b/deploy-agent/deployd/types/__init__.py deleted file mode 100644 index ba524f8ad3..0000000000 --- a/deploy-agent/deployd/types/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -# Copyright 2016 Pinterest, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - diff --git a/deploy-agent/tests/BUILD.bazel b/deploy-agent/tests/BUILD.bazel index d897996b92..6814450914 100644 --- a/deploy-agent/tests/BUILD.bazel +++ b/deploy-agent/tests/BUILD.bazel @@ -21,6 +21,13 @@ py_test( python_version = "PY3", ) +py_test( + name = "test_utils", + srcs = ['unit/deploy/common/test_utils.py'], + deps = ["test_lib"], + python_version = "PY3", +) + py_library( name = "test_lib", srcs = ["__init__.py"], diff --git a/deploy-agent/tests/unit/deploy/common/test_utils.py b/deploy-agent/tests/unit/deploy/common/test_utils.py new file mode 100644 index 0000000000..8f6238ff4d --- /dev/null +++ b/deploy-agent/tests/unit/deploy/common/test_utils.py @@ -0,0 +1,87 @@ +# Copyright 2016 Pinterest, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +from unittest import mock +from tests import TestCase + +from deployd.common.utils import ensure_dirs, check_prereqs, check_first_puppet_run_success + +@mock.patch('deployd.common.utils.send_statsboard_metric', new=mock.Mock(return_value= None)) +class TestCommonUtils(TestCase): + + @classmethod + def setUpClass(cls): + cls.estatus = mock.Mock() + cls.estatus.load_envs = mock.Mock(return_value=None) + cls.config = mock.Mock() + cls.config.load_env_and_configs = mock.Mock() + cls.config.get_agent_directory = mock.Mock(return_value='/tmp/deployd/') + cls.config.get_builds_directory = mock.Mock(return_value='/tmp/deployd/builds/') + cls.config.get_log_directory = mock.Mock(return_value='/tmp/logs/') + cls.config.respect_puppet = mock.Mock(return_value=True) + cls.config.get_puppet_state_file_path = mock.Mock(return_value='/tmp/deployd') + ensure_dirs(cls.config) + + @mock.patch('deployd.common.utils.IS_PINTEREST', False) + def test_check_prereqs_not_pins(self): + result = check_prereqs(self.config) + self.assertTrue(result) + + @mock.patch('deployd.common.utils.IS_PINTEREST', True) + @mock.patch('deployd.common.utils.is_first_run', new=mock.Mock(return_value=True)) + @mock.patch('deployd.common.utils.load_puppet_summary', new=mock.Mock(return_value= {'events': {'failure': 0}})) + @mock.patch('deployd.common.utils.get_puppet_exit_code', new=mock.Mock(return_value= 5)) + def test_check_prereqs_no_failures(self): + first_puppet_run_result = check_first_puppet_run_success(self.config) + self.assertTrue(first_puppet_run_result) + result = check_prereqs(self.config) + self.assertTrue(result) + + @mock.patch('deployd.common.utils.IS_PINTEREST', True) + @mock.patch('deployd.common.utils.is_first_run', new=mock.Mock(return_value=True)) + @mock.patch('deployd.common.utils.load_puppet_summary', new=mock.Mock(return_value= {'events': {'failure': 0}})) + @mock.patch('deployd.common.utils.get_puppet_exit_code', new=mock.Mock(return_value= 999)) + def test_check_prereqs_no_exit_code_file(self): + result = check_prereqs(self.config) + self.assertTrue(result) + + @mock.patch('deployd.common.utils.IS_PINTEREST', True) + @mock.patch('deployd.common.utils.is_first_run', new=mock.Mock(return_value=True)) + @mock.patch('deployd.common.utils.load_puppet_summary', new=mock.Mock(return_value= {'events': {'failure': 3}})) + @mock.patch('deployd.common.utils.get_puppet_exit_code', new=mock.Mock(return_value= 999)) + def test_check_prereqs_with_failures(self): + first_puppet_run_result = check_first_puppet_run_success(self.config) + self.assertFalse(first_puppet_run_result) + result = check_prereqs(self.config) + self.assertFalse(result) + + @mock.patch('deployd.common.utils.IS_PINTEREST', True) + @mock.patch('deployd.common.utils.is_first_run', new=mock.Mock(return_value=False)) + @mock.patch('deployd.common.utils.load_puppet_summary', new=mock.Mock(return_value= {'events': {'failure': 2}})) + @mock.patch('deployd.common.utils.get_puppet_exit_code', new=mock.Mock(return_value= 999)) + def test_check_prereqs_no_state_file(self): + self.config.get_puppet_state_file_path = mock.Mock(return_value=None) + result = check_prereqs(self.config) + self.assertTrue(result) + + @mock.patch('deployd.common.utils.IS_PINTEREST', True) + @mock.patch('deployd.common.utils.is_first_run', new=mock.Mock(return_value=False)) + def test_check_prereqs_not_first_run(self): + result = check_prereqs(self.config) + self.assertTrue(result) + + +if __name__ == '__main__': + unittest.main() diff --git a/deploy-agent/tests/unit/deploy/server/test_agent.py b/deploy-agent/tests/unit/deploy/server/test_agent.py index 706df2eeb9..8c661a29ae 100644 --- a/deploy-agent/tests/unit/deploy/server/test_agent.py +++ b/deploy-agent/tests/unit/deploy/server/test_agent.py @@ -518,60 +518,6 @@ def test_send_deploy_status(self, mock_create_sc): self.assertEqual(agent._curr_report.report.deployStage, DeployStage.PRE_DOWNLOAD) self.assertEqual(agent._curr_report.report.status, AgentStatus.SUCCEEDED) - @mock.patch('deployd.agent.create_sc_increment') - def test_send_deploy_status_with_errors(self, mock_create_sc): - status = DeployStatus() - status.report = PingReport(jsonValue=self.deploy_goal1) - - envs = {'abc': status} - client = mock.Mock() - estatus = mock.Mock() - estatus.load_envs = mock.Mock(return_value=envs) - ping_response_list = [ - PingResponse(jsonValue=self.ping_response1), - PingResponse(jsonValue=self.ping_noop_response) - ] - mock_create_sc.return_value = None - - client = mock.Mock() - client.send_reports = mock.Mock(side_effect=ping_response_list) - agent = DeployAgent(client=client, estatus=estatus, conf=self.config, - executor=self.executor, helper=self.helper) - report = DeployReport(status_code=AgentStatus.UNKNOWN, error_code=1, output_msg="teletraan_config_manager error", retry_times=3) - agent.process_deploy= mock.Mock(side_effect=[report]) - agent.serve_build() - mock_create_sc.assert_called_once_with('deployd.stats.deploy.status', tags={ - 'first_run': False, 'deploy_stage': 'PRE_DOWNLOAD', 'env_name': 'abc', 'stage_name': 'beta', 'status_code': 'UNKNOWN', 'error_source': 'TELEFIG'}) - self.assertEqual(agent._curr_report.report.deployStage, DeployStage.PRE_DOWNLOAD) - self.assertEqual(agent._curr_report.report.status, AgentStatus.UNKNOWN) - - @mock.patch('deployd.agent.create_sc_increment') - @mock.patch('deployd.common.utils.TELEFIG_BINARY', "telefig") - def test_send_deploy_status_with_telefig_unavailable_errors(self, mock_create_sc): - status = DeployStatus() - status.report = PingReport(jsonValue=self.deploy_goal1) - - envs = {'abc': status} - client = mock.Mock() - estatus = mock.Mock() - estatus.load_envs = mock.Mock(return_value=envs) - ping_response_list = [ - PingResponse(jsonValue=self.ping_response1), - PingResponse(jsonValue=self.ping_noop_response) - ] - mock_create_sc.return_value = None - client = mock.Mock() - client.send_reports = mock.Mock(side_effect=ping_response_list) - agent = DeployAgent(client=client, estatus=estatus, conf=self.config, - executor=self.executor, helper=self.helper) - report = DeployReport(status_code=AgentStatus.UNKNOWN, error_code=1, output_msg="/usr/local/bin/telefig: No such file or directory", retry_times=3) - agent.process_deploy= mock.Mock(side_effect=[report]) - agent.serve_build() - mock_create_sc.assert_called_once_with('deployd.stats.deploy.status', tags={ - 'first_run': False, 'deploy_stage': 'PRE_DOWNLOAD', 'env_name': 'abc', 'stage_name': 'beta', 'status_code': 'UNKNOWN', 'error_source': 'TELEFIG', 'error':'TELEFIG_UNAVAILABLE'}) - self.assertEqual(agent._curr_report.report.deployStage, DeployStage.PRE_DOWNLOAD) - self.assertEqual(agent._curr_report.report.status, AgentStatus.UNKNOWN) - if __name__ == '__main__': unittest.main() diff --git a/deploy-agent/thirdparty/requirements.txt b/deploy-agent/thirdparty/requirements.txt index 7957ffd06b..0c8953df1d 100644 --- a/deploy-agent/thirdparty/requirements.txt +++ b/deploy-agent/thirdparty/requirements.txt @@ -1,11 +1,11 @@ -argparse==1.2.1 PyYAML==5.3.1 +argparse==1.2.1 configparser==4.0.2 +lockfile==0.10.2 pinlogger==0.53.0 pinstatsd==1.4.6 python-daemon==2.3.2 -requests==2.20.0 -setuptools==54.2.0 +requests==2.27.1 +setuptools==59.0.1 strictyaml==1.6.1 -zipp==1.2.0 -lockfile==0.10.2 \ No newline at end of file +zipp==1.2.0 \ No newline at end of file