Deploy agent wait for first successful puppet run (#1208)

* Deploy agent prerun conditions reenforcement * added UT and bazel target * Update deploy-agent/deployd/client/client.py Co-authored-by: Tyler Ouyang <[email protected]>
pinterest · Jun 21, 2023 · d8d74f5 · d8d74f5
1 parent 66f1179
commit d8d74f5
Show file tree

Hide file tree

Showing 12 changed files with 236 additions and 114 deletions.
diff --git a/deploy-agent/deployd/__init__.py b/deploy-agent/deployd/__init__.py
@@ -18,6 +18,12 @@
 IS_PINTEREST = True if os.getenv("IS_PINTEREST", "false") == "true" else False
 METRIC_PORT_HEALTH = int(os.getenv('METRIC_PORT_HEALTH')) if os.getenv('METRIC_PORT_HEALTH', False) else None
 METRIC_CACHE_PATH = os.getenv('METRIC_CACHE_PATH', None)
-TELEFIG_BINARY = os.getenv('TELEFIG_BINARY', "")
+TELEFIG_BINARY = os.getenv('TELEFIG_BINARY', None)
+MAIN_LOGGER = "deployd"
+STATSBOARD_URL=os.getenv('STATSBOARD_URL', "https://statsboard.pinadmin.com/api/v1/")
 
-__version__ = '1.2.42'
+# 0: puppet applied successfully with no changes
+# 2: puppet applied successfully with changes
+PUPPET_SUCCESS_EXIT_CODES = [0, 2]
+
+__version__ = '1.2.43'
diff --git a/deploy-agent/deployd/agent.py b/deploy-agent/deployd/agent.py
@@ -16,6 +16,7 @@
 import daemon
 import logging
 import os
+import sys
 from random import randrange
 import time
 import traceback
@@ -32,11 +33,9 @@
 from deployd.common.executor import Executor
 from deployd.common.types import DeployReport, PingStatus, DeployStatus, OpCode, \
     DeployError, DeployErrorSource, DeployStage, AgentStatus
-from deployd.common.utils import check_telefig_unavailable_error
-from deployd import IS_PINTEREST
-
-log = logging.getLogger(__name__)
+from deployd import IS_PINTEREST, MAIN_LOGGER
 
+log = logging.getLogger(MAIN_LOGGER)
 
 class PingServer(object):
     def __init__(self, ag):
@@ -112,12 +111,6 @@ def _send_deploy_status_stats(self, deploy_report):
             tags['stage_name'] = self._response.deployGoal.stageName
         if deploy_report.status_code:
             tags['status_code'] = deploy_report.status_code
-        if deploy_report.output_msg: 
-            if check_telefig_unavailable_error(deploy_report.output_msg):
-                tags['error_source'] = DeployErrorSource.TELEFIG
-                tags['error'] = DeployError.TELEFIG_UNAVAILABLE
-            elif deploy_report.output_msg.find("teletraan_config_manager") != -1:
-                tags['error_source'] = DeployErrorSource.TELEFIG
 
         create_sc_increment('deployd.stats.deploy.status', tags=tags)
 
@@ -483,9 +476,9 @@ def main():
     is_serverless_mode = AgentRunMode.is_serverless(args.mode)
     if args.daemon and is_serverless_mode:
         raise ValueError("daemon and serverless mode is mutually exclusive.")
+
     config = Config(args.config_file)
-    utils.run_prereqs(config)
-
+
     if IS_PINTEREST:
         import pinlogger
 
@@ -496,6 +489,10 @@ def main():
         logging.basicConfig(filename=log_filename, level=config.get_log_level(),
                             format='%(asctime)s %(name)s:%(lineno)d %(levelname)s %(message)s')
 
+    if not utils.check_prereqs(config): 
+        log.warning("Deploy agent cannot start because the prerequisites on puppet did not meet.")
+        sys.exit(0)
+
     log.info("Start to run deploy-agent.")
     # timing stats - agent start time
     create_sc_timing('deployd.stats.internal.time_start_sec',

diff --git a/deploy-agent/deployd/client/client.py b/deploy-agent/deployd/client/client.py
@@ -164,7 +164,8 @@ def _read_host_info(self):
             # Note: on U14, facter -p ec2_tags.Autoscaling does not work.
             # so need to read ec2_tags from facter and parse Autoscaling tag to cover this case
             if not self._autoscaling_group:
-                self._autoscaling_group = facter_data.get(ec2_tags_key, {}).get(asg_tag_key, None)
+                ec2_tags = facter_data.get(ec2_tags_key)
+                self._autoscaling_group = ec2_tags.get(asg_tag_key) if ec2_tags else None
 
             if not self._stage_type and not self._stage_type_fetched:
                 self._stage_type = facter_data.get(stage_type_key, None)

diff --git a/deploy-agent/deployd/common/config.py b/deploy-agent/deployd/common/config.py
@@ -233,9 +233,15 @@ def get_num_builds_retain(self):
     def respect_puppet(self):
         return self.get_intvar("respect_puppet", 0)
 
-    def get_puppet_file_path(self):
+    def get_puppet_state_file_path(self):
         return self.get_var("puppet_file_path", None)
 
+    def get_puppet_summary_file_path(self):
+        return self.get_var("puppet_summary_file_path", "/var/cache/puppet/state/last_run_summary.yaml")
+
+    def get_puppet_exit_code_file_path(self):
+        return self.get_var("puppet_exit_code_file_path", "/var/log/puppet/puppet_exit_code")
+
     def get_daemon_sleep_time(self):
         return self.get_intvar("daemon_sleep_time", 30)
 

diff --git a/deploy-agent/deployd/common/single_instance.py b/deploy-agent/deployd/common/single_instance.py
@@ -48,7 +48,7 @@ def __init__(self):
         try:
             fcntl.lockf(lockfile_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
         except IOError:
-            print(('Error: {0} may already be running. Only one instance of it '
+            log.error(('Error: {0} may already be running. Only one instance of it '
                    'can run at a time.').format(appname))
             # noinspection PyTypeChecker
             os.close(lockfile_fd)

diff --git a/deploy-agent/deployd/common/stats.py b/deploy-agent/deployd/common/stats.py
@@ -13,11 +13,12 @@
 # limitations under the License.
 
 import logging
-from deployd import __version__, IS_PINTEREST, METRIC_PORT_HEALTH, METRIC_CACHE_PATH
+from deployd import __version__, IS_PINTEREST, METRIC_PORT_HEALTH, METRIC_CACHE_PATH, STATSBOARD_URL
 import timeit
 import socket
 import json
 import os
+import requests 
 
 if IS_PINTEREST:
     from pinstatsd.statsd import sc, sc_v2
@@ -106,7 +107,20 @@ def create_sc_gauge(name, value, sample_rate=1.0, tags=None):
     else:
         return
 
-
+def send_statsboard_metric(name, value, tags=None): 
+    tags['host'] = socket.gethostname()
+    tags_params = [f"{tag}={tags[tag]}" for tag in tags] 
+    tags_str = ",".join(tags_params)
+    url = (
+        f"{STATSBOARD_URL}put/"
+        f"{name}?value={value}"
+        f"&tags={tags_str}"
+    )
+
+    resp = requests.put(url)
+    if resp.status_code == 200:
+        log.info("Successfully send the metric to statsboard")
+
 class MetricCacheConfigurationError(ValueError):
     """ Raised when MetricCache has missing configuration """
     def __init__(self, name, value):

diff --git a/deploy-agent/deployd/common/utils.py b/deploy-agent/deployd/common/utils.py
@@ -4,9 +4,9 @@
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-#  
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-#    
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -23,14 +23,17 @@
 import traceback
 import subprocess
 import yaml
+
+
 import json
-from deployd import IS_PINTEREST, TELEFIG_BINARY
-from deployd.common.stats import TimeElapsed, create_sc_increment, create_sc_timing
+from deployd import IS_PINTEREST, PUPPET_SUCCESS_EXIT_CODES
+from deployd.common.stats import TimeElapsed, create_sc_increment, create_sc_timing, send_statsboard_metric
 
 log = logging.getLogger(__name__)
 
-
 # noinspection PyProtectedMember
+
+
 def exit_abruptly(status=0):
     """Exit method that just quits abruptly.
 
@@ -92,6 +95,7 @@ def mkdir_p(path):
         else:
             raise
 
+
 def uptime():
     """ return int: seconds of uptime in int, default 0 """
     sec = 0
@@ -101,36 +105,109 @@ def uptime():
             sec = int(float(line[0]))
     return sec
 
+
 def ensure_dirs(config):
     # make sure deployd directories exist
     mkdir_p(config.get_builds_directory())
     mkdir_p(config.get_agent_directory())
     mkdir_p(config.get_log_directory())
 
 
-def run_prereqs(config):
-    # check if the puppet has finished or not
+def is_first_run(config):
+    env_status_file = config.get_env_status_fn()
+    return not os.path.exists(env_status_file)
+
+
+def check_prereqs(config):
+    """
+    Check prerequisites before deploy agent can run
+
+    :return: True all conditions meet else False
+    """
     if IS_PINTEREST:
         respect_puppet = config.respect_puppet()
-        puppet_file_path = config.get_puppet_file_path()
-        if respect_puppet and \
-           puppet_file_path is not None and \
-           not os.path.exists(puppet_file_path):
-            print("Waiting for first puppet run.")
-            sys.exit(0)
+        # check if the puppet has finished successfully or not
+        if respect_puppet:
+            puppet_state_file_path = config.get_puppet_state_file_path()
+            if puppet_state_file_path and (not os.path.exists(puppet_state_file_path)):
+                log.error("Waiting for first puppet run.")
+                return False
+            if not check_first_puppet_run_success(config):
+                log.error("First puppet run failed.")
+                return False
 
     ensure_dirs(config)
+    return True
+
+
+def get_puppet_exit_code(config):
+    """
+    Get puppet exit code from the corresponding file
+
+    :return: puppet exit code or 999 if file doesn't exist
+    """
+    puppet_exit_code_file = config.get_puppet_exit_code_file_path()
+    try:
+        with open(puppet_exit_code_file, "rt") as f:
+            exit_code = f.readline().strip()
+    except Exception as e:
+        log.warning(f"Could not read {puppet_exit_code_file} file: {e}")
+        exit_code = 999
+
+    return exit_code
+
+
+def load_puppet_summary(config):
+    """
+    Load last_run_summary yaml file, parse results
+
+    :return: returns a dict constructed from for the puppet summary file
+    """
+    summary_file = config.get_puppet_summary_file_path()
+    summary = {}
+    if not os.path.exists(summary_file):
+        log.warning(f"{summary_file} does not exist. This could be the first puppet run")
+        return summary
+
+    with open(summary_file) as f:
+        summary = yaml.safe_load(f)
+    return summary
+
+
+def check_first_puppet_run_success(config):
+    """
+    Check first puppet run success from exit code and last run summary
+
+    :return: returns True if success else False
+    """
+    if not is_first_run(config):
+        return True
+
+    puppet_exit_code = get_puppet_exit_code(config)
+    if puppet_exit_code in PUPPET_SUCCESS_EXIT_CODES:
+        return True
+
+    # If failed, double check with puppet last summary
+    puppet_summary = load_puppet_summary(config)
+    puppet_failures = puppet_summary.get('events', {}).get(
+        'failure', None) if puppet_summary else None
+    log.info(f"Puppet failures: {puppet_failures}")
+
+    if puppet_failures != 0:
+        send_statsboard_metric(name='deployd.first_puppet_failed', value=1,
+                               tags={"puppet_exit_code": puppet_exit_code})
+    return puppet_failures == 0
 
 
 def get_info_from_facter(keys):
     try:
         time_facter = TimeElapsed()
         # increment stats - facter calls
         create_sc_increment('deployd.stats.internal.facter_calls_sum', 1)
-        log.info("Fetching {} keys from facter".format(keys))
-        cmd = ['facter', '-p', '-j']
+        log.info(f"Fetching {keys} keys from facter")
+        cmd = ['facter', '-jp']
         cmd.extend(keys)
-        output = subprocess.check_output(cmd)
+        output = subprocess.run(cmd, check=True, stdout=subprocess.PIPE).stdout
         # timing stats - facter run time
         create_sc_timing('deployd.stats.internal.time_elapsed_facter_calls_sec',
                          time_facter.get())
@@ -142,13 +219,8 @@ def get_info_from_facter(keys):
         log.error("Failed to get info from facter by keys {}".format(keys))
         return None
 
+
 def check_not_none(arg, msg=None):
     if arg is None:
         raise ValueError(msg)
     return arg
-
-def check_telefig_unavailable_error(msg):
-    if not TELEFIG_BINARY:
-        return False
-    telefig_unavailable_error = "{}: No such file or directory".format(TELEFIG_BINARY)
-    return msg.find(telefig_unavailable_error) != -1
diff --git a/deploy-agent/deployd/types/__init__.py b/deploy-agent/deployd/types/__init__.py
diff --git a/deploy-agent/tests/BUILD.bazel b/deploy-agent/tests/BUILD.bazel
@@ -21,6 +21,13 @@ py_test(
     python_version = "PY3",
 )
 
+py_test(
+    name = "test_utils",
+    srcs = ['unit/deploy/common/test_utils.py'],
+    deps = ["test_lib"],
+    python_version = "PY3",
+)
+
 py_library(
     name = "test_lib",
     srcs = ["__init__.py"],