Emit launch latency (#1696)

## Changes ## TAS Previously launch latency was emitted by a Rodimus worker. This PR implements the same measure in the TAS. Launch latency is defined as the duration from host launch to the the complete of the first deployment. It measures for all environments deployed on new hosts. The configuration **Launch grace period** is the user set threshold for this latency and it's used for AgentJanitor and Rodimus health check. Another fix is that I found all first deploy metrics were success. So I updated the condition when the `first_deploy` flag should be turned off. ## UI While updating the group status page to include the launch latency, I realized I forgot that the original plan was to create a dashboard. So I removed the links added in #1694, created a dashboard including the changes in a new Teletraan user dashboard. Fixed launch failure rate graph and updated some metrics calculations. <img width="1641" alt="image" src="https://github.com/user-attachments/assets/ef62bb3b-7748-4afa-b66a-f0a89111f8b6"> ## Test plan ### TAS 1. Deploy this PR to TAS dev1 2. Launch a new host in tyler/test - [x] Launch latency should be emitted - [x] First deploy counter should increment with success=true. 4. Deploy a bad build to tyler/test 5. Launch a new host - [x] Launch latency should be emitted - [x] First deploy counter should increment with success=false. ### UI 1. Deploy this PR to deploy-board dev1 2. Visit group status page /groups/tyler-test/ - [x] verify the link is updated and working
pinterest · Aug 16, 2024 · 5de012e · 5de012e
1 parent 2e49483
commit 5de012e
Show file tree

Hide file tree

Showing 7 changed files with 139 additions and 115 deletions.
diff --git a/deploy-board/deploy_board/templates/groups/group_details.html b/deploy-board/deploy_board/templates/groups/group_details.html
@@ -87,29 +87,12 @@ <h4 class="panel-title pull-left pointer-cursor">
 
     <div id="metricStatId" class="collapse in panel-body">
         <div id="tsdLinksId" style="text-align: left;">
-            <h4>Check out these links for better visualization</h4>
+            <h4>Check out the dashboard for better visualization</h4>
             <a type="button" class="deployToolTip btn btn-xs" data-toggle="tooltip"
-                href="{{ group_size_url }}"
-                title="" data-original-title="Click to see more group size information in TSDB">
-                <strong>Group Size</strong>
+                href="{{ teletraan_user_dashboard_url }}"
+                title="" data-original-title="Click to see more in TSDB">
+                <strong>Teletraan user dashboard</strong>
             </a>
-            <a type="button" class="deployToolTip btn btn-xs" data-toggle="tooltip"
-                href="{{ provision_latency_url }}"
-                title="" data-original-title="Click to see more provision latency information in TSDB">
-                <strong>Provision Latency</strong>
-            </a>
-            {% for env in envs %}
-                <a type="button" class="deployToolTip btn btn-xs" data-toggle="tooltip"
-                    href="{{ env.firstDeployLatencyLink }}"
-                    title="" data-original-title="Click to see more first deploy latency information in TSDB">
-                    <strong>Deploy Latency for {{ env.envName }}</strong>
-                </a>
-                <a type="button" class="deployToolTip btn btn-xs" data-toggle="tooltip"
-                    href="{{ env.firstDeploySRLink }}"
-                    title="" data-original-title="Click to see more first deploy success rate information in TSDB">
-                    <strong>First deploy SR for {{ env.envName }}</strong>
-                </a>
-            {% endfor %}
         </div>
         <div id="groupStatsId" class="collapse in panel-body" style="text-align: center;">
             <div id="container" class="chartContainer">

diff --git a/deploy-board/deploy_board/templates/groups/launch_rate.tmpl b/deploy-board/deploy_board/templates/groups/launch_rate.tmpl
@@ -43,7 +43,7 @@
                         lineWidth: 2}
                 },
                 vAxes: {
-                    0: {title: 'Success Rate',
+                    0: {title: 'Failure Rate',
                         titleTextStyle: {italic: false}}
                 },
                 interpolateNulls: true,
@@ -65,7 +65,6 @@
             if (metric_names != null) {
                 for (var i = 0; i < 1; ++i) {
                     var metric_name = metric_names[i];
-                    data.addColumn("number", "Launch Failure Rate");
                     data_list = response[metric_name];
                     for (j = 0; j < data_list.length; ++j) {
                         var d = new Date(data_list[j][0]);

diff --git a/deploy-board/deploy_board/webapp/group_view.py b/deploy-board/deploy_board/webapp/group_view.py
@@ -1210,7 +1210,7 @@ class GroupDetailView(View):
         "begin": "1w",
         "reducer_interval": "10m",
     }
-    base_metric_url = "https://statsboard.pinadmin.com/build3?"
+    base_dashboard_url = "https://statsboard.pinadmin.com/d/teletraan_user/main"
 
     def get(self, request, group_name):
         autoscaling_summary = autoscaling_groups_helper.get_autoscaling_summary(request, group_name)
@@ -1222,10 +1222,6 @@ def get(self, request, group_name):
         disabled_actions = autoscaling_groups_helper.get_disabled_asg_actions(request, group_name)
         pas_config = autoscaling_groups_helper.get_pas_config(request, group_name)
 
-        for env in envs:
-            env['firstDeploySRLink'] = self.generate_first_deploy_success_rate_link(env)
-            env['firstDeployLatencyLink'] = self.generate_deploy_latency_link(env)
-
         if "Terminate" in disabled_actions:
             scaling_down_event_enabled = False
         else:
@@ -1247,60 +1243,21 @@ def get(self, request, group_name):
             "launch_config": launch_config,
             "pas_enabled": pas_config['pas_state'] if pas_config else False,
             "disallow_autoscaling": _disallow_autoscaling(curr_image),
-            "group_size_url": self.generate_group_size_url(group_name),
-            "provision_latency_url": self.generate_provision_latency_url(group_name),
+            "teletraan_user_dashboard_url": self.generate_dashboard_url(group_name, envs, group_info),
         })
 
-    def generate_deploy_latency_link(self, env):
-        params = {
-            "metrics": (
-                '{"cmd":"sd=(s-s.timeShift(1h)).nonNegative()\\nctd=(ct-ct.timeShift(1h)).nonNegative()\\nmean=sd/ctd\\nreturn max,mean","metrics":'
-                f'[{{"aggregator":"zimsum","alias":"s","metric":"teletraan.{env.get("envName")}.{env.get("stageName")}.deploy_latency.sum"}},'
-                f'{{"aggregator":"zimsum","alias":"ct","metric":"teletraan.{env.get("envName")}.{env.get("stageName")}.deploy_latency.count"}},'
-                f'{{"aggregator":"mimmax","alias":"max","metric":"teletraan.{env.get("envName")}.{env.get("stageName")}.deploy_latency.max"}}]}}'
-            ),
-            "settings": (
-                '{"appearance":{"mean":{"color":"#0000ff"},"max":{"color":"#ff8000"}},"title":"mean & max first deploy latency [1h window]","y_axis_label":"Latency","y_min":0,'
-                '"note":"The deploy latency is measured from the first deploy start to finish on a single host."}'
-            ),
-        }
-        params.update(self.default_params)
-        return f"{self.base_metric_url}{urllib.parse.urlencode(params)}"
-
-    def generate_provision_latency_url(self, group_name):
+    def generate_dashboard_url(self, group, envs, group_info):
+        launch_latency_th = (
+            group_info.get("groupInfo", {}).get("launchLatencyTh", 600)
+            if group_info
+            else 600
+        )
+        env_arg = '|'.join([f'{env.get("envName")}.{env.get("stageName")}' for env in envs])
         params = {
-            "metrics": (
-                '{"cmd":"sd=(s-s.timeShift(1h)).nonNegative()\\nctd=(ct-ct.timeShift(1h)).nonNegative()\\nmean=sd/ctd\\nreturn max,mean","metrics":'
-                f'[{{"aggregator":"zimsum","alias":"s","metric":"teletraan.{group_name}.provision_latency.sum"}},'
-                f'{{"aggregator":"zimsum","alias":"ct","metric":"teletraan.{group_name}.provision_latency.count"}},'
-                f'{{"aggregator":"mimmax","alias":"max","metric":"teletraan.{group_name}.provision_latency.max"}}]}}'
-            ),
-            "settings": '{"appearance":{"mean":{"color":"#0000ff"},"max":{"color":"#ff8000"}},"title":"mean & max provision latency [1h window]","y_axis_label":"Latency","y_min":0,'
-            '"note":"The provision latency is measured from the host launch to the first Teletraan ping."}',
+            "tags": f"cluster={group},envs={env_arg},th={launch_latency_th}",
         }
         params.update(self.default_params)
-        return f"{self.base_metric_url}{urllib.parse.urlencode(params)}"
-
-    def generate_first_deploy_success_rate_link(self, env):
-        params = {
-            "metrics": (
-                '{"cmd":"sd=(suc-suc.timeShift(1h)).nonNegative()\\ntotd=(tot-tot.timeShift(1h)).nonNegative()\\nsr=sd/totd*100\\nreturn sr","metrics":'
-                f'[{{"aggregator":"zimsum","alias":"suc","metric":"teletraan.{env.get("envName")}.{env.get("stageName")}.first_deploy","tags":{{"success":"true"}}}},'
-                f'{{"aggregator":"zimsum","alias":"tot","metric":"teletraan.{env.get("envName")}.{env.get("stageName")}.first_deploy"}}]}}'
-            ),
-            "settings": '{"appearance":{"sr":{"disabled":false,"stroke_style":"solid","color":"#00ff00"}},"title":"First deploy success rate [1h window]","y_max":105,"y_min":0}',
-        }
-        params.update(self.default_params)
-        return f"{self.base_metric_url}{urllib.parse.urlencode(params)}"
-
-    def generate_group_size_url(self, group_name):
-        params = {
-            "metrics": f'{{"metrics":[{{"aggregator":"zimavg","alias":"size","metric":"autoscaling.{group_name}.size"}}]}}',
-            "settings": '{"appearance":{"d":{"color":"dodgerblue"}},"renderer":"line","title":"Group size","y_axis_label":"Group size","y_min":0}',
-        }
-        params.update(self.default_params)
-        return f"{self.base_metric_url}{urllib.parse.urlencode(params)}"
-
+        return f"{self.base_dashboard_url}?{urllib.parse.urlencode(params)}"
 
 # generate aws related settings
 def get_aws_settings(request):

diff --git a/deploy-board/deploy_board/webapp/util_views.py b/deploy-board/deploy_board/webapp/util_views.py
@@ -160,7 +160,7 @@ def get_latency_metrics(request, group_name):
                                                                              "LAUNCH", settings.DEFAULT_START_TIME)
             json_data = []
             for data_point in launch_data_points:
-                timestamp, value = data_point["timestamp"], data_point["value"] / 1000
+                timestamp, value = data_point["timestamp"], data_point["value"]
                 json_data.append([timestamp, value])
             util_data[metric_name1] = json_data
 
@@ -169,7 +169,7 @@ def get_latency_metrics(request, group_name):
                                                                              "DEPLOY", settings.DEFAULT_START_TIME)
             json_data2 = []
             for data_point in deploy_data_points:
-                timestamp, value = data_point["timestamp"], data_point["value"] / 1000
+                timestamp, value = data_point["timestamp"], data_point["value"]
                 json_data2.append([timestamp, value])
             util_data[metric_name2] = json_data2
 
@@ -189,13 +189,13 @@ def get_launch_rate(request, group_name):
     try:
         util_data["metric_names"] = []
         for env in envs:
-            metric_name = "mimmax:rate:teletraan.{}.{}.first_deploy{{success=false}}".format(
+            metric_name = "zimsum:rate:teletraan.{}.{}.first_deploy{{success=false}}".format(
                 env["envName"], env["stageName"])
             rate_data_points = autoscaling_metrics_helper.get_raw_metrics(request, metric_name,
                                                                           settings.DEFAULT_START_TIME)
             json_data = []
             for data_point in rate_data_points:
-                timestamp, value = data_point["timestamp"], data_point["value"]
+                timestamp, value = data_point["timestamp"], data_point["value"] * 60
                 json_data.append([timestamp, value])
 
             util_data[metric_name] = json_data

diff --git a/deploy-service/common/src/main/java/com/pinterest/deployservice/handler/GoalAnalyst.java b/deploy-service/common/src/main/java/com/pinterest/deployservice/handler/GoalAnalyst.java
@@ -27,13 +27,15 @@
 import com.pinterest.deployservice.bean.DeployStage;
 import com.pinterest.deployservice.bean.DeployType;
 import com.pinterest.deployservice.bean.EnvironBean;
+import com.pinterest.deployservice.bean.HostBean;
 import com.pinterest.deployservice.bean.HostTagBean;
 import com.pinterest.deployservice.bean.PingReportBean;
 import com.pinterest.deployservice.common.Constants;
 import com.pinterest.deployservice.common.StateMachines;
 import com.pinterest.deployservice.dao.DeployConstraintDAO;
 import com.pinterest.deployservice.dao.DeployDAO;
 import com.pinterest.deployservice.dao.EnvironDAO;
+import com.pinterest.deployservice.dao.HostDAO;
 import com.pinterest.deployservice.dao.HostTagDAO;
 import io.micrometer.core.instrument.Metrics;
 import java.time.Duration;
@@ -56,6 +58,8 @@ public class GoalAnalyst {
     private static final int ROLL_BACK_PRIORITY = DeployPriority.HIGHER.getValue() - 10;
     private static final String DEPLOY_LATENCY_TIMER_NAME =
             CUSTOM_NAME_PREFIX + "teletraan.%s.%s.deploy_latency";
+    private static final String LAUNCH_LATENCY_TIMER_NAME =
+            CUSTOM_NAME_PREFIX + "teletraan.%s.%s.launch_latency";
     private static final String FIRST_DEPLOY_COUNTER_NAME =
             CUSTOM_NAME_PREFIX + "teletraan.%s.%s.first_deploy";
 
@@ -64,6 +68,7 @@ public class GoalAnalyst {
     private DeployDAO deployDAO;
     private HostTagDAO hostTagDAO;
     private DeployConstraintDAO deployConstraintDAO;
+    private HostDAO hostDAO;
 
     private String ec2Tags;
 
@@ -226,6 +231,7 @@ public String toString() {
             HostTagDAO hostTagDAO,
             DeployDAO deployDAO,
             EnvironDAO environDAO,
+            HostDAO hostDAO,
             String host,
             String host_id,
             Map<String, EnvironBean> envs,
@@ -241,6 +247,7 @@ public String toString() {
         this.ec2Tags = ec2Tags;
         this.hostTagDAO = hostTagDAO;
         this.deployConstraintDAO = deployConstraintDAO;
+        this.hostDAO = hostDAO;
 
         for (Map.Entry<String, AgentBean> entry : agents.entrySet()) {
             try {
@@ -428,7 +435,9 @@ AgentBean genUpdateBeanByReport(PingReportBean report, AgentBean agent) {
             updateBean.setStart_date(agent.getStart_date());
         }
 
-        if (report.getDeployStage() == DeployStage.SERVING_BUILD && updateBean.getFirst_deploy()) {
+        if (Boolean.TRUE.equals(updateBean.getFirst_deploy())
+                && (DeployStage.SERVING_BUILD.equals(report.getDeployStage())
+                        || AgentState.PAUSED_BY_SYSTEM.equals(updateBean.getState()))) {
             // turn off first deploy flag
             updateBean.setFirst_deploy(false);
             updateBean.setFirst_deploy_time(currentTime);
@@ -457,13 +466,38 @@ private void emitMetrics(AgentBean updateBean) {
                                     env.getEnv_name(),
                                     env.getStage_name()),
                             "success",
-                            String.valueOf(updateBean.getStatus().equals(AgentStatus.SUCCEEDED)))
+                            String.valueOf(AgentStatus.SUCCEEDED.equals(updateBean.getStatus())))
                     .increment();
+
+            Long hostStartTime = estimateHostStartTime(updateBean);
+            if (hostStartTime != null) {
+                Metrics.timer(
+                                String.format(
+                                        LAUNCH_LATENCY_TIMER_NAME,
+                                        env.getEnv_name(),
+                                        env.getStage_name()))
+                        .record(
+                                Duration.ofMillis(
+                                        updateBean.getFirst_deploy_time() - hostStartTime));
+            }
         } catch (Exception ex) {
             LOG.warn("Failed to emit metrics of {}", updateBean, ex);
         }
     }
 
+    private Long estimateHostStartTime(AgentBean agent) {
+        try {
+            List<HostBean> hosts = hostDAO.getHostsByHostId(agent.getHost_id());
+            if (!hosts.isEmpty()) {
+                return hosts.get(0).getCreate_date();
+            }
+        } catch (Exception ex) {
+            LOG.warn("Failed to get host with id {}", agent.getHost_id(), ex);
+        }
+
+        return null;
+    }
+
     // Generate new agent bean based on the report & current agent record,
     // This is intended to be used for deploy goal to install next stage
     AgentBean genNextStageUpdateBean(EnvironBean env, PingReportBean report, AgentBean agent) {

diff --git a/deploy-service/common/src/main/java/com/pinterest/deployservice/handler/PingHandler.java b/deploy-service/common/src/main/java/com/pinterest/deployservice/handler/PingHandler.java
@@ -861,6 +861,7 @@ public PingResult ping(PingRequestBean pingRequest, boolean rate_limited) throws
                         hostTagDAO,
                         deployDAO,
                         environDAO,
+                        hostDAO,
                         hostName,
                         hostId,
                         envs,