Skip to content

Commit

Permalink
Emit launch latency (#1696)
Browse files Browse the repository at this point in the history
## Changes

## TAS
Previously launch latency was emitted by a Rodimus worker. This PR implements the same measure in the TAS. 

Launch latency is defined as the duration from host launch to the the complete of the first deployment. It measures for all environments deployed on new hosts. The configuration **Launch grace period** is the user set threshold for this latency and it's used for AgentJanitor and Rodimus health check.

Another fix is that I found all first deploy metrics were success. So I updated the condition when the `first_deploy` flag should be turned off. 

## UI
While updating the group status page to include the launch latency, I realized I forgot that the original plan was to create a dashboard. So I removed the links added in #1694, created a dashboard including the changes in a new Teletraan user dashboard.

Fixed launch failure rate graph and updated some metrics calculations.

<img width="1641" alt="image" src="https://github.com/user-attachments/assets/ef62bb3b-7748-4afa-b66a-f0a89111f8b6">

## Test plan
### TAS
1. Deploy this PR to TAS dev1
2. Launch a new host in tyler/test
    - [x] Launch latency should be emitted 
    - [x] First deploy counter should increment with success=true.
4. Deploy a bad build to tyler/test
5. Launch a new host
    - [x] Launch latency should be emitted
    - [x] First deploy counter should increment with success=false.

### UI
1. Deploy this PR to deploy-board dev1
2. Visit group status page /groups/tyler-test/
    - [x] verify the link is updated and working
  • Loading branch information
tylerwowen authored Aug 16, 2024
1 parent 2e49483 commit 5de012e
Show file tree
Hide file tree
Showing 7 changed files with 139 additions and 115 deletions.
25 changes: 4 additions & 21 deletions deploy-board/deploy_board/templates/groups/group_details.html
Original file line number Diff line number Diff line change
Expand Up @@ -87,29 +87,12 @@ <h4 class="panel-title pull-left pointer-cursor">

<div id="metricStatId" class="collapse in panel-body">
<div id="tsdLinksId" style="text-align: left;">
<h4>Check out these links for better visualization</h4>
<h4>Check out the dashboard for better visualization</h4>
<a type="button" class="deployToolTip btn btn-xs" data-toggle="tooltip"
href="{{ group_size_url }}"
title="" data-original-title="Click to see more group size information in TSDB">
<strong>Group Size</strong>
href="{{ teletraan_user_dashboard_url }}"
title="" data-original-title="Click to see more in TSDB">
<strong>Teletraan user dashboard</strong>
</a>
<a type="button" class="deployToolTip btn btn-xs" data-toggle="tooltip"
href="{{ provision_latency_url }}"
title="" data-original-title="Click to see more provision latency information in TSDB">
<strong>Provision Latency</strong>
</a>
{% for env in envs %}
<a type="button" class="deployToolTip btn btn-xs" data-toggle="tooltip"
href="{{ env.firstDeployLatencyLink }}"
title="" data-original-title="Click to see more first deploy latency information in TSDB">
<strong>Deploy Latency for {{ env.envName }}</strong>
</a>
<a type="button" class="deployToolTip btn btn-xs" data-toggle="tooltip"
href="{{ env.firstDeploySRLink }}"
title="" data-original-title="Click to see more first deploy success rate information in TSDB">
<strong>First deploy SR for {{ env.envName }}</strong>
</a>
{% endfor %}
</div>
<div id="groupStatsId" class="collapse in panel-body" style="text-align: center;">
<div id="container" class="chartContainer">
Expand Down
3 changes: 1 addition & 2 deletions deploy-board/deploy_board/templates/groups/launch_rate.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
lineWidth: 2}
},
vAxes: {
0: {title: 'Success Rate',
0: {title: 'Failure Rate',
titleTextStyle: {italic: false}}
},
interpolateNulls: true,
Expand All @@ -65,7 +65,6 @@
if (metric_names != null) {
for (var i = 0; i < 1; ++i) {
var metric_name = metric_names[i];
data.addColumn("number", "Launch Failure Rate");
data_list = response[metric_name];
for (j = 0; j < data_list.length; ++j) {
var d = new Date(data_list[j][0]);
Expand Down
65 changes: 11 additions & 54 deletions deploy-board/deploy_board/webapp/group_view.py
Original file line number Diff line number Diff line change
Expand Up @@ -1210,7 +1210,7 @@ class GroupDetailView(View):
"begin": "1w",
"reducer_interval": "10m",
}
base_metric_url = "https://statsboard.pinadmin.com/build3?"
base_dashboard_url = "https://statsboard.pinadmin.com/d/teletraan_user/main"

def get(self, request, group_name):
autoscaling_summary = autoscaling_groups_helper.get_autoscaling_summary(request, group_name)
Expand All @@ -1222,10 +1222,6 @@ def get(self, request, group_name):
disabled_actions = autoscaling_groups_helper.get_disabled_asg_actions(request, group_name)
pas_config = autoscaling_groups_helper.get_pas_config(request, group_name)

for env in envs:
env['firstDeploySRLink'] = self.generate_first_deploy_success_rate_link(env)
env['firstDeployLatencyLink'] = self.generate_deploy_latency_link(env)

if "Terminate" in disabled_actions:
scaling_down_event_enabled = False
else:
Expand All @@ -1247,60 +1243,21 @@ def get(self, request, group_name):
"launch_config": launch_config,
"pas_enabled": pas_config['pas_state'] if pas_config else False,
"disallow_autoscaling": _disallow_autoscaling(curr_image),
"group_size_url": self.generate_group_size_url(group_name),
"provision_latency_url": self.generate_provision_latency_url(group_name),
"teletraan_user_dashboard_url": self.generate_dashboard_url(group_name, envs, group_info),
})

def generate_deploy_latency_link(self, env):
params = {
"metrics": (
'{"cmd":"sd=(s-s.timeShift(1h)).nonNegative()\\nctd=(ct-ct.timeShift(1h)).nonNegative()\\nmean=sd/ctd\\nreturn max,mean","metrics":'
f'[{{"aggregator":"zimsum","alias":"s","metric":"teletraan.{env.get("envName")}.{env.get("stageName")}.deploy_latency.sum"}},'
f'{{"aggregator":"zimsum","alias":"ct","metric":"teletraan.{env.get("envName")}.{env.get("stageName")}.deploy_latency.count"}},'
f'{{"aggregator":"mimmax","alias":"max","metric":"teletraan.{env.get("envName")}.{env.get("stageName")}.deploy_latency.max"}}]}}'
),
"settings": (
'{"appearance":{"mean":{"color":"#0000ff"},"max":{"color":"#ff8000"}},"title":"mean & max first deploy latency [1h window]","y_axis_label":"Latency","y_min":0,'
'"note":"The deploy latency is measured from the first deploy start to finish on a single host."}'
),
}
params.update(self.default_params)
return f"{self.base_metric_url}{urllib.parse.urlencode(params)}"

def generate_provision_latency_url(self, group_name):
def generate_dashboard_url(self, group, envs, group_info):
launch_latency_th = (
group_info.get("groupInfo", {}).get("launchLatencyTh", 600)
if group_info
else 600
)
env_arg = '|'.join([f'{env.get("envName")}.{env.get("stageName")}' for env in envs])
params = {
"metrics": (
'{"cmd":"sd=(s-s.timeShift(1h)).nonNegative()\\nctd=(ct-ct.timeShift(1h)).nonNegative()\\nmean=sd/ctd\\nreturn max,mean","metrics":'
f'[{{"aggregator":"zimsum","alias":"s","metric":"teletraan.{group_name}.provision_latency.sum"}},'
f'{{"aggregator":"zimsum","alias":"ct","metric":"teletraan.{group_name}.provision_latency.count"}},'
f'{{"aggregator":"mimmax","alias":"max","metric":"teletraan.{group_name}.provision_latency.max"}}]}}'
),
"settings": '{"appearance":{"mean":{"color":"#0000ff"},"max":{"color":"#ff8000"}},"title":"mean & max provision latency [1h window]","y_axis_label":"Latency","y_min":0,'
'"note":"The provision latency is measured from the host launch to the first Teletraan ping."}',
"tags": f"cluster={group},envs={env_arg},th={launch_latency_th}",
}
params.update(self.default_params)
return f"{self.base_metric_url}{urllib.parse.urlencode(params)}"

def generate_first_deploy_success_rate_link(self, env):
params = {
"metrics": (
'{"cmd":"sd=(suc-suc.timeShift(1h)).nonNegative()\\ntotd=(tot-tot.timeShift(1h)).nonNegative()\\nsr=sd/totd*100\\nreturn sr","metrics":'
f'[{{"aggregator":"zimsum","alias":"suc","metric":"teletraan.{env.get("envName")}.{env.get("stageName")}.first_deploy","tags":{{"success":"true"}}}},'
f'{{"aggregator":"zimsum","alias":"tot","metric":"teletraan.{env.get("envName")}.{env.get("stageName")}.first_deploy"}}]}}'
),
"settings": '{"appearance":{"sr":{"disabled":false,"stroke_style":"solid","color":"#00ff00"}},"title":"First deploy success rate [1h window]","y_max":105,"y_min":0}',
}
params.update(self.default_params)
return f"{self.base_metric_url}{urllib.parse.urlencode(params)}"

def generate_group_size_url(self, group_name):
params = {
"metrics": f'{{"metrics":[{{"aggregator":"zimavg","alias":"size","metric":"autoscaling.{group_name}.size"}}]}}',
"settings": '{"appearance":{"d":{"color":"dodgerblue"}},"renderer":"line","title":"Group size","y_axis_label":"Group size","y_min":0}',
}
params.update(self.default_params)
return f"{self.base_metric_url}{urllib.parse.urlencode(params)}"

return f"{self.base_dashboard_url}?{urllib.parse.urlencode(params)}"

# generate aws related settings
def get_aws_settings(request):
Expand Down
8 changes: 4 additions & 4 deletions deploy-board/deploy_board/webapp/util_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ def get_latency_metrics(request, group_name):
"LAUNCH", settings.DEFAULT_START_TIME)
json_data = []
for data_point in launch_data_points:
timestamp, value = data_point["timestamp"], data_point["value"] / 1000
timestamp, value = data_point["timestamp"], data_point["value"]
json_data.append([timestamp, value])
util_data[metric_name1] = json_data

Expand All @@ -169,7 +169,7 @@ def get_latency_metrics(request, group_name):
"DEPLOY", settings.DEFAULT_START_TIME)
json_data2 = []
for data_point in deploy_data_points:
timestamp, value = data_point["timestamp"], data_point["value"] / 1000
timestamp, value = data_point["timestamp"], data_point["value"]
json_data2.append([timestamp, value])
util_data[metric_name2] = json_data2

Expand All @@ -189,13 +189,13 @@ def get_launch_rate(request, group_name):
try:
util_data["metric_names"] = []
for env in envs:
metric_name = "mimmax:rate:teletraan.{}.{}.first_deploy{{success=false}}".format(
metric_name = "zimsum:rate:teletraan.{}.{}.first_deploy{{success=false}}".format(
env["envName"], env["stageName"])
rate_data_points = autoscaling_metrics_helper.get_raw_metrics(request, metric_name,
settings.DEFAULT_START_TIME)
json_data = []
for data_point in rate_data_points:
timestamp, value = data_point["timestamp"], data_point["value"]
timestamp, value = data_point["timestamp"], data_point["value"] * 60
json_data.append([timestamp, value])

util_data[metric_name] = json_data
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,15 @@
import com.pinterest.deployservice.bean.DeployStage;
import com.pinterest.deployservice.bean.DeployType;
import com.pinterest.deployservice.bean.EnvironBean;
import com.pinterest.deployservice.bean.HostBean;
import com.pinterest.deployservice.bean.HostTagBean;
import com.pinterest.deployservice.bean.PingReportBean;
import com.pinterest.deployservice.common.Constants;
import com.pinterest.deployservice.common.StateMachines;
import com.pinterest.deployservice.dao.DeployConstraintDAO;
import com.pinterest.deployservice.dao.DeployDAO;
import com.pinterest.deployservice.dao.EnvironDAO;
import com.pinterest.deployservice.dao.HostDAO;
import com.pinterest.deployservice.dao.HostTagDAO;
import io.micrometer.core.instrument.Metrics;
import java.time.Duration;
Expand All @@ -56,6 +58,8 @@ public class GoalAnalyst {
private static final int ROLL_BACK_PRIORITY = DeployPriority.HIGHER.getValue() - 10;
private static final String DEPLOY_LATENCY_TIMER_NAME =
CUSTOM_NAME_PREFIX + "teletraan.%s.%s.deploy_latency";
private static final String LAUNCH_LATENCY_TIMER_NAME =
CUSTOM_NAME_PREFIX + "teletraan.%s.%s.launch_latency";
private static final String FIRST_DEPLOY_COUNTER_NAME =
CUSTOM_NAME_PREFIX + "teletraan.%s.%s.first_deploy";

Expand All @@ -64,6 +68,7 @@ public class GoalAnalyst {
private DeployDAO deployDAO;
private HostTagDAO hostTagDAO;
private DeployConstraintDAO deployConstraintDAO;
private HostDAO hostDAO;

private String ec2Tags;

Expand Down Expand Up @@ -226,6 +231,7 @@ public String toString() {
HostTagDAO hostTagDAO,
DeployDAO deployDAO,
EnvironDAO environDAO,
HostDAO hostDAO,
String host,
String host_id,
Map<String, EnvironBean> envs,
Expand All @@ -241,6 +247,7 @@ public String toString() {
this.ec2Tags = ec2Tags;
this.hostTagDAO = hostTagDAO;
this.deployConstraintDAO = deployConstraintDAO;
this.hostDAO = hostDAO;

for (Map.Entry<String, AgentBean> entry : agents.entrySet()) {
try {
Expand Down Expand Up @@ -428,7 +435,9 @@ AgentBean genUpdateBeanByReport(PingReportBean report, AgentBean agent) {
updateBean.setStart_date(agent.getStart_date());
}

if (report.getDeployStage() == DeployStage.SERVING_BUILD && updateBean.getFirst_deploy()) {
if (Boolean.TRUE.equals(updateBean.getFirst_deploy())
&& (DeployStage.SERVING_BUILD.equals(report.getDeployStage())
|| AgentState.PAUSED_BY_SYSTEM.equals(updateBean.getState()))) {
// turn off first deploy flag
updateBean.setFirst_deploy(false);
updateBean.setFirst_deploy_time(currentTime);
Expand Down Expand Up @@ -457,13 +466,38 @@ private void emitMetrics(AgentBean updateBean) {
env.getEnv_name(),
env.getStage_name()),
"success",
String.valueOf(updateBean.getStatus().equals(AgentStatus.SUCCEEDED)))
String.valueOf(AgentStatus.SUCCEEDED.equals(updateBean.getStatus())))
.increment();

Long hostStartTime = estimateHostStartTime(updateBean);
if (hostStartTime != null) {
Metrics.timer(
String.format(
LAUNCH_LATENCY_TIMER_NAME,
env.getEnv_name(),
env.getStage_name()))
.record(
Duration.ofMillis(
updateBean.getFirst_deploy_time() - hostStartTime));
}
} catch (Exception ex) {
LOG.warn("Failed to emit metrics of {}", updateBean, ex);
}
}

private Long estimateHostStartTime(AgentBean agent) {
try {
List<HostBean> hosts = hostDAO.getHostsByHostId(agent.getHost_id());
if (!hosts.isEmpty()) {
return hosts.get(0).getCreate_date();
}
} catch (Exception ex) {
LOG.warn("Failed to get host with id {}", agent.getHost_id(), ex);
}

return null;
}

// Generate new agent bean based on the report & current agent record,
// This is intended to be used for deploy goal to install next stage
AgentBean genNextStageUpdateBean(EnvironBean env, PingReportBean report, AgentBean agent) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -861,6 +861,7 @@ public PingResult ping(PingRequestBean pingRequest, boolean rate_limited) throws
hostTagDAO,
deployDAO,
environDAO,
hostDAO,
hostName,
hostId,
envs,
Expand Down
Loading

0 comments on commit 5de012e

Please sign in to comment.