Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix and improve group status metrics graphs and dashboard URLs #1694

Merged
merged 4 commits into from
Aug 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 29 additions & 28 deletions deploy-board/deploy_board/templates/groups/group_details.html
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ <h4 class="panel-title pull-left">Environments</h4>
{% endblock %}

{% block main %}
<!---- Group Metrics Panel --->
<!-- Group Metrics Panel -->
<div class="panel panel-default">
<div class="panel-heading clearfix">
<h4 class="panel-title pull-left pointer-cursor">
Expand All @@ -86,7 +86,32 @@ <h4 class="panel-title pull-left pointer-cursor">
</script>

<div id="metricStatId" class="collapse in panel-body">
<div align="center" id="groupStatsId" class="collapse in panel-body">
<div id="tsdLinksId" style="text-align: left;">
<h4>Check out these links for better visualization</h4>
<a type="button" class="deployToolTip btn btn-xs" data-toggle="tooltip"
href="{{ group_size_url }}"
title="" data-original-title="Click to see more group size information in TSDB">
<strong>Group Size</strong>
</a>
<a type="button" class="deployToolTip btn btn-xs" data-toggle="tooltip"
href="{{ provision_latency_url }}"
title="" data-original-title="Click to see more provision latency information in TSDB">
<strong>Provision Latency</strong>
</a>
{% for env in envs %}
<a type="button" class="deployToolTip btn btn-xs" data-toggle="tooltip"
href="{{ env.firstDeployLatencyLink }}"
title="" data-original-title="Click to see more first deploy latency information in TSDB">
<strong>Deploy Latency for {{ env.envName }}</strong>
</a>
<a type="button" class="deployToolTip btn btn-xs" data-toggle="tooltip"
href="{{ env.firstDeploySRLink }}"
title="" data-original-title="Click to see more first deploy success rate information in TSDB">
<strong>First deploy SR for {{ env.envName }}</strong>
</a>
{% endfor %}
</div>
<div id="groupStatsId" class="collapse in panel-body" style="text-align: center;">
<div id="container" class="chartContainer">
<div id="line_latencystats"></div>
<div id="launch_rate_id"></div>
Expand All @@ -108,37 +133,13 @@ <h4 class="panel-title pull-left pointer-cursor">
</div>
<div id="loadGroupInfo"></div>
</div>
<div align="left" id="tsdLinksId">
<a type="button" class="deployToolTip btn btn-xs" data-toggle="tooltip"
href="{{group_size_url}}"
title="" data-original-title="Click to see more group size information in TSDB">
<strong>Group Size</strong>
</a>
{% for env in envs %}

<a type="button" class="deployToolTip btn btn-xs" data-toggle="tooltip"
href="{{env.launchlatencylink}}"
title="" data-original-title="Click to see more launch latency information in TSDB">
<strong>Launch Latency for {{ env.envName }}</strong>
</a>
<a type="button" class="deployToolTip btn btn-xs" data-toggle="tooltip"
href="{{env.deploylatencylink}}"
title="" data-original-title="Click to see more deploy latency information in TSDB">
<strong>Deploy Latency for {{ env.envName }}</strong>
<a type="button" class="deployToolTip btn btn-xs" data-toggle="tooltip"
href="{{env.deployfailedlink}}"
title="" data-original-title="Click to see more launch failed count information in TSDB">
<strong>Launch failed count for {{ env.envName }}</strong>
</a>
{% endfor %}
</div>
</div>
</div>

<!--- launch instances button dialog-->
{% include "message_banner.tmpl" %}

<!---- Group Details Panel --->
<!--- Group Details Panel -->
{% if not scaling_down_event_enabled and asg_status == "ENABLED" %}
<div class="panel panel-warning">
{% elif asg_status == "DISABLED" %}
Expand All @@ -162,7 +163,7 @@ <h4 class="panel-title pull-left pointer-cursor">
</a>
</h4>

<!---- Buttons --->
<!--- Buttons -->
{% if not scaling_down_event_enabled and asg_status == "ENABLED" %}
<div class="btn-group pull-right">
<button type="button" class="deployToolTip btn btn-default btn-sm"
Expand Down
6 changes: 3 additions & 3 deletions deploy-board/deploy_board/templates/groups/launch_rate.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,11 @@
var data = new google.visualization.DataTable();
data.addColumn("datetime", "Date");

var failure_count_name = "Launch Failure Count";
var failure_count_name = "Launch Failure Rate";
data.addColumn("number", failure_count_name);

var options = {
title: 'Launch Failure Count',
title: 'Launch Failure Rate',
titleFontSize: 15,
height: 300,
min: 0,
Expand Down Expand Up @@ -65,7 +65,7 @@
if (metric_names != null) {
for (var i = 0; i < 1; ++i) {
var metric_name = metric_names[i];
data.addColumn("number", "Launch Failure Count");
data.addColumn("number", "Launch Failure Rate");
data_list = response[metric_name];
for (j = 0; j < data_list.length; ++j) {
var d = new Date(data_list[j][0]);
Expand Down
85 changes: 60 additions & 25 deletions deploy-board/deploy_board/webapp/group_view.py
Original file line number Diff line number Diff line change
Expand Up @@ -1206,6 +1206,12 @@ def get(self, request, group_name):


class GroupDetailView(View):
default_params = {
"begin": "1w",
"reducer_interval": "10m",
}
base_metric_url = "https://statsboard.pinadmin.com/build3?"

def get(self, request, group_name):
autoscaling_summary = autoscaling_groups_helper.get_autoscaling_summary(request, group_name)
if autoscaling_summary is None:
Expand All @@ -1215,32 +1221,10 @@ def get(self, request, group_name):
envs = environs_helper.get_all_envs_by_group(request, group_name)
disabled_actions = autoscaling_groups_helper.get_disabled_asg_actions(request, group_name)
pas_config = autoscaling_groups_helper.get_pas_config(request, group_name)
base_metric_url = "https://statsboard.pinadmin.com/build?"

group_size_url = base_metric_url+'''
{"renderer":"line","title":"Fleet Size", "yAxisLabel":"Group Size", "ymin":"0","from":"1w",
"metrics":[{"agg":"avg", "color":"dodgerblue","db":"tsdb", "dsValue":"10m", "renderer":"line",
"metric":"autoscaling.%s.size"}]}
''' % group_name

for env in envs:
env['launchlatencylink'] = base_metric_url + '''
{"renderer":"line", "yAxisLabel":"Launch Latency","ymin":"0","from":"1w",
"metrics":[{"agg":"avg", "color":"dodgerblue","db":"tsdb", "dsValue":"10m", "renderer":"line",
"metric":"autoscaling.%s.%s.launchlatency"}]}
''' % (env.get('envName'), env.get('stageName'))

env['deploylatencylink'] = base_metric_url + '''
{"renderer":"line", "yAxisLabel":"Deploy Latency", "ymin":"0","from":"1w",
"metrics":[{"agg":"avg", "color":"dodgerblue","db":"tsdb", "dsValue":"10m", "renderer":"line",
"metric":"autoscaling.%s.%s.deploylatency"}]}
''' % (env.get('envName'), env.get('stageName'))

env['deployfailedlink'] = base_metric_url + '''
{"renderer":"line", "yAxisLabel":"Launch Failed", "ymin":"0","from":"1w",
"metrics":[{"agg":"mimmax", "color":"dodgerblue","db":"tsdb", "dsValue":"10m", "renderer":"line",
"metric":"autoscaling.%s.%s.first_deploy.failed"}]}
''' % (env.get('envName'), env.get('stageName'))
env['firstDeploySRLink'] = self.generate_first_deploy_success_rate_link(env)
env['firstDeployLatencyLink'] = self.generate_deploy_latency_link(env)

if "Terminate" in disabled_actions:
scaling_down_event_enabled = False
Expand All @@ -1263,9 +1247,60 @@ def get(self, request, group_name):
"launch_config": launch_config,
"pas_enabled": pas_config['pas_state'] if pas_config else False,
"disallow_autoscaling": _disallow_autoscaling(curr_image),
"group_size_url": group_size_url,
"group_size_url": self.generate_group_size_url(group_name),
"provision_latency_url": self.generate_provision_latency_url(group_name),
})

def generate_deploy_latency_link(self, env):
params = {
"metrics": (
'{"cmd":"sd=(s-s.timeShift(1h)).nonNegative()\\nctd=(ct-ct.timeShift(1h)).nonNegative()\\nmean=sd/ctd\\nreturn max,mean","metrics":'
f'[{{"aggregator":"zimsum","alias":"s","metric":"teletraan.{env.get("envName")}.{env.get("stageName")}.deploy_latency.sum"}},'
f'{{"aggregator":"zimsum","alias":"ct","metric":"teletraan.{env.get("envName")}.{env.get("stageName")}.deploy_latency.count"}},'
f'{{"aggregator":"mimmax","alias":"max","metric":"teletraan.{env.get("envName")}.{env.get("stageName")}.deploy_latency.max"}}]}}'
),
"settings": (
'{"appearance":{"mean":{"color":"#0000ff"},"max":{"color":"#ff8000"}},"title":"mean & max first deploy latency [1h window]","y_axis_label":"Latency","y_min":0,'
'"note":"The deploy latency is measured from the first deploy start to finish on a single host."}'
),
}
params.update(self.default_params)
return f"{self.base_metric_url}{urllib.parse.urlencode(params)}"

def generate_provision_latency_url(self, group_name):
params = {
"metrics": (
'{"cmd":"sd=(s-s.timeShift(1h)).nonNegative()\\nctd=(ct-ct.timeShift(1h)).nonNegative()\\nmean=sd/ctd\\nreturn max,mean","metrics":'
f'[{{"aggregator":"zimsum","alias":"s","metric":"teletraan.{group_name}.provision_latency.sum"}},'
f'{{"aggregator":"zimsum","alias":"ct","metric":"teletraan.{group_name}.provision_latency.count"}},'
f'{{"aggregator":"mimmax","alias":"max","metric":"teletraan.{group_name}.provision_latency.max"}}]}}'
),
"settings": '{"appearance":{"mean":{"color":"#0000ff"},"max":{"color":"#ff8000"}},"title":"mean & max provision latency [1h window]","y_axis_label":"Latency","y_min":0,'
'"note":"The provision latency is measured from the host launch to the first Teletraan ping."}',
}
params.update(self.default_params)
return f"{self.base_metric_url}{urllib.parse.urlencode(params)}"

def generate_first_deploy_success_rate_link(self, env):
params = {
"metrics": (
'{"cmd":"sd=(suc-suc.timeShift(1h)).nonNegative()\\ntotd=(tot-tot.timeShift(1h)).nonNegative()\\nsr=sd/totd*100\\nreturn sr","metrics":'
f'[{{"aggregator":"zimsum","alias":"suc","metric":"teletraan.{env.get("envName")}.{env.get("stageName")}.first_deploy","tags":{{"success":"true"}}}},'
f'{{"aggregator":"zimsum","alias":"tot","metric":"teletraan.{env.get("envName")}.{env.get("stageName")}.first_deploy"}}]}}'
),
"settings": '{"appearance":{"sr":{"disabled":false,"stroke_style":"solid","color":"#00ff00"}},"title":"First deploy success rate [1h window]","y_max":105,"y_min":0}',
}
params.update(self.default_params)
return f"{self.base_metric_url}{urllib.parse.urlencode(params)}"

def generate_group_size_url(self, group_name):
params = {
"metrics": f'{{"metrics":[{{"aggregator":"zimavg","alias":"size","metric":"autoscaling.{group_name}.size"}}]}}',
"settings": '{"appearance":{"d":{"color":"dodgerblue"}},"renderer":"line","title":"Group size","y_axis_label":"Group size","y_min":0}',
}
params.update(self.default_params)
return f"{self.base_metric_url}{urllib.parse.urlencode(params)}"


# generate aws related settings
def get_aws_settings(request):
Expand Down
2 changes: 1 addition & 1 deletion deploy-board/deploy_board/webapp/util_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ def get_launch_rate(request, group_name):
try:
util_data["metric_names"] = []
for env in envs:
metric_name = "mimmax:autoscaling.{}.{}.first_deploy.failed".format(
metric_name = "mimmax:rate:teletraan.{}.{}.first_deploy{{success=false}}".format(
env["envName"], env["stageName"])
rate_data_points = autoscaling_metrics_helper.get_raw_metrics(request, metric_name,
settings.DEFAULT_START_TIME)
Expand Down
Loading