Skip to content

Commit

Permalink
crf: break metrics and track the early start instead of success
Browse files Browse the repository at this point in the history
  • Loading branch information
fantix committed Oct 23, 2024
1 parent 04ae6e7 commit 6c25862
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 36 deletions.
40 changes: 32 additions & 8 deletions edb/server/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,14 +215,38 @@
'Total number of the main multi-tenant config file reload errors.',
)

mt_tenant_successful_actions = registry.new_labeled_counter(
'mt_tenant_successful_actions_total',
'Number of successful tenant actions (add/remove/reload).',
labels=("tenant", "action"),
mt_tenant_add_total = registry.new_labeled_counter(
'mt_tenant_add_total',
'Total number of new tenants the server attempted to add.',
labels=("tenant",),
)

mt_tenant_add_errors = registry.new_labeled_counter(
'mt_tenant_add_errors_total',
'Total number of tenants the server failed to add.',
labels=("tenant",),
)

mt_tenant_remove_total = registry.new_labeled_counter(
'mt_tenant_remove_total',
'Total number of tenants the server attempted to remove.',
labels=("tenant",),
)

mt_tenant_action_errors = registry.new_labeled_counter(
'mt_tenant_action_errors_total',
'Number of failed tenant actions (add/remove/reload).',
labels=("tenant", "action"),
mt_tenant_remove_errors = registry.new_labeled_counter(
'mt_tenant_remove_errors_total',
'Total number of tenants the server failed to remove.',
labels=("tenant",),
)

mt_tenant_reload_total = registry.new_labeled_counter(
'mt_tenant_reload_total',
'Total number of tenants the server attempted to reload.',
labels=("tenant",),
)

mt_tenant_reload_errors = registry.new_labeled_counter(
'mt_tenant_reload_errors_total',
'Total number of tenants the server failed to reload.',
labels=("tenant",),
)
28 changes: 12 additions & 16 deletions edb/server/multitenant.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,7 @@ def _warn(e):

async def _add_tenant():
current_tenant.set(conf["instance-name"])
metrics.mt_tenant_add_total.inc(1.0, current_tenant.get())
rloop = retryloop.RetryLoop(
backoff=retryloop.exp_backoff(),
timeout=300,
Expand All @@ -320,9 +321,6 @@ async def _add_tenant():
tenant = await self._create_tenant(conf)
self._tenants[sni] = tenant
metrics.mt_tenants_total.inc()
metrics.mt_tenant_successful_actions.inc(
1.0, tenant.get_instance_name(), 'add'
)
logger.info("Added Tenant %s", sni)
self._tenants_serial[sni] = serial

Expand All @@ -338,9 +336,7 @@ async def _add_tenant():
async with self._tenants_lock[sni]:
if serial > self._tenants_serial.get(sni, 0):
self._tenants_conf.pop(sni, None)
metrics.mt_tenant_action_errors.inc(
1.0, conf["instance-name"], 'add'
)
metrics.mt_tenant_add_errors.inc(1.0, conf["instance-name"])

async def _remove_tenant(self, serial: int, sni: str):
tenant = None
Expand All @@ -349,19 +345,19 @@ async def _remove_tenant(self, serial: int, sni: str):
if serial > self._tenants_serial.get(sni, 0):
if sni in self._tenants:
tenant = self._tenants.pop(sni)
metrics.mt_tenant_remove_total.inc(
1.0, tenant.get_instance_name()
)
current_tenant.set(tenant.get_instance_name())
await self._destroy_tenant(tenant)
metrics.mt_tenants_total.dec()
metrics.mt_tenant_successful_actions.inc(
1.0, tenant.get_instance_name(), 'remove'
)
logger.info("Removed Tenant %s", sni)
self._tenants_serial[sni] = serial
except Exception:
logger.critical("Failed to remove Tenant %s", sni, exc_info=True)
if tenant is not None:
metrics.mt_tenant_action_errors.inc(
1.0, tenant.get_instance_name(), 'remove'
metrics.mt_tenant_remove_errors.inc(
1.0, tenant.get_instance_name(),
)

async def _reload_tenant(self, serial: int, sni: str, conf: TenantConfig):
Expand All @@ -370,6 +366,9 @@ async def _reload_tenant(self, serial: int, sni: str, conf: TenantConfig):
async with self._tenants_lock[sni]:
if serial > self._tenants_serial.get(sni, 0):
if tenant := self._tenants.get(sni):
metrics.mt_tenant_reload_total.inc(
1.0, tenant.get_instance_name()
)
current_tenant.set(tenant.get_instance_name())

orig = self._last_tenants_conf.get(sni, {})
Expand Down Expand Up @@ -402,9 +401,6 @@ async def _reload_tenant(self, serial: int, sni: str, conf: TenantConfig):
return

tenant.reload()
metrics.mt_tenant_successful_actions.inc(
1.0, tenant.get_instance_name(), 'reload'
)
logger.info("Reloaded Tenant %s", sni)

# GOTCHA: reloading tenant doesn't increase the tenant
Expand All @@ -413,8 +409,8 @@ async def _reload_tenant(self, serial: int, sni: str, conf: TenantConfig):
except Exception:
logger.critical("Failed to reload Tenant %s", sni, exc_info=True)
if tenant is not None:
metrics.mt_tenant_action_errors.inc(
1.0, tenant.get_instance_name(), 'reload'
metrics.mt_tenant_reload_errors.inc(
1.0, tenant.get_instance_name()
)

def get_debug_info(self):
Expand Down
24 changes: 12 additions & 12 deletions tests/test_server_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -1369,13 +1369,13 @@ async def _test_server_ops_multi_tenant_3(self, mtargs: MultiTenantArgs):
'\nedgedb_server_mt_config_reload_errors_total 0.0\n', data
)
self.assertIn(
'\nedgedb_server_mt_tenant_successful_actions_total'
'{tenant="localtest1",action="add"} 1.0\n',
'\nedgedb_server_mt_tenant_add_total'
'{tenant="localtest1"} 1.0\n',
data,
)
self.assertNotIn(
'\nedgedb_server_mt_tenant_successful_actions_total'
'{tenant="localtest1",action="remove"} 1.0\n',
'\nedgedb_server_mt_tenant_remove_total'
'{tenant="localtest1"} 1.0\n',
data,
)

Expand All @@ -1400,13 +1400,13 @@ async def _test_server_ops_multi_tenant_3(self, mtargs: MultiTenantArgs):
data,
)
self.assertIn(
'\nedgedb_server_mt_tenant_successful_actions_total'
'{tenant="localtest1",action="add"} 1.0\n',
'\nedgedb_server_mt_tenant_add_total'
'{tenant="localtest1"} 1.0\n',
data,
)
self.assertIn(
'\nedgedb_server_mt_tenant_successful_actions_total'
'{tenant="localtest1",action="remove"} 1.0\n',
'\nedgedb_server_mt_tenant_remove_total'
'{tenant="localtest1"} 1.0\n',
data,
)

Expand All @@ -1431,13 +1431,13 @@ async def _test_server_ops_multi_tenant_3(self, mtargs: MultiTenantArgs):
data,
)
self.assertIn(
'\nedgedb_server_mt_tenant_successful_actions_total'
'{tenant="localtest1",action="add"} 2.0\n',
'\nedgedb_server_mt_tenant_add_total'
'{tenant="localtest1"} 2.0\n',
data,
)
self.assertIn(
'\nedgedb_server_mt_tenant_successful_actions_total'
'{tenant="localtest1",action="remove"} 1.0\n',
'\nedgedb_server_mt_tenant_remove_total'
'{tenant="localtest1"} 1.0\n',
data,
)

Expand Down

0 comments on commit 6c25862

Please sign in to comment.