Skip to content

Commit

Permalink
DAOS-16557 test: Add debug to NvmeEnospace ftest (#15559)
Browse files Browse the repository at this point in the history
Add aggregation debugging information on the state of the pool to allow debugging if ENOSPACE error happens unexpectedly.

Signed-off-by: Cedric Koch-Hofer <[email protected]>
  • Loading branch information
knard38 authored Feb 11, 2025
1 parent 1e76f62 commit 32cf3f9
Showing 1 changed file with 216 additions and 49 deletions.
265 changes: 216 additions & 49 deletions src/tests/ftest/nvme/enospace.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
'''
(C) Copyright 2020-2024 Intel Corporation.
(C) Copyright 2025 Hewlett Packard Enterprise Development LP
SPDX-License-Identifier: BSD-2-Clause-Patent
'''
Expand Down Expand Up @@ -33,7 +34,42 @@ def __init__(self, *args, **kwargs):
"""Initialize a NvmeEnospace object."""
super().__init__(*args, **kwargs)

self.metric_names = ['engine_pool_vos_space_scm_used', 'engine_pool_vos_space_nvme_used']
self.space_metric_names = [
'engine_pool_vos_space_scm_used',
'engine_pool_vos_space_nvme_used'
]
self.aggr_metric_names = [
# -- Merged records --
"engine_pool_vos_aggregation_merged_size",
"engine_pool_vos_aggregation_merged_recs",
# -- Deleted records --
"engine_pool_vos_aggregation_deleted_ev",
"engine_pool_vos_aggregation_deleted_sv",
# -- Errors --
"engine_pool_vos_aggregation_fail_count",
"engine_pool_vos_aggregation_csum_errors",
"engine_pool_vos_aggregation_uncommitted",
"engine_pool_vos_aggregation_agg_blocked",
"engine_pool_vos_aggregation_discard_blocked",
# -- Details stat counter --
"engine_pool_vos_aggregation_obj_deleted",
"engine_pool_vos_aggregation_obj_scanned",
"engine_pool_vos_aggregation_obj_skipped",
"engine_pool_vos_aggregation_akey_deleted",
"engine_pool_vos_aggregation_akey_scanned",
"engine_pool_vos_aggregation_akey_skipped",
"engine_pool_vos_aggregation_dkey_deleted",
"engine_pool_vos_aggregation_dkey_scanned",
"engine_pool_vos_aggregation_dkey_skipped",
# -- Duration --
"engine_pool_vos_aggregation_epr_duration",
"engine_pool_vos_aggregation_epr_duration_max",
"engine_pool_vos_aggregation_epr_duration_mean",
"engine_pool_vos_aggregation_epr_duration_min",
"engine_pool_vos_aggregation_epr_duration_stddev"
]
self.metric_names = self.space_metric_names + self.aggr_metric_names

self.media_names = ['SCM', 'NVMe']
self.expected_errors = [self.DER_NOSPACE, self.DER_TIMEDOUT]

Expand All @@ -55,26 +91,31 @@ def setUp(self):
self.daos_cmd = DaosCommand(self.bin)
self.create_pool_max_size()

def get_pool_space_metrics(self, pool_uuid):
def get_pool_space_metrics(self, pool, metrics):
"""Return the metrics on space usage of a given pool.
Args:
pool_uuid (str): Unique id of a pool.
pool (TestPool): target TestPool.
metrics (dict): telemetry metrics.
Returns:
dict: metrics on space usage.
"""
metrics = {}
for hostname, data in self.telemetry.get_metrics(",".join(self.metric_names)).items():
pool_uuid = pool.uuid
space_metrics = {}
for hostname, data in metrics.items():
for metric_name, entry in data.items():
if metric_name not in metrics:
metrics[metric_name] = {
if metric_name not in self.space_metric_names:
continue

if metric_name not in space_metrics:
space_metrics[metric_name] = {
"description": entry['description'],
"hosts": {}
}

hosts = metrics[metric_name]["hosts"]
hosts = space_metrics[metric_name]["hosts"]
for metric in entry['metrics']:
if metric['labels']['pool'].casefold() != pool_uuid.casefold():
continue
Expand All @@ -89,11 +130,60 @@ def get_pool_space_metrics(self, pool_uuid):
target = metric['labels']['target']
hosts[hostname][rank][target] = metric['value']

return metrics
return space_metrics

def get_pool_aggr_metrics(self, pool, metrics):
"""Return the metrics on aggregation counters and gauges.
Args:
pool (TestPool): target TestPool.
metrics (dict): telemetry metrics.
Returns:
dict: metrics on aggregation.
"""
pool_uuid = pool.uuid
aggr_metrics = {
"metric_descriptions": {},
"metric_values": {}
}
for hostname, data in metrics.items():
if hostname not in aggr_metrics["metric_values"]:
aggr_metrics["metric_values"][hostname] = {}
hosts = aggr_metrics["metric_values"][hostname]

for metric_name, entry in data.items():
if metric_name not in self.aggr_metric_names:
continue

if metric_name not in aggr_metrics["metric_descriptions"]:
aggr_metrics["metric_descriptions"][metric_name] = entry["description"]

for metric in entry['metrics']:
if metric['labels']['pool'].casefold() != pool_uuid.casefold():
continue

rank = metric['labels']['rank']
if rank not in hosts:
hosts[rank] = {}
ranks = hosts[rank]

target = metric['labels']['target']
if target not in ranks:
ranks[target] = {}
targets = ranks[target]

targets[metric_name] = metric['value']

return aggr_metrics

def get_pool_usage(self, pool_space):
"""Get the pool storage used % for SCM and NVMe.
Args:
pool_space (object): space usage information of a pool.
Returns:
list: a list of SCM/NVMe pool space usage in %(float)
Expand All @@ -106,14 +196,55 @@ def get_pool_usage(self, pool_space):

return pool_usage

def display_pool_stats(self, pool_space, pool_space_metrics):
"""Display statistics on pool usage.
def display_table(self, title, table, align_idx):
"""Pretty print table content.
Args:
title (str): Title of the table.
table (list): Table to print on stdout.
align_idx (int): Last column to left align.
"""
cols_size = [
max(i) for i in [[len(row[j]) for row in table] for j in range(len(table[0]))]]
line_size = sum(cols_size) + 3 * (len(cols_size) - 1)

self.log.debug("")
line = f"{' ' + title + ' ':-^{line_size}}"
self.log.debug(line)

line = ""
for idx, elt in enumerate(table[0]):
line += f"{elt:^{cols_size[idx]}}"
if idx + 1 != len(table[0]):
line += " | "
self.log.debug(line)

line = ""
for idx, size in enumerate(cols_size):
line += '-' * size
if idx + 1 != len(cols_size):
line += "-+-"
self.log.debug(line)

for row in table[1:]:
line = ""
for idx, elt in enumerate(row):
align_op = "<"
if idx > align_idx:
align_op = ">"
line += f"{elt:{align_op}{cols_size[idx]}}"
if idx + 1 != len(row):
line += " | "
self.log.debug(line)

def display_pool_space(self, pool_space, pool_space_metrics):
"""Display space usage statistics of a given pool.
Args:
pool_space (object): space usage information of a pool.
pool_space_metrics (dict): dict of metrics on space usage of a pool.
"""

self.log.debug("")
title = f"{' Pool Space Usage ':-^80}"
self.log.debug(title)

Expand All @@ -135,34 +266,65 @@ def display_pool_stats(self, pool_space, pool_space_metrics):

for metric in pool_space_metrics.values():
table = [["Hostname", "Rank", "Target", "Size"]]
cols_size = []
for cell in table[0]:
cols_size.append(len(cell))
for hostname, ranks in metric['hosts'].items():
for rank, targets in ranks.items():
for target, size in targets.items():
row = [hostname, rank, target, get_display_size(size)]
table.append(row)
for idx, elt in enumerate(cols_size):
cols_size[idx] = max(elt, len(row[idx]))
hostname = ""
rank = ""

for idx, elt in enumerate(table[0]):
table[0][idx] = f"{elt:^{cols_size[idx]}}"
row = ' | '.join(table[0])
title = f"{' ' + metric['description'] + ' ':-^{len(row)}}"
self.log.debug("")
self.log.debug(title)
self.log.debug(row)
self.log.debug("-" * len(row))
for row in table[1:]:
for idx, elt in enumerate(row):
align_op = "<"
if idx + 1 == len(row):
align_op = ">"
row[idx] = f"{elt:{align_op}{cols_size[idx]}}"
self.log.debug(" | ".join(row))
self.display_table(metric['description'], table, 2)

def display_pool_aggregation(self, metrics):
"""Display record aggregation statistics of a given pool.
Args:
metrics (dict): dict of metrics on pool aggregation.
"""
table = [["Hostname", "Rank", "Target"]]
for it in self.aggr_metric_names:
table[0].append(metrics["metric_descriptions"][it])

for hostname in sorted(metrics["metric_values"]):
row = [hostname]

for rank in sorted(metrics["metric_values"][hostname]):
if not row:
row = [""]
row.append(rank)

for target in sorted(metrics["metric_values"][hostname][rank]):
if not row:
row = ["", ""]
row.append(target)

idx = 3
for metric_name in self.aggr_metric_names:
value = metrics["metric_values"][hostname][rank][target][metric_name]
if metric_name == "engine_pool_vos_aggregation_merged_size":
row.append(get_display_size(value))
else:
row.append(str(value))
idx += 1

table.append(row)
row = None

self.display_table('Pool Aggregation stats', table, 2)

def display_stats(self):
"""Display usage statistics of the tested pool."""
self.pool.get_info()
metrics = self.telemetry.get_metrics(",".join(self.metric_names))

pool_space = self.pool.info.pi_space
pool_space_metrics = self.get_pool_space_metrics(self.pool, metrics)
self.display_pool_space(pool_space, pool_space_metrics)

pool_aggr_metrics = self.get_pool_aggr_metrics(self.pool, metrics)
self.display_pool_aggregation(pool_aggr_metrics)
self.log.debug("")

def verify_enospace_log(self, log_file):
"""Function checking logs consistency.
Expand Down Expand Up @@ -207,10 +369,14 @@ def err_to_str(err_no):
"Number of errors %s (%s) is > 0: got=%d",
err_to_str(error), error, errors_count[error])

def delete_all_containers(self):
"""Delete all the containers."""
def delete_all_containers(self, pool):
"""Delete all the containers of a given pool.
Args:
pool (TestPool): target TestPool.
"""
# List all the container
kwargs = {"pool": self.pool.uuid}
kwargs = {"pool": pool.uuid}
data = self.daos_cmd.container_list(**kwargs)
containers = [uuid_label["uuid"] for uuid_label in data["response"]]

Expand Down Expand Up @@ -291,17 +457,22 @@ def run_enospace_foreground(self, log_file):
log_file (str): name prefix of the log files to check.
"""
self.log.info('----Starting main IOR load----')
self.display_stats()

# Fill 75% of current SCM free space. Aggregation is Enabled so NVMe space will
# start to fill up.
self.log.info('--Filling 75% of the current SCM free space--')
self.start_ior_load(storage='SCM', operation="Auto_Write", percent=75)
self.log.info(self.pool.pool_percentage_used())
try:
self.start_ior_load(storage='SCM', operation="Auto_Write", percent=75)
finally:
self.display_stats()

# Fill 50% of current SCM free space. Aggregation is Enabled so NVMe space will
# continue to fill up.
self.start_ior_load(storage='SCM', operation="Auto_Write", percent=50)
self.log.info(self.pool.pool_percentage_used())
try:
self.start_ior_load(storage='SCM', operation="Auto_Write", percent=50)
finally:
self.display_stats()

# Fill 60% of current SCM free space. This time, NVMe will be Full so data will
# not be moved to NVMe and continue to fill up SCM. SCM will be full and this
Expand All @@ -314,18 +485,14 @@ def run_enospace_foreground(self, log_file):
self.log.info('Test is expected to fail because of DER_NOSPACE')
else:
self.fail('This test is suppose to FAIL because of DER_NOSPACE but it Passed')

# Display the pool statistics
self.pool.get_info()
pool_space = self.pool.info.pi_space
pool_space_metrics = self.get_pool_space_metrics(self.pool.uuid)
self.display_pool_stats(pool_space, pool_space_metrics)
finally:
self.display_stats()

# verify the DER_NO_SPACE error count is expected and no other Error in client log
self.verify_enospace_log(log_file)

# Check both NVMe and SCM are full.
pool_usage = self.get_pool_usage(pool_space)
pool_usage = self.get_pool_usage(self.pool.info.pi_space)
for idx, elt in enumerate(self.media_names):
if pool_usage[idx] >= self.pool_usage_min[idx]:
continue
Expand Down Expand Up @@ -413,7 +580,7 @@ def test_enospace_lazy_with_fg(self):
log_file = f"-loop_{_loop}".join(os.path.splitext(self.client_log))
self.run_enospace_foreground(log_file)
# Delete all the containers
self.delete_all_containers()
self.delete_all_containers(self.pool)
# Delete container will take some time to release the space
time.sleep(60)

Expand Down Expand Up @@ -475,7 +642,7 @@ def test_enospace_time_with_fg(self):
log_file = f"-loop_{_loop}".join(os.path.splitext(self.client_log))
self.run_enospace_with_bg_job(log_file)
# Delete all the containers
self.delete_all_containers()
self.delete_all_containers(self.pool)
# Delete container will take some time to release the space
time.sleep(60)

Expand Down Expand Up @@ -571,7 +738,7 @@ def test_enospace_no_aggregation(self):
self.verify_enospace_log(log_file)

# Delete all the containers
self.delete_all_containers()
self.delete_all_containers(self.pool)

# Wait for the SCM space to be released. (Usage goes below 60%)
scm_released = False
Expand Down

0 comments on commit 32cf3f9

Please sign in to comment.