From 1077e94dfc84548b287929cc8f28e7686d28884c Mon Sep 17 00:00:00 2001 From: "chentianyi.cty" Date: Fri, 11 Oct 2024 15:44:25 +0800 Subject: [PATCH 1/7] done hang check basic impl and test --- .../check_training_hang_operator.py | 104 ++++++- dlrover/python/master/diagnosis/diagnosis.py | 54 +++- dlrover/python/master/dist_master.py | 2 +- .../python/master/node/dist_job_manager.py | 12 + dlrover/python/master/node/job_manager.py | 2 +- .../data/xpu_timer/hang/xpu_timer_metric_all | 193 +++++++++++++ .../data/xpu_timer/hang/xpu_timer_metric_some | 193 +++++++++++++ .../normal/xpu_timer_metric_0} | 0 .../data/xpu_timer/xpu_timer_metric_single | 193 +++++++++++++ dlrover/python/tests/test_diagnosis_agent.py | 2 +- dlrover/python/tests/test_inference_chain.py | 256 +++++++++++++++++- dlrover/python/tests/test_job_manager.py | 13 + 12 files changed, 1000 insertions(+), 24 deletions(-) create mode 100644 dlrover/python/tests/data/xpu_timer/hang/xpu_timer_metric_all create mode 100644 dlrover/python/tests/data/xpu_timer/hang/xpu_timer_metric_some rename dlrover/python/tests/data/{xpu_timer_metrics => xpu_timer/normal/xpu_timer_metric_0} (100%) create mode 100644 dlrover/python/tests/data/xpu_timer/xpu_timer_metric_single diff --git a/dlrover/python/diagnosis/inferencechain/inferenceoperator/check_training_hang_operator.py b/dlrover/python/diagnosis/inferencechain/inferenceoperator/check_training_hang_operator.py index 1bcfd0c55..55a9024b5 100644 --- a/dlrover/python/diagnosis/inferencechain/inferenceoperator/check_training_hang_operator.py +++ b/dlrover/python/diagnosis/inferencechain/inferenceoperator/check_training_hang_operator.py @@ -11,8 +11,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import List +import re +import sys +from typing import Dict, List, Tuple +from dlrover.python.common.log import default_logger as logger from dlrover.python.diagnosis.common.constants import DiagnosisDataType from dlrover.python.diagnosis.common.diagnosis_data import DiagnosisData from dlrover.python.diagnosis.common.inference_chain import ( @@ -46,7 +49,14 @@ def is_compatible(self, inference: Inference) -> bool: return False def infer(self, inferences: List[Inference]) -> List[Inference]: - if not self.data_manager: + if ( + not self.data_manager + or not self.data_manager.with_runtime_context() + ): + logger.info( + "Skip training-hang inference for there is " + "no diagnosis data reference." + ) return [ Inference( name=InferenceName.TRAINING, @@ -60,6 +70,7 @@ def infer(self, inferences: List[Inference]) -> List[Inference]: ) if diagnosis_data and self.is_hang(diagnosis_data): + logger.warning("Training might hanged.") return [ Inference( name=InferenceName.TRAINING, @@ -77,17 +88,100 @@ def infer(self, inferences: List[Inference]) -> List[Inference]: ] def is_hang(self, diagnosis_data: List[DiagnosisData]): - hang_metric = [] + logger.info( + "Hang detection start using diagnosis data, " + f"data number: {len(diagnosis_data)}, " + f"data size: {sys.getsizeof(diagnosis_data)}." + ) + worker_hang_metric: Dict[int, List[Tuple[int, bool]]] = {} if not diagnosis_data: return False for data in diagnosis_data: + # filter hang metric each_metric = [ line for line in data.data_content.splitlines() if line.startswith(HANG_METRIC_PREFIX) ] - hang_metric.append(each_metric) - # TODO: implement the judgement + # if all local rank is hanged, tag worker hang + rank_hang_size = 0 + is_worker_hang = False + for each_rank_metric in each_metric: + match = re.search(r"(\d+)(?!.*\d)", each_rank_metric) + if match and match.group(0) == "1": + rank_hang_size += 1 + if rank_hang_size == len(each_metric): + is_worker_hang = True + + if data.node_rank not in worker_hang_metric: + worker_hang_metric[data.node_rank] = [] + worker_hang_metric[data.node_rank].append( + (data.timestamp, is_worker_hang) + ) + + # hang detection rules: + # 1. 100% worker got hang metric + # 2. last for 10+ minutes + hang_id, hang_last = self._find_hang_intersection(worker_hang_metric) + if hang_id != -1: + logger.info( + f"Got hang worker: {hang_id}, " f"time last: {hang_last}" + ) + return True + return False + + def _find_hang_intersection( + self, worker_hang_metric: Dict[int, List[Tuple[int, bool]]] + ) -> Tuple[int, int]: + """ + Require all workers hang from latest and find the hang intersection. + + Args: + worker_hang_metric (Dict[int, List[Tuple[int, bool]]]): Input + metric. + + Returns: + The hang intersection's id and time last in tuple format. + """ + + worker_hang_length_min = 0 + worker_hang_id = -1 + + # find the intersection from latest + for worker_id, tuple_list in worker_hang_metric.items(): + # sorted by timestamp + tuple_list.sort(key=lambda x: x[0]) + worker_hang_length = 0 + + for tuple_item in reversed(tuple_list): + if tuple_item[1]: + worker_hang_length += 1 + else: + break + + if worker_hang_length > 0: + if worker_hang_length_min == 0: + worker_hang_length_min = worker_hang_length + worker_hang_id = worker_id + elif worker_hang_length < worker_hang_length_min: + worker_hang_length_min = worker_hang_length + worker_hang_id = worker_id + else: + # there is normal worker + return -1, -1 + + # get the intersection's time last + if worker_hang_id != -1 and worker_hang_length_min != 0: + hang_worker_metric = worker_hang_metric[worker_hang_id] + time_last = ( + hang_worker_metric[len(hang_worker_metric) - 1][0] + - hang_worker_metric[ + len(hang_worker_metric) - worker_hang_length_min + ][0] + ) + return worker_hang_id, time_last + + return -1, -1 diff --git a/dlrover/python/master/diagnosis/diagnosis.py b/dlrover/python/master/diagnosis/diagnosis.py index 17dd073ea..5d4a8b87c 100644 --- a/dlrover/python/master/diagnosis/diagnosis.py +++ b/dlrover/python/master/diagnosis/diagnosis.py @@ -11,8 +11,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +import sys import threading import time +from collections import deque from datetime import datetime, timedelta from typing import Dict, List @@ -41,9 +43,11 @@ def has_expired(timestamp: float, time_period: int) -> bool: class DiagnosisManager: - def __init__(self): + def __init__(self, job_manager=None): self._is_observing_started = False - self._data_manager: DiagnosisDataManager = DiagnosisDataManager(600) + self._data_manager: DiagnosisDataManager = DiagnosisDataManager( + job_manager, 600 + ) self._diagnostician: Diagnostician = Diagnostician(self._data_manager) def collect_diagnosis_data(self, data: DiagnosisData): @@ -94,6 +98,9 @@ def _diagnose_failures(self): if not self._is_observing_started: logger.info("Stop to diagnose failures for observing.") break + logger.info( + f"Diagnosis data size: {self._data_manager.get_data_size()}." + ) observed_problems = self._diagnostician.observe_training() for problem in observed_problems: @@ -107,28 +114,45 @@ def _diagnose_failures(self): class DiagnosisDataManager: - def __init__(self, expire_time_period): - self.diagnosis_data: Dict[str, List[DiagnosisData]] = {} + def __init__(self, job_manager=None, expire_time_period=600): + self._diagnosis_data: Dict[str, deque[DiagnosisData]] = {} self.expire_time_period = expire_time_period + self._job_manager = job_manager + self._lock = threading.Lock() + + @property + def job_manager(self): + return self._job_manager + + @property + def data(self): + return self._diagnosis_data + + def with_runtime_context(self) -> bool: + return self.job_manager is not None def store_data(self, data: DiagnosisData): data_type = data.data_type - if data_type not in self.diagnosis_data: - logger.debug(f"{data_type} is not found in the store") - self.diagnosis_data[data_type] = [] - self.diagnosis_data[data_type].append(data) - self._clean_diagnosis_data(data_type) + with self._lock: + if data_type not in self.data: + self.data[data_type] = deque(maxlen=100000) + self.data[data_type].append(data) + self._clean_diagnosis_data(data_type) def get_data(self, data_type: str) -> List[DiagnosisData]: - if data_type not in self.diagnosis_data: - return [] - return self.diagnosis_data[data_type] + with self._lock: + if data_type not in self.data: + return [] + return list(self.data[data_type]) + + def get_data_size(self): + return sys.getsizeof(self.data) def _clean_diagnosis_data(self, data_type: str): - if data_type not in self.diagnosis_data: + if data_type not in self.data: return - data = self.diagnosis_data[data_type] + data = self.data[data_type] n = 0 for d in data: if has_expired(d.timestamp, self.expire_time_period): @@ -136,7 +160,7 @@ def _clean_diagnosis_data(self, data_type: str): else: break - self.diagnosis_data[data_type] = data[n:] + self.data[data_type] = data[n:] class Diagnostician: diff --git a/dlrover/python/master/dist_master.py b/dlrover/python/master/dist_master.py index c68942e2c..cdff7d61f 100644 --- a/dlrover/python/master/dist_master.py +++ b/dlrover/python/master/dist_master.py @@ -143,7 +143,7 @@ def __init__( error_monitor ), } - self.diagnosis_manager = DiagnosisManager() + self.diagnosis_manager = DiagnosisManager(self.job_manager) self.job_metric_collector = self._create_metric_collector_if_needed( args ) diff --git a/dlrover/python/master/node/dist_job_manager.py b/dlrover/python/master/node/dist_job_manager.py index 6cc00cea2..aaf019ca2 100644 --- a/dlrover/python/master/node/dist_job_manager.py +++ b/dlrover/python/master/node/dist_job_manager.py @@ -1135,6 +1135,18 @@ def collect_node_heart_beat(self, node_type, node_id, timestamp): def update_node_required_info_callback(self): self._worker_manager.update_node_required_info(self._nodes_required) + def get_node_required_info(self): + return self._nodes_required + + def get_total_node_num_by_type(self, node_type): + if not self._job_nodes: + return 0 + + return len(self._job_nodes[node_type]) + + def get_job_strategy(self): + return self._job_args.distribution_strategy + def create_job_manager(args: JobArgs, speed_monitor) -> DistributedJobManager: critical_worker_index = get_critical_worker_index(args) diff --git a/dlrover/python/master/node/job_manager.py b/dlrover/python/master/node/job_manager.py index 77916a1e7..281af6982 100644 --- a/dlrover/python/master/node/job_manager.py +++ b/dlrover/python/master/node/job_manager.py @@ -54,7 +54,7 @@ def __init__( self._error_monitor: ErrorMonitor = error_monitor self._job_nodes: Dict[str, Dict[int, Node]] = {} - self._nodes_required = (0, 0, 0) + self._nodes_required = (0, 0, 0) # (min-nodes, max-nodes, timeout) self._training_node_config = TrainingNodeConfig(external_config) diff --git a/dlrover/python/tests/data/xpu_timer/hang/xpu_timer_metric_all b/dlrover/python/tests/data/xpu_timer/hang/xpu_timer_metric_all new file mode 100644 index 000000000..e384fb2de --- /dev/null +++ b/dlrover/python/tests/data/xpu_timer/hang/xpu_timer_metric_all @@ -0,0 +1,193 @@ +# HELP exposer_transferred_bytes_total Transferred bytes to metrics services +# TYPE exposer_transferred_bytes_total counter +exposer_transferred_bytes_total 12375203 +# HELP exposer_scrapes_total Number of times metrics were scraped +# TYPE exposer_scrapes_total counter +exposer_scrapes_total 6174 +# HELP exposer_request_latencies Latencies of serving scrape requests, in microseconds +# TYPE exposer_request_latencies summary +exposer_request_latencies_count 6174 +exposer_request_latencies_sum 6043374 +exposer_request_latencies{quantile="0.5"} 888 +exposer_request_latencies{quantile="0.9"} 888 +exposer_request_latencies{quantile="0.99"} 888 +# TYPE XPU_TIMER_MM_KERNEL_AVG_LATENCY gauge +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 19063 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 3719 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 78409 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 3673 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 3682 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 7500 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 23150 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 3675 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 3662 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 7267 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 3678 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 7082 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 7076 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 8104 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 7587 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 3690 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 7732 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 3511 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 18365 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 11184 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 17503 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 23387 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 6777 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 23613 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 79162 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 3694 +# TYPE XPU_TIMER_MM_KERNEL_MAX_LATENCY gauge +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 73411 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 4994 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 78856 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 4011 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 4035 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 63087 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 71453 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 4035 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 3990 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 63198 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 4001 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 63342 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 63214 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 63184 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 64022 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 4112 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 63349 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 3971 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 71930 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 75577 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 76229 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 72937 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 63274 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 72472 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 80781 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 4085 +# TYPE XPU_TIMER_MM_KERNEL_P99_LATENCY gauge +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 73411 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 4994 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 78856 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 4011 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 4025 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 63087 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 71453 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 4035 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 3989 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 63198 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 4001 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 63342 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 63214 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 63184 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 64022 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 4112 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 63349 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 3971 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 71930 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 75577 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 76229 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 72832 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 63274 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 72472 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 80781 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 4084 +# TYPE XPU_TIMER_MM_KERNEL_MIN_LATENCY gauge +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 1183 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 1257 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 77810 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 2995 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 2992 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 1039 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 1182 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 2996 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 2992 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 1040 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 2991 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 1037 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 1039 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 1058 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 1049 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 2996 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 1057 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 2997 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 1181 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 1200 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 1205 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 1180 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 1042 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 1182 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 77646 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 2991 +# TYPE XPU_TIMER_MM_KERNEL_FLOPS gauge +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 341.3345944233395 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 333.0872448878719 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 297.7779568156592 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 412.980679183614 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 412.7927898846224 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 371.6654952875318 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 333.9413147465587 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 412.6426014946334 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 414.2250512081811 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 365.3253468575526 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 414.2250512081811 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 366.8107237284316 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 366.655433637526 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 372.6319021584654 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 376.5763187831131 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 412.4764489189885 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 380.7275537933207 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 414.4304717174849 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 340.2784849108911 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 342.0186789595024 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 339.7178943261944 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 333.5362100380105 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 368.8886616386404 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 330.9267993677032 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 293.8160541599364 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 410.5513810707692 +# TYPE XPU_TIMER_COMMON_HANG gauge +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="4",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 1 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="0",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 1 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="2",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 1 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="6",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 1 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="1",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 1 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="5",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 1 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="3",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 1 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="7",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 1 +# TYPE XPU_TIMER_COMMON_START_DUMP gauge +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="4",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="0",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="2",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="6",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="1",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="5",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="3",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="7",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 0 +# TYPE XPU_TIMER_COMMON_END_DUMP gauge +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="4",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="0",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="2",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="6",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="1",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="5",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="3",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="7",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 0 +# TYPE XPU_TIMER_COMMON_POOL_QUEUE_SIZE gauge +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="4",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="0",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="2",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="6",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="1",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="5",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="3",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="7",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 23 +# TYPE XPU_TIMER_COMMON_WORK_QUEUE_SIZE gauge +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="4",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="0",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="2",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="6",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="1",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="5",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="3",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="7",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 14 diff --git a/dlrover/python/tests/data/xpu_timer/hang/xpu_timer_metric_some b/dlrover/python/tests/data/xpu_timer/hang/xpu_timer_metric_some new file mode 100644 index 000000000..59142aa4c --- /dev/null +++ b/dlrover/python/tests/data/xpu_timer/hang/xpu_timer_metric_some @@ -0,0 +1,193 @@ +# HELP exposer_transferred_bytes_total Transferred bytes to metrics services +# TYPE exposer_transferred_bytes_total counter +exposer_transferred_bytes_total 12375203 +# HELP exposer_scrapes_total Number of times metrics were scraped +# TYPE exposer_scrapes_total counter +exposer_scrapes_total 6174 +# HELP exposer_request_latencies Latencies of serving scrape requests, in microseconds +# TYPE exposer_request_latencies summary +exposer_request_latencies_count 6174 +exposer_request_latencies_sum 6043374 +exposer_request_latencies{quantile="0.5"} 888 +exposer_request_latencies{quantile="0.9"} 888 +exposer_request_latencies{quantile="0.99"} 888 +# TYPE XPU_TIMER_MM_KERNEL_AVG_LATENCY gauge +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 19063 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 3719 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 78409 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 3673 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 3682 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 7500 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 23150 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 3675 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 3662 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 7267 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 3678 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 7082 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 7076 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 8104 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 7587 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 3690 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 7732 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 3511 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 18365 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 11184 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 17503 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 23387 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 6777 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 23613 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 79162 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 3694 +# TYPE XPU_TIMER_MM_KERNEL_MAX_LATENCY gauge +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 73411 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 4994 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 78856 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 4011 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 4035 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 63087 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 71453 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 4035 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 3990 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 63198 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 4001 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 63342 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 63214 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 63184 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 64022 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 4112 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 63349 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 3971 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 71930 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 75577 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 76229 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 72937 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 63274 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 72472 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 80781 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 4085 +# TYPE XPU_TIMER_MM_KERNEL_P99_LATENCY gauge +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 73411 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 4994 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 78856 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 4011 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 4025 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 63087 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 71453 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 4035 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 3989 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 63198 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 4001 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 63342 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 63214 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 63184 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 64022 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 4112 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 63349 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 3971 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 71930 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 75577 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 76229 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 72832 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 63274 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 72472 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 80781 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 4084 +# TYPE XPU_TIMER_MM_KERNEL_MIN_LATENCY gauge +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 1183 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 1257 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 77810 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 2995 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 2992 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 1039 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 1182 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 2996 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 2992 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 1040 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 2991 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 1037 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 1039 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 1058 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 1049 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 2996 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 1057 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 2997 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 1181 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 1200 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 1205 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 1180 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 1042 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 1182 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 77646 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 2991 +# TYPE XPU_TIMER_MM_KERNEL_FLOPS gauge +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 341.3345944233395 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 333.0872448878719 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 297.7779568156592 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 412.980679183614 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 412.7927898846224 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 371.6654952875318 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 333.9413147465587 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 412.6426014946334 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 414.2250512081811 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 365.3253468575526 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 414.2250512081811 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 366.8107237284316 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 366.655433637526 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 372.6319021584654 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 376.5763187831131 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 412.4764489189885 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 380.7275537933207 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 414.4304717174849 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 340.2784849108911 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 342.0186789595024 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 339.7178943261944 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 333.5362100380105 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 368.8886616386404 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 330.9267993677032 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 293.8160541599364 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 410.5513810707692 +# TYPE XPU_TIMER_COMMON_HANG gauge +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="4",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 0 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="0",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 1 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="2",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 1 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="6",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 0 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="1",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 0 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="5",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 1 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="3",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 0 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="7",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 0 +# TYPE XPU_TIMER_COMMON_START_DUMP gauge +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="4",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="0",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="2",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="6",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="1",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="5",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="3",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="7",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 0 +# TYPE XPU_TIMER_COMMON_END_DUMP gauge +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="4",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="0",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="2",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="6",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="1",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="5",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="3",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="7",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 0 +# TYPE XPU_TIMER_COMMON_POOL_QUEUE_SIZE gauge +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="4",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="0",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="2",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="6",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="1",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="5",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="3",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="7",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 23 +# TYPE XPU_TIMER_COMMON_WORK_QUEUE_SIZE gauge +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="4",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="0",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="2",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="6",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="1",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="5",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="3",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="7",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 14 diff --git a/dlrover/python/tests/data/xpu_timer_metrics b/dlrover/python/tests/data/xpu_timer/normal/xpu_timer_metric_0 similarity index 100% rename from dlrover/python/tests/data/xpu_timer_metrics rename to dlrover/python/tests/data/xpu_timer/normal/xpu_timer_metric_0 diff --git a/dlrover/python/tests/data/xpu_timer/xpu_timer_metric_single b/dlrover/python/tests/data/xpu_timer/xpu_timer_metric_single new file mode 100644 index 000000000..0e646c2ab --- /dev/null +++ b/dlrover/python/tests/data/xpu_timer/xpu_timer_metric_single @@ -0,0 +1,193 @@ +# HELP exposer_transferred_bytes_total Transferred bytes to metrics services +# TYPE exposer_transferred_bytes_total counter +exposer_transferred_bytes_total 12375203 +# HELP exposer_scrapes_total Number of times metrics were scraped +# TYPE exposer_scrapes_total counter +exposer_scrapes_total 6174 +# HELP exposer_request_latencies Latencies of serving scrape requests, in microseconds +# TYPE exposer_request_latencies summary +exposer_request_latencies_count 6174 +exposer_request_latencies_sum 6043374 +exposer_request_latencies{quantile="0.5"} 888 +exposer_request_latencies{quantile="0.9"} 888 +exposer_request_latencies{quantile="0.99"} 888 +# TYPE XPU_TIMER_MM_KERNEL_AVG_LATENCY gauge +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 19063 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 3719 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 78409 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 3673 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 3682 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 7500 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 23150 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 3675 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 3662 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 7267 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 3678 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 7082 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 7076 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 8104 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 7587 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 3690 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 7732 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 3511 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 18365 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 11184 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 17503 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 23387 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 6777 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 23613 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 79162 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 3694 +# TYPE XPU_TIMER_MM_KERNEL_MAX_LATENCY gauge +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 73411 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 4994 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 78856 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 4011 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 4035 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 63087 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 71453 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 4035 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 3990 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 63198 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 4001 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 63342 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 63214 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 63184 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 64022 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 4112 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 63349 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 3971 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 71930 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 75577 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 76229 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 72937 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 63274 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 72472 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 80781 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 4085 +# TYPE XPU_TIMER_MM_KERNEL_P99_LATENCY gauge +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 73411 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 4994 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 78856 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 4011 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 4025 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 63087 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 71453 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 4035 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 3989 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 63198 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 4001 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 63342 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 63214 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 63184 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 64022 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 4112 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 63349 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 3971 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 71930 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 75577 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 76229 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 72832 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 63274 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 72472 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 80781 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 4084 +# TYPE XPU_TIMER_MM_KERNEL_MIN_LATENCY gauge +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 1183 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 1257 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 77810 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 2995 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 2992 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 1039 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 1182 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 2996 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 2992 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 1040 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 2991 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 1037 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 1039 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 1058 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 1049 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 2996 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 1057 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 2997 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 1181 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 1200 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 1205 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 1180 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 1042 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 1182 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 77646 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 2991 +# TYPE XPU_TIMER_MM_KERNEL_FLOPS gauge +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 341.3345944233395 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 333.0872448878719 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 297.7779568156592 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 412.980679183614 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 412.7927898846224 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 371.6654952875318 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 333.9413147465587 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 412.6426014946334 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 414.2250512081811 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 365.3253468575526 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 414.2250512081811 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 366.8107237284316 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 366.655433637526 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 372.6319021584654 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 376.5763187831131 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 412.4764489189885 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 380.7275537933207 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 414.4304717174849 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 340.2784849108911 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 342.0186789595024 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 339.7178943261944 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 333.5362100380105 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 368.8886616386404 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 330.9267993677032 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 293.8160541599364 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 410.5513810707692 +# TYPE XPU_TIMER_COMMON_HANG gauge +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="4",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 0 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="0",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 0 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="2",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 0 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="6",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 0 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="1",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 0 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="5",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 0 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="3",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 0 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="7",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 0 +# TYPE XPU_TIMER_COMMON_START_DUMP gauge +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="4",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="0",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="2",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="6",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="1",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="5",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="3",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="7",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 0 +# TYPE XPU_TIMER_COMMON_END_DUMP gauge +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="4",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="0",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="2",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="6",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="1",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="5",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="3",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="7",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 0 +# TYPE XPU_TIMER_COMMON_POOL_QUEUE_SIZE gauge +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="4",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="0",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="2",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="6",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="1",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="5",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="3",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="7",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 23 +# TYPE XPU_TIMER_COMMON_WORK_QUEUE_SIZE gauge +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="4",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="0",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="2",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="6",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="1",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="5",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="3",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="7",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 14 diff --git a/dlrover/python/tests/test_diagnosis_agent.py b/dlrover/python/tests/test_diagnosis_agent.py index 83cd5be83..d1cfe34bb 100644 --- a/dlrover/python/tests/test_diagnosis_agent.py +++ b/dlrover/python/tests/test_diagnosis_agent.py @@ -137,7 +137,7 @@ def test_xpu_timer_metric_collect(self): self.assertEqual(collector.collect_data(), "") - file = "data/xpu_timer_metrics" + file = "data/xpu_timer/xpu_timer_metric_single" file_path = os.path.join(os.path.dirname(__file__), file) with open(file_path, "r", encoding="utf-8") as file: test_metrics = file.read() diff --git a/dlrover/python/tests/test_inference_chain.py b/dlrover/python/tests/test_inference_chain.py index 5a5124997..998588736 100644 --- a/dlrover/python/tests/test_inference_chain.py +++ b/dlrover/python/tests/test_inference_chain.py @@ -13,8 +13,13 @@ import os import unittest +from typing import Dict, List, Tuple -from dlrover.python.diagnosis.common.constants import InferenceConfigKey +from dlrover.python.diagnosis.common.constants import ( + DiagnosisDataType, + InferenceConfigKey, +) +from dlrover.python.diagnosis.common.diagnosis_data import WorkerTrainingMetric from dlrover.python.diagnosis.common.inference_chain import ( Inference, InferenceAttribute, @@ -40,7 +45,256 @@ def setUp(self): def tearDown(self): pass + def test_check_training_hang_operator_find_intersection(self): + test_metric: Dict[int, List[Tuple[int, bool]]] = { + 1: [(1, True), (2, False), (3, True), (4, True), (5, True)], + 2: [(1, True), (2, True), (3, True), (4, True), (5, False)], + 3: [(1, False), (2, True), (3, True), (4, True), (5, True)], + } + operator = CheckTrainingHangOperator(None) + self.assertEqual( + operator._find_hang_intersection(test_metric), (-1, -1) + ) + + test_metric: Dict[int, List[Tuple[int, bool]]] = { + 1: [ + (1, True), + (2, False), + (3, True), + (4, True), + (5, True), + (6, True), + (7, True), + ], + 2: [ + (1, True), + (2, True), + (3, True), + (4, True), + (5, False), + (6, True), + (7, True), + ], + 3: [ + (1, False), + (2, True), + (3, True), + (4, True), + (5, True), + (6, True), + (7, True), + ], + } + operator = CheckTrainingHangOperator(None) + self.assertEqual(operator._find_hang_intersection(test_metric), (2, 1)) + + test_metric: Dict[int, List[Tuple[int, bool]]] = { + 1: [ + (1, True), + (2, False), + (3, True), + (4, True), + (5, True), + (6, True), + (8, True), + ], + 2: [ + (1, True), + (2, True), + (3, True), + (4, True), + (5, False), + (6, True), + (8, True), + ], + 3: [ + (1, False), + (2, True), + (3, True), + (4, True), + (5, True), + (6, True), + (8, True), + ], + } + operator = CheckTrainingHangOperator(None) + self.assertEqual(operator._find_hang_intersection(test_metric), (2, 2)) + + test_metric: Dict[int, List[Tuple[int, bool]]] = { + 1: [ + (1, True), + (2, False), + (3, True), + (4, True), + (5, True), + (6, True), + (8, False), + ], + 2: [ + (1, True), + (2, True), + (3, True), + (4, True), + (5, False), + (6, True), + (8, True), + ], + 3: [ + (1, False), + (2, True), + (3, True), + (4, True), + (5, True), + (6, True), + (8, True), + ], + } + operator = CheckTrainingHangOperator(None) + self.assertEqual( + operator._find_hang_intersection(test_metric), (-1, -1) + ) + + def test_check_training_hang_operator_is_hang(self): + operator = CheckTrainingHangOperator(None) + test_data = [] + + # prepare test data + normal_metric, some_abnormal_metric, all_abnormal_metric = "", "", "" + file_path = os.path.join( + os.path.dirname(__file__), + "data/xpu_timer/normal/xpu_timer_metric_0", + ) + with open(file_path, "r", encoding="utf-8") as file: + normal_metric = file.read() + file_path = os.path.join( + os.path.dirname(__file__), + "data/xpu_timer/hang/xpu_timer_metric_some", + ) + with open(file_path, "r", encoding="utf-8") as file: + some_abnormal_metric = file.read() + file_path = os.path.join( + os.path.dirname(__file__), + "data/xpu_timer/hang/xpu_timer_metric_all", + ) + with open(file_path, "r", encoding="utf-8") as file: + all_abnormal_metric = file.read() + + # test data: no worker hang + w0_t1 = WorkerTrainingMetric( + timestamp=1, + data_type=DiagnosisDataType.XPU_TIMER_METRIC, + data_content=normal_metric, + node_id=0, + node_type="worker", + node_rank=0, + ) + w0_t2 = WorkerTrainingMetric( + timestamp=2, + data_type=DiagnosisDataType.XPU_TIMER_METRIC, + data_content=normal_metric, + node_id=0, + node_type="worker", + node_rank=0, + ) + w1_t1 = WorkerTrainingMetric( + timestamp=1, + data_type=DiagnosisDataType.XPU_TIMER_METRIC, + data_content=normal_metric, + node_id=1, + node_type="worker", + node_rank=1, + ) + w1_t2 = WorkerTrainingMetric( + timestamp=2, + data_type=DiagnosisDataType.XPU_TIMER_METRIC, + data_content=normal_metric, + node_id=1, + node_type="worker", + node_rank=1, + ) + test_data = [w0_t1, w1_t1, w0_t2, w1_t2] + + self.assertFalse(operator.is_hang(test_data)) + test_data.clear() + + # test data0: 1 of 2 worker hang + w0_t1 = WorkerTrainingMetric( + timestamp=1, + data_type=DiagnosisDataType.XPU_TIMER_METRIC, + data_content=some_abnormal_metric, + node_id=0, + node_type="worker", + node_rank=0, + ) + w0_t2 = WorkerTrainingMetric( + timestamp=2, + data_type=DiagnosisDataType.XPU_TIMER_METRIC, + data_content=some_abnormal_metric, + node_id=0, + node_type="worker", + node_rank=0, + ) + w1_t1 = WorkerTrainingMetric( + timestamp=1, + data_type=DiagnosisDataType.XPU_TIMER_METRIC, + data_content=some_abnormal_metric, + node_id=1, + node_type="worker", + node_rank=1, + ) + w1_t2 = WorkerTrainingMetric( + timestamp=2, + data_type=DiagnosisDataType.XPU_TIMER_METRIC, + data_content=some_abnormal_metric, + node_id=1, + node_type="worker", + node_rank=1, + ) + test_data = [w0_t1, w1_t1, w0_t2, w1_t2] + + self.assertFalse(operator.is_hang(test_data)) + test_data.clear() + + # test data: 2 of 2 worker hang + w0_t1 = WorkerTrainingMetric( + timestamp=1, + data_type=DiagnosisDataType.XPU_TIMER_METRIC, + data_content=all_abnormal_metric, + node_id=0, + node_type="worker", + node_rank=0, + ) + w0_t2 = WorkerTrainingMetric( + timestamp=2, + data_type=DiagnosisDataType.XPU_TIMER_METRIC, + data_content=all_abnormal_metric, + node_id=0, + node_type="worker", + node_rank=0, + ) + w1_t1 = WorkerTrainingMetric( + timestamp=1, + data_type=DiagnosisDataType.XPU_TIMER_METRIC, + data_content=all_abnormal_metric, + node_id=1, + node_type="worker", + node_rank=1, + ) + w1_t2 = WorkerTrainingMetric( + timestamp=2, + data_type=DiagnosisDataType.XPU_TIMER_METRIC, + data_content=all_abnormal_metric, + node_id=1, + node_type="worker", + node_rank=1, + ) + test_data = [w0_t1, w1_t1, w0_t2, w1_t2] + + self.assertTrue(operator.is_hang(test_data)) + test_data.clear() + def test_check_training_hang_operator(self): + # no data operator = CheckTrainingHangOperator(None) inf = Inference( name=InferenceName.TRAINING, diff --git a/dlrover/python/tests/test_job_manager.py b/dlrover/python/tests/test_job_manager.py index ac6840a2f..6abfbb2b1 100644 --- a/dlrover/python/tests/test_job_manager.py +++ b/dlrover/python/tests/test_job_manager.py @@ -703,6 +703,19 @@ def test_get_pending_timeout(self): # reset _dlrover_context.seconds_to_wait_pending_pod = 900 + def test_multi_getting(self): + params = MockK8sPSJobArgs() + params.initilize() + manager = create_job_manager(params, SpeedMonitor()) + self.assertEqual(manager.get_total_node_num_by_type(NodeType.PS), 0) + manager._init_nodes() + + self.assertEqual(manager.get_job_strategy(), DistributionStrategy.PS) + self.assertEqual(manager.get_total_node_num_by_type(NodeType.PS), 3) + self.assertEqual(manager.get_node_required_info(), (0, 0, 0)) + manager._nodes_required = (3, 5, 100) + self.assertEqual(manager.get_node_required_info(), (3, 5, 100)) + class LocalJobManagerTest(unittest.TestCase): def test_local_job_manager(self): From 7e3ed0a020edeadbf44db231b55bc86f143e162c Mon Sep 17 00:00:00 2001 From: "chentianyi.cty" Date: Mon, 14 Oct 2024 17:46:22 +0800 Subject: [PATCH 2/7] optimized --- .../check_training_hang_operator.py | 16 +++-- .../python/elastic_agent/torch/training.py | 1 + dlrover/python/master/diagnosis/diagnosis.py | 9 +-- dlrover/python/master/servicer.py | 4 +- dlrover/python/tests/test_diagnosis.py | 4 +- dlrover/python/tests/test_inference_chain.py | 70 ++----------------- dlrover/python/tests/test_servicer.py | 4 +- 7 files changed, 26 insertions(+), 82 deletions(-) diff --git a/dlrover/python/diagnosis/inferencechain/inferenceoperator/check_training_hang_operator.py b/dlrover/python/diagnosis/inferencechain/inferenceoperator/check_training_hang_operator.py index 55a9024b5..6da976e7d 100644 --- a/dlrover/python/diagnosis/inferencechain/inferenceoperator/check_training_hang_operator.py +++ b/dlrover/python/diagnosis/inferencechain/inferenceoperator/check_training_hang_operator.py @@ -123,16 +123,20 @@ def is_hang(self, diagnosis_data: List[DiagnosisData]): # hang detection rules: # 1. 100% worker got hang metric - # 2. last for 10+ minutes + # 2. last for 5+ minutes hang_id, hang_last = self._find_hang_intersection(worker_hang_metric) - if hang_id != -1: - logger.info( - f"Got hang worker: {hang_id}, " f"time last: {hang_last}" - ) + hang_last_threshold = self._get_hang_time_last_threshold() + if hang_id != -1 and hang_last > hang_last_threshold: + logger.info(f"Got hang worker: {hang_id}, time last: {hang_last}, " + f"threshold: {hang_last_threshold}") return True return False + def _get_hang_time_last_threshold(self): + # set 5 minutes for now(second) + return 5 * 60 + def _find_hang_intersection( self, worker_hang_metric: Dict[int, List[Tuple[int, bool]]] ) -> Tuple[int, int]: @@ -141,7 +145,7 @@ def _find_hang_intersection( Args: worker_hang_metric (Dict[int, List[Tuple[int, bool]]]): Input - metric. + metric in format: node_id: [(timestamp, is_hang), ...] Returns: The hang intersection's id and time last in tuple format. diff --git a/dlrover/python/elastic_agent/torch/training.py b/dlrover/python/elastic_agent/torch/training.py index 29eebd7d4..ad8c47de3 100644 --- a/dlrover/python/elastic_agent/torch/training.py +++ b/dlrover/python/elastic_agent/torch/training.py @@ -852,6 +852,7 @@ def _invoke_run(self, role: str = DEFAULT_ROLE) -> RunResult: logger.warning(f"Unexpected exception when ending: {e}") finally: self._client.report_succeeded() + logger.info("Succeeded and exit.") return run_result elif state in {WorkerState.UNHEALTHY, WorkerState.FAILED}: diff --git a/dlrover/python/master/diagnosis/diagnosis.py b/dlrover/python/master/diagnosis/diagnosis.py index 5d4a8b87c..4102f20bf 100644 --- a/dlrover/python/master/diagnosis/diagnosis.py +++ b/dlrover/python/master/diagnosis/diagnosis.py @@ -16,6 +16,7 @@ import time from collections import deque from datetime import datetime, timedelta +from itertools import islice from typing import Dict, List from dlrover.python.common.log import default_logger as logger @@ -152,15 +153,15 @@ def _clean_diagnosis_data(self, data_type: str): if data_type not in self.data: return - data = self.data[data_type] + each_data = self.data[data_type] n = 0 - for d in data: + for d in each_data: if has_expired(d.timestamp, self.expire_time_period): n = n + 1 else: break - - self.data[data_type] = data[n:] + if n > 0: + self.data[data_type] = deque(islice(each_data, n, len(each_data))) class Diagnostician: diff --git a/dlrover/python/master/servicer.py b/dlrover/python/master/servicer.py index e8250dc17..e980ea5a3 100644 --- a/dlrover/python/master/servicer.py +++ b/dlrover/python/master/servicer.py @@ -360,7 +360,7 @@ def report(self, request, _): elif isinstance(message, grpc.NodeCheckpointState): success = self._sync_checkpoint(node_type, node_id, message) elif isinstance(message, grpc.DiagnosisReportData): - success = self._report_worker_diagnosis_data(message) + success = self._report_node_diagnosis_data(message) elif isinstance(message, grpc.SucceededRequest): success = self._report_succeeded(node_id, node_type) @@ -618,7 +618,7 @@ def _sync_checkpoint( rdzv_manager = self._rdzv_managers[RendezvousName.ELASTIC_TRAINING] return rdzv_manager.sync_ckpt_nodes(node_id, message.step) - def _report_worker_diagnosis_data(self, message: grpc.DiagnosisReportData): + def _report_node_diagnosis_data(self, message: grpc.DiagnosisReportData): if self._diagnosis_manager: data_cls: Optional[DiagnosisData] = getattr( self._diagnosis_data_module, diff --git a/dlrover/python/tests/test_diagnosis.py b/dlrover/python/tests/test_diagnosis.py index 501dd1d08..64c1364e5 100644 --- a/dlrover/python/tests/test_diagnosis.py +++ b/dlrover/python/tests/test_diagnosis.py @@ -27,7 +27,7 @@ def tearDown(self): pass def test_data_manager(self): - mgr = DiagnosisDataManager(5) + mgr = DiagnosisDataManager(expire_time_period=3) log1 = TrainingLog(0) mgr.store_data(log1) time.sleep(1) @@ -37,7 +37,7 @@ def test_data_manager(self): logs = mgr.get_data(DiagnosisDataType.TRAINING_LOG) self.assertEqual(len(logs), 2) - time.sleep(6) + time.sleep(4) log3 = TrainingLog(0) mgr.store_data(log3) logs = mgr.get_data(DiagnosisDataType.TRAINING_LOG) diff --git a/dlrover/python/tests/test_inference_chain.py b/dlrover/python/tests/test_inference_chain.py index 0bdc4baa1..a44186078 100644 --- a/dlrover/python/tests/test_inference_chain.py +++ b/dlrover/python/tests/test_inference_chain.py @@ -14,22 +14,17 @@ import os import unittest from typing import Dict, List, Tuple +from unittest import mock from unittest.mock import patch -from diagnosis.datacollector.training_log_collector import TrainingLogCollector -from diagnosis.datacollector.xpu_timer_metric_collector import \ - XpuTimerMetricsCollector -from dlrover.python.diagnosis.common.constants import ( - DiagnosisDataType, - InferenceConfigKey, -) -from dlrover.python.diagnosis.common.diagnosis_data import WorkerTrainingMetric from dlrover.python.common import env_utils from dlrover.python.common.constants import NodeEnv, NodeType from dlrover.python.diagnosis.common.constants import ( + DiagnosisDataType, EnvConfigKey, InferenceConfigKey, ) +from dlrover.python.diagnosis.common.diagnosis_data import WorkerTrainingMetric from dlrover.python.diagnosis.common.inference_chain import ( Inference, InferenceAttribute, @@ -175,7 +170,7 @@ def test_check_training_hang_operator_find_intersection(self): def test_check_training_hang_operator_is_hang(self): operator = CheckTrainingHangOperator(None) - test_data = [] + operator._get_hang_time_last_threshold = mock.MagicMock(return_value=0) # prepare test data normal_metric, some_abnormal_metric, all_abnormal_metric = "", "", "" @@ -400,63 +395,6 @@ def test_inference_chain(self): ) self.assertTrue(is_same_inference(results[0], failure_inf)) - @patch( - "dlrover.python.diagnosis.datacollector.training_log_collector" - ".read_last_n_lines" - ) - def test_log_collect(self, mock_file_util): - mock_file_util.return_value = [ - "test0", - "DLRover agent started with:", - "test1", - ] - training_log_collector = TrainingLogCollector( - log_file="test", n_line=3 - ) - self.assertTrue(training_log_collector.is_enabled()) - result = training_log_collector.collect_data() - self.assertTrue("test0" not in result.logs) - self.assertTrue("test1" in result.logs) - - def test_xpu_timer_metric_collect(self): - collector = XpuTimerMetricsCollector() - self.assertFalse(collector.is_enabled()) - - env_utils.set_env(EnvConfigKey.XPU_TIMER_PORT, 18889) - collector = XpuTimerMetricsCollector() - self.assertTrue(collector.is_enabled()) - - self.assertEqual(collector.collect_data(), "") - - file = "data/xpu_timer/xpu_timer_metric_single" - file_path = os.path.join(os.path.dirname(__file__), file) - with open(file_path, "r", encoding="utf-8") as file: - test_metrics = file.read() - result = collector._preprocess_metrics(test_metrics) - self.assertTrue(result) - if "#" in result or "exposer" in result: - self.fail() - - env_utils.set_env(NodeEnv.NODE_ID, 1) - env_utils.set_env(NodeEnv.NODE_TYPE, NodeType.WORKER) - env_utils.set_env(NodeEnv.NODE_RANK, 1) - agent_xpu_metric = WorkerTrainingMetric( - data_type=DiagnosisDataType.XPU_TIMER_METRIC, - data_content=result, - node_id=env_utils.get_node_id(), - node_type=env_utils.get_node_type(), - node_rank=env_utils.get_node_rank(), - ) - self.assertEqual( - agent_xpu_metric.data_type, - DiagnosisDataType.XPU_TIMER_METRIC, - ) - self.assertEqual(agent_xpu_metric.data_content, result) - self.assertEqual(agent_xpu_metric.node_id, 1) - self.assertEqual(agent_xpu_metric.node_type, NodeType.WORKER) - self.assertEqual(agent_xpu_metric.node_rank, 1) - self.assertTrue(agent_xpu_metric.timestamp > 0) - @patch( "dlrover.python.diagnosis.datacollector.xpu_timer_metric_collector" ".XpuTimerMetricsCollector.collect_data" diff --git a/dlrover/python/tests/test_servicer.py b/dlrover/python/tests/test_servicer.py index b30b6e999..799383825 100644 --- a/dlrover/python/tests/test_servicer.py +++ b/dlrover/python/tests/test_servicer.py @@ -408,7 +408,7 @@ def test_sync_checkpoint(self): success = self.servicer._sync_checkpoint(NodeType.WORKER, 1, message) self.assertTrue(success) - def test_report_worker_diagnosis_data(self): + def test_report_node_diagnosis_data(self): test = WorkerTrainingMetric( data_content="test123", node_id=env_utils.get_node_id(), @@ -422,7 +422,7 @@ def test_report_worker_diagnosis_data(self): test.to_json(), test.node_rank, ) - self.assertTrue(self.servicer._report_worker_diagnosis_data(request)) + self.assertTrue(self.servicer._report_node_diagnosis_data(request)) def test_report_succeeded(self): self.assertTrue(self.servicer._report_succeeded(0, NodeType.WORKER)) From 5f36a45d4840bca8ada59af5d2b676fb7da8aec9 Mon Sep 17 00:00:00 2001 From: "chentianyi.cty" Date: Tue, 15 Oct 2024 10:56:40 +0800 Subject: [PATCH 3/7] optimized --- dlrover/python/common/global_context.py | 1 + .../check_training_hang_operator.py | 14 ++++++++++++-- dlrover/python/master/args.py | 7 +++++++ .../python/tests/test_diagnosis_data_collector.py | 2 +- 4 files changed, 21 insertions(+), 3 deletions(-) diff --git a/dlrover/python/common/global_context.py b/dlrover/python/common/global_context.py index 5b679066a..fc8ae56d4 100644 --- a/dlrover/python/common/global_context.py +++ b/dlrover/python/common/global_context.py @@ -92,6 +92,7 @@ def __init__(self): self.is_tfv1_ps = False self.master_port = None self.relaunch_always = False + self.hang_detect_strategy = 0 def set_params_from_brain(self): self.train_speed_record_num = self.get_param_value_from_brain( diff --git a/dlrover/python/diagnosis/inferencechain/inferenceoperator/check_training_hang_operator.py b/dlrover/python/diagnosis/inferencechain/inferenceoperator/check_training_hang_operator.py index 6da976e7d..c4f8d5246 100644 --- a/dlrover/python/diagnosis/inferencechain/inferenceoperator/check_training_hang_operator.py +++ b/dlrover/python/diagnosis/inferencechain/inferenceoperator/check_training_hang_operator.py @@ -15,6 +15,7 @@ import sys from typing import Dict, List, Tuple +from dlrover.python.common.global_context import Context from dlrover.python.common.log import default_logger as logger from dlrover.python.diagnosis.common.constants import DiagnosisDataType from dlrover.python.diagnosis.common.diagnosis_data import DiagnosisData @@ -27,6 +28,7 @@ ) HANG_METRIC_PREFIX = "XPU_TIMER_COMMON_HANG" +_dlrover_ctx = Context.singleton_instance() class CheckTrainingHangOperator(InferenceOperator): @@ -127,8 +129,16 @@ def is_hang(self, diagnosis_data: List[DiagnosisData]): hang_id, hang_last = self._find_hang_intersection(worker_hang_metric) hang_last_threshold = self._get_hang_time_last_threshold() if hang_id != -1 and hang_last > hang_last_threshold: - logger.info(f"Got hang worker: {hang_id}, time last: {hang_last}, " - f"threshold: {hang_last_threshold}") + logger.info( + f"Got hang worker: {hang_id}, time last: {hang_last}, " + f"threshold: {hang_last_threshold}" + ) + if _dlrover_ctx.hang_detect_strategy == 1: + # TODO + pass + elif _dlrover_ctx.hang_detect_strategy == 2: + # TODO + pass return True return False diff --git a/dlrover/python/master/args.py b/dlrover/python/master/args.py index 05bf6bd35..7bf4d2040 100644 --- a/dlrover/python/master/args.py +++ b/dlrover/python/master/args.py @@ -86,6 +86,13 @@ def _build_master_args_parser(): type=pos_int, help="The number of nodes", ) + parser.add_argument( + "--hang_detection", + default=0, + type=pos_int, + help="The strategy of 'hang detection', " + "0: log only; 1: notify; 2: with fault tolerance", + ) add_params(parser) return parser diff --git a/dlrover/python/tests/test_diagnosis_data_collector.py b/dlrover/python/tests/test_diagnosis_data_collector.py index 7a6eb10e0..a69ea2daa 100644 --- a/dlrover/python/tests/test_diagnosis_data_collector.py +++ b/dlrover/python/tests/test_diagnosis_data_collector.py @@ -71,7 +71,7 @@ def test_xpu_timer_metric_collector(self): self.assertEqual(collector.collect_data(), "") - file = "data/xpu_timer_metrics" + file = "data/xpu_timer/xpu_timer_metric_single" file_path = os.path.join(os.path.dirname(__file__), file) with open(file_path, "r", encoding="utf-8") as file: test_metrics = file.read() From 3e0d7c3086fce6af7506be158e85707462530487 Mon Sep 17 00:00:00 2001 From: "chentianyi.cty" Date: Wed, 16 Oct 2024 14:50:54 +0800 Subject: [PATCH 4/7] add strategy params --- dlrover/python/common/global_context.py | 5 ++++- .../inferenceoperator/check_training_hang_operator.py | 4 ++-- dlrover/python/master/args.py | 2 +- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/dlrover/python/common/global_context.py b/dlrover/python/common/global_context.py index fc8ae56d4..c12453ed0 100644 --- a/dlrover/python/common/global_context.py +++ b/dlrover/python/common/global_context.py @@ -51,6 +51,7 @@ class DefaultValues(object): SEC_TO_CHANGE_PS = 3600 # 1h SEC_TO_WAIT_FAILED_PS = 600 # 10min HANG_CPU_USAGE_RATE = 0.05 + HANG_DETECTION = 1 class Context(Singleton): @@ -92,7 +93,9 @@ def __init__(self): self.is_tfv1_ps = False self.master_port = None self.relaunch_always = False - self.hang_detect_strategy = 0 + # The strategy of 'hang detection': + # 0: log only; 1: notify; 2: with fault tolerance + self.hang_detection = DefaultValues.HANG_DETECTION def set_params_from_brain(self): self.train_speed_record_num = self.get_param_value_from_brain( diff --git a/dlrover/python/diagnosis/inferencechain/inferenceoperator/check_training_hang_operator.py b/dlrover/python/diagnosis/inferencechain/inferenceoperator/check_training_hang_operator.py index c4f8d5246..8a9175743 100644 --- a/dlrover/python/diagnosis/inferencechain/inferenceoperator/check_training_hang_operator.py +++ b/dlrover/python/diagnosis/inferencechain/inferenceoperator/check_training_hang_operator.py @@ -133,10 +133,10 @@ def is_hang(self, diagnosis_data: List[DiagnosisData]): f"Got hang worker: {hang_id}, time last: {hang_last}, " f"threshold: {hang_last_threshold}" ) - if _dlrover_ctx.hang_detect_strategy == 1: + if _dlrover_ctx.hang_detection == 1: # TODO pass - elif _dlrover_ctx.hang_detect_strategy == 2: + elif _dlrover_ctx.hang_detection == 2: # TODO pass return True diff --git a/dlrover/python/master/args.py b/dlrover/python/master/args.py index 7bf4d2040..177c91f71 100644 --- a/dlrover/python/master/args.py +++ b/dlrover/python/master/args.py @@ -88,7 +88,7 @@ def _build_master_args_parser(): ) parser.add_argument( "--hang_detection", - default=0, + default=1, type=pos_int, help="The strategy of 'hang detection', " "0: log only; 1: notify; 2: with fault tolerance", From 24059215ff45f6f1df0abd41d5eb2753458fb412 Mon Sep 17 00:00:00 2001 From: "chentianyi.cty" Date: Thu, 17 Oct 2024 19:17:43 +0800 Subject: [PATCH 5/7] deve --- dlrover/python/diagnosis/common/constants.py | 4 +- .../diagnosis/common/diagnose_action.py | 22 ---- .../diagnosis/common/diagnosis_action.py | 106 ++++++++++++++++++ .../diagnosis/inferencechain/coordinator.py | 6 +- .../diagnosis/diagnosis_agent.py | 10 +- .../python/elastic_agent/torch/training.py | 10 +- dlrover/python/master/diagnosis/diagnosis.py | 12 +- .../python/master/node/dist_job_manager.py | 35 ++++++ dlrover/python/master/node/job_manager.py | 17 ++- dlrover/python/tests/test_diagnosis.py | 38 ++++++- dlrover/python/tests/test_diagnosis_agent.py | 10 +- dlrover/python/tests/test_job_manager.py | 3 + 12 files changed, 224 insertions(+), 49 deletions(-) delete mode 100644 dlrover/python/diagnosis/common/diagnose_action.py create mode 100644 dlrover/python/diagnosis/common/diagnosis_action.py diff --git a/dlrover/python/diagnosis/common/constants.py b/dlrover/python/diagnosis/common/constants.py index 601bb730d..85bec03e4 100644 --- a/dlrover/python/diagnosis/common/constants.py +++ b/dlrover/python/diagnosis/common/constants.py @@ -32,7 +32,9 @@ class DiagnosisDataType(object): XPU_TIMER_METRIC = "XPU_TIMER_METRIC" -class DiagnosisAction(object): +class DiagnosisActionType(object): NO_ACTION = "no_action" RESTART_WORKER = "restart_worker" RELAUNCH_WORKER = "relaunch_worker" + EVENT = "event" + MASTER_RELAUNCH_WORKER = "master_relaunch_worker" diff --git a/dlrover/python/diagnosis/common/diagnose_action.py b/dlrover/python/diagnosis/common/diagnose_action.py deleted file mode 100644 index ea96de464..000000000 --- a/dlrover/python/diagnosis/common/diagnose_action.py +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright 2024 The DLRover Authors. All rights reserved. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import List - - -class DiagnoseAction: - def __init__(self): - self._actions: List[str] = [] - - def add_action(self, action: str): - self._actions.append(action) diff --git a/dlrover/python/diagnosis/common/diagnosis_action.py b/dlrover/python/diagnosis/common/diagnosis_action.py new file mode 100644 index 000000000..98bb723be --- /dev/null +++ b/dlrover/python/diagnosis/common/diagnosis_action.py @@ -0,0 +1,106 @@ +# Copyright 2024 The DLRover Authors. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict, List + +from dlrover.python.diagnosis.common.constants import DiagnosisActionType + + +class DiagnosisAction: + """ + The action describes the expect operation after diagnostician. + The action can be consumed by the master's job manager or directly used + in training node. + """ + + def __init__( + self, + diagnosis_type: DiagnosisActionType = DiagnosisActionType.NO_ACTION, + action_config={}, + ): + """ + Args: + diagnosis_type (DiagnosisActionType): The action type. + """ + + self._diagnosis_type = diagnosis_type + self._action_config = action_config + + @property + def diagnosis_type(self): + return self._diagnosis_type + + @property + def action_config(self): + return self._action_config + + +class EventAction(DiagnosisAction): + """Output the specified event.""" + + def __init__( + self, + event_type: str = "", + instance: str = "", + action: str = "", + msg: str = "", + labels: Dict[str, str] = {}, + ): + super().__init__(DiagnosisActionType.EVENT) + self._event_type = event_type + self._instance = instance + self._action = action + self._msg = msg + self._labels = labels + + @property + def event_type(self): + return self._event_type + + @property + def instance(self): + return self._instance + + @property + def action(self): + return self._action + + @property + def msg(self): + return self._msg + + @property + def labels(self): + return self._labels + + +class NodeRelaunchAction(DiagnosisAction): + """Relaunch the specified node.""" + + def __init__(self, node_id, node_status, reason): + super().__init__(DiagnosisActionType.MASTER_RELAUNCH_WORKER) + self._node_id = node_id + self._node_status = node_status + self._reason = reason + + @property + def node_id(self): + return self._node_id + + @property + def node_status(self): + return self._node_status + + @property + def reason(self): + return self._reason diff --git a/dlrover/python/diagnosis/inferencechain/coordinator.py b/dlrover/python/diagnosis/inferencechain/coordinator.py index 07cb70326..f092fa95a 100644 --- a/dlrover/python/diagnosis/inferencechain/coordinator.py +++ b/dlrover/python/diagnosis/inferencechain/coordinator.py @@ -13,9 +13,9 @@ from typing import List -from dlrover.python.diagnosis.common.diagnose_action import DiagnoseAction +from dlrover.python.diagnosis.common.diagnosis_action import DiagnosisAction from dlrover.python.diagnosis.common.inference_chain import Inference -def coordinate_inferences(observations: List[Inference]) -> DiagnoseAction: - return DiagnoseAction() +def coordinate_inferences(observations: List[Inference]) -> DiagnosisAction: + return DiagnosisAction() diff --git a/dlrover/python/elastic_agent/diagnosis/diagnosis_agent.py b/dlrover/python/elastic_agent/diagnosis/diagnosis_agent.py index 7b1619829..85054a4f3 100644 --- a/dlrover/python/elastic_agent/diagnosis/diagnosis_agent.py +++ b/dlrover/python/elastic_agent/diagnosis/diagnosis_agent.py @@ -25,11 +25,11 @@ from dlrover.python.common.singleton import Singleton from dlrover.python.common.worker import WorkerContext from dlrover.python.diagnosis.common.constants import ( - DiagnosisAction, + DiagnosisActionType, DiagnosisConstant, InferenceConfigKey, ) -from dlrover.python.diagnosis.common.diagnose_action import DiagnoseAction +from dlrover.python.diagnosis.common.diagnosis_action import DiagnosisAction from dlrover.python.diagnosis.common.diagnosis_data import WorkerTrainingMetric from dlrover.python.diagnosis.common.inference_chain import ( Inference, @@ -105,7 +105,7 @@ def _observe(self) -> List[Inference]: def _diagnose_observations( self, observations: List[Inference] - ) -> DiagnoseAction: + ) -> DiagnosisAction: conclusions: List[Inference] = [] for ob in observations: ic = InferenceChain([ob], self._diagnosis_operators) @@ -165,7 +165,7 @@ def diagnose_training_failure(self, worker_context: WorkerContext) -> str: f"{worker_context.worker_spec.max_restarts} " f"attempts left; will restart worker group." ) - return DiagnosisAction.RESTART_WORKER + return DiagnosisActionType.RESTART_WORKER else: logger.info( f"[{worker_context.worker_spec.role}] Worker group " @@ -174,7 +174,7 @@ def diagnose_training_failure(self, worker_context: WorkerContext) -> str: f"no attempts({worker_context.worker_spec.max_restarts}) " "left; will relaunch." ) - return DiagnosisAction.RELAUNCH_WORKER + return DiagnosisActionType.RELAUNCH_WORKER def _report_failure_to_master( self, failures: Dict[int, ProcessFailure], restart_count: int diff --git a/dlrover/python/elastic_agent/torch/training.py b/dlrover/python/elastic_agent/torch/training.py index ad8c47de3..6a760820c 100644 --- a/dlrover/python/elastic_agent/torch/training.py +++ b/dlrover/python/elastic_agent/torch/training.py @@ -88,7 +88,7 @@ ) from dlrover.python.common.log import default_logger as logger from dlrover.python.common.worker import WorkerContext -from dlrover.python.diagnosis.common.constants import DiagnosisAction +from dlrover.python.diagnosis.common.constants import DiagnosisActionType from dlrover.python.elastic_agent.config.paral_config_tuner import ( ParalConfigTuner, ) @@ -872,9 +872,9 @@ def _invoke_run(self, role: str = DEFAULT_ROLE) -> RunResult: except Exception as e: logger.warning(f"Failed to diagnose errors: {e}") if self._remaining_failovers > 0: - action = DiagnosisAction.RESTART_WORKER + action = DiagnosisActionType.RESTART_WORKER else: - action = DiagnosisAction.RELAUNCH_WORKER + action = DiagnosisActionType.RELAUNCH_WORKER self._process_diagnose_action(action) if self._worker_group.state == WorkerState.FAILED: return run_result @@ -887,10 +887,10 @@ def _invoke_run(self, role: str = DEFAULT_ROLE) -> RunResult: raise Exception(f"[{role}] worker group in {state.name} state") def _process_diagnose_action(self, action: str): - if action == DiagnosisAction.RESTART_WORKER: + if action == DiagnosisActionType.RESTART_WORKER: self._remaining_failovers -= 1 self._restart_workers(self._worker_group) - elif action == DiagnosisAction.RELAUNCH_WORKER: + elif action == DiagnosisActionType.RELAUNCH_WORKER: self._stop_workers(self._worker_group) self._worker_group.state = WorkerState.FAILED diff --git a/dlrover/python/master/diagnosis/diagnosis.py b/dlrover/python/master/diagnosis/diagnosis.py index 4102f20bf..62b49c663 100644 --- a/dlrover/python/master/diagnosis/diagnosis.py +++ b/dlrover/python/master/diagnosis/diagnosis.py @@ -45,12 +45,16 @@ def has_expired(timestamp: float, time_period: int) -> bool: class DiagnosisManager: def __init__(self, job_manager=None): + self._job_manager = job_manager self._is_observing_started = False self._data_manager: DiagnosisDataManager = DiagnosisDataManager( job_manager, 600 ) self._diagnostician: Diagnostician = Diagnostician(self._data_manager) + def is_job_manager_exist(self) -> bool: + return self._job_manager is not None + def collect_diagnosis_data(self, data: DiagnosisData): self._data_manager.store_data(data) @@ -77,8 +81,8 @@ def start_observing(self): try: thread = threading.Thread( - target=self._diagnose_failures(), - name="diagnose_failures", + target=self._diagnose_failures, + name="failure_diagnosis", daemon=True, ) thread.start() @@ -100,12 +104,12 @@ def _diagnose_failures(self): logger.info("Stop to diagnose failures for observing.") break logger.info( - f"Diagnosis data size: {self._data_manager.get_data_size()}." + f"Current diagnosis data size: {self._data_manager.get_data_size()}." ) observed_problems = self._diagnostician.observe_training() for problem in observed_problems: - logger.info(f"observed problems: {problem}") + logger.info(f"Observe problem in diagnosing: {problem}") root_causes = self._diagnostician.diagnose_failure(problem) for root_cause in root_causes: logger.info(f"identify root cause: {root_cause}") diff --git a/dlrover/python/master/node/dist_job_manager.py b/dlrover/python/master/node/dist_job_manager.py index 1d85f8ec3..5f53dbe9f 100644 --- a/dlrover/python/master/node/dist_job_manager.py +++ b/dlrover/python/master/node/dist_job_manager.py @@ -20,6 +20,8 @@ from datetime import datetime from typing import Dict, List, Optional +from diagnosis.common.constants import DiagnosisActionType + from dlrover.python.common.constants import ( DistributionStrategy, ElasticJobLabel, @@ -206,6 +208,11 @@ def start(self): worker_num += plan.node_group_resources[NodeType.CHIEF].count self._speed_monitor.set_target_worker_num(worker_num) self._training_node_config.set_node_num(worker_num) + threading.Thread( + target=self._diagnosis_action_consumer, + name="diagnosis_action_consumer", + daemon=True, + ).start() threading.Thread( target=self._monitor_nodes, name="node_monitor", daemon=True ).start() @@ -415,6 +422,34 @@ def _init_job_auto_scaler(self): "Create job autoscaler: %s", self._job_autoscaler.__class__ ) + def _diagnosis_action_consumer(self): + logger.info("Start consuming diagnosis actions.") + while True: + if self._stopped: + logger.info("Stop consuming diagnosis actions.") + break + try: + if self.get_diagnosis_actions_size() == 0: + time.sleep(5) + continue + + action = self._diagnosis_action_queue.get() + if action.type == DiagnosisActionType.EVENT: + self._report_event( + action.event_type, + action.instance, + action.action, + action.msg, + action.labels, + ) + elif action.type == DiagnosisActionType.MASTER_RELAUNCH_WORKER: + # TODO + pass + except Exception as e: + logger.warning(e) + time.sleep(10) + time.sleep(1) + def _monitor_nodes(self): logger.info("Start monitoring nodes events.") while True: diff --git a/dlrover/python/master/node/job_manager.py b/dlrover/python/master/node/job_manager.py index bc8300ff6..c4a7f0526 100644 --- a/dlrover/python/master/node/job_manager.py +++ b/dlrover/python/master/node/job_manager.py @@ -12,10 +12,13 @@ # limitations under the License. from abc import ABCMeta, abstractmethod +from queue import Queue from typing import Dict from dlrover.python.common.log import default_logger as logger from dlrover.python.common.node import Node +from dlrover.python.diagnosis.common.constants import DiagnosisActionType +from dlrover.python.diagnosis.common.diagnosis_action import DiagnosisAction from dlrover.python.master.hyperparams.simple_strategy_generator import ( SimpleStrategyGenerator, ) @@ -58,6 +61,8 @@ def __init__( self._training_node_config = TrainingNodeConfig(external_config) + self._diagnosis_action_queue = Queue() + @abstractmethod def start(self): pass @@ -112,9 +117,6 @@ def post_ps_ready(self): def stop(self): pass - def update_node_service_addr(self, node_type, node_id, service_addr): - pass - @abstractmethod def get_cur_cluster_ps(self): pass @@ -199,11 +201,20 @@ def collect_node_heart_beat(self, node_type, node_id, timestamp): """Collect the heart beat message of nodes.""" pass + def put_diagnosis_action(self, diagnosis_action: DiagnosisAction): + self._diagnosis_action_queue.put(diagnosis_action) + + def get_diagnosis_actions_size(self): + return self._diagnosis_action_queue.qsize() + def sync_node_training_port(self, node_id, port) -> SyncNodeTrainingPorts: return self._training_node_config.sync_node_training_port( node_id, port ) + def update_node_service_addr(self, node_type, node_id, service_addr): + pass + def update_node_required_info(self, min_required, max_required, timeout): """ Update the nodes min/max requirements. diff --git a/dlrover/python/tests/test_diagnosis.py b/dlrover/python/tests/test_diagnosis.py index 64c1364e5..6bc1ef171 100644 --- a/dlrover/python/tests/test_diagnosis.py +++ b/dlrover/python/tests/test_diagnosis.py @@ -14,7 +14,16 @@ import time import unittest -from dlrover.python.diagnosis.common.constants import DiagnosisDataType +from dlrover.python.common.constants import NodeStatus +from dlrover.python.diagnosis.common.constants import ( + DiagnosisActionType, + DiagnosisDataType, +) +from dlrover.python.diagnosis.common.diagnosis_action import ( + DiagnosisAction, + EventAction, + NodeRelaunchAction, +) from dlrover.python.diagnosis.common.diagnosis_data import TrainingLog from dlrover.python.master.diagnosis.diagnosis import DiagnosisDataManager @@ -43,6 +52,33 @@ def test_data_manager(self): logs = mgr.get_data(DiagnosisDataType.TRAINING_LOG) self.assertEqual(len(logs), 1) + def test_action_basic(self): + basic_action = DiagnosisAction() + self.assertEqual( + basic_action.diagnosis_type, DiagnosisActionType.NO_ACTION + ) + + event_action = EventAction( + "info", "job", "test", "test123", {"k1": "v1"} + ) + self.assertEqual( + event_action.diagnosis_type, DiagnosisActionType.EVENT + ) + self.assertEqual(event_action.event_type, "info") + self.assertEqual(event_action.instance, "job") + self.assertEqual(event_action.action, "test") + self.assertEqual(event_action.msg, "test123") + self.assertEqual(event_action.labels, {"k1": "v1"}) + + node_relaunch_action = NodeRelaunchAction(1, NodeStatus.FAILED, "hang") + self.assertEqual( + node_relaunch_action.diagnosis_type, + DiagnosisActionType.MASTER_RELAUNCH_WORKER, + ) + self.assertEqual(node_relaunch_action.node_id, 1) + self.assertEqual(node_relaunch_action.node_status, NodeStatus.FAILED) + self.assertEqual(node_relaunch_action.reason, "hang") + if __name__ == "__main__": unittest.main() diff --git a/dlrover/python/tests/test_diagnosis_agent.py b/dlrover/python/tests/test_diagnosis_agent.py index c6770f677..ce9d2a76c 100644 --- a/dlrover/python/tests/test_diagnosis_agent.py +++ b/dlrover/python/tests/test_diagnosis_agent.py @@ -20,7 +20,7 @@ from dlrover.python.common import env_utils from dlrover.python.common.constants import RendezvousName from dlrover.python.common.worker import WorkerContext -from dlrover.python.diagnosis.common.constants import DiagnosisAction +from dlrover.python.diagnosis.common.constants import DiagnosisActionType from dlrover.python.diagnosis.common.diagnosis_data import WorkerTrainingMetric from dlrover.python.elastic_agent.diagnosis.diagnosis_agent import ( DiagnosisAgent, @@ -82,21 +82,21 @@ def test_diagnose_training(self): ) action = agent.diagnose_training_failure(wc) - self.assertEqual(action, DiagnosisAction.RESTART_WORKER) + self.assertEqual(action, DiagnosisActionType.RESTART_WORKER) agent._errors = "error code is 507035" action = agent.diagnose_training_failure(wc) - self.assertEqual(action, DiagnosisAction.RELAUNCH_WORKER) + self.assertEqual(action, DiagnosisActionType.RELAUNCH_WORKER) agent._errors = "error code is 11111" wc.remaining_failovers = 0 action = agent.diagnose_training_failure(wc) - self.assertEqual(action, DiagnosisAction.RELAUNCH_WORKER) + self.assertEqual(action, DiagnosisActionType.RELAUNCH_WORKER) agent._errors = " #" wc.remaining_failovers = 2 action = agent.diagnose_training_failure(wc) - self.assertEqual(action, DiagnosisAction.RESTART_WORKER) + self.assertEqual(action, DiagnosisActionType.RESTART_WORKER) def test_worker_training_metric(self): test = WorkerTrainingMetric( diff --git a/dlrover/python/tests/test_job_manager.py b/dlrover/python/tests/test_job_manager.py index 68ccca7e7..1c28871fd 100644 --- a/dlrover/python/tests/test_job_manager.py +++ b/dlrover/python/tests/test_job_manager.py @@ -672,6 +672,7 @@ def test_start_and_stop(self): manager.start() active_threads_name = [t.name for t in threading.enumerate()] + self.assertIn("diagnosis_action_consumer", active_threads_name) self.assertIn("node_monitor", active_threads_name) self.assertIn("node_heart_beat_monitor", active_threads_name) manager.stop() @@ -780,3 +781,5 @@ def test_local_job_manager(self): job_manager.update_succeeded_node(0, "unknown") except Exception: self.fail() + + self.assertEqual(job_manager.get_diagnosis_actions_size(), 0) From d39cf36695485b3884de0785866ed4776b3d6501 Mon Sep 17 00:00:00 2001 From: "chentianyi.cty" Date: Thu, 7 Nov 2024 19:30:56 +0800 Subject: [PATCH 6/7] revert --- .../check_training_hang_operator.py | 2 +- dlrover/python/master/diagnosis/diagnosis.py | 22 +++------- .../python/master/node/dist_job_manager.py | 42 ------------------- 3 files changed, 6 insertions(+), 60 deletions(-) diff --git a/dlrover/python/diagnosis/inferencechain/inferenceoperator/check_training_hang_operator.py b/dlrover/python/diagnosis/inferencechain/inferenceoperator/check_training_hang_operator.py index 8a9175743..9b01cf1e5 100644 --- a/dlrover/python/diagnosis/inferencechain/inferenceoperator/check_training_hang_operator.py +++ b/dlrover/python/diagnosis/inferencechain/inferenceoperator/check_training_hang_operator.py @@ -134,7 +134,7 @@ def is_hang(self, diagnosis_data: List[DiagnosisData]): f"threshold: {hang_last_threshold}" ) if _dlrover_ctx.hang_detection == 1: - # TODO + # logs only pass elif _dlrover_ctx.hang_detection == 2: # TODO diff --git a/dlrover/python/master/diagnosis/diagnosis.py b/dlrover/python/master/diagnosis/diagnosis.py index 62b49c663..ad0f73a11 100644 --- a/dlrover/python/master/diagnosis/diagnosis.py +++ b/dlrover/python/master/diagnosis/diagnosis.py @@ -35,6 +35,7 @@ from dlrover.python.diagnosis.inferencechain.inferenceoperator.check_training_hang_operator import ( # noqa: E501 CheckTrainingHangOperator, ) +from dlrover.python.master.node.job_context import get_job_context def has_expired(timestamp: float, time_period: int) -> bool: @@ -44,17 +45,11 @@ def has_expired(timestamp: float, time_period: int) -> bool: class DiagnosisManager: - def __init__(self, job_manager=None): - self._job_manager = job_manager + def __init__(self): self._is_observing_started = False - self._data_manager: DiagnosisDataManager = DiagnosisDataManager( - job_manager, 600 - ) + self._data_manager: DiagnosisDataManager = DiagnosisDataManager() self._diagnostician: Diagnostician = Diagnostician(self._data_manager) - def is_job_manager_exist(self) -> bool: - return self._job_manager is not None - def collect_diagnosis_data(self, data: DiagnosisData): self._data_manager.store_data(data) @@ -119,23 +114,16 @@ def _diagnose_failures(self): class DiagnosisDataManager: - def __init__(self, job_manager=None, expire_time_period=600): + def __init__(self, expire_time_period=600): self._diagnosis_data: Dict[str, deque[DiagnosisData]] = {} self.expire_time_period = expire_time_period - self._job_manager = job_manager + self._job_context = get_job_context() self._lock = threading.Lock() - @property - def job_manager(self): - return self._job_manager - @property def data(self): return self._diagnosis_data - def with_runtime_context(self) -> bool: - return self.job_manager is not None - def store_data(self, data: DiagnosisData): data_type = data.data_type with self._lock: diff --git a/dlrover/python/master/node/dist_job_manager.py b/dlrover/python/master/node/dist_job_manager.py index 0caf190bf..61ce60697 100644 --- a/dlrover/python/master/node/dist_job_manager.py +++ b/dlrover/python/master/node/dist_job_manager.py @@ -212,11 +212,6 @@ def start(self): worker_num += plan.node_group_resources[NodeType.CHIEF].count self._speed_monitor.set_target_worker_num(worker_num) self._training_node_config.set_node_num(worker_num) - threading.Thread( - target=self._diagnosis_action_consumer, - name="diagnosis_action_consumer", - daemon=True, - ).start() threading.Thread( target=self._monitor_nodes, name="node_monitor", daemon=True ).start() @@ -421,34 +416,6 @@ def _init_job_auto_scaler(self): "Create job autoscaler: %s", self._job_autoscaler.__class__ ) - def _diagnosis_action_consumer(self): - logger.info("Start consuming diagnosis actions.") - while True: - if self._stopped: - logger.info("Stop consuming diagnosis actions.") - break - try: - if self.get_diagnosis_actions_size() == 0: - time.sleep(5) - continue - - action = self._diagnosis_action_queue.get() - if action.type == DiagnosisActionType.EVENT: - self._report_event( - action.event_type, - action.instance, - action.action, - action.msg, - action.labels, - ) - elif action.type == DiagnosisActionType.MASTER_RELAUNCH_WORKER: - # TODO - pass - except Exception as e: - logger.warning(e) - time.sleep(10) - time.sleep(1) - def _monitor_nodes(self): logger.info("Start monitoring nodes events.") while True: @@ -685,9 +652,6 @@ def _get_pod_unique_labels(self, node: Node): ElasticJobLabel.RANK_INDEX_KEY: node.rank_index, } - def _process_diagnosis_action(self, action: DiagnosisAction): - pass - def _process_event(self, event: NodeEvent): node_type = event.node.type node_status = event.node.status @@ -1274,12 +1238,6 @@ def process_reported_node_event(self, node_event: NodeEvent): def get_node_required_info(self): return self._nodes_required - def get_total_node_num_by_type(self, node_type): - if not self._job_nodes: - return 0 - - return len(self._job_nodes[node_type]) - def get_job_strategy(self): return self._job_args.distribution_strategy From 01e4a5eaae09e485e8d72d3b6c75c5703cf9dcc4 Mon Sep 17 00:00:00 2001 From: "chentianyi.cty" Date: Thu, 7 Nov 2024 20:14:48 +0800 Subject: [PATCH 7/7] lint --- dlrover/python/diagnosis/common/constants.py | 2 +- .../diagnosis/common/diagnosis_action.py | 2 +- .../check_training_hang_operator.py | 4 ++-- dlrover/python/master/diagnosis/diagnosis.py | 3 ++- dlrover/python/master/dist_master.py | 2 +- dlrover/python/master/node/dist_job_manager.py | 18 +++++++++++++++--- dlrover/python/tests/test_diagnosis.py | 8 ++++++-- 7 files changed, 28 insertions(+), 11 deletions(-) diff --git a/dlrover/python/diagnosis/common/constants.py b/dlrover/python/diagnosis/common/constants.py index e190a8298..b12ac485c 100644 --- a/dlrover/python/diagnosis/common/constants.py +++ b/dlrover/python/diagnosis/common/constants.py @@ -24,7 +24,7 @@ class InferenceConfigKey(object): class DiagnosisConstant(object): MASTER_DIAGNOSIS_OBSERVING_INTERVAL_SECS = 180 AGENT_PERIODICALLY_DIAGNOSIS_INTERVAL_SECS = 60 - MASTER = -1 + MASTER_INSTANCE = -1 ANY_INSTANCE = -2 LOCAL_INSTANCE = -3 ACTION_EXPIRED_TIME_PERIOD_DEFAULT = 60 * 5 diff --git a/dlrover/python/diagnosis/common/diagnosis_action.py b/dlrover/python/diagnosis/common/diagnosis_action.py index 968263859..1025926e4 100644 --- a/dlrover/python/diagnosis/common/diagnosis_action.py +++ b/dlrover/python/diagnosis/common/diagnosis_action.py @@ -43,7 +43,7 @@ class DiagnosisAction(metaclass=ABCMeta): def __init__( self, action_type=DiagnosisActionType.NONE, - instance=DiagnosisConstant.MASTER, + instance=DiagnosisConstant.MASTER_INSTANCE, timestamp=0, expired_time_period=60 * 1000, ): diff --git a/dlrover/python/diagnosis/inferencechain/inferenceoperator/check_training_hang_operator.py b/dlrover/python/diagnosis/inferencechain/inferenceoperator/check_training_hang_operator.py index 9b01cf1e5..14a100f4d 100644 --- a/dlrover/python/diagnosis/inferencechain/inferenceoperator/check_training_hang_operator.py +++ b/dlrover/python/diagnosis/inferencechain/inferenceoperator/check_training_hang_operator.py @@ -134,10 +134,10 @@ def is_hang(self, diagnosis_data: List[DiagnosisData]): f"threshold: {hang_last_threshold}" ) if _dlrover_ctx.hang_detection == 1: - # logs only + # trigger event action pass elif _dlrover_ctx.hang_detection == 2: - # TODO + # trigger relaunch action pass return True diff --git a/dlrover/python/master/diagnosis/diagnosis.py b/dlrover/python/master/diagnosis/diagnosis.py index ad0f73a11..f2809e2f4 100644 --- a/dlrover/python/master/diagnosis/diagnosis.py +++ b/dlrover/python/master/diagnosis/diagnosis.py @@ -99,7 +99,8 @@ def _diagnose_failures(self): logger.info("Stop to diagnose failures for observing.") break logger.info( - f"Current diagnosis data size: {self._data_manager.get_data_size()}." + "Current diagnosis " + f"data size: {self._data_manager.get_data_size()}." ) observed_problems = self._diagnostician.observe_training() diff --git a/dlrover/python/master/dist_master.py b/dlrover/python/master/dist_master.py index cdff7d61f..c68942e2c 100644 --- a/dlrover/python/master/dist_master.py +++ b/dlrover/python/master/dist_master.py @@ -143,7 +143,7 @@ def __init__( error_monitor ), } - self.diagnosis_manager = DiagnosisManager(self.job_manager) + self.diagnosis_manager = DiagnosisManager() self.job_metric_collector = self._create_metric_collector_if_needed( args ) diff --git a/dlrover/python/master/node/dist_job_manager.py b/dlrover/python/master/node/dist_job_manager.py index 61ce60697..f2d7a0df4 100644 --- a/dlrover/python/master/node/dist_job_manager.py +++ b/dlrover/python/master/node/dist_job_manager.py @@ -20,8 +20,6 @@ from datetime import datetime from typing import Dict, List, Optional -from diagnosis.common.constants import DiagnosisActionType - from dlrover.python.common.constants import ( DistributionStrategy, ElasticJobLabel, @@ -38,6 +36,10 @@ from dlrover.python.common.grpc import ParallelConfig from dlrover.python.common.log import default_logger as logger from dlrover.python.common.node import Node, NodeGroupResource +from dlrover.python.diagnosis.common.constants import ( + DiagnosisActionType, + DiagnosisConstant, +) from dlrover.python.diagnosis.common.diagnosis_action import ( DiagnosisAction, NoAction, @@ -478,7 +480,11 @@ def _diagnose_job(self): logger.warning(e) detail_trace_back = traceback.format_exc() logger.warning(detail_trace_back) - self._process_diagnosis_action(self._job_context.next_action()) + self._process_diagnosis_action( + self._job_context.next_action( + instance=DiagnosisConstant.MASTER_INSTANCE + ) + ) time.sleep(15) def _get_dead_node_event(self, window_interval=900) -> List[NodeEvent]: @@ -652,6 +658,12 @@ def _get_pod_unique_labels(self, node: Node): ElasticJobLabel.RANK_INDEX_KEY: node.rank_index, } + def _process_diagnosis_action(self, action: DiagnosisAction): + if not action or action.action_type == DiagnosisActionType.NONE: + return + + # TODO + def _process_event(self, event: NodeEvent): node_type = event.node.type node_status = event.node.status diff --git a/dlrover/python/tests/test_diagnosis.py b/dlrover/python/tests/test_diagnosis.py index 876dc3070..04586798d 100644 --- a/dlrover/python/tests/test_diagnosis.py +++ b/dlrover/python/tests/test_diagnosis.py @@ -57,13 +57,17 @@ def test_data_manager(self): def test_action_basic(self): basic_action = DiagnosisAction() self.assertEqual(basic_action.action_type, DiagnosisActionType.NONE) - self.assertEqual(basic_action._instance, DiagnosisConstant.MASTER) + self.assertEqual( + basic_action._instance, DiagnosisConstant.MASTER_INSTANCE + ) event_action = EventAction( "info", "job", "test", "test123", {"k1": "v1"} ) self.assertEqual(event_action.action_type, DiagnosisActionType.EVENT) - self.assertEqual(event_action._instance, DiagnosisConstant.MASTER) + self.assertEqual( + event_action._instance, DiagnosisConstant.MASTER_INSTANCE + ) self.assertEqual(event_action.event_type, "info") self.assertEqual(event_action.event_instance, "job") self.assertEqual(event_action.event_action, "test")