From adcec424def828e48f3d173614ae7275ab38c85d Mon Sep 17 00:00:00 2001 From: nguu0123 Date: Wed, 28 Aug 2024 12:13:43 +0300 Subject: [PATCH 1/5] update --- example/docker_report/config/client1.yaml | 38 +++++++++---------- requirements-dev.lock | 2 +- requirements.lock | 2 +- src/qoa4ml/config/configs.py | 10 ++++- src/qoa4ml/connector/debug_connector.py | 8 ++-- src/qoa4ml/lang/datamodel_enum.py | 2 +- src/qoa4ml/probes/docker_monitoring_probe.py | 20 ++++++++-- src/qoa4ml/probes/probe.py | 7 +++- src/qoa4ml/probes/process_monitoring_probe.py | 13 +++++-- src/qoa4ml/probes/system_monitoring_probe.py | 13 +++++-- src/qoa4ml/qoa_client.py | 34 ++++++++++++----- src/qoa4ml/reports/resources_report_model.py | 18 +++++++-- src/qoa4ml/utils/jetson_utils.py | 4 ++ tests/gpu_test.py | 1 + tests/qoaclient_test/config/client.yaml | 2 + tests/qoaclient_test/qoaclient_test.py | 8 +++- tox.ini | 2 +- 17 files changed, 129 insertions(+), 55 deletions(-) diff --git a/example/docker_report/config/client1.yaml b/example/docker_report/config/client1.yaml index a53b65e..8607700 100644 --- a/example/docker_report/config/client1.yaml +++ b/example/docker_report/config/client1.yaml @@ -15,23 +15,23 @@ connector: # exchange_name: test_qoa4ml # exchange_type: topic # out_routing_key: test.client1 -# probes: -# - probe_type: "docker" -# frequency: 9999999 -# require_register: false -# log_latency_flag: false -# environment: Edge -# container_list: ["test"] +probes: + - probe_type: "docker" + frequency: 9999999 + require_register: false + log_latency_flag: false + environment: Edge + container_list: ["test"] -# - probe_type: "system" -# frequency: 1 -# require_register: false -# log_latency_flag: false -# environment: Edge -# node_name: "Edge1" -# -# - probe_type: "process" -# frequency: 1 -# require_register: false -# log_latency_flag: false -# environment: Edge + - probe_type: "system" + frequency: 1 + require_register: false + log_latency_flag: false + environment: Edge + node_name: "Edge1" + + - probe_type: "process" + frequency: 1 + require_register: false + log_latency_flag: false + environment: Edge diff --git a/requirements-dev.lock b/requirements-dev.lock index c7831da..d439db2 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -129,7 +129,7 @@ namex==0.0.8 # via keras nodeenv==1.9.1 # via pre-commit -numpy==1.23.5 +numpy==1.26.4 # via h5py # via keras # via ml-dtypes diff --git a/requirements.lock b/requirements.lock index 44faa64..8ec1803 100644 --- a/requirements.lock +++ b/requirements.lock @@ -79,7 +79,7 @@ ml-dtypes==0.4.0 # via tensorflow namex==0.0.8 # via keras -numpy==1.23.5 +numpy==1.26.4 # via h5py # via keras # via ml-dtypes diff --git a/src/qoa4ml/config/configs.py b/src/qoa4ml/config/configs.py index c1d0679..2ef6dc2 100644 --- a/src/qoa4ml/config/configs.py +++ b/src/qoa4ml/config/configs.py @@ -99,12 +99,20 @@ class KafkaCollectorConfig(BaseModel): poll_inteval: float = 1.0 +class DebugConnectorConfig(BaseModel): + silence: bool + + # TODO: test if loading the config, the type of the config can be found CollectorConfigClass = Union[ AMQPCollectorConfig, SocketCollectorConfig, KafkaCollectorConfig, dict ] ConnectorConfigClass = Union[ - AMQPConnectorConfig, SocketConnectorConfig, KafkaConnectorConfig, dict + AMQPConnectorConfig, + SocketConnectorConfig, + KafkaConnectorConfig, + DebugConnectorConfig, + dict, ] diff --git a/src/qoa4ml/connector/debug_connector.py b/src/qoa4ml/connector/debug_connector.py index 2cfb575..70f414a 100644 --- a/src/qoa4ml/connector/debug_connector.py +++ b/src/qoa4ml/connector/debug_connector.py @@ -2,12 +2,14 @@ from devtools import debug +from ..config.configs import DebugConnectorConfig from .base_connector import BaseConnector class DebugConnector(BaseConnector): - def __init__(self): - pass + def __init__(self, config: DebugConnectorConfig): + self.silence = config.silence def send_report(self, body_message: str): - debug(json.loads(body_message)) + if not self.silence: + debug(json.loads(body_message)) diff --git a/src/qoa4ml/lang/datamodel_enum.py b/src/qoa4ml/lang/datamodel_enum.py index 4c3acc6..e0dc4db 100644 --- a/src/qoa4ml/lang/datamodel_enum.py +++ b/src/qoa4ml/lang/datamodel_enum.py @@ -182,7 +182,7 @@ class ReportTypeEnum(Enum): security = "security_report" -class EnvironmentEnum(Enum): +class EnvironmentEnum(str, Enum): hpc = "HPC" edge = "Edge" cloud = "Cloud" diff --git a/src/qoa4ml/probes/docker_monitoring_probe.py b/src/qoa4ml/probes/docker_monitoring_probe.py index 5ef848c..6a7ecc8 100644 --- a/src/qoa4ml/probes/docker_monitoring_probe.py +++ b/src/qoa4ml/probes/docker_monitoring_probe.py @@ -3,7 +3,9 @@ import docker -from ..config.configs import DockerProbeConfig +from qoa4ml.reports.resources_report_model import DockerReport + +from ..config.configs import ClientInfo, DockerProbeConfig from ..connector.base_connector import BaseConnector from ..utils.docker_util import get_docker_stats from ..utils.logger import qoa_logger @@ -11,8 +13,13 @@ class DockerMonitoringProbe(Probe): - def __init__(self, config: DockerProbeConfig, connector: BaseConnector) -> None: - super().__init__(config, connector) + def __init__( + self, + config: DockerProbeConfig, + connector: BaseConnector, + client_info: ClientInfo, + ) -> None: + super().__init__(config, connector, client_info) self.config = config if self.config.require_register: self.obs_service_url = self.config.obs_service_url @@ -21,7 +28,12 @@ def __init__(self, config: DockerProbeConfig, connector: BaseConnector) -> None: def create_report(self): try: reports = get_docker_stats(self.docker_client, self.config.container_list) - reports_dict = [report.model_dump() for report in reports] + docker_report = DockerReport( + metadata=self.client_info, + timestamp=time.time(), + container_reports=reports, + ) + reports_dict = docker_report.model_dump() # NOTE: if the reports dict is empty, the loop will run very fast, so here add 2 seconds as if there is container to report if not reports_dict: time.sleep(2) diff --git a/src/qoa4ml/probes/probe.py b/src/qoa4ml/probes/probe.py index 1a08881..b6b9a41 100644 --- a/src/qoa4ml/probes/probe.py +++ b/src/qoa4ml/probes/probe.py @@ -3,15 +3,18 @@ from abc import ABC, abstractmethod from typing import Any -from ..config.configs import ProbeConfig +from ..config.configs import ClientInfo, ProbeConfig from ..connector.base_connector import BaseConnector from ..utils.qoa_utils import make_folder from ..utils.repeated_timer import RepeatedTimer class Probe(ABC): - def __init__(self, config: ProbeConfig, connector: BaseConnector) -> None: + def __init__( + self, config: ProbeConfig, connector: BaseConnector, client_info: ClientInfo + ) -> None: self.config = config + self.client_info = client_info self.frequency = self.config.frequency self.monitoring_interval = 1.0 / self.frequency self.execution_flag = False diff --git a/src/qoa4ml/probes/process_monitoring_probe.py b/src/qoa4ml/probes/process_monitoring_probe.py index 26e94b2..9a5fb50 100644 --- a/src/qoa4ml/probes/process_monitoring_probe.py +++ b/src/qoa4ml/probes/process_monitoring_probe.py @@ -7,7 +7,7 @@ import lazy_import import psutil -from ..config.configs import ProcessProbeConfig +from ..config.configs import ClientInfo, ProcessProbeConfig from ..connector.base_connector import BaseConnector from ..lang.datamodel_enum import EnvironmentEnum from ..utils.qoa_utils import ( @@ -31,8 +31,13 @@ class ProcessMonitoringProbe(Probe): - def __init__(self, config: ProcessProbeConfig, connector: BaseConnector) -> None: - super().__init__(config, connector) + def __init__( + self, + config: ProcessProbeConfig, + connector: BaseConnector, + client_info: ClientInfo, + ) -> None: + super().__init__(config, connector, client_info) self.config = config if self.config.pid is None: self.pid = os.getpid() @@ -48,7 +53,7 @@ def __init__(self, config: ProcessProbeConfig, connector: BaseConnector) -> None self.metadata = {"pid": str(self.pid), "user": self.process.username()} else: self.metadata = resources_report_model.ProcessMetadata( - pid=str(self.pid), user=self.process.username() + pid=str(self.pid), user=self.process.username(), client_info=client_info ) def get_cpu_usage(self): diff --git a/src/qoa4ml/probes/system_monitoring_probe.py b/src/qoa4ml/probes/system_monitoring_probe.py index 34ea101..732f2bb 100644 --- a/src/qoa4ml/probes/system_monitoring_probe.py +++ b/src/qoa4ml/probes/system_monitoring_probe.py @@ -5,7 +5,7 @@ import lazy_import -from ..config.configs import SystemProbeConfig +from ..config.configs import ClientInfo, SystemProbeConfig from ..connector.base_connector import BaseConnector from ..lang.datamodel_enum import EnvironmentEnum from ..utils.gpu_utils import get_sys_gpu_metadata, get_sys_gpu_usage @@ -27,8 +27,13 @@ class SystemMonitoringProbe(Probe): - def __init__(self, config: SystemProbeConfig, connector: BaseConnector) -> None: - super().__init__(config, connector) + def __init__( + self, + config: SystemProbeConfig, + connector: BaseConnector, + client_info: ClientInfo, + ) -> None: + super().__init__(config, connector, client_info) self.config = config if self.config.node_name is None: self.node_name = socket.gethostname().split(".")[0] @@ -91,7 +96,7 @@ def create_report(self): else: report = resources_report_model.SystemReport( metadata=resources_report_model.SystemMetadata( - node_name=self.node_name + node_name=self.node_name, client_info=self.client_info ), timestamp=round(timestamp), cpu=resources_report_model.ResourceReport( diff --git a/src/qoa4ml/qoa_client.py b/src/qoa4ml/qoa_client.py index e80520b..9be6494 100644 --- a/src/qoa4ml/qoa_client.py +++ b/src/qoa4ml/qoa_client.py @@ -19,7 +19,9 @@ from .config.configs import ( AMQPConnectorConfig, ClientConfig, + ClientInfo, ConnectorConfig, + DebugConnectorConfig, DockerProbeConfig, ProbeConfig, ProcessProbeConfig, @@ -143,7 +145,9 @@ def __init__( self.probes_list = None if self.configuration.probes: - self.probes_list = self.init_probes(self.configuration.probes) + self.probes_list = self.init_probes( + self.configuration.probes, self.configuration.client + ) # lock report to guarantee consistency self.lock = threading.Lock() @@ -154,7 +158,9 @@ def registration(self, url: str): "POST", url, headers=headers, data=self.client_config.json() ) - def init_probes(self, probe_config_list: list[ProbeConfig]): + def init_probes( + self, probe_config_list: list[ProbeConfig], client_info: ClientInfo + ): probes_list: list[Probe] = [] # TODO: each probe can have their own connector if self.default_connector: @@ -164,17 +170,20 @@ def init_probes(self, probe_config_list: list[ProbeConfig]): selected_connector = DebugConnector() for probe_config in probe_config_list: + # TODO: can be simplify for less duplicate code if isinstance(probe_config, DockerProbeConfig): probes_list.append( - DockerMonitoringProbe(probe_config, selected_connector) + DockerMonitoringProbe(probe_config, selected_connector, client_info) ) elif isinstance(probe_config, ProcessProbeConfig): probes_list.append( - ProcessMonitoringProbe(probe_config, selected_connector) + ProcessMonitoringProbe( + probe_config, selected_connector, client_info + ) ) elif isinstance(probe_config, SystemProbeConfig): probes_list.append( - SystemMonitoringProbe(probe_config, selected_connector) + SystemMonitoringProbe(probe_config, selected_connector, client_info) ) else: raise ValueError( @@ -184,13 +193,14 @@ def init_probes(self, probe_config_list: list[ProbeConfig]): def init_connector(self, configuration: ConnectorConfig) -> BaseConnector: # init connector from configuration - if ( - configuration.connector_class == ServiceAPIEnum.amqp - and type(configuration.config) is AMQPConnectorConfig + if configuration.connector_class == ServiceAPIEnum.amqp and isinstance( + configuration.config, AMQPConnectorConfig ): return AmqpConnector(configuration.config) - elif configuration.connector_class == ServiceAPIEnum.debug: - return DebugConnector() + elif configuration.connector_class == ServiceAPIEnum.debug and isinstance( + configuration.config, DebugConnectorConfig + ): + return DebugConnector(configuration.config) # TODO: MQTT is both connector and collector # @@ -320,6 +330,10 @@ def report( return return_report.model_dump(mode="json") def start_all_probes(self): + """ + Start all probes in the background, will be killed when the main process exited + NOTE: if the probe takes long to report, and the main process exit, no report may be sent + """ if not self.probes_list: raise RuntimeError( "There is no initiated probes, please recheck the config" diff --git a/src/qoa4ml/reports/resources_report_model.py b/src/qoa4ml/reports/resources_report_model.py index 90b2092..13f6fc6 100644 --- a/src/qoa4ml/reports/resources_report_model.py +++ b/src/qoa4ml/reports/resources_report_model.py @@ -2,13 +2,19 @@ from pydantic import BaseModel +from ..config.configs import ClientInfo -class ProcessMetadata(BaseModel): + +class BaseMetadata(BaseModel): + client_info: ClientInfo | None = None + + +class ProcessMetadata(BaseMetadata): pid: str user: str -class SystemMetadata(BaseModel): +class SystemMetadata(BaseMetadata): node_name: str model: str | None = None @@ -34,7 +40,7 @@ class SystemReport(BaseModel): mem: ResourceReport -class DockerContainerMetadata(BaseModel): +class DockerContainerMetadata(BaseMetadata): id: str image: str @@ -45,3 +51,9 @@ class DockerContainerReport(BaseModel): cpu: ResourceReport gpu: ResourceReport | None = None mem: ResourceReport + + +class DockerReport(BaseModel): + metadata: ClientInfo + timestamp: float + container_reports: list[DockerContainerReport] = [] diff --git a/src/qoa4ml/utils/jetson_utils.py b/src/qoa4ml/utils/jetson_utils.py index 0bd0c4a..49b97e8 100644 --- a/src/qoa4ml/utils/jetson_utils.py +++ b/src/qoa4ml/utils/jetson_utils.py @@ -54,6 +54,10 @@ def find_igpu(): return igpu +def find_dgpu(): + pass + + def get_gpu_status(self): gpu_list = {} # Read iGPU frequency diff --git a/tests/gpu_test.py b/tests/gpu_test.py index 11db5b7..82e4c13 100644 --- a/tests/gpu_test.py +++ b/tests/gpu_test.py @@ -1,4 +1,5 @@ from devtools import debug + from qoa4ml.utils.jetson_utils import find_igpu debug(find_igpu()) diff --git a/tests/qoaclient_test/config/client.yaml b/tests/qoaclient_test/config/client.yaml index afd04ac..2da8206 100644 --- a/tests/qoaclient_test/config/client.yaml +++ b/tests/qoaclient_test/config/client.yaml @@ -15,3 +15,5 @@ client: connector: - name: debug_connector connector_class: Debug + config: + silence: True diff --git a/tests/qoaclient_test/qoaclient_test.py b/tests/qoaclient_test/qoaclient_test.py index 9099d32..283b68a 100644 --- a/tests/qoaclient_test/qoaclient_test.py +++ b/tests/qoaclient_test/qoaclient_test.py @@ -26,7 +26,13 @@ def test_creating_qoaclient_from_dict(): "run_id": "test1", "custom_info": {"your_custom_info": 1}, }, - "connector": [{"name": "debug_connector", "connector_class": "Debug"}], + "connector": [ + { + "name": "debug_connector", + "connector_class": "Debug", + "config": {"silence": True}, + } + ], } _ = QoaClient(config_dict=config) diff --git a/tox.ini b/tox.ini index 981b797..4575644 100644 --- a/tox.ini +++ b/tox.ini @@ -18,5 +18,5 @@ deps = pytest-sugar devtools commands = - pytest tests {posargs:tests} + pytest -s tests {posargs:tests} From cd6d5e0b73cf53183226a92ae89feda2584e3444 Mon Sep 17 00:00:00 2001 From: nguu0123 Date: Wed, 28 Aug 2024 12:20:41 +0300 Subject: [PATCH 2/5] bump version --- pyproject.toml | 2 +- .../config/client_with_probes.yaml | 40 +++++++++++++++++++ tests/qoaclient_test/probes_test.py | 12 ++++++ 3 files changed, 53 insertions(+), 1 deletion(-) create mode 100644 tests/qoaclient_test/config/client_with_probes.yaml create mode 100644 tests/qoaclient_test/probes_test.py diff --git a/pyproject.toml b/pyproject.toml index a32cb5e..e32cd1a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ include = ["src/qoa4ml"] [project] name = "qoa4ml" -version = "0.2.16" +version = "0.2.17" description = "Quality of Analysis for Machine Learning" readme = "README.md" diff --git a/tests/qoaclient_test/config/client_with_probes.yaml b/tests/qoaclient_test/config/client_with_probes.yaml new file mode 100644 index 0000000..83bde42 --- /dev/null +++ b/tests/qoaclient_test/config/client_with_probes.yaml @@ -0,0 +1,40 @@ +client: + name: qoa_client_test + username: aaltosea + user_id: "1" + instance_name: aaltosea_instance_test1 + instance_id: b6f83293-cf67-44dd-a7b5-77229d384012 + stage_id: gateway + functionality: REST + application_name: test + role: ml + run_id: test1 + custom_info: + your_custom_info: 1 + +connector: + - name: debug_connector + connector_class: Debug + config: + silence: True + +probes: + - probe_type: "docker" + frequency: 1 + require_register: false + log_latency_flag: false + environment: Edge + container_list: ["test"] + + - probe_type: "system" + frequency: 1 + require_register: false + log_latency_flag: false + environment: Edge + node_name: "Edge1" + + - probe_type: "process" + frequency: 1 + require_register: false + log_latency_flag: false + environment: Edge diff --git a/tests/qoaclient_test/probes_test.py b/tests/qoaclient_test/probes_test.py new file mode 100644 index 0000000..ada181d --- /dev/null +++ b/tests/qoaclient_test/probes_test.py @@ -0,0 +1,12 @@ +import os +import time + +from qoa4ml.qoa_client import QoaClient + +dir_path = os.path.dirname(os.path.realpath(__file__)) + + +def test_probes_reporting(): + qoa_client = QoaClient(config_path=f"{dir_path}/config/client_with_probes.yaml") + qoa_client.start_all_probes() + time.sleep(1) From a07b3b56b85f48913e061643b2409ef2493d549d Mon Sep 17 00:00:00 2001 From: nguu0123 Date: Wed, 28 Aug 2024 16:10:36 +0300 Subject: [PATCH 3/5] fix incorerct id, should be instance_id --- src/qoa4ml/qoa_client.py | 11 ++++---- .../reports/general_application_report.py | 2 +- src/qoa4ml/reports/ml_reports.py | 28 +++++++++++-------- src/qoa4ml/reports/rohe_reports.py | 6 ++-- 4 files changed, 25 insertions(+), 22 deletions(-) diff --git a/src/qoa4ml/qoa_client.py b/src/qoa4ml/qoa_client.py index cc2763f..d968b94 100644 --- a/src/qoa4ml/qoa_client.py +++ b/src/qoa4ml/qoa_client.py @@ -11,10 +11,6 @@ import requests from pydantic import create_model -from qoa4ml.probes.docker_monitoring_probe import DockerMonitoringProbe -from qoa4ml.probes.process_monitoring_probe import ProcessMonitoringProbe -from qoa4ml.probes.system_monitoring_probe import SystemMonitoringProbe - # from .connector.mqtt_connector import Mqtt_Connector from .config.configs import ( AMQPConnectorConfig, @@ -37,9 +33,12 @@ ServiceAPIEnum, ServiceMetricNameEnum, ) +from .probes.docker_monitoring_probe import DockerMonitoringProbe from .probes.probe import Probe +from .probes.process_monitoring_probe import ProcessMonitoringProbe +from .probes.system_monitoring_probe import SystemMonitoringProbe from .reports.abstract_report import AbstractReport -from .reports.rohe_reports import RoheReport +from .reports.ml_reports import MLReport from .utils.logger import qoa_logger from .utils.qoa_utils import ( load_config, @@ -57,7 +56,7 @@ class QoaClient(Generic[T]): def __init__( self, # NOTE: use text, number, enum - report_cls: type[T] = RoheReport, + report_cls: type[T] = MLReport, config_dict: Optional[dict] = None, config_path: Optional[str] = None, registration_url: Optional[str] = None, diff --git a/src/qoa4ml/reports/general_application_report.py b/src/qoa4ml/reports/general_application_report.py index f8ee3ca..3ce064a 100644 --- a/src/qoa4ml/reports/general_application_report.py +++ b/src/qoa4ml/reports/general_application_report.py @@ -23,7 +23,7 @@ def reset(self): self.report = GeneralApplicationReportModel() self.execution_instance = MicroserviceInstance( - id=UUID(self.client_config.id), + id=UUID(self.client_config.instance_id), name=self.client_config.name, functionality=self.client_config.functionality, stage=self.client_config.stage_id, diff --git a/src/qoa4ml/reports/ml_reports.py b/src/qoa4ml/reports/ml_reports.py index 3f4cf11..73f3596 100644 --- a/src/qoa4ml/reports/ml_reports.py +++ b/src/qoa4ml/reports/ml_reports.py @@ -74,7 +74,7 @@ def observe_metric(self, report_type, stage, metric: Metric): self.report.service[stage].metrics[metric.metric_name] = {} self.report.service[stage].metrics[metric.metric_name] |= { - UUID(self.client_config.id): metric + UUID(self.client_config.instance_id): metric } elif report_type == ReportTypeEnum.data: @@ -84,32 +84,36 @@ def observe_metric(self, report_type, stage, metric: Metric): self.report.data[stage].metrics[metric.metric_name] = {} self.report.data[stage].metrics[metric.metric_name] |= { - UUID(self.client_config.id): metric + UUID(self.client_config.instance_id): metric } else: raise ValueError(f"Can't handle report type {report_type}") def observe_inference(self, inference_value): - if self.client_config.id in self.report.ml_inference: + if self.client_config.instance_id in self.report.ml_inference: raise RuntimeWarning( "Inference existed, will override the existing inference" ) - self.report.ml_inference[self.client_config.id] = InferenceInstance( + self.report.ml_inference[self.client_config.instance_id] = InferenceInstance( inference_id=uuid4(), - instance_id=UUID(self.client_config.id), + instance_id=UUID(self.client_config.instance_id), functionality=self.client_config.functionality, prediction=inference_value, ) def observe_inference_metric(self, metric: Metric): - if self.client_config.id in self.report.ml_inference: - self.report.ml_inference[self.client_config.id].metrics.append(metric) + if self.client_config.instance_id in self.report.ml_inference: + self.report.ml_inference[self.client_config.instance_id].metrics.append( + metric + ) else: - self.report.ml_inference[self.client_config.id] = InferenceInstance( - inference_id=uuid4(), - instance_id=UUID(self.client_config.id), - functionality=self.client_config.functionality, - metrics=[metric], + self.report.ml_inference[self.client_config.instance_id] = ( + InferenceInstance( + inference_id=uuid4(), + instance_id=UUID(self.client_config.instance_id), + functionality=self.client_config.functionality, + metrics=[metric], + ) ) def generate_report( diff --git a/src/qoa4ml/reports/rohe_reports.py b/src/qoa4ml/reports/rohe_reports.py index 45b630d..39ff3f1 100644 --- a/src/qoa4ml/reports/rohe_reports.py +++ b/src/qoa4ml/reports/rohe_reports.py @@ -36,7 +36,7 @@ def reset(self): self.report = RoheReportModel() self.previous_microservice_instance = [] self.execution_instance = MicroserviceInstance( - id=UUID(self.client_config.id), + id=UUID(self.client_config.instance_id), name=self.client_config.name, functionality=self.client_config.functionality, stage=self.client_config.stage_id, @@ -163,7 +163,7 @@ def observe_metric(self, report_type: ReportTypeEnum, stage: str, metric: Metric self.inference_report.service[stage].metrics[metric.metric_name] = {} self.inference_report.service[stage].metrics[metric.metric_name] |= { - UUID(self.client_config.id): metric + UUID(self.client_config.instance_id): metric } elif report_type == ReportTypeEnum.data: @@ -173,7 +173,7 @@ def observe_metric(self, report_type: ReportTypeEnum, stage: str, metric: Metric self.inference_report.data[stage].metrics[metric.metric_name] = {} self.inference_report.data[stage].metrics[metric.metric_name] |= { - UUID(self.client_config.id): metric + UUID(self.client_config.instance_id): metric } else: raise ValueError(f"Can't handle report type {report_type}") From 94b4c7ee66a5f5338c5f3b476c071d8de45c3c63 Mon Sep 17 00:00:00 2001 From: nguu0123 Date: Thu, 29 Aug 2024 11:35:55 +0300 Subject: [PATCH 4/5] update --- .gitignore | 2 +- .python-version | 2 +- pyproject.toml | 23 +++---- requirements-dev.lock | 18 ++++-- requirements.lock | 12 +++- src/qoa4ml/metric_mananger.py | 108 ------------------------------- src/qoa4ml/qoa_client.py | 5 -- src/qoa4ml/utils/jetson_utils.py | 12 +--- 8 files changed, 38 insertions(+), 144 deletions(-) delete mode 100644 src/qoa4ml/metric_mananger.py diff --git a/.gitignore b/.gitignore index 5df29e7..51c1430 100644 --- a/.gitignore +++ b/.gitignore @@ -19,4 +19,4 @@ modules/orchestration/temp/ *.pem observability/odop_obs/logs/*.txt observability/odop_obs/tinyflux/*.csv - +.python-version diff --git a/.python-version b/.python-version index 1445aee..8e34c81 100644 --- a/.python-version +++ b/.python-version @@ -1 +1 @@ -3.10.14 +3.9.19 diff --git a/pyproject.toml b/pyproject.toml index e32cd1a..dbd566d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -77,19 +77,20 @@ AaltoSEA = "https://rdsea.github.io/" [tool.rye] dev-dependencies = [ - "mkdocs-material", - "pre-commit", - "devtools", - "ruff", - "codespell", - "pip>=24.2", - "pytest>=8.3.2", - "tox>=4.18.0", - "pytest-sugar>=1.0.0", - "tox-uv>=1.11.2", - "tox-gh-actions>=3.2.0", + "mkdocs-material", + "pre-commit", + "devtools", + "ruff", + "codespell", + "pip>=24.2", + "pytest>=8.3.2", + "tox>=4.18.0", + "pytest-sugar>=1.0.0", + "tox-uv>=1.11.2", + "tox-gh-actions>=3.2.0", ] managed = true +universal = true [tool.ruff.lint] diff --git a/requirements-dev.lock b/requirements-dev.lock index d439db2..b088c39 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -7,7 +7,7 @@ # all-features: true # with-sources: false # generate-hashes: false -# universal: false +# universal: true -e file:. absl-py==2.1.0 @@ -39,7 +39,10 @@ click==8.1.7 # via uvicorn codespell==2.3.0 colorama==0.4.6 + # via click + # via mkdocs # via mkdocs-material + # via pytest # via tox confluent-kafka==2.5.0 # via qoa4ml @@ -53,7 +56,7 @@ docker==7.1.0 # via qoa4ml eval-type-backport==0.2.0 # via qoa4ml -exceptiongroup==1.2.2 +exceptiongroup==1.2.2 ; python_full_version < '3.11' # via anyio # via pytest executing==2.0.1 @@ -87,6 +90,9 @@ idna==3.7 # via anyio # via requests importlib-metadata==8.0.0 + # via markdown + # via mkdocs + # via mkdocs-get-deps # via opentelemetry-api iniconfig==2.0.0 # via pytest @@ -210,6 +216,8 @@ pytest==8.3.2 pytest-sugar==1.0.0 python-dateutil==2.9.0.post0 # via ghp-import +pywin32==306 ; sys_platform == 'win32' + # via docker pyyaml==6.0.2 # via mkdocs # via mkdocs-get-deps @@ -252,14 +260,14 @@ tensorboard-data-server==0.7.2 # via tensorboard tensorflow==2.17.0 # via qoa4ml -tensorflow-io-gcs-filesystem==0.37.1 +tensorflow-io-gcs-filesystem==0.37.1 ; python_full_version < '3.12' # via tensorflow termcolor==2.4.0 # via pytest-sugar # via tensorflow tinyflux==1.0.0 # via qoa4ml -tomli==2.0.1 +tomli==2.0.1 ; python_full_version < '3.11' # via pyproject-api # via pytest # via tox @@ -275,7 +283,9 @@ typing-extensions==4.12.2 # via optree # via pydantic # via pydantic-core + # via starlette # via tensorflow + # via tox-uv # via uvicorn urllib3==2.2.2 # via docker diff --git a/requirements.lock b/requirements.lock index 8ec1803..04d3567 100644 --- a/requirements.lock +++ b/requirements.lock @@ -7,7 +7,7 @@ # all-features: true # with-sources: false # generate-hashes: false -# universal: false +# universal: true -e file:. absl-py==2.1.0 @@ -26,6 +26,8 @@ charset-normalizer==3.3.2 # via requests click==8.1.7 # via uvicorn +colorama==0.4.6 ; platform_system == 'Windows' + # via click confluent-kafka==2.5.0 # via qoa4ml deprecated==1.2.14 @@ -35,7 +37,7 @@ docker==7.1.0 # via qoa4ml eval-type-backport==0.2.0 # via qoa4ml -exceptiongroup==1.2.2 +exceptiongroup==1.2.2 ; python_full_version < '3.11' # via anyio fastapi==0.112.1 # via qoa4ml @@ -59,6 +61,7 @@ idna==3.7 # via anyio # via requests importlib-metadata==8.0.0 + # via markdown # via opentelemetry-api keras==3.5.0 # via tensorflow @@ -130,6 +133,8 @@ pydantic-core==2.18.4 # via pydantic pygments==2.18.0 # via rich +pywin32==306 ; sys_platform == 'win32' + # via docker pyyaml==6.0.2 # via qoa4ml requests==2.32.3 @@ -159,7 +164,7 @@ tensorboard-data-server==0.7.2 # via tensorboard tensorflow==2.17.0 # via qoa4ml -tensorflow-io-gcs-filesystem==0.37.1 +tensorflow-io-gcs-filesystem==0.37.1 ; python_full_version < '3.12' # via tensorflow termcolor==2.4.0 # via tensorflow @@ -172,6 +177,7 @@ typing-extensions==4.12.2 # via optree # via pydantic # via pydantic-core + # via starlette # via tensorflow # via uvicorn urllib3==2.2.2 diff --git a/src/qoa4ml/metric_mananger.py b/src/qoa4ml/metric_mananger.py deleted file mode 100644 index 4ff547e..0000000 --- a/src/qoa4ml/metric_mananger.py +++ /dev/null @@ -1,108 +0,0 @@ -from typing import Optional, Union - -from .config.configs import MetricConfig -from .lang.datamodel_enum import MetricClassEnum, MetricNameEnum -from .metric import Counter, Gauge, Histogram, PrometheusMetric, Summary -from .utils.logger import qoa_logger - - -class MetricManager: - def __init__(self) -> None: - self.metric_list: dict[MetricNameEnum, PrometheusMetric] = {} - - def add_metric(self, metric_configs: list[MetricConfig]): - # Add multiple metrics - for metric_config in metric_configs: - self.metric_list[metric_config.name] = self.init_metric(metric_config) - - def reset_metric(self, key: Optional[Union[list, str]] = None): - # TO DO: - try: - if key is None: - for metric_name in self.metric_list: - self.metric_list[metric_name].reset() - elif isinstance(key, list): - for k in key: - self.metric_list[k].reset() - else: - return self.metric_list[key].reset() - except Exception as e: - qoa_logger.error( - str( - f"[ERROR] - Error {type(e)} when resetting metric in QoA client: {e.__traceback__}" - ) - ) - - def get_metric(self, key: Optional[Union[list, str]] = None): - # TO DO: - try: - if key is None: - # Get all metric - return self.metric_list - elif isinstance(key, list): - # Get a list of metrics - return {k: self.metric_list[k] for k in key} - else: - # Get a specific metric - return self.metric_list[key] - except Exception as e: - qoa_logger.error( - str( - f"[ERROR] - Error {type(e)} when getting metric from QoA client: {e.__traceback__}" - ) - ) - - def init_metric(self, configuration: MetricConfig) -> PrometheusMetric: - # init individual metrics - if configuration.metric_class == MetricClassEnum.gauge: - return Gauge( - configuration.name, - configuration.description, - configuration.default_value, - configuration.category, - ) - elif configuration.metric_class == MetricClassEnum.counter: - return Counter( - configuration.name, - configuration.description, - configuration.default_value, - configuration.category, - ) - elif configuration.metric_class == MetricClassEnum.summary: - return Summary( - configuration.name, - configuration.description, - configuration.default_value, - configuration.category, - ) - elif configuration.metric_class == MetricClassEnum.histogram: - return Histogram( - configuration.name, - configuration.description, - configuration.default_value, - configuration.category, - ) - else: - raise ValueError( - f"Metric class {configuration.metric_class} is not supported" - ) - - def observe_metric( - self, - metric_name: MetricNameEnum, - value, - category=0, - metric_class: MetricClassEnum = MetricClassEnum.gauge, - description: str = "", - default_value: int = -1, - ): - if metric_name not in self.metric_list: - metric_config = MetricConfig( - name=metric_name, - category=category, - metric_class=metric_class, - description=description, - default_value=default_value, - ) - self.metric_list[metric_name] = self.init_metric(metric_config) - self.metric_list[metric_name].set(value) diff --git a/src/qoa4ml/qoa_client.py b/src/qoa4ml/qoa_client.py index d968b94..10aae2b 100644 --- a/src/qoa4ml/qoa_client.py +++ b/src/qoa4ml/qoa_client.py @@ -71,7 +71,6 @@ def __init__( self.configuration = ClientConfig.model_validate(load_config(config_path)) self.client_config = self.configuration.client - # self.metric_manager = MetricManager() self.connector_list: dict[str, BaseConnector] = {} self.timer_flag = False self.functionality = self.client_config.functionality @@ -227,10 +226,6 @@ def observe_metric( category: int = 0, description: str = "", ): - # self.metric_manager.observe_metric( - # metric_name, value, category, metric_class, description, default_value - # ) - # metric = self.metric_manager.metric_list[metric_name] if category == 0: report_type = ReportTypeEnum.service elif category == 1: diff --git a/src/qoa4ml/utils/jetson_utils.py b/src/qoa4ml/utils/jetson_utils.py index 49b97e8..6fc7fee 100644 --- a/src/qoa4ml/utils/jetson_utils.py +++ b/src/qoa4ml/utils/jetson_utils.py @@ -41,11 +41,9 @@ def find_igpu(): } qoa_logger.info(f'GPU "{name}" status in {path}') qoa_logger.info(f'GPU "{name}" frq in {frq_path}') - # Check if railgate exist path_railgate = os.path.join(path, "railgate_enable") if os.path.isfile(path_railgate): igpu[name]["railgate"] = path_railgate - # Check if 3d scaling exist path_3d_scaling = os.path.join(path, "enable_3d_scaling") if os.path.isfile(path_3d_scaling): igpu[name]["3d_scaling"] = path_3d_scaling @@ -90,28 +88,20 @@ def meminfo(): def get_memory_status(mem_total): memory = {} - # Measure the largest free bank for 4MB - # Count only the biggest Large free bank (lfb) - # Status Memory status_mem = meminfo() - # Read memory use + # NOTE: Read memory use # NvMapMemUsed: Is the shared memory between CPU and GPU # This key is always available on Jetson (not really always) ram_shared = status_mem.get("NvMapMemUsed", 0) if mem_total: - # Update shared size ram_shared = mem_total if ram_shared == 0 else ram_shared - # Extract memory info ram_total = status_mem.get("MemTotal", 0) ram_free = status_mem.get("MemFree", 0) - # ram_available = status_mem.get('MemAvailable', 0) ram_buffer = status_mem.get("Buffers", 0) ram_cached = status_mem.get("Cached", 0) ram_sreclaimable = status_mem.get("SReclaimable", 0) - # ram_Shmem = status_mem.get('Shmem', 0) total_used_memory = ram_total - ram_free cached_memory = ram_cached + ram_sreclaimable # + ram_Shmem - # Add fields for RAM memory["RAM"] = { "tot": ram_total, "used": total_used_memory - (ram_buffer + ram_cached), From 9511279fee87be33a515d5ae5d9e120a0894be72 Mon Sep 17 00:00:00 2001 From: nguu0123 Date: Thu, 29 Aug 2024 12:09:30 +0300 Subject: [PATCH 5/5] fix incorerct timestamp in metadata when using custom report --- .python-version | 2 +- requirements-dev.lock | 5 ----- requirements.lock | 2 -- src/qoa4ml/qoa_client.py | 14 +++++++++----- tests/qoaclient_test/qoaclient_test.py | 11 +++++++++++ 5 files changed, 21 insertions(+), 13 deletions(-) diff --git a/.python-version b/.python-version index 8e34c81..1445aee 100644 --- a/.python-version +++ b/.python-version @@ -1 +1 @@ -3.9.19 +3.10.14 diff --git a/requirements-dev.lock b/requirements-dev.lock index b088c39..dbcf04c 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -90,9 +90,6 @@ idna==3.7 # via anyio # via requests importlib-metadata==8.0.0 - # via markdown - # via mkdocs - # via mkdocs-get-deps # via opentelemetry-api iniconfig==2.0.0 # via pytest @@ -283,9 +280,7 @@ typing-extensions==4.12.2 # via optree # via pydantic # via pydantic-core - # via starlette # via tensorflow - # via tox-uv # via uvicorn urllib3==2.2.2 # via docker diff --git a/requirements.lock b/requirements.lock index 04d3567..7d13cbb 100644 --- a/requirements.lock +++ b/requirements.lock @@ -61,7 +61,6 @@ idna==3.7 # via anyio # via requests importlib-metadata==8.0.0 - # via markdown # via opentelemetry-api keras==3.5.0 # via tensorflow @@ -177,7 +176,6 @@ typing-extensions==4.12.2 # via optree # via pydantic # via pydantic-core - # via starlette # via tensorflow # via uvicorn urllib3==2.2.2 diff --git a/src/qoa4ml/qoa_client.py b/src/qoa4ml/qoa_client.py index 10aae2b..a242109 100644 --- a/src/qoa4ml/qoa_client.py +++ b/src/qoa4ml/qoa_client.py @@ -304,14 +304,18 @@ def report( if report is None: return_report = self.qoa_report.generate_report(reset, corr_id=corr_id) else: - user_defined_report_odel = create_model( - "UserDefinedReportModel", metadata=(dict, ...), report=(dict, ...) + user_defined_report_model = create_model( + "UserDefinedReportModel", + metadata=(dict, ...), + timestamp=(float, ...), + report=(dict, ...), ) - return_report = user_defined_report_odel( - report=report, metadata=copy.deepcopy(self.client_config.__dict__) + return_report = user_defined_report_model( + report=report, + metadata=copy.deepcopy(self.client_config.__dict__), + timestamp=time.time(), ) - return_report.metadata["timestamp"] = time.time() if submit: if self.default_connector is not None: sub_thread = Thread( diff --git a/tests/qoaclient_test/qoaclient_test.py b/tests/qoaclient_test/qoaclient_test.py index 283b68a..7063287 100644 --- a/tests/qoaclient_test/qoaclient_test.py +++ b/tests/qoaclient_test/qoaclient_test.py @@ -43,3 +43,14 @@ def test_submiting_report(): ) qoa_client.observe_metric("test", random()) qoa_client.report(submit=True) + + +def test_submiting_custom_report(): + qoa_client = QoaClient( + report_cls=MLReport, config_path=f"{dir_path}/config/client.yaml" + ) + qoa_client.observe_metric("test", random()) + report = { + "test": "123456", + } + qoa_client.report(report=report, submit=True)