diff --git a/.env.example b/.env.example index b7244697..8b504ad1 100644 --- a/.env.example +++ b/.env.example @@ -9,6 +9,7 @@ USE_EMBEDDING_CACHE=True LOG_LLM_CHAT_CONTENT=False CHAT_FREQUENCY_PENALTY=0.0 CHAT_PRESENCE_PENALTY=0.0 +LOG_TRACE_PATH=log_traces # embedding model configs: EMBEDDING_OPENAI_API_KEY=your_api_key diff --git a/rdagent/app/factor_extraction_and_implementation/factor_extract_and_implement.py b/rdagent/app/factor_extraction_and_implementation/factor_extract_and_implement.py index e92a1f00..de5048d9 100644 --- a/rdagent/app/factor_extraction_and_implementation/factor_extract_and_implement.py +++ b/rdagent/app/factor_extraction_and_implementation/factor_extract_and_implement.py @@ -1,22 +1,24 @@ # %% from dotenv import load_dotenv -from rdagent.scenarios.qlib.factor_experiment_loader.pdf_loader import ( - FactorExperimentLoaderFromPDFfiles, -) -from rdagent.scenarios.qlib.factor_task_implementation import ( - COSTEERFG_QUANT_FACTOR_IMPLEMENTATION, -) +from rdagent.scenarios.qlib.factor_experiment_loader.pdf_loader import FactorExperimentLoaderFromPDFfiles +from rdagent.scenarios.qlib.factor_task_implementation import QlibFactorCoSTEER +from rdagent.log import rdagent_logger as logger assert load_dotenv() def extract_factors_and_implement(report_file_path: str) -> None: - factor_tasks = FactorExperimentLoaderFromPDFfiles().load(report_file_path) - implementation_result = COSTEERFG_QUANT_FACTOR_IMPLEMENTATION().generate(factor_tasks) + with logger.tag('extract_factors_and_implement'): + with logger.tag('load_factor_tasks'): + factor_tasks = FactorExperimentLoaderFromPDFfiles().load(report_file_path) + with logger.tag('implement_factors'): + implementation_result = QlibFactorCoSTEER().generate(factor_tasks) # Qlib to run the implementation return implementation_result if __name__ == "__main__": extract_factors_and_implement("/home/xuyang1/workspace/report.pdf") + +# %% diff --git a/rdagent/components/coder/factor_coder/CoSTEER/evaluators.py b/rdagent/components/coder/factor_coder/CoSTEER/evaluators.py index 9b8a0fe3..5a2869f6 100644 --- a/rdagent/components/coder/factor_coder/CoSTEER/evaluators.py +++ b/rdagent/components/coder/factor_coder/CoSTEER/evaluators.py @@ -17,7 +17,7 @@ from rdagent.core.evaluation import Evaluator from rdagent.core.evolving_framework import Feedback, QueriedKnowledge from rdagent.core.experiment import Implementation, Task -from rdagent.core.log import RDAgentLog +from rdagent.log import rdagent_logger as logger from rdagent.core.prompts import Prompts from rdagent.core.utils import multiprocessing_wrapper from rdagent.oai.llm_utils import APIBackend @@ -585,7 +585,7 @@ def evaluate( value_feedback=factor_feedback.factor_value_feedback, code_feedback=factor_feedback.code_feedback, ) - RDAgentLog().info(factor_feedback.final_decision) + logger.info(factor_feedback.final_decision) return factor_feedback @@ -640,7 +640,7 @@ def evaluate( None if single_feedback is None else single_feedback.final_decision for single_feedback in multi_implementation_feedback ] - RDAgentLog().info(f"Final decisions: {final_decision} True count: {final_decision.count(True)}") + logger.info(f"Final decisions: {final_decision} True count: {final_decision.count(True)}") return multi_implementation_feedback diff --git a/rdagent/components/coder/factor_coder/CoSTEER/evolvable_subjects.py b/rdagent/components/coder/factor_coder/CoSTEER/evolvable_subjects.py index 1bf47073..9af63e5f 100644 --- a/rdagent/components/coder/factor_coder/CoSTEER/evolvable_subjects.py +++ b/rdagent/components/coder/factor_coder/CoSTEER/evolvable_subjects.py @@ -4,7 +4,7 @@ FileBasedFactorImplementation, ) from rdagent.core.evolving_framework import EvolvableSubjects -from rdagent.core.log import RDAgentLog +from rdagent.log import rdagent_logger as logger class FactorEvolvingItem(FactorExperiment, EvolvableSubjects): @@ -23,7 +23,7 @@ def __init__( sub_gt_implementations, ) != len(self.sub_tasks): self.sub_gt_implementations = None - RDAgentLog().warning( + logger.warning( "The length of sub_gt_implementations is not equal to the length of sub_tasks, set sub_gt_implementations to None", ) else: diff --git a/rdagent/components/coder/factor_coder/CoSTEER/knowledge_management.py b/rdagent/components/coder/factor_coder/CoSTEER/knowledge_management.py index b18529ae..8cbf4095 100644 --- a/rdagent/components/coder/factor_coder/CoSTEER/knowledge_management.py +++ b/rdagent/components/coder/factor_coder/CoSTEER/knowledge_management.py @@ -28,7 +28,7 @@ RAGStrategy, ) from rdagent.core.experiment import Implementation -from rdagent.core.log import RDAgentLog +from rdagent.log import rdagent_logger as logger from rdagent.core.prompts import Prompts from rdagent.oai.llm_utils import ( APIBackend, @@ -336,7 +336,7 @@ def analyze_component( )["component_no_list"] return [all_component_nodes[index - 1] for index in sorted(list(set(component_no_list)))] except: - RDAgentLog().warning("Error when analyzing components.") + logger.warning("Error when analyzing components.") analyze_component_user_prompt = "Your response is not a valid component index list." return [] @@ -714,7 +714,7 @@ def __init__(self, init_component_list=None) -> None: Load knowledge, offer brief information of knowledge and common handle interfaces """ self.graph: UndirectedGraph = UndirectedGraph.load(Path.cwd() / "graph.pkl") - RDAgentLog().info(f"Knowledge Graph loaded, size={self.graph.size()}") + logger.info(f"Knowledge Graph loaded, size={self.graph.size()}") if init_component_list: for component in init_component_list: diff --git a/rdagent/components/coder/factor_coder/CoSTEER/scheduler.py b/rdagent/components/coder/factor_coder/CoSTEER/scheduler.py index 90e00a9c..42b8a7c8 100644 --- a/rdagent/components/coder/factor_coder/CoSTEER/scheduler.py +++ b/rdagent/components/coder/factor_coder/CoSTEER/scheduler.py @@ -9,7 +9,7 @@ ) from rdagent.components.coder.factor_coder.utils import get_data_folder_intro from rdagent.core.conf import RD_AGENT_SETTINGS -from rdagent.core.log import RDAgentLog +from rdagent.log import rdagent_logger as logger from rdagent.core.prompts import Prompts from rdagent.core.scenario import Scenario from rdagent.oai.llm_utils import APIBackend @@ -25,7 +25,7 @@ def RandomSelect(to_be_finished_task_index, implementation_factors_per_round): implementation_factors_per_round, ) - RDAgentLog().info(f"The random selection is: {to_be_finished_task_index}") + logger.info(f"The random selection is: {to_be_finished_task_index}") return to_be_finished_task_index diff --git a/rdagent/components/coder/factor_coder/factor.py b/rdagent/components/coder/factor_coder/factor.py index aa4fecac..82c193e7 100644 --- a/rdagent/components/coder/factor_coder/factor.py +++ b/rdagent/components/coder/factor_coder/factor.py @@ -16,7 +16,7 @@ RuntimeErrorException, ) from rdagent.core.experiment import Experiment, FBImplementation, Task -from rdagent.core.log import RDAgentLog +from rdagent.log import rdagent_logger as logger from rdagent.oai.llm_utils import md5_hash @@ -74,7 +74,7 @@ def __init__( ) -> None: super().__init__(*args, **kwargs) self.executed_factor_value_dataframe = executed_factor_value_dataframe - self.logger = RDAgentLog() + self.logger = logger self.raise_exception = raise_exception @staticmethod diff --git a/rdagent/components/coder/model_coder/CoSTEER/evaluators.py b/rdagent/components/coder/model_coder/CoSTEER/evaluators.py index c3e5f468..9ae642d4 100644 --- a/rdagent/components/coder/model_coder/CoSTEER/evaluators.py +++ b/rdagent/components/coder/model_coder/CoSTEER/evaluators.py @@ -16,7 +16,7 @@ from rdagent.core.evaluation import Evaluator from rdagent.core.evolving_framework import QueriedKnowledge from rdagent.core.experiment import Implementation, Task -from rdagent.core.log import RDAgentLog +from rdagent.log import rdagent_logger as logger from rdagent.core.prompts import Prompts from rdagent.core.utils import multiprocessing_wrapper from rdagent.oai.llm_utils import APIBackend @@ -328,6 +328,6 @@ def evaluate( None if single_feedback is None else single_feedback.final_decision for single_feedback in multi_implementation_feedback ] - RDAgentLog().info(f"Final decisions: {final_decision} True count: {final_decision.count(True)}") + logger.info(f"Final decisions: {final_decision} True count: {final_decision.count(True)}") return multi_implementation_feedback diff --git a/rdagent/components/coder/model_coder/CoSTEER/evolvable_subjects.py b/rdagent/components/coder/model_coder/CoSTEER/evolvable_subjects.py index 70e485b3..8ed10dd5 100644 --- a/rdagent/components/coder/model_coder/CoSTEER/evolvable_subjects.py +++ b/rdagent/components/coder/model_coder/CoSTEER/evolvable_subjects.py @@ -4,7 +4,7 @@ ModelTask, ) from rdagent.core.evolving_framework import EvolvableSubjects -from rdagent.core.log import RDAgentLog +from rdagent.log import rdagent_logger as logger class ModelEvolvingItem(ModelExperiment, EvolvableSubjects): @@ -22,7 +22,7 @@ def __init__( sub_gt_implementations, ) != len(self.sub_tasks): self.sub_gt_implementations = None - RDAgentLog().warning( + logger.warning( "The length of sub_gt_implementations is not equal to the length of sub_tasks, set sub_gt_implementations to None", ) else: diff --git a/rdagent/components/coder/model_coder/task_loader.py b/rdagent/components/coder/model_coder/task_loader.py index 7f1035f1..95cda9fb 100644 --- a/rdagent/components/coder/model_coder/task_loader.py +++ b/rdagent/components/coder/model_coder/task_loader.py @@ -9,13 +9,12 @@ load_and_process_pdfs_by_langchain, ) from rdagent.components.loader.task_loader import ModelTaskLoader -from rdagent.core.log import RDAgentLog +from rdagent.log import rdagent_logger as logger from rdagent.core.prompts import Prompts from rdagent.oai.llm_utils import APIBackend document_process_prompts = Prompts(file_path=Path(__file__).parent / "prompts.yaml") - def extract_model_from_doc(doc_content: str) -> dict: """ Extract model information from document content. @@ -62,7 +61,7 @@ def extract_model_from_doc(doc_content: str) -> dict: else: break - RDAgentLog().info(f"已经完成{len(model_dict)}个模型的提取") + logger.info(f"已经完成{len(model_dict)}个模型的提取") return model_dict diff --git a/rdagent/components/knowledge_management/vector_base.py b/rdagent/components/knowledge_management/vector_base.py index 9322a510..17ca514b 100644 --- a/rdagent/components/knowledge_management/vector_base.py +++ b/rdagent/components/knowledge_management/vector_base.py @@ -5,10 +5,9 @@ import pandas as pd from scipy.spatial.distance import cosine -from rdagent.core.log import RDAgentLog +from rdagent.log import rdagent_logger as logger from rdagent.oai.llm_utils import APIBackend - class KnowledgeMetaData: def __init__(self, content: str = "", label: str = None, embedding=None, identity=None): self.label = label @@ -127,7 +126,7 @@ def __init__(self, vector_df_path: Union[str, Path] = None): else: self.vector_df = pd.DataFrame(columns=["id", "label", "content", "embedding"]) - RDAgentLog().info(f"VectorBase loaded, shape={self.vector_df.shape}") + logger.info(f"VectorBase loaded, shape={self.vector_df.shape}") def shape(self): return self.vector_df.shape @@ -205,4 +204,4 @@ def load(self, vector_df_path, **kwargs): def save(self, vector_df_path, **kwargs): self.vector_df.to_pickle(vector_df_path) - RDAgentLog().info(f"Save vectorBase vector_df to: {vector_df_path}") + logger.info(f"Save vectorBase vector_df to: {vector_df_path}") diff --git a/rdagent/core/conf.py b/rdagent/core/conf.py index 120cb8f8..f709d575 100644 --- a/rdagent/core/conf.py +++ b/rdagent/core/conf.py @@ -15,7 +15,13 @@ class RDAgentSettings(BaseSettings): + # TODO: (xiao) I think LLMSetting may be a better name. # TODO: (xiao) I think most of the config should be in oai.config + # Log configs + # TODO: (xiao) think it can be a seperate config. + log_trace_path: str | None = None + log_llm_chat_content: bool = True + use_azure: bool = True use_azure_token_provider: bool = False managed_identity_client_id: str | None = None @@ -28,7 +34,6 @@ class RDAgentSettings(BaseSettings): prompt_cache_path: str = str(Path.cwd() / "prompt_cache.db") session_cache_folder_location: str = str(Path.cwd() / "session_cache_folder/") max_past_message_include: int = 10 - log_llm_chat_content: bool = True # Chat configs chat_openai_api_key: str = "" diff --git a/rdagent/core/log.py b/rdagent/core/log.py deleted file mode 100644 index 9c6e2b74..00000000 --- a/rdagent/core/log.py +++ /dev/null @@ -1,141 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING, Sequence - -from loguru import logger - -if TYPE_CHECKING: - from loguru import Logger - - -class LogColors: - """ - ANSI color codes for use in console output. - """ - - RED = "\033[91m" - GREEN = "\033[92m" - YELLOW = "\033[93m" - BLUE = "\033[94m" - MAGENTA = "\033[95m" - CYAN = "\033[96m" - WHITE = "\033[97m" - GRAY = "\033[90m" - BLACK = "\033[30m" - - BOLD = "\033[1m" - ITALIC = "\033[3m" - - END = "\033[0m" - - @classmethod - def get_all_colors(cls: type[LogColors]) -> list: - names = dir(cls) - names = [name for name in names if not name.startswith("__") and not callable(getattr(cls, name))] - return [getattr(cls, name) for name in names] - - def render(self, text: str, color: str = "", style: str = "") -> str: - """ - render text by input color and style. - It's not recommend that input text is already rendered. - """ - # This method is called too frequently, which is not good. - colors = self.get_all_colors() - # Perhaps color and font should be distinguished here. - if color and color in colors: - # Changes to accommodate ruff checks. - # Original code: - # raise ValueError(f"color should be in: {colors} but now is: {color}") - # Description of the problem: - # TRY003 Avoid specifying long messages outside the exception class - # EM102 Exception must not use an f-string literal, assign to variable first - # References: - # https://docs.astral.sh/ruff/rules/raise-vanilla-args/ - # https://docs.astral.sh/ruff/rules/f-string-in-exception/ - error_message = f"color should be in: {colors} but now is: {color}" - raise ValueError(error_message) - if style and style in colors: - # Changes to accommodate ruff checks. - # Original code: - # raise ValueError(f"style should be in: {colors} but now is: {style}") - # Description of the problem: - # TRY003 Avoid specifying long messages outside the exception class - # EM102 Exception must not use an f-string literal, assign to variable first - # References: - # https://docs.astral.sh/ruff/rules/raise-vanilla-args/ - # https://docs.astral.sh/ruff/rules/f-string-in-exception/ - error_message = f"style should be in: {colors} but now is: {style}" - raise ValueError(error_message) - - text = f"{color}{text}{self.END}" - - return f"{style}{text}{self.END}" - - -class RDAgentLog: - # logger.add(loguru_handler, level="INFO") # you can add use storage as a loguru handler - - def __init__(self) -> None: - self.logger: Logger = logger - - def info(self, *args: Sequence, plain: bool = False) -> None: - if plain: - return self.plain_info(*args) - for arg in args: - info = f"{LogColors.WHITE}{arg}{LogColors.END}" - self.logger.info(info) - return None - - def __getstate__(self) -> dict: - return {} - - def __setstate__(self, _: str) -> None: - self.logger = logger - - def plain_info(self, *args: Sequence) -> None: - for arg in args: - # Changes to accommodate ruff checks. - # Original code: - # self.logger.info( - # f""" - # {LogColors.YELLOW}{LogColors.BOLD} - # Info:{LogColors.END}{LogColors.WHITE}{arg}{LogColors.END} - # """, - # ) - # Description of the problem: - # G004 Logging statement uses f-string - # References: - # https://docs.astral.sh/ruff/rules/logging-f-string/ - info = f""" - {LogColors.YELLOW}{LogColors.BOLD} - Info:{LogColors.END}{LogColors.WHITE}{arg}{LogColors.END} - """ - self.logger.info(info) - - def warning(self, *args: Sequence) -> None: - for arg in args: - # Changes to accommodate ruff checks. - # Original code: - # self.logger.warning( - # f"{LogColors.BLUE}{LogColors.BOLD}Warning:{LogColors.END}{arg}", - # ) - # Description of the problem: - # G004 Logging statement uses f-string - # References: - # https://docs.astral.sh/ruff/rules/logging-f-string/ - info = f"{LogColors.BLUE}{LogColors.BOLD}Warning:{LogColors.END}{arg}" - self.logger.warning(info) - - def error(self, *args: Sequence) -> None: - for arg in args: - # Changes to accommodate ruff checks. - # Original code: - # self.logger.error( - # f"{LogColors.RED}{LogColors.BOLD}Error:{LogColors.END}{arg}", - # ) - # Description of the problem: - # G004 Logging statement uses f-string - # References: - # https://docs.astral.sh/ruff/rules/logging-f-string/ - info = f"{LogColors.RED}{LogColors.BOLD}Error:{LogColors.END}{arg}" - self.logger.error(info) diff --git a/rdagent/core/utils.py b/rdagent/core/utils.py index e72b17eb..efec9184 100644 --- a/rdagent/core/utils.py +++ b/rdagent/core/utils.py @@ -19,11 +19,15 @@ class RDAgentException(Exception): # noqa: N818 class SingletonMeta(type): - _instance_dict: ClassVar[dict] = {} + def __init__(cls, *args, **kwargs): + cls._instance_dict: dict = {} + # This must be the class variable instead of sharing one in all classes to avoid confliction like `A()`, `B()` + super().__init__(*args, **kwargs) def __call__(cls, *args: Any, **kwargs: Any) -> Any: # Since it's hard to align the difference call using args and kwargs, we strictly ask to use kwargs in Singleton if args: + # TODO: this restriction can be solved. exception_message = "Please only use kwargs in Singleton to avoid misunderstanding." raise RDAgentException(exception_message) kwargs_hash = hash(tuple(sorted(kwargs.items()))) diff --git a/rdagent/log/__init__.py b/rdagent/log/__init__.py new file mode 100644 index 00000000..ff6f158d --- /dev/null +++ b/rdagent/log/__init__.py @@ -0,0 +1,4 @@ +from .logger import RDAgentLog +from .utils import LogColors + +rdagent_logger: RDAgentLog = RDAgentLog() \ No newline at end of file diff --git a/rdagent/log/base.py b/rdagent/log/base.py new file mode 100644 index 00000000..d99c6a81 --- /dev/null +++ b/rdagent/log/base.py @@ -0,0 +1,62 @@ +from __future__ import annotations + +from abc import abstractmethod +from pathlib import Path + + +class Storage: + """ + Basic storage to support saving objects; + + # Usage: + + The storage has mainly two kind of users: + - The logging end: you can choose any of the following method to use the object + - We can use it directly with the native logging storage + - We can use it with other logging tools; For example, serve as a handler for loggers + - The view end: + - Mainly for the subclass of `logging.base.View` + - It should provide two kind of ways to provide content + - offline content provision. + - online content preovision. + """ + + @abstractmethod + def log(self, obj: object, name: str = "", **kwargs: dict) -> str | Path: + """ + + Parameters + ---------- + obj : object + The object for logging. + name : str + The name of the object. For example "a.b.c" + We may log a lot of objects to a same name + + Returns + ------- + str | Path + The storage identifier of the object. + """ + ... + + +class View: + """ + Motivation: + + Display the content in the storage + """ + # TODO: pleas fix me + @abstractmethod + def display(s: Storage, watch: bool=False): + """ + + Parameters + ---------- + s : Storage + + watch : bool + should we watch the new content and display them + """ + ... \ No newline at end of file diff --git a/rdagent/log/logger.py b/rdagent/log/logger.py new file mode 100644 index 00000000..2b9b3ead --- /dev/null +++ b/rdagent/log/logger.py @@ -0,0 +1,145 @@ +import sys +import os + +from loguru import logger +from rdagent.core.utils import SingletonBaseClass +from rdagent.core.conf import RD_AGENT_SETTINGS +from pathlib import Path +from psutil import Process +from datetime import datetime, timezone +from functools import partial +from contextlib import contextmanager +from multiprocessing import Pipe +from multiprocessing.connection import Connection + +from .storage import FileStorage +from .utils import LogColors, get_caller_info + + +class RDAgentLog(SingletonBaseClass): + """ + The files are organized based on the tag & PID + Here is an example tag + + .. code-block:: + + a + - b + - c + - 123 + - common_logs.log + - 1322 + - common_logs.log + - 1233 + - .pkl + - d + - 1233-673 ... + - 1233-4563 ... + - 1233-365 ... + + """ + # TODO: Simplify it to introduce less concepts ( We may merge RDAgentLog, Storage &) + # Solution: Storage => PipeLog, View => PipeLogView, RDAgentLog is an instance of PipeLogger + # PipeLogger.info(...) , PipeLogger.get_resp() to get feedback from frontend. + # def f(): + # logger = PipeLog() + # logger.info("") + # feedback = logger.get_reps() + _tag: str = "" + + def __init__(self, log_trace_path: str | None = RD_AGENT_SETTINGS.log_trace_path) -> None: + + if log_trace_path is None: + timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d_%H-%M-%S-%f") + log_trace_path: Path = Path.cwd() / "log" / timestamp + + self.log_trace_path = Path(log_trace_path) + self.log_trace_path.mkdir(parents=True, exist_ok=True) + + self.storage = FileStorage(log_trace_path) + + self.main_pid = os.getpid() + + @contextmanager + def tag(self, tag: str): + if tag.strip() == "": + raise ValueError("Tag cannot be empty.") + if self._tag != "": + tag = "." + tag + + # TODO: It may result in error in mutithreading or co-routine + self._tag = self._tag + tag + yield + self._tag = self._tag[:-len(tag)] + + def get_pids(self) -> str: + ''' + Returns a string of pids from the current process to the main process. + Split by '-'. + ''' + pid = os.getpid() + process = Process(pid) + pid_chain = f"{pid}" + while process.pid != self.main_pid: + parent_pid = process.ppid() + parent_process = Process(parent_pid) + pid_chain = f"{parent_pid}-{pid_chain}" + process = parent_process + return pid_chain + + def file_format(self, record, raw: bool=False): + record["message"] = LogColors.remove_ansi_codes(record["message"]) + if raw: + return "{message}" + return "{time:YYYY-MM-DD HH:mm:ss.SSS} | {level: <8} | {name}:{function}:{line} - {message}\n" + + def log_object(self, obj: object, *, tag: str = "") -> None: + caller_info = get_caller_info() + tag = f"{self._tag}.{tag}.{self.get_pids()}".strip('.') + + logp = self.storage.log(obj, name=tag, save_type="pkl") + + file_handler_id = logger.add(self.log_trace_path / tag.replace('.','/') / "common_logs.log", format=self.file_format) + logger.patch(lambda r: r.update(caller_info)).info(f"Logging object in {logp.absolute()}") + logger.remove(file_handler_id) + + + def info(self, msg: str, *, tag: str = "", raw: bool=False) -> None: + # TODO: too much duplicated. due to we have no logger with stream context; + caller_info = get_caller_info() + if raw: + logger.remove() + logger.add(sys.stderr, format=lambda r: "{message}") + + tag = f"{self._tag}.{tag}.{self.get_pids()}".strip('.') + log_file_path = self.log_trace_path / tag.replace('.','/') / "common_logs.log" + if raw: + file_handler_id = logger.add(log_file_path, format=partial(self.file_format, raw=True)) + else: + file_handler_id = logger.add(log_file_path, format=self.file_format) + + logger.patch(lambda r: r.update(caller_info)).info(msg) + logger.remove(file_handler_id) + + if raw: + logger.remove() + logger.add(sys.stderr) + + def warning(self, msg: str, *, tag: str = "") -> None: + # TODO: reuse code + # _log(self, msg: str, *, tag: str = "", level=Literal["warning", "error", ..]) -> None: + # getattr(logger.patch(lambda r: r.update(caller_info)), level)(msg) + caller_info = get_caller_info() + + tag = f"{self._tag}.{tag}.{self.get_pids()}".strip('.') + file_handler_id = logger.add(self.log_trace_path / tag.replace('.','/') / "common_logs.log", format=self.file_format) + logger.patch(lambda r: r.update(caller_info)).warning(msg) + logger.remove(file_handler_id) + + def error(self, msg: str, *, tag: str = "") -> None: + caller_info = get_caller_info() + + tag = f"{self._tag}.{tag}.{self.get_pids()}".strip('.') + file_handler_id = logger.add(self.log_trace_path / tag.replace('.','/') / "common_logs.log", format=self.file_format) + logger.patch(lambda r: r.update(caller_info)).error(msg) + logger.remove(file_handler_id) diff --git a/rdagent/log/storage.py b/rdagent/log/storage.py new file mode 100644 index 00000000..50418a24 --- /dev/null +++ b/rdagent/log/storage.py @@ -0,0 +1,54 @@ +import json +import pickle +from typing import Literal + +from pathlib import Path +from datetime import datetime, timezone +from .base import Storage + +class FileStorage(Storage): + """ + The info are logginged to the file systems + + TODO: describe the storage format + """ + + def __init__(self, path: str = "./log/") -> None: + self.path = Path(path) + self.path.mkdir(parents=True, exist_ok=True) + + def log(self, + obj: object, + name: str = "", + save_type: Literal["json", "text", "pkl"] = "text", + timestamp: datetime | None = None, + ) -> Path: + # TODO: We can remove the timestamp after we implement PipeLog + if timestamp is None: + timestamp = datetime.now(timezone.utc) + else: + timestamp = timestamp.astimezone(timezone.utc) + + cur_p = self.path / name.replace(".", "/") + cur_p.mkdir(parents=True, exist_ok=True) + + path = cur_p / f"{timestamp.strftime('%Y-%m-%d_%H-%M-%S-%f')}.log" + + if save_type == "json": + path = path.with_suffix(".json") + with path.open("w") as f: + try: + json.dump(obj, f) + except TypeError: + json.dump(json.loads(str(obj)), f) + return path + elif save_type == "pkl": + path = path.with_suffix(".pkl") + with path.open("wb") as f: + pickle.dump(obj, f) + return path + elif save_type == "text": + obj = str(obj) + with path.open("w") as f: + f.write(obj) + return path diff --git a/rdagent/log/ui/__init__.py b/rdagent/log/ui/__init__.py new file mode 100644 index 00000000..6040d81b --- /dev/null +++ b/rdagent/log/ui/__init__.py @@ -0,0 +1,7 @@ +""" +UI is a kind of view for user. + +We are not sure how generality of the UI, we can't make decision among following options: +- in general folder like rdagent/log/ui +- It is for specific scenario rdagent/scenarios/ +""" diff --git a/rdagent/log/ui/web.py b/rdagent/log/ui/web.py new file mode 100644 index 00000000..ad5aada4 --- /dev/null +++ b/rdagent/log/ui/web.py @@ -0,0 +1,92 @@ +from rdagent.log.base import View, Storage +from pathlib import Path + +class ProcessView(View): + def __init__(self, trace_path: Path): + + + # Save logs to your desired data structure + # ... + pass + + + def display(s: Storage, watch: bool = False): + pass + + + +class WebView(View): + r""" + + We have tree structure for sequence + + session + | \ + ... defined by user ... + | \ + info1 -> info2 -> ... -> info3 -> ... overtime. + + + | | - dispatch according to uri(e.g. `a.b.c. ...`) + Frontend is composed of windows. + Each window can individually display the message flow. + + Some design principles: + session1.module(e.g. implement). + `s.log(a.b.1.c) s.log(a.b.2.c)` should not handed over to users. + + An display example: + + W1 write factor + W2 evaluate factor + W3 backtest + + W123 + R + RX + RXX + RX + + W4 + trace r1 r2 r3 r4 + + What to do next? + 1. Data structure + 2. Map path like `a.b.c` to frontend components + 3. Display logic + """ + def __init__(self, trace_path: Path): + pass + # Save logs to your desired data structure + # ... + + def display(s: Storage, watch: bool = False): + ui = STLUI() + for msg in s.iter_msg(): # iterate overtime + # NOTE: iter_msg will correctly seperate the information. + # TODO: msg may support streaming mode. + ui.dispatch(msg) + pass + + +# TODO: Implement the following classes +class STLWindow: + ... + + def consume_msg(self, msg): + ... # update it's view + + +class STLUI: + wd_l: list[STLWindow] + + def __init__(self): + self.build_ui() + + def build_ui(self): + # control the dispaly of windows + ... + + def dispatch(self, msg): + # map the message to a specific window + ... diff --git a/rdagent/log/utils.py b/rdagent/log/utils.py new file mode 100644 index 00000000..1459f888 --- /dev/null +++ b/rdagent/log/utils.py @@ -0,0 +1,70 @@ +import inspect +import re + + +class LogColors: + """ + ANSI color codes for use in console output. + """ + + RED = "\033[91m" + GREEN = "\033[92m" + YELLOW = "\033[93m" + BLUE = "\033[94m" + MAGENTA = "\033[95m" + CYAN = "\033[96m" + WHITE = "\033[97m" + GRAY = "\033[90m" + BLACK = "\033[30m" + + BOLD = "\033[1m" + ITALIC = "\033[3m" + + END = "\033[0m" + + @classmethod + def get_all_colors(cls: type['LogColors']) -> list: + names = dir(cls) + names = [name for name in names if not name.startswith("__") and not callable(getattr(cls, name))] + return [getattr(cls, name) for name in names] + + def render(self, text: str, color: str = "", style: str = "") -> str: + """ + render text by input color and style. + It's not recommend that input text is already rendered. + """ + # This method is called too frequently, which is not good. + colors = self.get_all_colors() + # Perhaps color and font should be distinguished here. + if color and color in colors: + error_message = f"color should be in: {colors} but now is: {color}" + raise ValueError(error_message) + if style and style in colors: + error_message = f"style should be in: {colors} but now is: {style}" + raise ValueError(error_message) + + text = f"{color}{text}{self.END}" + + return f"{style}{text}{self.END}" + + @staticmethod + def remove_ansi_codes(s: str) -> str: + """ + It is for removing ansi ctrl characters in the string(e.g. colored text) + """ + ansi_escape = re.compile(r'\x1B\[[0-?]*[ -/]*[@-~]') + return ansi_escape.sub('', s) + + +def get_caller_info(): + # Get the current stack information + stack = inspect.stack() + # The second element is usually the caller's information + caller_info = stack[2] + frame = caller_info[0] + info = { + 'line': caller_info.lineno, + 'name': frame.f_globals['__name__'], # Get the module name from the frame's globals + 'function': frame.f_code.co_name, # Get the caller's function name + } + return info diff --git a/rdagent/oai/llm_utils.py b/rdagent/oai/llm_utils.py index 72ab564c..35fb0fb5 100644 --- a/rdagent/oai/llm_utils.py +++ b/rdagent/oai/llm_utils.py @@ -19,12 +19,11 @@ import tiktoken from rdagent.core.conf import RD_AGENT_SETTINGS -from rdagent.core.log import LogColors, RDAgentLog +from rdagent.log import LogColors, rdagent_logger as logger from rdagent.core.utils import SingletonBaseClass DEFAULT_QLIB_DOT_PATH = Path("./") - def md5_hash(input_string: str) -> str: hash_md5 = hashlib.md5(usedforsecurity=False) input_bytes = input_string.encode("utf-8") @@ -35,17 +34,17 @@ def md5_hash(input_string: str) -> str: try: from azure.identity import DefaultAzureCredential, get_bearer_token_provider except ImportError: - RDAgentLog().warning("azure.identity is not installed.") + logger.warning("azure.identity is not installed.") try: import openai except ImportError: - RDAgentLog().warning("openai is not installed.") + logger.warning("openai is not installed.") try: from llama import Llama except ImportError: - RDAgentLog().warning("llama is not installed.") + logger.warning("llama is not installed.") class ConvManager: @@ -88,6 +87,7 @@ def __init__(self, cache_location: str) -> None: super().__init__() self.cache_location = cache_location db_file_exist = Path(cache_location).exists() + # TODO: sqlite3 does not support multiprocessing. self.conn = sqlite3.connect(cache_location) self.c = self.conn.cursor() if not db_file_exist: @@ -150,11 +150,11 @@ def __init__(self) -> None: self.session_cache_location = Path(self.cfg.session_cache_folder_location) self.cache = {} if not self.session_cache_location.exists(): - RDAgentLog().warning(f"Directory {self.session_cache_location} does not exist.") + logger.warning(f"Directory {self.session_cache_location} does not exist.") self.session_cache_location.mkdir(parents=True, exist_ok=True) json_files = [f for f in self.session_cache_location.iterdir() if f.suffix == ".json"] if not json_files: - RDAgentLog().info(f"No JSON files found in {self.session_cache_location}.") + logger.info(f"No JSON files found in {self.session_cache_location}.") for file_path in json_files: conversation_id = file_path.stem with file_path.open("r") as f: @@ -203,12 +203,14 @@ def build_chat_completion(self, user_prompt: str, **kwargs: Any) -> str: user prompt should always be provided """ messages = self.build_chat_completion_message(user_prompt) + + with logger.tag(f"session_{self.conversation_id}"): + response = self.api_backend._try_create_chat_completion_or_embedding( # noqa: SLF001 + messages=messages, + chat_completion=True, + **kwargs, + ) - response = self.api_backend._try_create_chat_completion_or_embedding( # noqa: SLF001 - messages=messages, - chat_completion=True, - **kwargs, - ) messages.append( { "role": "assistant", @@ -478,8 +480,8 @@ def _try_create_chat_completion_or_embedding( if chat_completion: return self._create_chat_completion_auto_continue(**kwargs) except openai.BadRequestError as e: # noqa: PERF203 - RDAgentLog().warning(e) - RDAgentLog().warning(f"Retrying {i+1}th time...") + logger.warning(e) + logger.warning(f"Retrying {i+1}th time...") if "'messages' must contain the word 'json' in some form" in e.message: kwargs["add_json_in_prompt"] = True elif embedding and "maximum context length" in e.message: @@ -487,8 +489,8 @@ def _try_create_chat_completion_or_embedding( content[: len(content) // 2] for content in kwargs.get("input_content_list", []) ] except Exception as e: # noqa: BLE001 - RDAgentLog().warning(e) - RDAgentLog().warning(f"Retrying {i+1}th time...") + logger.warning(e) + logger.warning(f"Retrying {i+1}th time...") time.sleep(self.retry_wait_seconds) error_message = f"Failed to create chat completion after {max_retry} retries." raise RuntimeError(error_message) @@ -526,7 +528,7 @@ def _create_embedding_inner_function( self.cache.embedding_set(content_to_embedding_dict) return [content_to_embedding_dict[content] for content in input_content_list] - def _build_messages(self, messages: list[dict]) -> str: + def _build_log_messages(self, messages: list[dict]) -> str: log_messages = "" for m in messages: log_messages += ( @@ -537,17 +539,6 @@ def _build_messages(self, messages: list[dict]) -> str: ) return log_messages - def log_messages(self, messages: list[dict]) -> None: - if self.cfg.log_llm_chat_content: - RDAgentLog().info(self._build_messages(messages)) - - def log_response(self, response: str | None = None, *, stream: bool = False) -> None: - if self.cfg.log_llm_chat_content: - if stream: - RDAgentLog().info(f"\n{LogColors.CYAN}Response:{LogColors.END}") - else: - RDAgentLog().info(f"\n{LogColors.CYAN}Response:{response}{LogColors.END}") - def _create_chat_completion_inner_function( # noqa: C901, PLR0912, PLR0915 self, messages: list[dict], @@ -560,7 +551,9 @@ def _create_chat_completion_inner_function( # noqa: C901, PLR0912, PLR0915 json_mode: bool = False, add_json_in_prompt: bool = False, ) -> str: - self.log_messages(messages) + # TODO: we can add this function back to avoid so much `self.cfg.log_llm_chat_content` + if self.cfg.log_llm_chat_content: + logger.info(self._build_log_messages(messages), tag="llm_messages") # TODO: fail to use loguru adaptor due to stream response input_content_json = json.dumps(messages) input_content_json = ( @@ -569,6 +562,8 @@ def _create_chat_completion_inner_function( # noqa: C901, PLR0912, PLR0915 if self.use_chat_cache: cache_result = self.cache.chat_get(input_content_json) if cache_result is not None: + if self.cfg.log_llm_chat_content: + logger.info(f"{LogColors.CYAN}Response:{cache_result}{LogColors.END}", tag="llm_messages") return cache_result, None if temperature is None: @@ -588,7 +583,8 @@ def _create_chat_completion_inner_function( # noqa: C901, PLR0912, PLR0915 temperature=temperature, ) resp = response[0]["generation"]["content"] - self.log_response(resp) + if self.cfg.log_llm_chat_content: + logger.info(f"{LogColors.CYAN}Response:{resp}{LogColors.END}", tag="llm_messages") elif self.use_gcr_endpoint: body = str.encode( json.dumps( @@ -609,7 +605,8 @@ def _create_chat_completion_inner_function( # noqa: C901, PLR0912, PLR0915 req = urllib.request.Request(self.gcr_endpoint, body, self.headers) # noqa: S310 response = urllib.request.urlopen(req) # noqa: S310 resp = json.loads(response.read().decode())["output"] - self.log_response(resp) + if self.cfg.log_llm_chat_content: + logger.info(f"{LogColors.CYAN}Response:{resp}{LogColors.END}", tag="llm_messages") else: if self.use_azure: if json_mode: @@ -650,8 +647,11 @@ def _create_chat_completion_inner_function( # noqa: C901, PLR0912, PLR0915 presence_penalty=presence_penalty, ) if self.chat_stream: - self.log_response(stream=True) resp = "" + # TODO: with logger.config(stream=self.chat_stream): and add a `stream_start` flag to add timestamp for first message. + if self.cfg.log_llm_chat_content: + logger.info(f"{LogColors.CYAN}Response:{LogColors.END}", tag="llm_messages") + for chunk in response: content = ( chunk.choices[0].delta.content @@ -659,24 +659,28 @@ def _create_chat_completion_inner_function( # noqa: C901, PLR0912, PLR0915 else "" ) if self.cfg.log_llm_chat_content: - print(LogColors.CYAN + content, end="") + logger.info(LogColors.CYAN + content + LogColors.END, raw=True, tag="llm_messages") resp += content if len(chunk.choices) > 0 and chunk.choices[0].finish_reason is not None: finish_reason = chunk.choices[0].finish_reason + + if self.cfg.log_llm_chat_content: + logger.info("\n", raw=True, tag="llm_messages") + else: resp = response.choices[0].message.content finish_reason = response.choices[0].finish_reason - self.log_response(resp) + if self.cfg.log_llm_chat_content: + logger.info(f"{LogColors.CYAN}Response:{resp}{LogColors.END}", tag="llm_messages") if json_mode: json.loads(resp) if self.dump_chat_cache: self.cache.chat_set(input_content_json, resp) - # TODO: fail to use loguru adaptor due to stream response return resp, finish_reason def calculate_token_from_messages(self, messages: list[dict]) -> int: if self.use_llama2 or self.use_gcr_endpoint: - RDAgentLog().warning("num_tokens_from_messages() is not implemented for model llama2.") + logger.warning("num_tokens_from_messages() is not implemented for model llama2.") return 0 # TODO implement this function for llama2 if "gpt4" in self.chat_model or "gpt-4" in self.chat_model: diff --git a/rdagent/scenarios/qlib/factor_experiment_loader/pdf_loader.py b/rdagent/scenarios/qlib/factor_experiment_loader/pdf_loader.py index 446b7026..01029ff0 100644 --- a/rdagent/scenarios/qlib/factor_experiment_loader/pdf_loader.py +++ b/rdagent/scenarios/qlib/factor_experiment_loader/pdf_loader.py @@ -18,7 +18,7 @@ ) from rdagent.components.loader.experiment_loader import FactorExperimentLoader from rdagent.core.conf import RD_AGENT_SETTINGS -from rdagent.core.log import RDAgentLog +from rdagent.log import rdagent_logger as logger from rdagent.core.prompts import Prompts from rdagent.oai.llm_utils import APIBackend, create_embedding_with_multiprocessing from rdagent.scenarios.qlib.factor_experiment_loader.json_loader import ( @@ -27,7 +27,6 @@ document_process_prompts = Prompts(file_path=Path(__file__).parent / "prompts.yaml") - def classify_report_from_dict( report_dict: Mapping[str, str], vote_time: int = 1, @@ -69,7 +68,7 @@ def classify_report_from_dict( if isinstance(value, str): content = value else: - RDAgentLog().warning(f"输入格式不符合要求: {file_name}") + logger.warning(f"输入格式不符合要求: {file_name}") res_dict[file_name] = {"class": 0} continue @@ -101,7 +100,7 @@ def classify_report_from_dict( res = json.loads(res) vote_list.append(int(res["class"])) except json.JSONDecodeError: - RDAgentLog().warning(f"返回值无法解析: {file_name}") + logger.warning(f"返回值无法解析: {file_name}") res_dict[file_name] = {"class": 0} count_0 = vote_list.count(0) count_1 = vote_list.count(1) @@ -246,7 +245,7 @@ def extract_factors_from_report_dict( if int(value.get("class")) == 1: useful_report_dict[key] = report_dict[key] else: - RDAgentLog().warning(f"Invalid input format: {key}") + logger.warning(f"Invalid input format: {key}") final_report_factor_dict = {} # for file_name, content in useful_report_dict.items(): @@ -276,7 +275,7 @@ def extract_factors_from_report_dict( file_name = file_names[index] final_report_factor_dict.setdefault(file_name, {}) final_report_factor_dict[file_name] = result.get() - RDAgentLog().info(f"已经完成{len(final_report_factor_dict)}个报告的因子提取") + logger.info(f"已经完成{len(final_report_factor_dict)}个报告的因子提取") return final_report_factor_dict @@ -499,7 +498,7 @@ def __deduplicate_factor_dict(factor_dict: dict[str, dict[str, str]]) -> list[li kmeans_index_group = __kmeans_embeddings(embeddings=embeddings, k=k) if len(kmeans_index_group[0]) < RD_AGENT_SETTINGS.max_input_duplicate_factor_group: target_k = k - RDAgentLog().info(f"K-means group number: {k}") + logger.info(f"K-means group number: {k}") break factor_name_groups = [[factor_names[index] for index in index_group] for index_group in kmeans_index_group] diff --git a/rdagent/scenarios/qlib/task_generator/data.py b/rdagent/scenarios/qlib/task_generator/data.py index 2203b43d..4995bcf0 100644 --- a/rdagent/scenarios/qlib/task_generator/data.py +++ b/rdagent/scenarios/qlib/task_generator/data.py @@ -7,7 +7,7 @@ from rdagent.components.runner import CachedRunner from rdagent.components.runner.conf import RUNNER_SETTINGS -from rdagent.core.log import RDAgentLog +from rdagent.log import rdagent_logger as logger from rdagent.core.task_generator import TaskGenerator from rdagent.oai.llm_utils import md5_hash from rdagent.scenarios.qlib.experiment.factor_experiment import QlibFactorExperiment @@ -15,7 +15,6 @@ DIRNAME = Path(__file__).absolute().resolve().parent DIRNAME_local = Path.cwd() -logger = RDAgentLog() # class QlibFactorExpWorkspace: diff --git a/rdagent/scenarios/qlib/task_generator/feedback.py b/rdagent/scenarios/qlib/task_generator/feedback.py index 8ce1e08e..339ab7f0 100644 --- a/rdagent/scenarios/qlib/task_generator/feedback.py +++ b/rdagent/scenarios/qlib/task_generator/feedback.py @@ -7,7 +7,7 @@ from jinja2 import Environment, StrictUndefined from rdagent.core.experiment import Experiment -from rdagent.core.log import RDAgentLog +from rdagent.log import rdagent_logger as logger from rdagent.core.prompts import Prompts from rdagent.core.proposal import ( Hypothesis, @@ -19,7 +19,6 @@ feedback_prompts = Prompts(file_path=Path(__file__).parent.parent / "prompts.yaml") DIRNAME = Path(__file__).absolute().resolve().parent -logger = RDAgentLog() class QlibFactorHypothesisExperiment2Feedback(HypothesisExperiment2Feedback): diff --git a/rdagent/scenarios/qlib/task_generator/model.py b/rdagent/scenarios/qlib/task_generator/model.py index 47586924..88c7ac33 100644 --- a/rdagent/scenarios/qlib/task_generator/model.py +++ b/rdagent/scenarios/qlib/task_generator/model.py @@ -10,7 +10,7 @@ ) from rdagent.components.runner import CachedRunner from rdagent.components.runner.conf import RUNNER_SETTINGS -from rdagent.core.log import RDAgentLog +from rdagent.log import rdagent_logger as logger from rdagent.core.task_generator import TaskGenerator from rdagent.utils.env import QTDockerEnv @@ -72,7 +72,7 @@ def generate(self, exp: ModelExperiment) -> ModelExperiment: csv_path = exp.exp_ws.workspace_path / "qlib_res.csv" if not csv_path.exists(): - RDAgentLog().error(f"File {csv_path} does not exist.") + logger.error(f"File {csv_path} does not exist.") return None result = pd.read_csv(csv_path, index_col=0).iloc[:,0] diff --git a/rdagent/utils/env.py b/rdagent/utils/env.py index 8b1503b9..d90becf1 100644 --- a/rdagent/utils/env.py +++ b/rdagent/utils/env.py @@ -19,7 +19,7 @@ from pydantic import BaseModel from pydantic_settings import BaseSettings -from rdagent.core.log import RDAgentLog +from rdagent.log import rdagent_logger as logger ASpecificBaseModel = TypeVar("ASpecificBaseModel", bound=BaseModel) @@ -146,11 +146,11 @@ def prepare(self): """ client = docker.from_env() if self.conf.build_from_dockerfile and self.conf.dockerfile_folder_path.exists(): - RDAgentLog().info(f"Building the image from dockerfile: {self.conf.dockerfile_folder_path}") + logger.info(f"Building the image from dockerfile: {self.conf.dockerfile_folder_path}") image, logs = client.images.build( path=str(self.conf.dockerfile_folder_path), tag=self.conf.image, network_mode=self.conf.network ) - RDAgentLog().info(f"Finished building the image from dockerfile: {self.conf.dockerfile_folder_path}") + logger.info(f"Finished building the image from dockerfile: {self.conf.dockerfile_folder_path}") try: client.images.get(self.conf.image) except docker.errors.ImageNotFound: @@ -216,8 +216,8 @@ def prepare(self): super().prepare() qlib_data_path = next(iter(self.conf.extra_volumes.keys())) if not (Path(qlib_data_path) / "qlib_data" / "cn_data").exists(): - RDAgentLog().info("We are downloading!") + logger.info("We are downloading!") cmd = "python -m qlib.run.get_data qlib_data --target_dir ~/.qlib/qlib_data/cn_data --region cn --interval 1d --delete_old False" self.run(entry=cmd) else: - RDAgentLog().info("Data already exists. Download skipped.") + logger.info("Data already exists. Download skipped.")