diff --git a/Makefile b/Makefile index 5fc4e80b..19ff05e0 100644 --- a/Makefile +++ b/Makefile @@ -100,7 +100,7 @@ toml-sort: $(PIPRUN) toml-sort --check pyproject.toml # Check lint with all linters. -lint: mypy ruff toml-sort +lint: black isort mypy ruff toml-sort # Run pre-commit with autofix against all files. pre-commit: diff --git a/constraints/3.8.txt b/constraints/3.8.txt index b1dc0ef5..7b9a455d 100644 --- a/constraints/3.8.txt +++ b/constraints/3.8.txt @@ -151,4 +151,4 @@ typing_extensions==4.9.0 tzdata==2023.4 urllib3==2.1.0 yarl==1.9.4 -zipp==3.17.0 +zipp==3.17.0 \ No newline at end of file diff --git a/rdagent/app/CI/run.py b/rdagent/app/CI/run.py index e73a18fb..d3671cc2 100644 --- a/rdagent/app/CI/run.py +++ b/rdagent/app/CI/run.py @@ -13,6 +13,16 @@ from typing import Any, Literal import tree_sitter_python +from rich import print +from rich.panel import Panel +from rich.progress import Progress, SpinnerColumn, TimeElapsedColumn +from rich.prompt import Prompt +from rich.rule import Rule +from rich.syntax import Syntax +from rich.table import Table +from rich.text import Text +from tree_sitter import Language, Node, Parser + from rdagent.core.evolving_framework import ( Evaluator, EvoAgent, @@ -24,15 +34,6 @@ ) from rdagent.core.prompts import Prompts from rdagent.oai.llm_utils import APIBackend -from rich import print -from rich.panel import Panel -from rich.progress import Progress, SpinnerColumn, TimeElapsedColumn -from rich.prompt import Prompt -from rich.rule import Rule -from rich.syntax import Syntax -from rich.table import Table -from rich.text import Text -from tree_sitter import Language, Node, Parser py_parser = Parser(Language(tree_sitter_python.language())) CI_prompts = Prompts(file_path=Path(__file__).parent / "prompts.yaml") @@ -355,7 +356,6 @@ def evaluate(self, evo: Repo, **kwargs: Any) -> CIFeedback: # noqa: ARG002 class MypyEvaluator(Evaluator): - def __init__(self, command: str | None = None) -> None: if command is None: self.command = "mypy . --pretty --no-error-summary --show-column-numbers" @@ -411,12 +411,10 @@ def evaluate(self, evo: Repo, **kwargs: Any) -> CIFeedback: # noqa: ARG002 class MultiEvaluator(Evaluator): - def __init__(self, *evaluators: Evaluator) -> None: self.evaluators = evaluators def evaluate(self, evo: Repo, **kwargs: Any) -> CIFeedback: - all_errors = defaultdict(list) for evaluator in self.evaluators: feedback: CIFeedback = evaluator.evaluate(evo, **kwargs) @@ -438,7 +436,6 @@ def evolve( # noqa: C901, PLR0912, PLR0915 knowledge_l: list[Knowledge] | None = None, # noqa: ARG002 **kwargs: Any, # noqa: ARG002 ) -> Repo: - @dataclass class CodeFixGroup: start_line: int diff --git a/rdagent/app/data mining/README.md b/rdagent/app/data mining/README.md new file mode 100644 index 00000000..45ca5c2a --- /dev/null +++ b/rdagent/app/data mining/README.md @@ -0,0 +1,5 @@ + + + +# TODO +If we have more efforts, include more scenario. \ No newline at end of file diff --git a/rdagent/app/factor_extraction_and_implementation/factor_extract_and_implement.py b/rdagent/app/factor_extraction_and_implementation/factor_extract_and_implement.py index 04efa2b6..f0fd5435 100644 --- a/rdagent/app/factor_extraction_and_implementation/factor_extract_and_implement.py +++ b/rdagent/app/factor_extraction_and_implementation/factor_extract_and_implement.py @@ -1,14 +1,20 @@ # %% from dotenv import load_dotenv -from rdagent.factor_implementation.CoSTEER import CoSTEERFG -from rdagent.factor_implementation.task_loader.pdf_loader import FactorImplementationTaskLoaderFromPDFfiles + +from rdagent.scenarios.qlib.factor_task_implementation import ( + COSTEERFG_QUANT_FACTOR_IMPLEMENTATION, +) +from rdagent.scenarios.qlib.factor_task_loader.pdf_loader import ( + FactorImplementationTaskLoaderFromPDFfiles, +) assert load_dotenv() def extract_factors_and_implement(report_file_path: str) -> None: factor_tasks = FactorImplementationTaskLoaderFromPDFfiles().load(report_file_path) - implementation_result = CoSTEERFG().generate(factor_tasks) + implementation_result = COSTEERFG_QUANT_FACTOR_IMPLEMENTATION().generate(factor_tasks) + # Qlib to run the implementation return implementation_result diff --git a/rdagent/app/model_implementation/eval.py b/rdagent/app/model_implementation/eval.py index d51bf15c..e9ef2301 100644 --- a/rdagent/app/model_implementation/eval.py +++ b/rdagent/app/model_implementation/eval.py @@ -17,7 +17,7 @@ # TODO: Align it with the benchmark framework after @wenjun's refine the evaluation part. # Currently, we just handcraft a workflow for fast evaluation. -mil = ModelImpLoader(DIRNAME.parent.parent / "model_implementation" / "benchmark" / "gt_code") +mil = ModelImpLoader(DIRNAME.parent.parent / "model_implementation" / "benchmark" / "gt_code") mie = ModelImpValEval() # Evaluation: diff --git a/rdagent/app/model_proposal/conf.py b/rdagent/app/model_proposal/conf.py new file mode 100644 index 00000000..f3455c61 --- /dev/null +++ b/rdagent/app/model_proposal/conf.py @@ -0,0 +1,12 @@ +from pydantic_settings import BaseSettings + + +class ModelPropSetting(BaseSettings): + """""" + + scen: str # a.b.c:XXXClass + # TODO: inital keywards should be included in the settings + ... + + +MODEL_PROP_SETTING = ModelPropSetting() diff --git a/rdagent/app/model_proposal/run.py b/rdagent/app/model_proposal/run.py new file mode 100644 index 00000000..fad65bf0 --- /dev/null +++ b/rdagent/app/model_proposal/run.py @@ -0,0 +1,36 @@ +""" +TODO: Model Structure RD-Loop +TODO: move the following code to a new class: Model_RD_Agent +""" + +# import_from +from rdagent.app.model_proposal.conf import MODEL_PROP_SETTING +from rdagent.core.implementation import TaskGenerator +from rdagent.core.proposal import Belief2Task, BeliefSet, Imp2Feedback, Trace + +# load_from_cls_uri + + +scen = load_from_cls_uri(MODEL_PROP_SETTING.scen)() + +belief_gen = load_from_cls_uri(MODEL_PROP_SETTING.belief_gen)(scen) + +belief2task: Belief2Task = load_from_cls_uri(MODEL_PROP_SETTING.belief2task)() + +task_gen: TaskGenerator = load_from_cls_uri(MODEL_PROP_SETTING.task_gen)(scen) # for implementation + +imp2feedback: Imp2Feedback = load_from_cls_uri(MODEL_PROP_SETTING.imp2feedback)(scen) # for implementation + + +iter_n = MODEL_PROP_SETTING.iter_n + +trace = Trace() + +belief_set = BeliefSet() +for _ in range(iter_n): + belief = belief_gen.gen(trace) + task = belief2task.convert(belief) + imp = task_gen.gen(task) + imp.execute() + feedback = imp2feedback.summarize(imp) + trace.hist.append((belief, feedback)) diff --git a/rdagent/app/factor_implementation/eval_implement.py b/rdagent/app/quant_factor_benchmark/eval.py similarity index 70% rename from rdagent/app/factor_implementation/eval_implement.py rename to rdagent/app/quant_factor_benchmark/eval.py index 0ae577b8..0580e8cb 100644 --- a/rdagent/app/factor_implementation/eval_implement.py +++ b/rdagent/app/quant_factor_benchmark/eval.py @@ -1,7 +1,9 @@ -from rdagent.benchmark.conf import BenchmarkSettings +from rdagent.components.benchmark.conf import BenchmarkSettings +from rdagent.components.benchmark.eval_method import FactorImplementEval from rdagent.core.utils import import_class -from rdagent.benchmark.eval_method import FactorImplementEval -from rdagent.factor_implementation.task_loader.json_loader import FactorTestCaseLoaderFromJsonFile +from rdagent.scenarios.qlib.factor_task_loader.json_loader import ( + FactorTestCaseLoaderFromJsonFile, +) # 1.read the settings bs = BenchmarkSettings() diff --git a/rdagent/benchmark/conf.py b/rdagent/components/benchmark/conf.py similarity index 90% rename from rdagent/benchmark/conf.py rename to rdagent/components/benchmark/conf.py index bcbbc030..f854eec3 100644 --- a/rdagent/benchmark/conf.py +++ b/rdagent/components/benchmark/conf.py @@ -1,15 +1,16 @@ from dotenv import load_dotenv + load_dotenv(verbose=True, override=True) from dataclasses import field from pathlib import Path -from typing import Optional +from typing import Optional from pydantic_settings import BaseSettings DIRNAME = Path(__file__).absolute().resolve().parent -class BenchmarkSettings(BaseSettings): +class BenchmarkSettings(BaseSettings): ground_truth_dir: Path = DIRNAME / "ground_truth" bench_data_path: Path = DIRNAME / "example.json" @@ -22,4 +23,4 @@ class BenchmarkSettings(BaseSettings): default_factory=dict, ) # extra kwargs for the method to be tested except the task list - bench_result_path: Path = DIRNAME / "result" \ No newline at end of file + bench_result_path: Path = DIRNAME / "result" diff --git a/rdagent/benchmark/eval_method.py b/rdagent/components/benchmark/eval_method.py similarity index 85% rename from rdagent/benchmark/eval_method.py rename to rdagent/components/benchmark/eval_method.py index 262d248b..8bc81012 100644 --- a/rdagent/benchmark/eval_method.py +++ b/rdagent/components/benchmark/eval_method.py @@ -1,27 +1,29 @@ +from collections import defaultdict from pathlib import Path from typing import List, Tuple, Union from tqdm import tqdm -from collections import defaultdict -from rdagent.factor_implementation.share_modules.factor_implementation_config import FACTOR_IMPLEMENT_SETTINGS -from rdagent.core.exception import ImplementRunException -from rdagent.core.task import ( - TaskImplementation, - TestCase, -) -from rdagent.factor_implementation.evolving.evaluators import ( + +from rdagent.components.task_implementation.factor_implementation.evolving.evaluators import ( FactorImplementationCorrelationEvaluator, + FactorImplementationEvaluator, FactorImplementationIndexEvaluator, FactorImplementationIndexFormatEvaluator, FactorImplementationMissingValuesEvaluator, FactorImplementationRowCountEvaluator, FactorImplementationSingleColumnEvaluator, FactorImplementationValuesEvaluator, - FactorImplementationEvaluator, ) +from rdagent.components.task_implementation.factor_implementation.evolving.factor import ( + FileBasedFactorImplementation, +) +from rdagent.components.task_implementation.factor_implementation.share_modules.factor_implementation_config import ( + FACTOR_IMPLEMENT_SETTINGS, +) +from rdagent.core.exception import ImplementRunException from rdagent.core.implementation import TaskGenerator +from rdagent.core.task import TaskImplementation, TestCase from rdagent.core.utils import multiprocessing_wrapper -from rdagent.factor_implementation.evolving.factor import FileBasedFactorImplementation class BaseEval: @@ -109,14 +111,14 @@ def __init__( **kwargs, ): online_evaluator_l = [ - FactorImplementationSingleColumnEvaluator(), - FactorImplementationIndexFormatEvaluator(), - FactorImplementationRowCountEvaluator(), - FactorImplementationIndexEvaluator(), - FactorImplementationMissingValuesEvaluator(), - FactorImplementationValuesEvaluator(), - FactorImplementationCorrelationEvaluator(hard_check=False), - ] + FactorImplementationSingleColumnEvaluator(), + FactorImplementationIndexFormatEvaluator(), + FactorImplementationRowCountEvaluator(), + FactorImplementationIndexEvaluator(), + FactorImplementationMissingValuesEvaluator(), + FactorImplementationValuesEvaluator(), + FactorImplementationCorrelationEvaluator(hard_check=False), + ] super().__init__(online_evaluator_l, test_cases, method, *args, **kwargs) self.test_round = test_round diff --git a/rdagent/benchmark/example.json b/rdagent/components/benchmark/example.json similarity index 100% rename from rdagent/benchmark/example.json rename to rdagent/components/benchmark/example.json diff --git a/rdagent/document_reader/document_reader.py b/rdagent/components/document_reader/document_reader.py similarity index 100% rename from rdagent/document_reader/document_reader.py rename to rdagent/components/document_reader/document_reader.py diff --git a/rdagent/components/idea_proposal/factor_proposal.py b/rdagent/components/idea_proposal/factor_proposal.py new file mode 100644 index 00000000..e69de29b diff --git a/rdagent/components/idea_proposal/model_proposal.py b/rdagent/components/idea_proposal/model_proposal.py new file mode 100644 index 00000000..e69de29b diff --git a/rdagent/knowledge_management/graph.py b/rdagent/components/knowledge_management/graph.py similarity index 99% rename from rdagent/knowledge_management/graph.py rename to rdagent/components/knowledge_management/graph.py index 59c7f366..c25f84e9 100644 --- a/rdagent/knowledge_management/graph.py +++ b/rdagent/components/knowledge_management/graph.py @@ -6,8 +6,13 @@ from pathlib import Path from typing import Any, NoReturn +from rdagent.components.knowledge_management.vector_base import ( + KnowledgeMetaData, + PDVectorBase, + VectorBase, + cosine, +) from rdagent.oai.llm_utils import APIBackend -from rdagent.knowledge_management.vector_base import KnowledgeMetaData, PDVectorBase, VectorBase, cosine Node = KnowledgeMetaData diff --git a/rdagent/knowledge_management/vector_base.py b/rdagent/components/knowledge_management/vector_base.py similarity index 100% rename from rdagent/knowledge_management/vector_base.py rename to rdagent/components/knowledge_management/vector_base.py index b377a740..9322a510 100644 --- a/rdagent/knowledge_management/vector_base.py +++ b/rdagent/components/knowledge_management/vector_base.py @@ -5,8 +5,8 @@ import pandas as pd from scipy.spatial.distance import cosine -from rdagent.oai.llm_utils import APIBackend from rdagent.core.log import RDAgentLog +from rdagent.oai.llm_utils import APIBackend class KnowledgeMetaData: diff --git a/rdagent/factor_implementation/CoSTEER.py b/rdagent/components/task_implementation/factor_implementation/CoSTEER.py similarity index 85% rename from rdagent/factor_implementation/CoSTEER.py rename to rdagent/components/task_implementation/factor_implementation/CoSTEER.py index 2519da6b..554193ba 100644 --- a/rdagent/factor_implementation/CoSTEER.py +++ b/rdagent/components/task_implementation/factor_implementation/CoSTEER.py @@ -1,23 +1,29 @@ import pickle from pathlib import Path from typing import List -from rdagent.core.implementation import TaskGenerator -from rdagent.core.task import TaskImplementation -from rdagent.factor_implementation.evolving.knowledge_management import FactorImplementationKnowledgeBaseV1 -from rdagent.factor_implementation.evolving.factor import FactorImplementTask, FactorEvovlingItem -from rdagent.factor_implementation.evolving.knowledge_management import ( + +from rdagent.components.task_implementation.factor_implementation.evolving.evaluators import ( + FactorImplementationEvaluatorV1, + FactorImplementationsMultiEvaluator, +) +from rdagent.components.task_implementation.factor_implementation.evolving.evolving_strategy import ( + FactorEvolvingStrategyWithGraph, +) +from rdagent.components.task_implementation.factor_implementation.evolving.factor import ( + FactorEvovlingItem, + FactorImplementTask, +) +from rdagent.components.task_implementation.factor_implementation.evolving.knowledge_management import ( FactorImplementationGraphKnowledgeBase, FactorImplementationGraphRAGStrategy, + FactorImplementationKnowledgeBaseV1, ) -from rdagent.factor_implementation.evolving.evolving_strategy import FactorEvolvingStrategyWithGraph -from rdagent.factor_implementation.evolving.evaluators import ( - FactorImplementationsMultiEvaluator, - FactorImplementationEvaluatorV1, -) -from rdagent.core.evolving_agent import RAGEvoAgent -from rdagent.factor_implementation.share_modules.factor_implementation_config import ( +from rdagent.components.task_implementation.factor_implementation.share_modules.factor_implementation_config import ( FACTOR_IMPLEMENT_SETTINGS, ) +from rdagent.core.evolving_agent import RAGEvoAgent +from rdagent.core.implementation import TaskGenerator +from rdagent.core.task import TaskImplementation class CoSTEERFG(TaskGenerator): @@ -47,7 +53,6 @@ def __init__( self.evolving_version = 2 def load_or_init_knowledge_base(self, former_knowledge_base_path: Path = None, component_init_list: list = []): - if former_knowledge_base_path is not None and former_knowledge_base_path.exists(): factor_knowledge_base = pickle.load(open(former_knowledge_base_path, "rb")) if self.evolving_version == 1 and not isinstance( diff --git a/rdagent/factor_implementation/evolving/evaluators.py b/rdagent/components/task_implementation/factor_implementation/evolving/evaluators.py similarity index 98% rename from rdagent/factor_implementation/evolving/evaluators.py rename to rdagent/components/task_implementation/factor_implementation/evolving/evaluators.py index 11a3d330..7303de20 100644 --- a/rdagent/factor_implementation/evolving/evaluators.py +++ b/rdagent/components/task_implementation/factor_implementation/evolving/evaluators.py @@ -1,26 +1,27 @@ import json import re - from abc import abstractmethod -from typing import Tuple +from pathlib import Path +from typing import List, Tuple import pandas as pd from jinja2 import Template -from rdagent.oai.llm_utils import APIBackend -from rdagent.core.log import RDAgentLog -from rdagent.factor_implementation.evolving.evolving_strategy import FactorImplementTask, FactorEvovlingItem -from rdagent.core.task import ( - TaskImplementation, +from rdagent.components.task_implementation.factor_implementation.evolving.evolving_strategy import ( + FactorEvovlingItem, + FactorImplementTask, ) -from typing import List, Tuple -from rdagent.core.evolving_framework import QueriedKnowledge, Feedback +from rdagent.components.task_implementation.factor_implementation.share_modules.factor_implementation_config import ( + FACTOR_IMPLEMENT_SETTINGS, +) +from rdagent.core.conf import RD_AGENT_SETTINGS from rdagent.core.evaluation import Evaluator +from rdagent.core.evolving_framework import Feedback, QueriedKnowledge +from rdagent.core.log import RDAgentLog from rdagent.core.prompts import Prompts -from rdagent.core.conf import RD_AGENT_SETTINGS -from rdagent.factor_implementation.share_modules.factor_implementation_config import FACTOR_IMPLEMENT_SETTINGS +from rdagent.core.task import TaskImplementation from rdagent.core.utils import multiprocessing_wrapper -from pathlib import Path +from rdagent.oai.llm_utils import APIBackend evaluate_prompts = Prompts(file_path=Path(__file__).parent.parent / "prompts.yaml") diff --git a/rdagent/factor_implementation/evolving/evolving_strategy.py b/rdagent/components/task_implementation/factor_implementation/evolving/evolving_strategy.py similarity index 95% rename from rdagent/factor_implementation/evolving/evolving_strategy.py rename to rdagent/components/task_implementation/factor_implementation/evolving/evolving_strategy.py index 71dde026..36a8ae9a 100644 --- a/rdagent/factor_implementation/evolving/evolving_strategy.py +++ b/rdagent/components/task_implementation/factor_implementation/evolving/evolving_strategy.py @@ -8,38 +8,30 @@ from jinja2 import Template -from rdagent.core.conf import RD_AGENT_SETTINGS -from rdagent.core.evolving_framework import EvolvingStrategy, QueriedKnowledge -from rdagent.oai.llm_utils import APIBackend -from rdagent.factor_implementation.share_modules.factor_implementation_config import ( +from rdagent.components.task_implementation.factor_implementation.evolving.factor import ( + FactorEvovlingItem, + FactorImplementTask, + FileBasedFactorImplementation, +) +from rdagent.components.task_implementation.factor_implementation.evolving.scheduler import ( + LLMSelect, + RandomSelect, +) +from rdagent.components.task_implementation.factor_implementation.share_modules.factor_implementation_config import ( FACTOR_IMPLEMENT_SETTINGS, ) - -from rdagent.core.task import ( - TaskImplementation, +from rdagent.components.task_implementation.factor_implementation.share_modules.factor_implementation_utils import ( + get_data_folder_intro, ) +from rdagent.core.conf import RD_AGENT_SETTINGS +from rdagent.core.evolving_framework import EvolvingStrategy, QueriedKnowledge from rdagent.core.prompts import Prompts - -from pathlib import Path - -from rdagent.factor_implementation.evolving.scheduler import ( - RandomSelect, - LLMSelect, -) - -from rdagent.factor_implementation.share_modules.factor_implementation_utils import get_data_folder_intro -from rdagent.oai.llm_utils import APIBackend - +from rdagent.core.task import TaskImplementation from rdagent.core.utils import multiprocessing_wrapper - -from rdagent.factor_implementation.evolving.factor import ( - FactorImplementTask, - FactorEvovlingItem, - FileBasedFactorImplementation, -) +from rdagent.oai.llm_utils import APIBackend if TYPE_CHECKING: - from rdagent.factor_implementation.evolving.knowledge_management import ( + from rdagent.components.task_implementation.factor_implementation.evolving.knowledge_management import ( FactorImplementationQueriedKnowledge, FactorImplementationQueriedKnowledgeV1, ) @@ -220,7 +212,6 @@ def implement_one_factor( elif queried_knowledge is not None and target_factor_task_information in queried_knowledge.failed_task_info_set: return None else: - # 3. 取出knowledge里面的经验数据(similar success、similar error、former_trace) queried_similar_component_knowledge = ( queried_knowledge.component_with_success_task[target_factor_task_information] @@ -262,7 +253,6 @@ def implement_one_factor( and len(queried_similar_error_knowledge_to_render) != 0 and len(queried_former_failed_knowledge_to_render) != 0 ): - error_summary_system_prompt = ( Template(implement_prompts["evolving_strategy_error_summary_v2_system"]) .render( diff --git a/rdagent/factor_implementation/evolving/factor.py b/rdagent/components/task_implementation/factor_implementation/evolving/factor.py similarity index 95% rename from rdagent/factor_implementation/evolving/factor.py rename to rdagent/components/task_implementation/factor_implementation/evolving/factor.py index f2cee55d..0f3cdbd5 100644 --- a/rdagent/factor_implementation/evolving/factor.py +++ b/rdagent/components/task_implementation/factor_implementation/evolving/factor.py @@ -1,32 +1,31 @@ from __future__ import annotations -from rdagent.factor_implementation.share_modules.factor_implementation_config import ( - FACTOR_IMPLEMENT_SETTINGS, -) - -from rdagent.core.task import ( - TaskImplementation, - BaseTask, - TestCase, -) -from rdagent.core.evolving_framework import EvolvableSubjects -from rdagent.core.log import RDAgentLog +import pickle +import subprocess +import uuid from pathlib import Path +from typing import Tuple, Union -from rdagent.oai.llm_utils import md5_hash +import pandas as pd +from filelock import FileLock +from rdagent.components.task_implementation.factor_implementation.share_modules.factor_implementation_config import ( + FACTOR_IMPLEMENT_SETTINGS, +) +from rdagent.core.evolving_framework import EvolvableSubjects from rdagent.core.exception import ( CodeFormatException, NoOutputException, RuntimeErrorException, ) - -import pandas as pd -import uuid -import pickle -import subprocess -from typing import Tuple, Union -from filelock import FileLock +from rdagent.core.log import RDAgentLog +from rdagent.core.task import ( + BaseTask, + FBTaskImplementation, + TaskImplementation, + TestCase, +) +from rdagent.oai.llm_utils import md5_hash class FactorImplementTask(BaseTask): @@ -37,15 +36,12 @@ def __init__( factor_name, factor_description, factor_formulation, - factor_formulation_description: str = "", variables: dict = {}, resource: str = None, ) -> None: - # TODO: remove the useless factor_formulation_description self.factor_name = factor_name self.factor_description = factor_description self.factor_formulation = factor_formulation - self.factor_formulation_description = factor_formulation_description self.variables = variables self.factor_resources = resource @@ -53,7 +49,7 @@ def get_factor_information(self): return f"""factor_name: {self.factor_name} factor_description: {self.factor_description} factor_formulation: {self.factor_formulation} -factor_formulation_description: {self.factor_formulation_description}""" +variables: {str(self.variables)}""" @staticmethod def from_dict(dict): @@ -88,7 +84,7 @@ def __init__( self.corresponding_gt_implementations = corresponding_gt_implementations -class FileBasedFactorImplementation(TaskImplementation): +class FileBasedFactorImplementation(FBTaskImplementation): """ This class is used to implement a factor by writing the code to a file. Input data and output factor value are also written to files. @@ -131,6 +127,13 @@ def link_data_to_workspace(data_path: Path, workspace_path: Path): check=False, ) + def execute_desc(self): + raise NotImplementedError + + def prepare(self, *args, **kwargs): + # TODO move the prepare part code in execute into here + return super().prepare(*args, **kwargs) + def execute(self, store_result: bool = False) -> Tuple[str, pd.DataFrame]: """ execute the implementation and get the factor value by the following steps: diff --git a/rdagent/factor_implementation/evolving/knowledge_management.py b/rdagent/components/task_implementation/factor_implementation/evolving/knowledge_management.py similarity index 96% rename from rdagent/factor_implementation/evolving/knowledge_management.py rename to rdagent/components/task_implementation/factor_implementation/evolving/knowledge_management.py index d8437d4f..5d48d698 100644 --- a/rdagent/factor_implementation/evolving/knowledge_management.py +++ b/rdagent/components/task_implementation/factor_implementation/evolving/knowledge_management.py @@ -6,10 +6,23 @@ import re from itertools import combinations from pathlib import Path -from jinja2 import Template from typing import Union from jinja2 import Template + +from rdagent.components.knowledge_management.graph import ( + UndirectedGraph, + UndirectedNode, +) +from rdagent.components.task_implementation.factor_implementation.evolving.evaluators import ( + FactorImplementationSingleFeedback, +) +from rdagent.components.task_implementation.factor_implementation.evolving.evolving_strategy import ( + FactorImplementTask, +) +from rdagent.components.task_implementation.factor_implementation.share_modules.factor_implementation_config import ( + FACTOR_IMPLEMENT_SETTINGS, +) from rdagent.core.evolving_framework import ( EvolvableSubjects, EvoStep, @@ -20,17 +33,10 @@ ) from rdagent.core.log import RDAgentLog from rdagent.core.prompts import Prompts -from rdagent.factor_implementation.evolving.evaluators import FactorImplementationSingleFeedback -from rdagent.core.task import ( - TaskImplementation, -) -from rdagent.factor_implementation.evolving.evolving_strategy import FactorImplementTask -from rdagent.core.prompts import Prompts -from rdagent.knowledge_management.graph import UndirectedGraph, UndirectedNode -from rdagent.oai.llm_utils import APIBackend, calculate_embedding_distance_between_str_list - -from rdagent.factor_implementation.share_modules.factor_implementation_config import ( - FACTOR_IMPLEMENT_SETTINGS, +from rdagent.core.task import TaskImplementation +from rdagent.oai.llm_utils import ( + APIBackend, + calculate_embedding_distance_between_str_list, ) @@ -147,9 +153,9 @@ def query( for target_factor_task in evo.target_factor_tasks: target_factor_task_information = target_factor_task.get_factor_information() if target_factor_task_information in self.knowledgebase.success_task_info_set: - queried_knowledge.success_task_to_knowledge_dict[target_factor_task_information] = ( - self.knowledgebase.implementation_trace[target_factor_task_information][-1] - ) + queried_knowledge.success_task_to_knowledge_dict[ + target_factor_task_information + ] = self.knowledgebase.implementation_trace[target_factor_task_information][-1] elif ( len( self.knowledgebase.implementation_trace.setdefault( @@ -161,12 +167,14 @@ def query( ): queried_knowledge.failed_task_info_set.add(target_factor_task_information) else: - queried_knowledge.working_task_to_former_failed_knowledge_dict[target_factor_task_information] = ( - self.knowledgebase.implementation_trace.setdefault( - target_factor_task_information, - [], - )[-v1_query_former_trace_limit:] - ) + queried_knowledge.working_task_to_former_failed_knowledge_dict[ + target_factor_task_information + ] = self.knowledgebase.implementation_trace.setdefault( + target_factor_task_information, + [], + )[ + -v1_query_former_trace_limit: + ] knowledge_base_success_task_list = list( self.knowledgebase.success_task_info_set, @@ -187,9 +195,9 @@ def query( )[-1] for index in similar_indexes ] - queried_knowledge.working_task_to_similar_successful_knowledge_dict[target_factor_task_information] = ( - similar_successful_knowledge - ) + queried_knowledge.working_task_to_similar_successful_knowledge_dict[ + target_factor_task_information + ] = similar_successful_knowledge return queried_knowledge @@ -310,6 +318,8 @@ def analyze_component( target_factor_task_information, ) -> list[UndirectedNode]: # Hardcode: certain component nodes all_component_nodes = self.knowledgebase.graph.get_all_nodes_by_label_list(["component"]) + if not len(all_component_nodes): + return [] all_component_content = "" for _, component_node in enumerate(all_component_nodes): all_component_content += f"{component_node.content}, \n" @@ -417,9 +427,9 @@ def former_trace_query( else: current_index += 1 - factor_implementation_queried_graph_knowledge.former_traces[target_factor_task_information] = ( - former_trace_knowledge[-v2_query_former_trace_limit:] - ) + factor_implementation_queried_graph_knowledge.former_traces[ + target_factor_task_information + ] = former_trace_knowledge[-v2_query_former_trace_limit:] else: factor_implementation_queried_graph_knowledge.former_traces[target_factor_task_information] = [] diff --git a/rdagent/factor_implementation/evolving/scheduler.py b/rdagent/components/task_implementation/factor_implementation/evolving/scheduler.py similarity index 89% rename from rdagent/factor_implementation/evolving/scheduler.py rename to rdagent/components/task_implementation/factor_implementation/evolving/scheduler.py index 5c3263f6..d27707dc 100644 --- a/rdagent/factor_implementation/evolving/scheduler.py +++ b/rdagent/components/task_implementation/factor_implementation/evolving/scheduler.py @@ -1,13 +1,19 @@ -from rdagent.oai.llm_utils import APIBackend -from jinja2 import Template import json -from rdagent.factor_implementation.share_modules.factor_implementation_utils import get_data_folder_intro -from rdagent.factor_implementation.evolving.factor import FactorEvovlingItem -from rdagent.core.prompts import Prompts -from rdagent.core.log import RDAgentLog -from rdagent.core.conf import RD_AGENT_SETTINGS from pathlib import Path +from jinja2 import Template + +from rdagent.components.task_implementation.factor_implementation.evolving.factor import ( + FactorEvovlingItem, +) +from rdagent.components.task_implementation.factor_implementation.share_modules.factor_implementation_utils import ( + get_data_folder_intro, +) +from rdagent.core.conf import RD_AGENT_SETTINGS +from rdagent.core.log import RDAgentLog +from rdagent.core.prompts import Prompts +from rdagent.oai.llm_utils import APIBackend + scheduler_prompts = Prompts(file_path=Path(__file__).parent.parent / "prompts.yaml") diff --git a/rdagent/factor_implementation/prompts.yaml b/rdagent/components/task_implementation/factor_implementation/prompts.yaml similarity index 100% rename from rdagent/factor_implementation/prompts.yaml rename to rdagent/components/task_implementation/factor_implementation/prompts.yaml diff --git a/rdagent/factor_implementation/share_modules/factor_implementation_config.py b/rdagent/components/task_implementation/factor_implementation/share_modules/factor_implementation_config.py similarity index 99% rename from rdagent/factor_implementation/share_modules/factor_implementation_config.py rename to rdagent/components/task_implementation/factor_implementation/share_modules/factor_implementation_config.py index 16607143..236275a3 100644 --- a/rdagent/factor_implementation/share_modules/factor_implementation_config.py +++ b/rdagent/components/task_implementation/factor_implementation/share_modules/factor_implementation_config.py @@ -1,9 +1,8 @@ from pathlib import Path +from typing import Literal, Union from pydantic_settings import BaseSettings -from typing import Literal, Union - SELECT_METHOD = Literal["random", "scheduler"] diff --git a/rdagent/factor_implementation/share_modules/factor_implementation_utils.py b/rdagent/components/task_implementation/factor_implementation/share_modules/factor_implementation_utils.py similarity index 85% rename from rdagent/factor_implementation/share_modules/factor_implementation_utils.py rename to rdagent/components/task_implementation/factor_implementation/share_modules/factor_implementation_utils.py index 8056cad7..b6620ecc 100644 --- a/rdagent/factor_implementation/share_modules/factor_implementation_utils.py +++ b/rdagent/components/task_implementation/factor_implementation/share_modules/factor_implementation_utils.py @@ -4,9 +4,13 @@ # render it with jinja from jinja2 import Template -from rdagent.factor_implementation.share_modules.factor_implementation_config import FACTOR_IMPLEMENT_SETTINGS -from rdagent.factor_implementation.evolving.factor import FactorImplementTask +from rdagent.components.task_implementation.factor_implementation.evolving.factor import ( + FactorImplementTask, +) +from rdagent.components.task_implementation.factor_implementation.share_modules.factor_implementation_config import ( + FACTOR_IMPLEMENT_SETTINGS, +) TPL = """ {{file_name}} diff --git a/rdagent/model_implementation/benchmark/eval.py b/rdagent/components/task_implementation/model_implementation/benchmark/eval.py similarity index 97% rename from rdagent/model_implementation/benchmark/eval.py rename to rdagent/components/task_implementation/model_implementation/benchmark/eval.py index 7d9289be..983159f9 100644 --- a/rdagent/model_implementation/benchmark/eval.py +++ b/rdagent/components/task_implementation/model_implementation/benchmark/eval.py @@ -27,7 +27,7 @@ class ModelImpValEval: Comparing the correlation of following sequences - modelA[init1](input1).hidden_out1, modelA[init1](input2).hidden_out1, ... - modelB[init1](input1).hidden_out1, modelB[init1](input2).hidden_out1, ... - + For each hidden output, we can calculate a correlation. The average correlation will be the metrics. """ @@ -59,12 +59,13 @@ def evaluate(self, gt: ModelTaskImpl, gen: ModelTaskImpl): # pearson correlation of each hidden output def norm(x): return (x - x.mean(axis=0)) / x.std(axis=0) + dim_corr = (norm(res_batch) * norm(gt_res_batch)).mean(axis=0) # the correlation of each hidden output # aggregate all the correlation avr_corr = dim_corr.mean() - # FIXME: + # FIXME: # It is too high(e.g. 0.944) . - # Check if it is not a good evaluation!! + # Check if it is not a good evaluation!! # Maybe all the same initial params will results in extreamly high correlation without regard to the model structure. return avr_corr diff --git a/rdagent/model_implementation/benchmark/gt_code/A-DGN.py b/rdagent/components/task_implementation/model_implementation/benchmark/gt_code/A-DGN.py similarity index 99% rename from rdagent/model_implementation/benchmark/gt_code/A-DGN.py rename to rdagent/components/task_implementation/model_implementation/benchmark/gt_code/A-DGN.py index 4a84af36..5c65bf12 100644 --- a/rdagent/model_implementation/benchmark/gt_code/A-DGN.py +++ b/rdagent/components/task_implementation/model_implementation/benchmark/gt_code/A-DGN.py @@ -4,7 +4,6 @@ import torch from torch import Tensor from torch.nn import Parameter - from torch_geometric.nn.conv import GCNConv, MessagePassing from torch_geometric.nn.inits import zeros from torch_geometric.nn.resolver import activation_resolver diff --git a/rdagent/model_implementation/benchmark/gt_code/dirgnn.py b/rdagent/components/task_implementation/model_implementation/benchmark/gt_code/dirgnn.py similarity index 92% rename from rdagent/model_implementation/benchmark/gt_code/dirgnn.py rename to rdagent/components/task_implementation/model_implementation/benchmark/gt_code/dirgnn.py index b4430f9c..f5fa78b1 100644 --- a/rdagent/model_implementation/benchmark/gt_code/dirgnn.py +++ b/rdagent/components/task_implementation/model_implementation/benchmark/gt_code/dirgnn.py @@ -2,7 +2,6 @@ import torch from torch import Tensor - from torch_geometric.nn.conv import MessagePassing @@ -23,6 +22,7 @@ class DirGNNConv(torch.nn.Module): transformed root node features to the output. (default: :obj:`True`) """ + def __init__( self, conv: MessagePassing, @@ -37,10 +37,10 @@ def __init__( self.conv_in = copy.deepcopy(conv) self.conv_out = copy.deepcopy(conv) - if hasattr(conv, 'add_self_loops'): + if hasattr(conv, "add_self_loops"): self.conv_in.add_self_loops = False self.conv_out.add_self_loops = False - if hasattr(conv, 'root_weight'): + if hasattr(conv, "root_weight"): self.conv_in.root_weight = False self.conv_out.root_weight = False @@ -71,8 +71,9 @@ def forward(self, x: Tensor, edge_index: Tensor) -> Tensor: return out def __repr__(self) -> str: - return f'{self.__class__.__name__}({self.conv_in}, alpha={self.alpha})' - + return f"{self.__class__.__name__}({self.conv_in}, alpha={self.alpha})" + + if __name__ == "__main__": node_features = torch.load("node_features.pt") edge_index = torch.load("edge_index.pt") @@ -82,4 +83,4 @@ def __repr__(self) -> str: output = model(node_features, edge_index) # Save output to a file - torch.save(output, "gt_output.pt") \ No newline at end of file + torch.save(output, "gt_output.pt") diff --git a/rdagent/model_implementation/benchmark/gt_code/gpsconv.py b/rdagent/components/task_implementation/model_implementation/benchmark/gt_code/gpsconv.py similarity index 88% rename from rdagent/model_implementation/benchmark/gt_code/gpsconv.py rename to rdagent/components/task_implementation/model_implementation/benchmark/gt_code/gpsconv.py index c4821cbc..e6ee0193 100644 --- a/rdagent/model_implementation/benchmark/gt_code/gpsconv.py +++ b/rdagent/components/task_implementation/model_implementation/benchmark/gt_code/gpsconv.py @@ -5,14 +5,10 @@ import torch.nn.functional as F from torch import Tensor from torch.nn import Dropout, Linear, Sequential - from torch_geometric.nn.attention import PerformerAttention from torch_geometric.nn.conv import MessagePassing from torch_geometric.nn.inits import reset -from torch_geometric.nn.resolver import ( - activation_resolver, - normalization_resolver, -) +from torch_geometric.nn.resolver import activation_resolver, normalization_resolver from torch_geometric.typing import Adj from torch_geometric.utils import to_dense_batch @@ -59,17 +55,18 @@ class GPSConv(torch.nn.Module): attn_kwargs (Dict[str, Any], optional): Arguments passed to the attention layer. (default: :obj:`None`) """ + def __init__( self, channels: int, conv: Optional[MessagePassing], heads: int = 1, dropout: float = 0.0, - act: str = 'relu', + act: str = "relu", act_kwargs: Optional[Dict[str, Any]] = None, - norm: Optional[str] = 'batch_norm', + norm: Optional[str] = "batch_norm", norm_kwargs: Optional[Dict[str, Any]] = None, - attn_type: str = 'multihead', + attn_type: str = "multihead", attn_kwargs: Optional[Dict[str, Any]] = None, ): super().__init__() @@ -81,14 +78,14 @@ def __init__( self.attn_type = attn_type attn_kwargs = attn_kwargs or {} - if attn_type == 'multihead': + if attn_type == "multihead": self.attn = torch.nn.MultiheadAttention( channels, heads, batch_first=True, **attn_kwargs, ) - elif attn_type == 'performer': + elif attn_type == "performer": self.attn = PerformerAttention( channels=channels, heads=heads, @@ -96,7 +93,7 @@ def __init__( ) else: # TODO: Support BigBird - raise ValueError(f'{attn_type} is not supported') + raise ValueError(f"{attn_type} is not supported") self.mlp = Sequential( Linear(channels, channels * 2), @@ -114,7 +111,7 @@ def __init__( self.norm_with_batch = False if self.norm1 is not None: signature = inspect.signature(self.norm1.forward) - self.norm_with_batch = 'batch' in signature.parameters + self.norm_with_batch = "batch" in signature.parameters def reset_parameters(self): r"""Resets all learnable parameters of the module.""" @@ -153,8 +150,7 @@ def forward( h, mask = to_dense_batch(x, batch) if isinstance(self.attn, torch.nn.MultiheadAttention): - h, _ = self.attn(h, h, h, key_padding_mask=~mask, - need_weights=False) + h, _ = self.attn(h, h, h, key_padding_mask=~mask, need_weights=False) elif isinstance(self.attn, PerformerAttention): h = self.attn(h, mask=mask) @@ -180,17 +176,20 @@ def forward( return out def __repr__(self) -> str: - return (f'{self.__class__.__name__}({self.channels}, ' - f'conv={self.conv}, heads={self.heads}, ' - f'attn_type={self.attn_type})') + return ( + f"{self.__class__.__name__}({self.channels}, " + f"conv={self.conv}, heads={self.heads}, " + f"attn_type={self.attn_type})" + ) + if __name__ == "__main__": node_features = torch.load("node_features.pt") edge_index = torch.load("edge_index.pt") # Model instantiation and forward pass - model = GPSConv(channels=node_features.size(-1),conv=MessagePassing()) + model = GPSConv(channels=node_features.size(-1), conv=MessagePassing()) output = model(node_features, edge_index) # Save output to a file - torch.save(output, "gt_output.pt") \ No newline at end of file + torch.save(output, "gt_output.pt") diff --git a/rdagent/model_implementation/benchmark/gt_code/linkx.py b/rdagent/components/task_implementation/model_implementation/benchmark/gt_code/linkx.py similarity index 87% rename from rdagent/model_implementation/benchmark/gt_code/linkx.py rename to rdagent/components/task_implementation/model_implementation/benchmark/gt_code/linkx.py index e543adb1..eb3ddfc6 100644 --- a/rdagent/model_implementation/benchmark/gt_code/linkx.py +++ b/rdagent/components/task_implementation/model_implementation/benchmark/gt_code/linkx.py @@ -3,7 +3,6 @@ import torch from torch import Tensor from torch.nn import BatchNorm1d, Parameter - from torch_geometric.nn import inits from torch_geometric.nn.conv import MessagePassing from torch_geometric.nn.models import MLP @@ -13,7 +12,7 @@ class SparseLinear(MessagePassing): def __init__(self, in_channels: int, out_channels: int, bias: bool = True): - super().__init__(aggr='add') + super().__init__(aggr="add") self.in_channels = in_channels self.out_channels = out_channels @@ -21,13 +20,12 @@ def __init__(self, in_channels: int, out_channels: int, bias: bool = True): if bias: self.bias = Parameter(torch.empty(out_channels)) else: - self.register_parameter('bias', None) + self.register_parameter("bias", None) self.reset_parameters() def reset_parameters(self): - inits.kaiming_uniform(self.weight, fan=self.in_channels, - a=math.sqrt(5)) + inits.kaiming_uniform(self.weight, fan=self.in_channels, a=math.sqrt(5)) inits.uniform(self.in_channels, self.bias) def forward( @@ -36,8 +34,7 @@ def forward( edge_weight: OptTensor = None, ) -> Tensor: # propagate_type: (weight: Tensor, edge_weight: OptTensor) - out = self.propagate(edge_index, weight=self.weight, - edge_weight=edge_weight) + out = self.propagate(edge_index, weight=self.weight, edge_weight=edge_weight) if self.bias is not None: out = out + self.bias @@ -87,6 +84,7 @@ class LINKX(torch.nn.Module): dropout (float, optional): Dropout probability of each hidden embedding. (default: :obj:`0.0`) """ + def __init__( self, num_nodes: int, @@ -110,13 +108,13 @@ def __init__( if self.num_edge_layers > 1: self.edge_norm = BatchNorm1d(hidden_channels) channels = [hidden_channels] * num_edge_layers - self.edge_mlp = MLP(channels, dropout=0., act_first=True) + self.edge_mlp = MLP(channels, dropout=0.0, act_first=True) else: self.edge_norm = None self.edge_mlp = None channels = [in_channels] + [hidden_channels] * num_node_layers - self.node_mlp = MLP(channels, dropout=0., act_first=True) + self.node_mlp = MLP(channels, dropout=0.0, act_first=True) self.cat_lin1 = torch.nn.Linear(hidden_channels, hidden_channels) self.cat_lin2 = torch.nn.Linear(hidden_channels, hidden_channels) @@ -162,17 +160,26 @@ def forward( return self.final_mlp(out.relu_()) def __repr__(self) -> str: - return (f'{self.__class__.__name__}(num_nodes={self.num_nodes}, ' - f'in_channels={self.in_channels}, ' - f'out_channels={self.out_channels})') + return ( + f"{self.__class__.__name__}(num_nodes={self.num_nodes}, " + f"in_channels={self.in_channels}, " + f"out_channels={self.out_channels})" + ) + if __name__ == "__main__": node_features = torch.load("node_features.pt") edge_index = torch.load("edge_index.pt") # Model instantiation and forward pass - model = LINKX(num_nodes=node_features.size(0), in_channels=node_features.size(1), hidden_channels=node_features.size(1), out_channels=node_features.size(1), num_layers=1) + model = LINKX( + num_nodes=node_features.size(0), + in_channels=node_features.size(1), + hidden_channels=node_features.size(1), + out_channels=node_features.size(1), + num_layers=1, + ) output = model(node_features, edge_index) # Save output to a file - torch.save(output, "gt_output.pt") \ No newline at end of file + torch.save(output, "gt_output.pt") diff --git a/rdagent/model_implementation/benchmark/gt_code/pmlp.py b/rdagent/components/task_implementation/model_implementation/benchmark/gt_code/pmlp.py similarity index 87% rename from rdagent/model_implementation/benchmark/gt_code/pmlp.py rename to rdagent/components/task_implementation/model_implementation/benchmark/gt_code/pmlp.py index 0129dfb6..ada1b7b0 100644 --- a/rdagent/model_implementation/benchmark/gt_code/pmlp.py +++ b/rdagent/components/task_implementation/model_implementation/benchmark/gt_code/pmlp.py @@ -3,7 +3,6 @@ import torch import torch.nn.functional as F from torch import Tensor - from torch_geometric.nn import SimpleConv from torch_geometric.nn.dense.linear import Linear @@ -27,13 +26,14 @@ class PMLP(torch.nn.Module): bias (bool, optional): If set to :obj:`False`, the module will not learn additive biases. (default: :obj:`True`) """ + def __init__( self, in_channels: int, hidden_channels: int, out_channels: int, num_layers: int, - dropout: float = 0., + dropout: float = 0.0, norm: bool = True, bias: bool = True, ): @@ -61,7 +61,7 @@ def __init__( track_running_stats=False, ) - self.conv = SimpleConv(aggr='mean', combine_root='self_loop') + self.conv = SimpleConv(aggr="mean", combine_root="self_loop") self.reset_parameters() @@ -79,8 +79,7 @@ def forward( ) -> torch.Tensor: """""" # noqa: D419 if not self.training and edge_index is None: - raise ValueError(f"'edge_index' needs to be present during " - f"inference in '{self.__class__.__name__}'") + raise ValueError(f"'edge_index' needs to be present during " f"inference in '{self.__class__.__name__}'") for i in range(self.num_layers): x = x @ self.lins[i].weight.t() @@ -97,16 +96,21 @@ def forward( return x def __repr__(self) -> str: - return (f'{self.__class__.__name__}({self.in_channels}, ' - f'{self.out_channels}, num_layers={self.num_layers})') + return f"{self.__class__.__name__}({self.in_channels}, " f"{self.out_channels}, num_layers={self.num_layers})" + if __name__ == "__main__": node_features = torch.load("node_features.pt") edge_index = torch.load("edge_index.pt") # Model instantiation and forward pass - model = PMLP(in_channels=node_features.size(-1), hidden_channels=node_features.size(-1), node_features.size(-1), num_layers=1) + model = PMLP( + in_channels=node_features.size(-1), + hidden_channels=node_features.size(-1), + out_channels=node_features.size(-1), + num_layers=1, + ) output = model(node_features, edge_index) # Save output to a file - torch.save(output, "gt_output.pt") \ No newline at end of file + torch.save(output, "gt_output.pt") diff --git a/rdagent/model_implementation/benchmark/gt_code/visnet.py b/rdagent/components/task_implementation/model_implementation/benchmark/gt_code/visnet.py similarity index 92% rename from rdagent/model_implementation/benchmark/gt_code/visnet.py rename to rdagent/components/task_implementation/model_implementation/benchmark/gt_code/visnet.py index e960cb7b..3118d641 100644 --- a/rdagent/model_implementation/benchmark/gt_code/visnet.py +++ b/rdagent/components/task_implementation/model_implementation/benchmark/gt_code/visnet.py @@ -5,7 +5,6 @@ from torch import Tensor from torch.autograd import grad from torch.nn import Embedding, LayerNorm, Linear, Parameter - from torch_geometric.nn import MessagePassing, radius_graph from torch_geometric.utils import scatter @@ -25,6 +24,7 @@ class CosineCutoff(torch.nn.Module): cutoff (float): A scalar that determines the point at which the cutoff is applied. """ + def __init__(self, cutoff: float) -> None: super().__init__() self.cutoff = cutoff @@ -60,6 +60,7 @@ class ExpNormalSmearing(torch.nn.Module): trainable (bool, optional): If set to :obj:`False`, the means and betas of the RBFs will not be trained. (default: :obj:`True`) """ + def __init__( self, cutoff: float = 5.0, @@ -76,18 +77,17 @@ def __init__( means, betas = self._initial_params() if trainable: - self.register_parameter('means', Parameter(means)) - self.register_parameter('betas', Parameter(betas)) + self.register_parameter("means", Parameter(means)) + self.register_parameter("betas", Parameter(betas)) else: - self.register_buffer('means', means) - self.register_buffer('betas', betas) + self.register_buffer("means", means) + self.register_buffer("betas", betas) def _initial_params(self) -> Tuple[Tensor, Tensor]: r"""Initializes the means and betas for the radial basis functions.""" start_value = torch.exp(torch.tensor(-self.cutoff)) means = torch.linspace(start_value, 1, self.num_rbf) - betas = torch.tensor([(2 / self.num_rbf * (1 - start_value))**-2] * - self.num_rbf) + betas = torch.tensor([(2 / self.num_rbf * (1 - start_value)) ** -2] * self.num_rbf) return means, betas def reset_parameters(self): @@ -103,8 +103,7 @@ def forward(self, dist: Tensor) -> Tensor: dist (torch.Tensor): A tensor of distances. """ dist = dist.unsqueeze(-1) - smeared_dist = self.cutoff_fn(dist) * (-self.betas * ( - (self.alpha * (-dist)).exp() - self.means)**2).exp() + smeared_dist = self.cutoff_fn(dist) * (-self.betas * ((self.alpha * (-dist)).exp() - self.means) ** 2).exp() return smeared_dist @@ -121,6 +120,7 @@ class Sphere(torch.nn.Module): lmax (int, optional): The maximum degree of the spherical harmonics. (default: :obj:`2`) """ + def __init__(self, lmax: int = 2) -> None: super().__init__() self.lmax = lmax @@ -168,16 +168,19 @@ def _spherical_harmonics( sh_2_4 = math.sqrt(3.0) / 2.0 * (z.pow(2) - x.pow(2)) if lmax == 2: - return torch.stack([ - sh_1_0, - sh_1_1, - sh_1_2, - sh_2_0, - sh_2_1, - sh_2_2, - sh_2_3, - sh_2_4, - ], dim=-1) + return torch.stack( + [ + sh_1_0, + sh_1_1, + sh_1_2, + sh_2_0, + sh_2_1, + sh_2_2, + sh_2_3, + sh_2_4, + ], + dim=-1, + ) raise ValueError(f"'lmax' needs to be 1 or 2 (got {lmax})") @@ -196,11 +199,12 @@ class VecLayerNorm(torch.nn.Module): norm_type (str, optional): The type of normalization to apply, one of :obj:`"max_min"` or :obj:`None`. (default: :obj:`"max_min"`) """ + def __init__( self, hidden_channels: int, trainable: bool, - norm_type: Optional[str] = 'max_min', + norm_type: Optional[str] = "max_min", ) -> None: super().__init__() @@ -210,9 +214,9 @@ def __init__( weight = torch.ones(self.hidden_channels) if trainable: - self.register_parameter('weight', Parameter(weight)) + self.register_parameter("weight", Parameter(weight)) else: - self.register_buffer('weight', weight) + self.register_buffer("weight", weight) self.reset_parameters() @@ -258,19 +262,18 @@ def forward(self, vec: Tensor) -> Tensor: vec (torch.Tensor): The input tensor. """ if vec.size(1) == 3: - if self.norm_type == 'max_min': + if self.norm_type == "max_min": vec = self.max_min_norm(vec) return vec * self.weight.unsqueeze(0).unsqueeze(0) elif vec.size(1) == 8: vec1, vec2 = torch.split(vec, [3, 5], dim=1) - if self.norm_type == 'max_min': + if self.norm_type == "max_min": vec1 = self.max_min_norm(vec1) vec2 = self.max_min_norm(vec2) vec = torch.cat([vec1, vec2], dim=1) return vec * self.weight.unsqueeze(0).unsqueeze(0) - raise ValueError(f"'{self.__class__.__name__}' only support 3 or 8 " - f"channels (got {vec.size(1)})") + raise ValueError(f"'{self.__class__.__name__}' only support 3 or 8 " f"channels (got {vec.size(1)})") class Distance(torch.nn.Module): @@ -289,6 +292,7 @@ class Distance(torch.nn.Module): add_self_loops (bool, optional): If set to :obj:`False`, will not include self-loops. (default: :obj:`True`) """ + def __init__( self, cutoff: float, @@ -350,6 +354,7 @@ class NeighborEmbedding(MessagePassing): max_z (int, optional): The maximum atomic numbers. (default: :obj:`100`) """ + def __init__( self, hidden_channels: int, @@ -357,7 +362,7 @@ def __init__( cutoff: float, max_z: int = 100, ) -> None: - super().__init__(aggr='add') + super().__init__(aggr="add") self.embedding = Embedding(max_z, hidden_channels) self.distance_proj = Linear(num_rbf, hidden_channels) self.combine = Linear(hidden_channels * 2, hidden_channels) @@ -422,6 +427,7 @@ class EdgeEmbedding(torch.nn.Module): hidden_channels (int): The number of hidden channels in the node embeddings. """ + def __init__(self, num_rbf: int, hidden_channels: int) -> None: super().__init__() self.edge_proj = Linear(num_rbf, hidden_channels) @@ -472,6 +478,7 @@ class ViS_MP(MessagePassing): last_layer (bool, optional): Whether this is the last layer in the model. (default: :obj:`False`) """ + def __init__( self, num_heads: int, @@ -481,13 +488,14 @@ def __init__( trainable_vecnorm: bool, last_layer: bool = False, ) -> None: - super().__init__(aggr='add', node_dim=0) + super().__init__(aggr="add", node_dim=0) if hidden_channels % num_heads != 0: raise ValueError( f"The number of hidden channels (got {hidden_channels}) must " f"be evenly divisible by the number of attention heads " - f"(got {num_heads})") + f"(got {num_heads})" + ) self.num_heads = num_heads self.hidden_channels = hidden_channels @@ -599,42 +607,35 @@ def forward( dv = self.act(self.dv_proj(f_ij)) dv = dv.reshape(-1, self.num_heads, self.head_dim) - vec1, vec2, vec3 = torch.split(self.vec_proj(vec), - self.hidden_channels, dim=-1) + vec1, vec2, vec3 = torch.split(self.vec_proj(vec), self.hidden_channels, dim=-1) vec_dot = (vec1 * vec2).sum(dim=1) - x, vec_out = self.propagate(edge_index, q=q, k=k, v=v, dk=dk, dv=dv, - vec=vec, r_ij=r_ij, d_ij=d_ij) + x, vec_out = self.propagate(edge_index, q=q, k=k, v=v, dk=dk, dv=dv, vec=vec, r_ij=r_ij, d_ij=d_ij) o1, o2, o3 = torch.split(self.o_proj(x), self.hidden_channels, dim=1) dx = vec_dot * o2 + o3 dvec = vec3 * o1.unsqueeze(1) + vec_out if not self.last_layer: - df_ij = self.edge_updater(edge_index, vec=vec, d_ij=d_ij, - f_ij=f_ij) + df_ij = self.edge_updater(edge_index, vec=vec, d_ij=d_ij, f_ij=f_ij) return dx, dvec, df_ij else: return dx, dvec, None - def message(self, q_i: Tensor, k_j: Tensor, v_j: Tensor, vec_j: Tensor, - dk: Tensor, dv: Tensor, r_ij: Tensor, - d_ij: Tensor) -> Tuple[Tensor, Tensor]: - + def message( + self, q_i: Tensor, k_j: Tensor, v_j: Tensor, vec_j: Tensor, dk: Tensor, dv: Tensor, r_ij: Tensor, d_ij: Tensor + ) -> Tuple[Tensor, Tensor]: attn = (q_i * k_j * dk).sum(dim=-1) attn = self.attn_activation(attn) * self.cutoff(r_ij).unsqueeze(1) v_j = v_j * dv v_j = (v_j * attn.unsqueeze(2)).view(-1, self.hidden_channels) - s1, s2 = torch.split(self.act(self.s_proj(v_j)), self.hidden_channels, - dim=1) + s1, s2 = torch.split(self.act(self.s_proj(v_j)), self.hidden_channels, dim=1) vec_j = vec_j * s1.unsqueeze(1) + s2.unsqueeze(1) * d_ij.unsqueeze(2) return v_j, vec_j - def edge_update(self, vec_i: Tensor, vec_j: Tensor, d_ij: Tensor, - f_ij: Tensor) -> Tensor: - + def edge_update(self, vec_i: Tensor, vec_j: Tensor, d_ij: Tensor, f_ij: Tensor) -> Tensor: w1 = self.vector_rejection(self.w_trg_proj(vec_i), d_ij) w2 = self.vector_rejection(self.w_src_proj(vec_j), -d_ij) w_dot = (w1 * w2).sum(dim=1) @@ -673,6 +674,7 @@ class ViS_MP_Vertex(ViS_MP): last_layer (bool, optional): Whether this is the last layer in the model. (default: :obj:`False`) """ + def __init__( self, num_heads: int, @@ -682,8 +684,7 @@ def __init__( trainable_vecnorm: bool, last_layer: bool = False, ) -> None: - super().__init__(num_heads, hidden_channels, cutoff, vecnorm_type, - trainable_vecnorm, last_layer) + super().__init__(num_heads, hidden_channels, cutoff, vecnorm_type, trainable_vecnorm, last_layer) if not self.last_layer: self.f_proj = Linear(hidden_channels, hidden_channels * 2) @@ -697,14 +698,12 @@ def reset_parameters(self): super().reset_parameters() if not self.last_layer: - if hasattr(self, 't_src_proj'): + if hasattr(self, "t_src_proj"): torch.nn.init.xavier_uniform_(self.t_src_proj.weight) - if hasattr(self, 't_trg_proj'): + if hasattr(self, "t_trg_proj"): torch.nn.init.xavier_uniform_(self.t_trg_proj.weight) - def edge_update(self, vec_i: Tensor, vec_j: Tensor, d_ij: Tensor, - f_ij: Tensor) -> Tensor: - + def edge_update(self, vec_i: Tensor, vec_j: Tensor, d_ij: Tensor, f_ij: Tensor) -> Tensor: w1 = self.vector_rejection(self.w_trg_proj(vec_i), d_ij) w2 = self.vector_rejection(self.w_src_proj(vec_j), -d_ij) w_dot = (w1 * w2).sum(dim=1) @@ -713,8 +712,7 @@ def edge_update(self, vec_i: Tensor, vec_j: Tensor, d_ij: Tensor, t2 = self.vector_rejection(self.t_src_proj(vec_i), -d_ij) t_dot = (t1 * t2).sum(dim=1) - f1, f2 = torch.split(self.act(self.f_proj(f_ij)), self.hidden_channels, - dim=-1) + f1, f2 = torch.split(self.act(self.f_proj(f_ij)), self.hidden_channels, dim=-1) return f1 * w_dot + f2 * t_dot @@ -750,6 +748,7 @@ class ViSNetBlock(torch.nn.Module): vertex (bool, optional): Whether to use vertex geometric features. (default: :obj:`False`) """ + def __init__( self, lmax: int = 1, @@ -782,10 +781,8 @@ def __init__( self.embedding = Embedding(max_z, hidden_channels) self.distance = Distance(cutoff, max_num_neighbors=max_num_neighbors) self.sphere = Sphere(lmax=lmax) - self.distance_expansion = ExpNormalSmearing(cutoff, num_rbf, - trainable_rbf) - self.neighbor_embedding = NeighborEmbedding(hidden_channels, num_rbf, - cutoff, max_z) + self.distance_expansion = ExpNormalSmearing(cutoff, num_rbf, trainable_rbf) + self.neighbor_embedding = NeighborEmbedding(hidden_channels, num_rbf, cutoff, max_z) self.edge_embedding = EdgeEmbedding(num_rbf, hidden_channels) self.vis_mp_layers = torch.nn.ModuleList() @@ -800,8 +797,7 @@ def __init__( for _ in range(num_layers - 1): layer = vis_mp_class(last_layer=False, **vis_mp_kwargs) self.vis_mp_layers.append(layer) - self.vis_mp_layers.append( - vis_mp_class(last_layer=True, **vis_mp_kwargs)) + self.vis_mp_layers.append(vis_mp_class(last_layer=True, **vis_mp_kwargs)) self.out_norm = LayerNorm(hidden_channels) self.vec_out_norm = VecLayerNorm( @@ -845,23 +841,19 @@ def forward( edge_index, edge_weight, edge_vec = self.distance(pos, batch) edge_attr = self.distance_expansion(edge_weight) mask = edge_index[0] != edge_index[1] - edge_vec[mask] = edge_vec[mask] / torch.norm(edge_vec[mask], - dim=1).unsqueeze(1) + edge_vec[mask] = edge_vec[mask] / torch.norm(edge_vec[mask], dim=1).unsqueeze(1) edge_vec = self.sphere(edge_vec) x = self.neighbor_embedding(z, x, edge_index, edge_weight, edge_attr) - vec = torch.zeros(x.size(0), ((self.lmax + 1)**2) - 1, x.size(1), - dtype=x.dtype, device=x.device) + vec = torch.zeros(x.size(0), ((self.lmax + 1) ** 2) - 1, x.size(1), dtype=x.dtype, device=x.device) edge_attr = self.edge_embedding(edge_index, edge_attr, x) for attn in self.vis_mp_layers[:-1]: - dx, dvec, dedge_attr = attn(x, vec, edge_index, edge_weight, - edge_attr, edge_vec) + dx, dvec, dedge_attr = attn(x, vec, edge_index, edge_weight, edge_attr, edge_vec) x = x + dx vec = vec + dvec edge_attr = edge_attr + dedge_attr - dx, dvec, _ = self.vis_mp_layers[-1](x, vec, edge_index, edge_weight, - edge_attr, edge_vec) + dx, dvec, _ = self.vis_mp_layers[-1](x, vec, edge_index, edge_weight, edge_attr, edge_vec) x = x + dx vec = vec + dvec @@ -888,6 +880,7 @@ class GatedEquivariantBlock(torch.nn.Module): activation function to the output node features. (default: obj:`False`) """ + def __init__( self, hidden_channels: int, @@ -952,21 +945,24 @@ class EquivariantScalar(torch.nn.Module): hidden_channels (int): The number of hidden channels in the node embeddings. """ + def __init__(self, hidden_channels: int) -> None: super().__init__() - self.output_network = torch.nn.ModuleList([ - GatedEquivariantBlock( - hidden_channels, - hidden_channels // 2, - scalar_activation=True, - ), - GatedEquivariantBlock( - hidden_channels // 2, - 1, - scalar_activation=False, - ), - ]) + self.output_network = torch.nn.ModuleList( + [ + GatedEquivariantBlock( + hidden_channels, + hidden_channels // 2, + scalar_activation=True, + ), + GatedEquivariantBlock( + hidden_channels // 2, + 1, + scalar_activation=False, + ), + ] + ) self.reset_parameters() @@ -1000,6 +996,7 @@ class Atomref(torch.nn.Module): max_z (int, optional): The maximum atomic numbers. (default: :obj:`100`) """ + def __init__( self, atomref: Optional[Tensor] = None, @@ -1015,7 +1012,7 @@ def __init__( if atomref.ndim == 1: atomref = atomref.view(-1, 1) - self.register_buffer('initial_atomref', atomref) + self.register_buffer("initial_atomref", atomref) self.atomref = Embedding(len(atomref), 1) self.reset_parameters() @@ -1076,6 +1073,7 @@ class ViSNet(torch.nn.Module): derivative (bool, optional): Whether to compute the derivative of the output with respect to the positions. (default: :obj:`False`) """ + def __init__( self, lmax: int = 1, @@ -1118,8 +1116,8 @@ def __init__( self.reduce_op = reduce_op self.derivative = derivative - self.register_buffer('mean', torch.tensor(mean)) - self.register_buffer('std', torch.tensor(std)) + self.register_buffer("mean", torch.tensor(mean)) + self.register_buffer("std", torch.tensor(std)) self.reset_parameters() @@ -1172,12 +1170,12 @@ def forward( retain_graph=True, )[0] if dy is None: - raise RuntimeError( - "Autograd returned None for the force prediction.") + raise RuntimeError("Autograd returned None for the force prediction.") return y, -dy return y, None + if __name__ == "__main__": node_features = torch.load("node_features.pt") edge_index = torch.load("edge_index.pt") @@ -1187,4 +1185,4 @@ def forward( output = model(node_features, edge_index) # Save output to a file - torch.save(output, "gt_output.pt") \ No newline at end of file + torch.save(output, "gt_output.pt") diff --git a/rdagent/model_implementation/conf.py b/rdagent/components/task_implementation/model_implementation/conf.py similarity index 81% rename from rdagent/model_implementation/conf.py rename to rdagent/components/task_implementation/model_implementation/conf.py index 061f5b9d..af504aa6 100644 --- a/rdagent/model_implementation/conf.py +++ b/rdagent/components/task_implementation/model_implementation/conf.py @@ -1,10 +1,13 @@ from pathlib import Path + from pydantic_settings import BaseSettings + class ModelImplSettings(BaseSettings): workspace_path: Path = Path("./git_ignore_folder/model_imp_workspace/") # Added type annotation for work_space - + class Config: - env_prefix = 'MODEL_IMPL_' # Use MODEL_IMPL_ as prefix for environment variables + env_prefix = "MODEL_IMPL_" # Use MODEL_IMPL_ as prefix for environment variables + MODEL_IMPL_SETTINGS = ModelImplSettings() diff --git a/rdagent/model_implementation/evaluator.py b/rdagent/components/task_implementation/model_implementation/evaluator.py similarity index 87% rename from rdagent/model_implementation/evaluator.py rename to rdagent/components/task_implementation/model_implementation/evaluator.py index 678d87a3..9a1830fa 100644 --- a/rdagent/model_implementation/evaluator.py +++ b/rdagent/components/task_implementation/model_implementation/evaluator.py @@ -1,5 +1,5 @@ -import torch import numpy as np +import torch def shape_evaluator(target, prediction): @@ -41,12 +41,8 @@ def value_evaluator(target, prediction): ) ] # Reshape both tensors to the determined shape - target = target.reshape( - *tar_shape, *(1,) * (max(len(tar_shape), len(pre_shape)) - len(tar_shape)) - ) - prediction = prediction.reshape( - *pre_shape, *(1,) * (max(len(tar_shape), len(pre_shape)) - len(pre_shape)) - ) + target = target.reshape(*tar_shape, *(1,) * (max(len(tar_shape), len(pre_shape)) - len(tar_shape))) + prediction = prediction.reshape(*pre_shape, *(1,) * (max(len(tar_shape), len(pre_shape)) - len(pre_shape))) target_padded = reshape_tensor(target, dims) prediction_padded = reshape_tensor(prediction, dims) diff --git a/rdagent/model_implementation/gt_code.py b/rdagent/components/task_implementation/model_implementation/gt_code.py similarity index 100% rename from rdagent/model_implementation/gt_code.py rename to rdagent/components/task_implementation/model_implementation/gt_code.py index b8329fbf..988273a3 100644 --- a/rdagent/model_implementation/gt_code.py +++ b/rdagent/components/task_implementation/model_implementation/gt_code.py @@ -2,13 +2,13 @@ This is just an exmaple. It will be replaced wtih a list of ground truth tasks. """ + import math from typing import Any, Callable, Dict, Optional, Union import torch from torch import Tensor from torch.nn import Parameter - from torch_geometric.nn.conv import GCNConv, MessagePassing from torch_geometric.nn.inits import zeros from torch_geometric.nn.resolver import activation_resolver diff --git a/rdagent/model_implementation/main.py b/rdagent/components/task_implementation/model_implementation/main.py similarity index 98% rename from rdagent/model_implementation/main.py rename to rdagent/components/task_implementation/model_implementation/main.py index 760deef6..88466ba0 100644 --- a/rdagent/model_implementation/main.py +++ b/rdagent/components/task_implementation/model_implementation/main.py @@ -2,13 +2,15 @@ This file will be removed in the future and replaced by - rdagent/app/model_implementation/eval.py """ -from dotenv import load_dotenv -from rdagent.oai.llm_utils import APIBackend + +import os # randomly generate a input graph, node_feature and edge_index # 1000 nodes, 128 dim node feature, 2000 edges import torch -import os +from dotenv import load_dotenv + +from rdagent.oai.llm_utils import APIBackend assert load_dotenv() formula_info = { @@ -34,9 +36,7 @@ formula_info["variables"], ) -resp = APIBackend(use_chat_cache=False).build_messages_and_create_chat_completion( - user_prompt, system_prompt -) +resp = APIBackend(use_chat_cache=False).build_messages_and_create_chat_completion(user_prompt, system_prompt) print(resp) @@ -48,7 +48,6 @@ average_shape_eval = [] average_value_eval = [] for test_mode in ["zeros", "ones", "randn"]: - if test_mode == "zeros": node_feature = torch.zeros(1000, 128) elif test_mode == "ones": diff --git a/rdagent/model_implementation/one_shot/__init__.py b/rdagent/components/task_implementation/model_implementation/one_shot/__init__.py similarity index 86% rename from rdagent/model_implementation/one_shot/__init__.py rename to rdagent/components/task_implementation/model_implementation/one_shot/__init__.py index 2e02fdb9..87f3d9e4 100644 --- a/rdagent/model_implementation/one_shot/__init__.py +++ b/rdagent/components/task_implementation/model_implementation/one_shot/__init__.py @@ -1,18 +1,21 @@ import re +from pathlib import Path from typing import Sequence -from rdagent.oai.llm_utils import APIBackend from jinja2 import Template + +from rdagent.components.task_implementation.model_implementation.task import ( + ModelImplTask, + ModelTaskImpl, +) from rdagent.core.implementation import TaskGenerator from rdagent.core.prompts import Prompts -from rdagent.model_implementation.task import ModelImplTask, ModelTaskImpl +from rdagent.oai.llm_utils import APIBackend -from pathlib import Path DIRNAME = Path(__file__).absolute().resolve().parent class ModelTaskGen(TaskGenerator): - def generate(self, task_l: Sequence[ModelImplTask]) -> Sequence[ModelTaskImpl]: mti_l = [] for t in task_l: @@ -28,13 +31,11 @@ def generate(self, task_l: Sequence[ModelImplTask]) -> Sequence[ModelTaskImpl]: description=t.description, formulation=t.formulation, variables=t.variables, - execute_desc=mti.execute_desc() + execute_desc=mti.execute_desc(), ) system_prompt = sys_prompt_tpl.render() - resp = APIBackend().build_messages_and_create_chat_completion( - user_prompt, system_prompt - ) + resp = APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt) # Extract the code part from the response match = re.search(r".*```[Pp]ython\n(.*)\n```.*", resp, re.DOTALL) diff --git a/rdagent/model_implementation/one_shot/prompt.yaml b/rdagent/components/task_implementation/model_implementation/one_shot/prompt.yaml similarity index 100% rename from rdagent/model_implementation/one_shot/prompt.yaml rename to rdagent/components/task_implementation/model_implementation/one_shot/prompt.yaml diff --git a/rdagent/model_implementation/task.py b/rdagent/components/task_implementation/model_implementation/task.py similarity index 93% rename from rdagent/model_implementation/task.py rename to rdagent/components/task_implementation/model_implementation/task.py index 0a69aee4..0cb81070 100644 --- a/rdagent/model_implementation/task.py +++ b/rdagent/components/task_implementation/model_implementation/task.py @@ -1,10 +1,20 @@ -import torch -from pathlib import Path import uuid +from pathlib import Path from typing import Dict, Optional, Sequence + +import torch + +from rdagent.components.task_implementation.model_implementation.conf import ( + MODEL_IMPL_SETTINGS, +) from rdagent.core.exception import CodeFormatException -from rdagent.core.task import BaseTask, FBTaskImplementation, ImpLoader, TaskImplementation, TaskLoader -from rdagent.model_implementation.conf import MODEL_IMPL_SETTINGS +from rdagent.core.task import ( + BaseTask, + FBTaskImplementation, + ImpLoader, + TaskImplementation, + TaskLoader, +) from rdagent.utils import get_module_by_module_path @@ -15,7 +25,9 @@ class ModelImplTask(BaseTask): formulation: str variables: Dict[str, str] # map the variable name to the variable description - def __init__(self, name: str, description: str, formulation: str, variables: Dict[str, str], key: Optional[str] = None) -> None: + def __init__( + self, name: str, description: str, formulation: str, variables: Dict[str, str], key: Optional[str] = None + ) -> None: """ Parameters @@ -76,7 +88,7 @@ def load(self, *argT, **kwargs) -> Sequence[ModelImplTask]: return [ModelImplTask(**formula_info)] -class ModelTaskImpl(FBTaskImplementation): +class ModelTaskImpl(TaskImplementation): """ It is a Pytorch model implementation task; All the things are placed in a folder. @@ -88,13 +100,14 @@ class ModelTaskImpl(FBTaskImplementation): - the `model.py` that contains a variable named `model_cls` which indicates the implemented model structure - `model_cls` is a instance of `torch.nn.Module`; - + We'll import the model in the implementation in file `model.py` after setting the cwd into the directory - from model import model_cls - initialize the model by initializing it `model_cls(input_dim=INPUT_DIM)` - And then verify the modle. """ + def __init__(self, target_task: BaseTask) -> None: super().__init__(target_task) self.path = None @@ -114,7 +127,7 @@ def execute(self, data=None, config: dict = {}): model_cls = mod.model_cls except AttributeError: raise CodeFormatException("The model_cls is not implemented in the model.py") - # model_init = + # model_init = assert isinstance(data, tuple) node_feature, _ = data @@ -147,6 +160,7 @@ class XXXLayer(torch.nn.Module): - And then verify the model by comparing the output tensors by feeding specific input tensor. """ + class ModelImpLoader(ImpLoader[ModelImplTask, ModelTaskImpl]): def __init__(self, path: Path) -> None: self.path = Path(path) diff --git a/rdagent/components/task_loader/__init__.py b/rdagent/components/task_loader/__init__.py new file mode 100644 index 00000000..6646680b --- /dev/null +++ b/rdagent/components/task_loader/__init__.py @@ -0,0 +1,9 @@ +from rdagent.core.task import TaskLoader + + +class FactorTaskLoader(TaskLoader): + pass + + +class ModelTaskLoader(TaskLoader): + pass diff --git a/rdagent/core/evaluation.py b/rdagent/core/evaluation.py index de9a46ce..2b20e8ab 100644 --- a/rdagent/core/evaluation.py +++ b/rdagent/core/evaluation.py @@ -1,8 +1,7 @@ from abc import ABC, abstractmethod -from rdagent.core.task import ( - TaskImplementation, - BaseTask, -) + +from rdagent.core.task import BaseTask, TaskImplementation + class Evaluator(ABC): @abstractmethod diff --git a/rdagent/core/evolving_agent.py b/rdagent/core/evolving_agent.py index 30a8f729..855d7e40 100644 --- a/rdagent/core/evolving_agent.py +++ b/rdagent/core/evolving_agent.py @@ -1,9 +1,11 @@ -from rdagent.core.evaluation import Evaluator -from rdagent.core.evolving_framework import Feedback, EvolvableSubjects, EvoStep -from tqdm import tqdm from abc import ABC, abstractmethod from typing import Any +from tqdm import tqdm + +from rdagent.core.evaluation import Evaluator +from rdagent.core.evolving_framework import EvolvableSubjects, EvoStep, Feedback + class EvoAgent(ABC): def __init__(self, max_loop, evolving_strategy) -> None: @@ -30,7 +32,6 @@ def multistep_evolve( with_feedback: bool = True, knowledge_self_gen: bool = False, ) -> EvolvableSubjects: - for _ in tqdm(range(self.max_loop), "Implementing factors"): # 1. knowledge self-evolving if knowledge_self_gen and self.rag is not None: diff --git a/rdagent/core/evolving_framework.py b/rdagent/core/evolving_framework.py index 06486a65..7b586a4b 100644 --- a/rdagent/core/evolving_framework.py +++ b/rdagent/core/evolving_framework.py @@ -33,7 +33,8 @@ def clone(self) -> EvolvableSubjects: return copy.deepcopy(self) -class QlibEvolvableSubjects(EvolvableSubjects): ... +class QlibEvolvableSubjects(EvolvableSubjects): + ... @dataclass diff --git a/rdagent/core/implementation.py b/rdagent/core/implementation.py index ada0da65..be08dc42 100644 --- a/rdagent/core/implementation.py +++ b/rdagent/core/implementation.py @@ -1,10 +1,8 @@ from abc import ABC, abstractmethod from typing import List, Sequence -from rdagent.core.task import ( - BaseTask, - TaskImplementation, -) +from rdagent.core.task import BaseTask, TaskImplementation + class TaskGenerator(ABC): @abstractmethod @@ -28,5 +26,3 @@ def collect_feedback(self, feedback_obj_l: List[object]): feedback_obj_l : List[object] """ - - diff --git a/rdagent/core/prompts.py b/rdagent/core/prompts.py index 32ab9046..19f9f63b 100644 --- a/rdagent/core/prompts.py +++ b/rdagent/core/prompts.py @@ -2,6 +2,7 @@ from typing import Dict import yaml + from rdagent.core.utils import SingletonBaseClass diff --git a/rdagent/core/proposal/__init__.py b/rdagent/core/proposal/__init__.py new file mode 100644 index 00000000..43125809 --- /dev/null +++ b/rdagent/core/proposal/__init__.py @@ -0,0 +1,95 @@ +""" + +""" + +from typing import Tuple + +from rdagent.core.task import BaseTask, TaskLoader + +# class data_ana: XXX + + +class Belief: + """ + TODO: We may have better name for it. + + Name Candidates: + - Hypothesis + """ + + # source: data_ana | model_nan = None + + +# Origin(path of repo/data/feedback) => view/summarization => generated Belief + + +class Scenario: + def get_repo_path(self): + """codebase""" + + def get_data(self): + """ "data info""" + + def get_env(self): + """env description""" + + +class Trace: + scen: Scenario + hist: list[Tuple[Belief, Feedback]] + + +class BeliefGen: + def __init__(self, scen: Scenario): + self.scen = scen + + def gen(self, trace: Trace) -> Belief: + # def gen(self, scenario_desc: str, ) -> Belief: + """ + Motivation of the variable `scenario_desc`: + - Mocking a data-scientist is observing the scenario. + + scenario_desc may conclude: + - data observation: + - Original or derivative + - Task information: + """ + + +class BeliefSet: + """ + # drop, append + + belief_imp: list[float] | None # importance of each belief + failed_belief or success belief + """ + + belief_l: list[Belief] + feedbacks: Dict[Tuple[Belief, Scenario], BeliefFeedback] + + +class Belief2Task(TaskLoader): + """ + [Abstract description => conceret description] => Code implement + """ + + def convert(self, bs: BeliefSet) -> BaseTask: + """Connect the idea proposal to implementation""" + ... + + +class BeliefFeedback: + ... + + +# Boolean, Reason, Confidence, etc. + + +class Imp2Feedback: + """ "Generated(summarize) feedback from **Executed** Implemenation""" + + def summarize(self, ti: TaskImplementation) -> BeliefFeedback: + """ + The `ti` should be exectued and the results should be included. + For example: `mlflow` of Qlib will be included. + """ diff --git a/rdagent/core/task.py b/rdagent/core/task.py index bb2d758d..95e8f4a7 100644 --- a/rdagent/core/task.py +++ b/rdagent/core/task.py @@ -1,7 +1,9 @@ from abc import ABC, abstractmethod from pathlib import Path from typing import Generic, Optional, Sequence, Tuple, TypeVar + import pandas as pd + """ This file contains the all the data class for rdagent task. """ @@ -13,11 +15,11 @@ class BaseTask(ABC): # I think the task version applies to the base class. pass + ASpecificTask = TypeVar("ASpecificTask", bound=BaseTask) class TaskImplementation(ABC, Generic[ASpecificTask]): - def __init__(self, target_task: ASpecificTask) -> None: self.target_task = target_task @@ -46,7 +48,6 @@ def execute_desc(self): class ImpLoader(ABC, Generic[ASpecificTask, ASpecificTaskImp]): - @abstractmethod def load(self, task: ASpecificTask) -> ASpecificTaskImp: raise NotImplementedError("load method is not implemented.") @@ -55,7 +56,7 @@ def load(self, task: ASpecificTask) -> ASpecificTaskImp: class FBTaskImplementation(TaskImplementation): """ File-based task implementation - + The implemented task will be a folder which contains related elements. - Data - Code Implementation @@ -73,6 +74,7 @@ def run_pipline(self, **files: str): self.execute() """ + # TODO: # FileBasedFactorImplementation should inherient from it. # Why not directly reuse FileBasedFactorImplementation. @@ -115,7 +117,6 @@ def get_files(self) -> list[Path]: class TestCase: - def __init__( self, target_task: list[BaseTask] = [], @@ -126,7 +127,6 @@ def __init__( class TaskLoader: - @abstractmethod def load(self, *args, **kwargs) -> Sequence[BaseTask]: raise NotImplementedError("load method is not implemented.") diff --git a/rdagent/knowledge_management/knowledgebase.py b/rdagent/knowledge_management/knowledgebase.py deleted file mode 100644 index cffdb8da..00000000 --- a/rdagent/knowledge_management/knowledgebase.py +++ /dev/null @@ -1,723 +0,0 @@ -from __future__ import annotations - -import re -import random -import json -import copy - -from jinja2 import Template - -from itertools import combinations -from pathlib import Path -from typing import Union - -from rdagent.core.evolving_framework import ( - KnowledgeBase, -) -from rdagent.core.evolving_framework import EvoStep, EvolvableSubjects, RAGStrategy, Knowledge, QueriedKnowledge -from rdagent.factor_implementation.evolving.knowledge_management import ( - FactorImplementationKnowledge, - FactorImplementationQueriedGraphKnowledge, -) -from rdagent.factor_implementation.share_modules.factor_implementation_config import FACTOR_IMPLEMENT_SETTINGS - -from rdagent.knowledge_management.graph import UndirectedGraph, UndirectedNode -from rdagent.core.prompts import Prompts -from rdagent.core.log import RDAgentLog -from rdagent.oai.llm_utils import APIBackend, calculate_embedding_distance_between_str_list - - -class FactorImplementationGraphKnowledgeBase(KnowledgeBase): - def __init__(self, init_component_list=None) -> None: - """ - Load knowledge, offer brief information of knowledge and common handle interfaces - """ - self.graph: UndirectedGraph = UndirectedGraph.load(Path.cwd() / "graph.pkl") - RDAgentLog().info(f"Knowledge Graph loaded, size={self.graph.size()}") - - if init_component_list: - for component in init_component_list: - exist_node = self.graph.get_node_by_content(content=component) - node = exist_node if exist_node else UndirectedNode(content=component, label="component") - self.graph.add_nodes(node=node, neighbors=[]) - - # A dict containing all working trace until they fail or succeed - self.working_trace_knowledge = {} - - # A dict containing error analysis each step aligned with working trace - self.working_trace_error_analysis = {} - - # Add already success task - self.success_task_to_knowledge_dict = {} - - # key:node_id(for task trace and success implement), value:knowledge instance(aka 'FactorImplementationKnowledge') - self.node_to_implementation_knowledge_dict = {} - - # store the task description to component nodes - self.task_to_component_nodes = {} - - def get_all_nodes_by_label(self, label: str) -> list[UndirectedNode]: - return self.graph.get_all_nodes_by_label(label) - - def update_success_task( - self, - success_task_info: str, - ): # Transfer the success tasks' working trace to knowledge storage & graph - success_task_trace = self.working_trace_knowledge[success_task_info] - success_task_error_analysis_record = ( - self.working_trace_error_analysis[success_task_info] - if success_task_info in self.working_trace_error_analysis - else [] - ) - task_des_node = UndirectedNode(content=success_task_info, label="task_description") - self.graph.add_nodes( - node=task_des_node, - neighbors=self.task_to_component_nodes[success_task_info], - ) # 1st version, we assume that all component nodes are given - for index, trace_unit in enumerate(success_task_trace): # every unit: single_knowledge - neighbor_nodes = [task_des_node] - if index != len(success_task_trace) - 1: - trace_node = UndirectedNode( - content=trace_unit.get_implementation_and_feedback_str(), - label="task_trace", - ) - self.node_to_implementation_knowledge_dict[trace_node.id] = trace_unit - for node_index, error_node in enumerate(success_task_error_analysis_record[index]): - if type(error_node).__name__ == "str": - queried_node = self.graph.get_node_by_content(content=error_node) - if queried_node is None: - new_error_node = UndirectedNode(content=error_node, label="error") - self.graph.add_node(node=new_error_node) - success_task_error_analysis_record[index][node_index] = new_error_node - else: - success_task_error_analysis_record[index][node_index] = queried_node - neighbor_nodes.extend(success_task_error_analysis_record[index]) - self.graph.add_nodes(node=trace_node, neighbors=neighbor_nodes) - else: - success_node = UndirectedNode( - content=trace_unit.get_implementation_and_feedback_str(), - label="task_success_implement", - ) - self.graph.add_nodes(node=success_node, neighbors=neighbor_nodes) - self.node_to_implementation_knowledge_dict[success_node.id] = trace_unit - - def query(self): - pass - - def graph_get_node_by_content(self, content: str) -> UndirectedNode: - return self.graph.get_node_by_content(content=content) - - def graph_query_by_content( - self, - content: Union[str, list[str]], - topk_k: int = 5, - step: int = 1, - constraint_labels: list[str] = None, - constraint_node: UndirectedNode = None, - similarity_threshold: float = 0.0, - constraint_distance: float = 0, - block: bool = False, - ) -> list[UndirectedNode]: - """ - search graph by content similarity and connection relationship, return empty list if nodes' chain without node - near to constraint_node - - Parameters - ---------- - constraint_distance - content - topk_k: the upper number of output for each query, if the number of fit nodes is less than topk_k, return all fit nodes's content - step - constraint_labels - constraint_node - similarity_threshold - block: despite the start node, the search can only flow through the constraint_label type nodes - - Returns - ------- - - """ - - return self.graph.query_by_content( - content=content, - topk_k=topk_k, - step=step, - constraint_labels=constraint_labels, - constraint_node=constraint_node, - similarity_threshold=similarity_threshold, - constraint_distance=constraint_distance, - block=block, - ) - - def graph_query_by_node( - self, - node: UndirectedNode, - step: int = 1, - constraint_labels: list[str] = None, - constraint_node: UndirectedNode = None, - constraint_distance: float = 0, - block: bool = False, - ) -> list[UndirectedNode]: - """ - search graph by connection, return empty list if nodes' chain without node near to constraint_node - Parameters - ---------- - node : start node - step : the max steps will be searched - constraint_labels : the labels of output nodes - constraint_node : the node that the output nodes must connect to - constraint_distance : the max distance between output nodes and constraint_node - block: despite the start node, the search can only flow through the constraint_label type nodes - - Returns - ------- - A list of nodes - - """ - nodes = self.graph.query_by_node( - node=node, - step=step, - constraint_labels=constraint_labels, - constraint_node=constraint_node, - constraint_distance=constraint_distance, - block=block, - ) - return nodes - - def graph_query_by_intersection( - self, - nodes: list[UndirectedNode], - steps: int = 1, - constraint_labels: list[str] = None, - output_intersection_origin: bool = False, - ) -> list[UndirectedNode] | list[list[list[UndirectedNode], UndirectedNode]]: - """ - search graph by node intersection, node intersected by a higher frequency has a prior order in the list - Parameters - ---------- - nodes : node list - step : the max steps will be searched - constraint_labels : the labels of output nodes - output_intersection_origin: output the list that contains the node which form this intersection node - - Returns - ------- - A list of nodes - - """ - node_count = len(nodes) - assert node_count >= 2, "nodes length must >=2" - intersection_node_list = [] - if output_intersection_origin: - origin_list = [] - for k in range(node_count, 1, -1): - possible_combinations = combinations(nodes, k) - for possible_combination in possible_combinations: - node_list = list(possible_combination) - intersection_node_list.extend( - self.graph.get_nodes_intersection(node_list, steps=steps, constraint_labels=constraint_labels) - ) - if output_intersection_origin: - for _ in range(len(intersection_node_list)): - origin_list.append(node_list) - intersection_node_list_sort_by_freq = [] - for index, node in enumerate(intersection_node_list): - if node not in intersection_node_list_sort_by_freq: - if output_intersection_origin: - intersection_node_list_sort_by_freq.append([origin_list[index], node]) - else: - intersection_node_list_sort_by_freq.append(node) - - return intersection_node_list_sort_by_freq - - -class FactorImplementationGraphRAGStrategy(RAGStrategy): - def __init__(self, knowledgebase: FactorImplementationGraphKnowledgeBase) -> None: - super().__init__(knowledgebase) - self.current_generated_trace_count = 0 - self.prompt = Prompts(file_path=Path(__file__).parent / "prompts.yaml") - - def generate_knowledge( - self, - evolving_trace: list[EvoStep], - *, - return_knowledge: bool = False, - ) -> Knowledge | None: - if len(evolving_trace) == self.current_generated_trace_count: - return None - - else: - for trace_index in range(self.current_generated_trace_count, len(evolving_trace)): - evo_step = evolving_trace[trace_index] - implementations = evo_step.evolvable_subjects - feedback = evo_step.feedback - for task_index in range(len(implementations.target_factor_tasks)): - single_feedback = feedback[task_index] - target_task = implementations.target_factor_tasks[task_index] - target_task_information = target_task.get_factor_information() - implementation = implementations.corresponding_implementations[task_index] - single_feedback = feedback[task_index] - if single_feedback is None: - continue - single_knowledge = FactorImplementationKnowledge( - target_task=target_task, - implementation=implementation, - feedback=single_feedback, - ) - if ( - target_task_information not in self.knowledgebase.success_task_to_knowledge_dict - and implementation is not None - ): - self.knowledgebase.working_trace_knowledge.setdefault(target_task_information, []).append( - single_knowledge, - ) # save to working trace - if single_feedback.final_decision == True: - self.knowledgebase.success_task_to_knowledge_dict.setdefault( - target_task_information, - single_knowledge, - ) - # Do summary for the last step and update the knowledge graph - self.knowledgebase.update_success_task( - target_task_information, - ) - else: - # generate error node and store into knowledge base - error_analysis_result = [] - if not single_feedback.value_generated_flag: - error_analysis_result = self.analyze_error( - single_feedback.execution_feedback, - feedback_type="execution", - ) - else: - error_analysis_result = self.analyze_error( - single_feedback.factor_value_feedback, - feedback_type="value", - ) - self.knowledgebase.working_trace_error_analysis.setdefault( - target_task_information, - [], - ).append( - error_analysis_result, - ) # save to working trace error record, for graph update - - self.current_generated_trace_count = len(evolving_trace) - return None - - def query(self, evo: EvolvableSubjects, evolving_trace: list[EvoStep]) -> QueriedKnowledge | None: - conf_knowledge_sampler = FACTOR_IMPLEMENT_SETTINGS.v2_knowledge_sampler - factor_implementation_queried_graph_knowledge = FactorImplementationQueriedGraphKnowledge( - success_task_to_knowledge_dict=self.knowledgebase.success_task_to_knowledge_dict, - ) - - factor_implementation_queried_graph_knowledge = self.former_trace_query( - evo, - factor_implementation_queried_graph_knowledge, - FACTOR_IMPLEMENT_SETTINGS.v2_query_former_trace_limit, - ) - factor_implementation_queried_graph_knowledge = self.component_query( - evo, - factor_implementation_queried_graph_knowledge, - FACTOR_IMPLEMENT_SETTINGS.v2_query_component_limit, - knowledge_sampler=conf_knowledge_sampler, - ) - factor_implementation_queried_graph_knowledge = self.error_query( - evo, - factor_implementation_queried_graph_knowledge, - FACTOR_IMPLEMENT_SETTINGS.v2_query_error_limit, - knowledge_sampler=conf_knowledge_sampler, - ) - return factor_implementation_queried_graph_knowledge - - def analyze_component( - self, - target_factor_task_information, - ) -> list[UndirectedNode]: # Hardcode: certain component nodes - all_component_nodes = self.knowledgebase.graph.get_all_nodes_by_label_list(["component"]) - all_component_content = "" - for _, component_node in enumerate(all_component_nodes): - all_component_content += f"{component_node.content}, \n" - analyze_component_system_prompt = Template(self.prompt["analyze_component_prompt_v1_system"]).render( - all_component_content=all_component_content, - ) - - analyze_component_user_prompt = target_factor_task_information - try: - component_no_list = json.loads( - APIBackend().build_messages_and_create_chat_completion( - system_prompt=analyze_component_system_prompt, - user_prompt=analyze_component_user_prompt, - json_mode=True, - ), - )["component_no_list"] - return [all_component_nodes[index - 1] for index in sorted(list(set(component_no_list)))] - except: - RDAgentLog().warning("Error when analyzing components.") - analyze_component_user_prompt = "Your response is not a valid component index list." - - return [] - - def analyze_error( - self, - single_feedback, - feedback_type="execution", - ) -> list[ - UndirectedNode | str - ]: # Hardcode: Raised errors, existed error nodes + not existed error nodes(here, they are strs) - if feedback_type == "execution": - match = re.search( - r'File "(?P.+)", line (?P\d+), in (?P.+)\n\s+(?P.+)\n(?P\w+): (?P.+)', - single_feedback, - ) - if match: - error_details = match.groupdict() - # last_traceback = f'File "{error_details["file"]}", line {error_details["line"]}, in {error_details["function"]}\n {error_details["error_line"]}' - error_type = error_details["error_type"] - error_line = error_details["error_line"] - error_contents = [f"ErrorType: {error_type}" + "\n" + f"Error line: {error_line}"] - else: - error_contents = ["Undefined Error"] - elif feedback_type == "value": # value check error - value_check_types = r"The source dataframe and the ground truth dataframe have different rows count.|The source dataframe and the ground truth dataframe have different index.|Some values differ by more than the tolerance of 1e-6.|No sufficient correlation found when shifting up|Something wrong happens when naming the multi indices of the dataframe." - error_contents = re.findall(value_check_types, single_feedback) - else: - error_contents = ["Undefined Error"] - - all_error_nodes = self.knowledgebase.graph.get_all_nodes_by_label_list(["error"]) - if not len(all_error_nodes): - return error_contents - else: - error_list = [] - for error_content in error_contents: - for error_node in all_error_nodes: - if error_content == error_node.content: - error_list.append(error_node) - else: - error_list.append(error_content) - if error_list[-1] in error_list[:-1]: - error_list.pop() - - return error_list - - def former_trace_query( - self, - evo: EvolvableSubjects, - factor_implementation_queried_graph_knowledge: FactorImplementationQueriedGraphKnowledge, - v2_query_former_trace_limit: int = 5, - ) -> Union[QueriedKnowledge, set]: - """ - Query the former trace knowledge of the working trace, and find all the failed task information which tried more than fail_task_trial_limit times - """ - fail_task_trial_limit = FACTOR_IMPLEMENT_SETTINGS.fail_task_trial_limit - - for target_factor_task in evo.target_factor_tasks: - target_factor_task_information = target_factor_task.get_factor_information() - if ( - target_factor_task_information not in self.knowledgebase.success_task_to_knowledge_dict - and target_factor_task_information in self.knowledgebase.working_trace_knowledge - and len(self.knowledgebase.working_trace_knowledge[target_factor_task_information]) - >= fail_task_trial_limit - ): - factor_implementation_queried_graph_knowledge.failed_task_info_set.add(target_factor_task_information) - - if ( - target_factor_task_information not in self.knowledgebase.success_task_to_knowledge_dict - and target_factor_task_information - not in factor_implementation_queried_graph_knowledge.failed_task_info_set - and target_factor_task_information in self.knowledgebase.working_trace_knowledge - ): - former_trace_knowledge = copy.copy( - self.knowledgebase.working_trace_knowledge[target_factor_task_information], - ) - # in former trace query we will delete the right trace in the following order:[..., value_generated_flag is True, value_generated_flag is False, ...] - # because we think this order means a deterioration of the trial (like a wrong gradient descent) - current_index = 1 - while current_index < len(former_trace_knowledge): - if ( - not former_trace_knowledge[current_index].feedback.value_generated_flag - and former_trace_knowledge[current_index - 1].feedback.value_generated_flag - ): - former_trace_knowledge.pop(current_index) - else: - current_index += 1 - - factor_implementation_queried_graph_knowledge.former_traces[target_factor_task_information] = ( - former_trace_knowledge[-v2_query_former_trace_limit:] - ) - else: - factor_implementation_queried_graph_knowledge.former_traces[target_factor_task_information] = [] - - return factor_implementation_queried_graph_knowledge - - def component_query( - self, - evo: EvolvableSubjects, - factor_implementation_queried_graph_knowledge: FactorImplementationQueriedGraphKnowledge, - v2_query_component_limit: int = 5, - knowledge_sampler: float = 1.0, - ) -> QueriedKnowledge | None: - # queried_component_knowledge = FactorImplementationQueriedGraphComponentKnowledge() - for target_factor_task in evo.target_factor_tasks: - target_factor_task_information = target_factor_task.get_factor_information() - if ( - target_factor_task_information in self.knowledgebase.success_task_to_knowledge_dict - or target_factor_task_information in factor_implementation_queried_graph_knowledge.failed_task_info_set - ): - factor_implementation_queried_graph_knowledge.component_with_success_task[ - target_factor_task_information - ] = [] - else: - if target_factor_task_information not in self.knowledgebase.task_to_component_nodes: - self.knowledgebase.task_to_component_nodes[target_factor_task_information] = self.analyze_component( - target_factor_task_information, - ) - - component_analysis_result = self.knowledgebase.task_to_component_nodes[target_factor_task_information] - - if len(component_analysis_result) > 1: - task_des_node_list = self.knowledgebase.graph_query_by_intersection( - component_analysis_result, - constraint_labels=["task_description"], - ) - single_component_constraint = (v2_query_component_limit // len(component_analysis_result)) + 1 - else: - task_des_node_list = [] - single_component_constraint = v2_query_component_limit - factor_implementation_queried_graph_knowledge.component_with_success_task[ - target_factor_task_information - ] = [] - for component_node in component_analysis_result: - # Reverse iterate, a trade-off with intersection search - count = 0 - for task_des_node in self.knowledgebase.graph_query_by_node( - node=component_node, - step=1, - constraint_labels=["task_description"], - block=True, - )[::-1]: - if task_des_node not in task_des_node_list: - task_des_node_list.append(task_des_node) - count += 1 - if count >= single_component_constraint: - break - - for node in task_des_node_list: - for searched_node in self.knowledgebase.graph_query_by_node( - node=node, - step=50, - constraint_labels=[ - "task_success_implement", - ], - block=True, - ): - if searched_node.label == "task_success_implement": - target_knowledge = self.knowledgebase.node_to_implementation_knowledge_dict[ - searched_node.id - ] - if ( - target_knowledge - not in factor_implementation_queried_graph_knowledge.component_with_success_task[ - target_factor_task_information - ] - ): - factor_implementation_queried_graph_knowledge.component_with_success_task[ - target_factor_task_information - ].append(target_knowledge) - - # finally add embedding related knowledge - knowledge_base_success_task_list = list(self.knowledgebase.success_task_to_knowledge_dict) - - similarity = calculate_embedding_distance_between_str_list( - [target_factor_task_information], - knowledge_base_success_task_list, - )[0] - similar_indexes = sorted( - range(len(similarity)), - key=lambda i: similarity[i], - reverse=True, - ) - embedding_similar_successful_knowledge = [ - self.knowledgebase.success_task_to_knowledge_dict[knowledge_base_success_task_list[index]] - for index in similar_indexes - ] - for knowledge in embedding_similar_successful_knowledge: - if ( - knowledge - not in factor_implementation_queried_graph_knowledge.component_with_success_task[ - target_factor_task_information - ] - ): - factor_implementation_queried_graph_knowledge.component_with_success_task[ - target_factor_task_information - ].append(knowledge) - - if knowledge_sampler > 0: - factor_implementation_queried_graph_knowledge.component_with_success_task[ - target_factor_task_information - ] = [ - knowledge - for knowledge in factor_implementation_queried_graph_knowledge.component_with_success_task[ - target_factor_task_information - ] - if random.uniform(0, 1) <= knowledge_sampler - ] - - # Make sure no less than half of the knowledge are from GT - queried_knowledge_list = factor_implementation_queried_graph_knowledge.component_with_success_task[ - target_factor_task_information - ] - queried_from_gt_knowledge_list = [ - knowledge - for knowledge in queried_knowledge_list - if knowledge.feedback is not None and knowledge.feedback.final_decision_based_on_gt == True - ] - queried_without_gt_knowledge_list = [ - knowledge - for knowledge in queried_knowledge_list - if knowledge.feedback is not None and knowledge.feedback.final_decision_based_on_gt == False - ] - queried_from_gt_knowledge_count = max( - min(v2_query_component_limit // 2, len(queried_from_gt_knowledge_list)), - v2_query_component_limit - len(queried_without_gt_knowledge_list), - ) - factor_implementation_queried_graph_knowledge.component_with_success_task[ - target_factor_task_information - ] = ( - queried_from_gt_knowledge_list[:queried_from_gt_knowledge_count] - + queried_without_gt_knowledge_list[: v2_query_component_limit - queried_from_gt_knowledge_count] - ) - - return factor_implementation_queried_graph_knowledge - - def error_query( - self, - evo: EvolvableSubjects, - factor_implementation_queried_graph_knowledge: FactorImplementationQueriedGraphKnowledge, - v2_query_error_limit: int = 5, - knowledge_sampler: float = 1.0, - ) -> QueriedKnowledge | None: - # queried_error_knowledge = FactorImplementationQueriedGraphErrorKnowledge() - for task_index, target_factor_task in enumerate(evo.target_factor_tasks): - target_factor_task_information = target_factor_task.get_factor_information() - factor_implementation_queried_graph_knowledge.error_with_success_task[target_factor_task_information] = {} - if ( - target_factor_task_information in self.knowledgebase.success_task_to_knowledge_dict - or target_factor_task_information in factor_implementation_queried_graph_knowledge.failed_task_info_set - ): - factor_implementation_queried_graph_knowledge.error_with_success_task[ - target_factor_task_information - ] = [] - else: - factor_implementation_queried_graph_knowledge.error_with_success_task[ - target_factor_task_information - ] = [] - if ( - target_factor_task_information in self.knowledgebase.working_trace_error_analysis - and len(self.knowledgebase.working_trace_error_analysis[target_factor_task_information]) > 0 - and len(factor_implementation_queried_graph_knowledge.former_traces[target_factor_task_information]) - > 0 - ): - queried_last_trace = factor_implementation_queried_graph_knowledge.former_traces[ - target_factor_task_information - ][-1] - target_index = self.knowledgebase.working_trace_knowledge[target_factor_task_information].index( - queried_last_trace, - ) - last_knowledge_error_analysis_result = self.knowledgebase.working_trace_error_analysis[ - target_factor_task_information - ][target_index] - else: - last_knowledge_error_analysis_result = [] - - error_nodes = [] - for error_node in last_knowledge_error_analysis_result: - if not isinstance(error_node, UndirectedNode): - error_node = self.knowledgebase.graph_get_node_by_content(content=error_node) - if error_node is None: - continue - error_nodes.append(error_node) - - if len(error_nodes) > 1: - task_trace_node_list = self.knowledgebase.graph_query_by_intersection( - error_nodes, - constraint_labels=["task_trace"], - output_intersection_origin=True, - ) - single_error_constraint = (v2_query_error_limit // len(error_nodes)) + 1 - else: - task_trace_node_list = [] - single_error_constraint = v2_query_error_limit - for error_node in error_nodes: - # Reverse iterate, a trade-off with intersection search - count = 0 - for task_trace_node in self.knowledgebase.graph_query_by_node( - node=error_node, - step=1, - constraint_labels=["task_trace"], - block=True, - )[::-1]: - if task_trace_node not in task_trace_node_list: - task_trace_node_list.append([[error_node], task_trace_node]) - count += 1 - if count >= single_error_constraint: - break - - # for error_node in last_knowledge_error_analysis_result: - # if not isinstance(error_node, UndirectedNode): - # error_node = self.knowledgebase.graph_get_node_by_content(content=error_node) - # if error_node is None: - # continue - # for searched_node in self.knowledgebase.graph_query_by_node( - # node=error_node, - # step=1, - # constraint_labels=["task_trace"], - # block=True, - # ): - # if searched_node not in [node[0] for node in task_trace_node_list]: - # task_trace_node_list.append((searched_node, error_node.content)) - - same_error_success_knowledge_pair_list = [] - same_error_success_node_set = set() - for error_node_list, trace_node in task_trace_node_list: - for searched_trace_success_node in self.knowledgebase.graph_query_by_node( - node=trace_node, - step=50, - constraint_labels=[ - "task_trace", - "task_success_implement", - "task_description", - ], - block=True, - ): - if ( - searched_trace_success_node not in same_error_success_node_set - and searched_trace_success_node.label == "task_success_implement" - ): - same_error_success_node_set.add(searched_trace_success_node) - - trace_knowledge = self.knowledgebase.node_to_implementation_knowledge_dict[trace_node.id] - success_knowledge = self.knowledgebase.node_to_implementation_knowledge_dict[ - searched_trace_success_node.id - ] - error_content = "" - for index, error_node in enumerate(error_node_list): - error_content += f"{index+1}. {error_node.content}; " - same_error_success_knowledge_pair_list.append( - ( - error_content, - (trace_knowledge, success_knowledge), - ), - ) - - if knowledge_sampler > 0: - same_error_success_knowledge_pair_list = [ - knowledge - for knowledge in same_error_success_knowledge_pair_list - if random.uniform(0, 1) <= knowledge_sampler - ] - - same_error_success_knowledge_pair_list = same_error_success_knowledge_pair_list[:v2_query_error_limit] - factor_implementation_queried_graph_knowledge.error_with_success_task[ - target_factor_task_information - ] = same_error_success_knowledge_pair_list - - return factor_implementation_queried_graph_knowledge diff --git a/rdagent/oai/llm_utils.py b/rdagent/oai/llm_utils.py index 2d1c8c01..7be21525 100644 --- a/rdagent/oai/llm_utils.py +++ b/rdagent/oai/llm_utils.py @@ -19,7 +19,7 @@ import tiktoken from rdagent.core.conf import RD_AGENT_SETTINGS -from rdagent.core.log import RDAgentLog, LogColors +from rdagent.core.log import LogColors, RDAgentLog from rdagent.core.utils import SingletonBaseClass DEFAULT_QLIB_DOT_PATH = Path("./") diff --git a/rdagent/scenarios/qlib/factor_task_implementation/__init__.py b/rdagent/scenarios/qlib/factor_task_implementation/__init__.py new file mode 100644 index 00000000..b0ac1c47 --- /dev/null +++ b/rdagent/scenarios/qlib/factor_task_implementation/__init__.py @@ -0,0 +1,5 @@ +from rdagent.components.task_implementation.factor_implementation.CoSTEER import ( + CoSTEERFG, +) + +COSTEERFG_QUANT_FACTOR_IMPLEMENTATION = CoSTEERFG # TODO: This is a placeholder. We need to split the scenario part of the task implementation into this folder diff --git a/rdagent/factor_implementation/task_loader/json_loader.py b/rdagent/scenarios/qlib/factor_task_loader/json_loader.py similarity index 72% rename from rdagent/factor_implementation/task_loader/json_loader.py rename to rdagent/scenarios/qlib/factor_task_loader/json_loader.py index d626d98a..292922be 100644 --- a/rdagent/factor_implementation/task_loader/json_loader.py +++ b/rdagent/scenarios/qlib/factor_task_loader/json_loader.py @@ -1,11 +1,15 @@ import json from pathlib import Path -from rdagent.core.task import TaskLoader -from rdagent.factor_implementation.evolving.factor import FactorImplementTask, FileBasedFactorImplementation -from rdagent.core.task import TestCase +from rdagent.components.task_implementation.factor_implementation.evolving.factor import ( + FactorImplementTask, + FileBasedFactorImplementation, +) +from rdagent.components.task_loader import FactorTaskLoader +from rdagent.core.task import TaskLoader, TestCase -class FactorImplementationTaskLoaderFromDict(TaskLoader): + +class FactorImplementationTaskLoaderFromDict(FactorTaskLoader): def load(self, factor_dict: dict) -> list: """Load data from a dict.""" task_l = [] @@ -20,21 +24,22 @@ def load(self, factor_dict: dict) -> list: return task_l -class FactorImplementationTaskLoaderFromJsonFile(TaskLoader): +class FactorImplementationTaskLoaderFromJsonFile(FactorTaskLoader): def load(self, json_file_path: Path) -> list: - with open(json_file_path, 'r') as file: + with open(json_file_path, "r") as file: factor_dict = json.load(file) return FactorImplementationTaskLoaderFromDict().load(factor_dict) -class FactorImplementationTaskLoaderFromJsonString(TaskLoader): +class FactorImplementationTaskLoaderFromJsonString(FactorTaskLoader): def load(self, json_string: str) -> list: factor_dict = json.loads(json_string) return FactorImplementationTaskLoaderFromDict().load(factor_dict) + class FactorTestCaseLoaderFromJsonFile(TaskLoader): def load(self, json_file_path: Path) -> list: - with open(json_file_path, 'r') as file: + with open(json_file_path, "r") as file: factor_dict = json.load(file) TestData = TestCase() for factor_name, factor_data in factor_dict.items(): @@ -49,4 +54,4 @@ def load(self, json_file_path: Path) -> list: TestData.target_task.append(task) TestData.ground_truth.append(gt) - return TestData \ No newline at end of file + return TestData diff --git a/rdagent/factor_implementation/task_loader/pdf_loader.py b/rdagent/scenarios/qlib/factor_task_loader/pdf_loader.py similarity index 98% rename from rdagent/factor_implementation/task_loader/pdf_loader.py rename to rdagent/scenarios/qlib/factor_task_loader/pdf_loader.py index aa964f8c..6ee4ab75 100644 --- a/rdagent/factor_implementation/task_loader/pdf_loader.py +++ b/rdagent/scenarios/qlib/factor_task_loader/pdf_loader.py @@ -10,16 +10,21 @@ import pandas as pd import tiktoken from jinja2 import Template +from sklearn.cluster import KMeans +from sklearn.metrics.pairwise import cosine_similarity +from sklearn.preprocessing import normalize + +from rdagent.components.document_reader.document_reader import ( + load_and_process_pdfs_by_langchain, +) +from rdagent.components.task_loader import FactorTaskLoader from rdagent.core.conf import RD_AGENT_SETTINGS from rdagent.core.log import RDAgentLog from rdagent.core.prompts import Prompts -from rdagent.core.task import TaskLoader -from rdagent.document_reader.document_reader import load_and_process_pdfs_by_langchain -from rdagent.factor_implementation.task_loader.json_loader import FactorImplementationTaskLoaderFromDict from rdagent.oai.llm_utils import APIBackend, create_embedding_with_multiprocessing -from sklearn.cluster import KMeans -from sklearn.metrics.pairwise import cosine_similarity -from sklearn.preprocessing import normalize +from rdagent.scenarios.qlib.factor_task_loader.json_loader import ( + FactorImplementationTaskLoaderFromDict, +) document_process_prompts = Prompts(file_path=Path(__file__).parent / "prompts.yaml") @@ -571,7 +576,7 @@ def deduplicate_factors_by_llm( # noqa: C901, PLR0912 return llm_deduplicated_factor_dict, final_duplication_names_list -class FactorImplementationTaskLoaderFromPDFfiles(TaskLoader): +class FactorImplementationTaskLoaderFromPDFfiles(FactorTaskLoader): def load(self, file_or_folder_path: Path) -> dict: docs_dict = load_and_process_pdfs_by_langchain(Path(file_or_folder_path)) diff --git a/rdagent/factor_implementation/task_loader/prompts.yaml b/rdagent/scenarios/qlib/factor_task_loader/prompts.yaml similarity index 100% rename from rdagent/factor_implementation/task_loader/prompts.yaml rename to rdagent/scenarios/qlib/factor_task_loader/prompts.yaml diff --git a/rdagent/scenarios/qlib/model_task_implementation/__init__.py b/rdagent/scenarios/qlib/model_task_implementation/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/rdagent/scenarios/qlib/task_implementation/data.py b/rdagent/scenarios/qlib/task_implementation/data.py new file mode 100644 index 00000000..96752733 --- /dev/null +++ b/rdagent/scenarios/qlib/task_implementation/data.py @@ -0,0 +1,14 @@ +from rdagent.core.task import FBTaskImplementation + + +class QlibDataTaskImplementation(FBTaskImplementation): + """ + Docker run + Everything in a folder + - config.yaml + - price-volume data dumper + - `data.py` + Adaptor to Factor implementation + - results in `mlflow` + + - TODO: implement a qlib handler + """ diff --git a/rdagent/scenarios/qlib/task_implementation/feedback.py b/rdagent/scenarios/qlib/task_implementation/feedback.py new file mode 100644 index 00000000..f2e8ce91 --- /dev/null +++ b/rdagent/scenarios/qlib/task_implementation/feedback.py @@ -0,0 +1,2 @@ +# TODO: +# Implement to feedback. diff --git a/rdagent/scenarios/qlib/task_implementation/model.py b/rdagent/scenarios/qlib/task_implementation/model.py new file mode 100644 index 00000000..db0bb4b0 --- /dev/null +++ b/rdagent/scenarios/qlib/task_implementation/model.py @@ -0,0 +1,15 @@ +from rdagent.core.task import FBTaskImplementation + + +class QlibModelTaskImplementation(FBTaskImplementation): + """ + Docker run + Everything in a folder + - config.yaml + - Pytorch `model.py` + - results in `mlflow` + + https://github.com/microsoft/qlib/blob/main/qlib/contrib/model/pytorch_nn.py + - pt_model_uri: hard-code `model.py:Net` in the config + - let LLM modify model.py + """ diff --git a/rdagent/utils/__init__.py b/rdagent/utils/__init__.py index 58e8a456..98365d7b 100644 --- a/rdagent/utils/__init__.py +++ b/rdagent/utils/__init__.py @@ -2,6 +2,7 @@ This is some common utils functions. it is not binding to the scenarios or framework (So it is not placed in rdagent.core.utils) """ + # TODO: merge the common utils in `rdagent.core.utils` into this folder # TODO: split the utils in this module into different modules in the future. diff --git a/requirements.txt b/requirements.txt index c4661959..d9b5a1ca 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,32 +2,13 @@ pydantic-settings typer[all] -loguru -black -isort -mypy -ruff - -numpy -pandas cython -langchain scipy -tiktoken python-Levenshtein -scikit-learn - -# PDF related -pypdf -azure-core -azure-ai-formrecognizer - -# factor implementations -tables - -# azure identity related -azure.identity # CI Fix Tool tree-sitter-python tree-sitter + +# Jupyter related +jupyter diff --git a/requirements/package.txt b/requirements/package.txt index 8840f3c5..092e94dc 100644 --- a/requirements/package.txt +++ b/requirements/package.txt @@ -1,4 +1,5 @@ # Requirements for package. +loguru build setuptools-scm twine @@ -7,11 +8,24 @@ fuzzywuzzy openai ruamel-yaml torch +torch_geometric tabulate # Convert pandas dataframe to markdown table to make it more readable to LLM tables # we use hd5 as default data format. So we have to install pytables +numpy # we use numpy as default data format. So we have to install numpy +pandas # we use pandas as default data format. So we have to install pandas feedparser matplotlib -pandas +langchain +tiktoken +scikit-learn + +# azure identity related +azure.identity + +# PDF related +pypdf +azure-core +azure-ai-formrecognizer # TODO: dependencies for implementing factors. # I think it is for running insteading of implementing. The dependency should be in diff --git a/test/oai/test_completion.py b/test/oai/test_completion.py index 79297d0b..129cc3f5 100644 --- a/test/oai/test_completion.py +++ b/test/oai/test_completion.py @@ -10,7 +10,8 @@ def test_chat_completion(self) -> None: system_prompt = "You are a helpful assistant." user_prompt = "What is your name?" response = APIBackend().build_messages_and_create_chat_completion( - system_prompt=system_prompt, user_prompt=user_prompt, + system_prompt=system_prompt, + user_prompt=user_prompt, ) assert response is not None assert isinstance(response, str) @@ -19,7 +20,9 @@ def test_chat_completion_json_mode(self) -> None: system_prompt = "You are a helpful assistant. answer in Json format." user_prompt = "What is your name?" response = APIBackend().build_messages_and_create_chat_completion( - system_prompt=system_prompt, user_prompt=user_prompt, json_mode=True, + system_prompt=system_prompt, + user_prompt=user_prompt, + json_mode=True, ) assert response is not None assert isinstance(response, str) diff --git a/test/oai/test_embedding_and_similarity.py b/test/oai/test_embedding_and_similarity.py index a577c4a5..8e06997c 100644 --- a/test/oai/test_embedding_and_similarity.py +++ b/test/oai/test_embedding_and_similarity.py @@ -1,6 +1,9 @@ import unittest -from rdagent.oai.llm_utils import APIBackend, calculate_embedding_distance_between_str_list +from rdagent.oai.llm_utils import ( + APIBackend, + calculate_embedding_distance_between_str_list, +) class TestEmbedding(unittest.TestCase): @@ -17,5 +20,6 @@ def test_embedding_similarity(self) -> None: min_similarity_threshold = 0.8 assert similarity >= min_similarity_threshold + if __name__ == "__main__": unittest.main()