diff --git a/rdagent/app/model_extraction_and_implementation/model_extraction_and_implementation.py b/rdagent/app/model_extraction_and_implementation/model_extraction_and_implementation.py new file mode 100644 index 000000000..d02f48544 --- /dev/null +++ b/rdagent/app/model_extraction_and_implementation/model_extraction_and_implementation.py @@ -0,0 +1,16 @@ +# %% +from dotenv import load_dotenv +from rdagent.components.task_implementation.model_implementation.one_shot import ModelTaskGen +from rdagent.components.task_implementation.model_implementation.task_extraction import ModelImplementationTaskLoaderFromPDFfiles + + + +def extract_models_and_implement(report_file_path: str="../test_doc") -> None: + factor_tasks = ModelImplementationTaskLoaderFromPDFfiles().load(report_file_path) + implementation_result = ModelTaskGen().generate(factor_tasks) + return implementation_result + + +import fire +if __name__ == "__main__": + fire.Fire(extract_models_and_implement) diff --git a/rdagent/app/model_implementation/README.md b/rdagent/app/model_implementation/README.md index 87232c3bb..bcd068e08 100644 --- a/rdagent/app/model_implementation/README.md +++ b/rdagent/app/model_implementation/README.md @@ -17,8 +17,9 @@ pip3 install torch_geometric ## Task Extraction From paper to task. ```bash -python rdagent/app/model_implementation/task_extraction.py +# python rdagent/app/model_implementation/task_extraction.py # It may based on rdagent/document_reader/document_reader.py +python rdagent/components/task_implementation/model_implementation/task_extraction.py ./PaperImpBench/raw_paper/ ``` ## Complete workflow @@ -30,10 +31,14 @@ From paper to implementation ## Paper benchmark ```bash +# TODO: it does not work well now. python rdagent/app/model_implementation/eval.py +``` TODO: -- Is evaluation reasonable -``` +- Create reasonable benchmark + - with uniform input + - manually create task +- Create reasonable evaluation metrics ## Evolving diff --git a/rdagent/app/model_implementation/eval.py b/rdagent/app/model_implementation/eval.py index e9ef23011..1460e6184 100644 --- a/rdagent/app/model_implementation/eval.py +++ b/rdagent/app/model_implementation/eval.py @@ -2,14 +2,18 @@ DIRNAME = Path(__file__).absolute().resolve().parent -from rdagent.model_implementation.benchmark.eval import ModelImpValEval -from rdagent.model_implementation.one_shot import ModelTaskGen -from rdagent.model_implementation.task import ModelImpLoader, ModelTaskLoderJson +from rdagent.components.task_implementation.model_implementation.benchmark.eval import ModelImpValEval +from rdagent.components.task_implementation.model_implementation.one_shot import ModelTaskGen +from rdagent.components.task_implementation.model_implementation.task import ModelImpLoader, ModelTaskLoderJson -mtl = ModelTaskLoderJson("TODO: A Path to json") + +bench_folder = DIRNAME.parent.parent / "components" / "task_implementation" / "model_implementation" / "benchmark" +mtl = ModelTaskLoderJson(str(bench_folder / "model_dict.json")) task_l = mtl.load() +task_l = [t for t in task_l if t.key == "A-DGN"] # FIXME: other models does not work well + mtg = ModelTaskGen() impl_l = mtg.generate(task_l) @@ -17,12 +21,13 @@ # TODO: Align it with the benchmark framework after @wenjun's refine the evaluation part. # Currently, we just handcraft a workflow for fast evaluation. -mil = ModelImpLoader(DIRNAME.parent.parent / "model_implementation" / "benchmark" / "gt_code") +mil = ModelImpLoader(bench_folder / "gt_code") mie = ModelImpValEval() # Evaluation: eval_l = [] for impl in impl_l: + print(impl.target_task) gt_impl = mil.load(impl.target_task) eval_l.append(mie.evaluate(gt_impl, impl)) diff --git a/rdagent/components/task_implementation/model_implementation/benchmark/eval.py b/rdagent/components/task_implementation/model_implementation/benchmark/eval.py index 983159f97..4c1fb1b09 100644 --- a/rdagent/components/task_implementation/model_implementation/benchmark/eval.py +++ b/rdagent/components/task_implementation/model_implementation/benchmark/eval.py @@ -1,6 +1,6 @@ # TODO: inherent from the benchmark base class import torch -from rdagent.model_implementation.task import ModelTaskImpl +from rdagent.components.task_implementation.model_implementation.task import ModelTaskImpl def get_data_conf(init_val): diff --git a/rdagent/components/task_implementation/model_implementation/benchmark/gt_code/dirgnn.py b/rdagent/components/task_implementation/model_implementation/benchmark/gt_code/dirgnn.py index f5fa78b14..d22cb8987 100644 --- a/rdagent/components/task_implementation/model_implementation/benchmark/gt_code/dirgnn.py +++ b/rdagent/components/task_implementation/model_implementation/benchmark/gt_code/dirgnn.py @@ -74,6 +74,9 @@ def __repr__(self) -> str: return f"{self.__class__.__name__}({self.conv_in}, alpha={self.alpha})" +model_cls = DirGNNConv + + if __name__ == "__main__": node_features = torch.load("node_features.pt") edge_index = torch.load("edge_index.pt") diff --git a/rdagent/components/task_implementation/model_implementation/benchmark/gt_code/gpsconv.py b/rdagent/components/task_implementation/model_implementation/benchmark/gt_code/gpsconv.py index e6ee0193d..305b30e30 100644 --- a/rdagent/components/task_implementation/model_implementation/benchmark/gt_code/gpsconv.py +++ b/rdagent/components/task_implementation/model_implementation/benchmark/gt_code/gpsconv.py @@ -183,6 +183,9 @@ def __repr__(self) -> str: ) +model_cls = GPSConv + + if __name__ == "__main__": node_features = torch.load("node_features.pt") edge_index = torch.load("edge_index.pt") diff --git a/rdagent/components/task_implementation/model_implementation/benchmark/gt_code/linkx.py b/rdagent/components/task_implementation/model_implementation/benchmark/gt_code/linkx.py index eb3ddfc69..fa9a53c30 100644 --- a/rdagent/components/task_implementation/model_implementation/benchmark/gt_code/linkx.py +++ b/rdagent/components/task_implementation/model_implementation/benchmark/gt_code/linkx.py @@ -166,6 +166,7 @@ def __repr__(self) -> str: f"out_channels={self.out_channels})" ) +model_cls = LINKX if __name__ == "__main__": node_features = torch.load("node_features.pt") diff --git a/rdagent/components/task_implementation/model_implementation/benchmark/gt_code/pmlp.py b/rdagent/components/task_implementation/model_implementation/benchmark/gt_code/pmlp.py index ada1b7b01..de8face9f 100644 --- a/rdagent/components/task_implementation/model_implementation/benchmark/gt_code/pmlp.py +++ b/rdagent/components/task_implementation/model_implementation/benchmark/gt_code/pmlp.py @@ -99,17 +99,14 @@ def __repr__(self) -> str: return f"{self.__class__.__name__}({self.in_channels}, " f"{self.out_channels}, num_layers={self.num_layers})" +model_cls = PMLP + if __name__ == "__main__": node_features = torch.load("node_features.pt") edge_index = torch.load("edge_index.pt") # Model instantiation and forward pass - model = PMLP( - in_channels=node_features.size(-1), - hidden_channels=node_features.size(-1), - out_channels=node_features.size(-1), - num_layers=1, - ) + model = PMLP(in_channels=node_features.size(-1), hidden_channels=node_features.size(-1), out_channels=node_features.size(-1), num_layers=1) output = model(node_features, edge_index) # Save output to a file diff --git a/rdagent/components/task_implementation/model_implementation/benchmark/gt_code/visnet.py b/rdagent/components/task_implementation/model_implementation/benchmark/gt_code/visnet.py index 3118d6412..67cff2089 100644 --- a/rdagent/components/task_implementation/model_implementation/benchmark/gt_code/visnet.py +++ b/rdagent/components/task_implementation/model_implementation/benchmark/gt_code/visnet.py @@ -1176,6 +1176,9 @@ def forward( return y, None +model_cls = ViSNet + + if __name__ == "__main__": node_features = torch.load("node_features.pt") edge_index = torch.load("edge_index.pt") diff --git a/rdagent/components/task_implementation/model_implementation/benchmark/model_dict.json b/rdagent/components/task_implementation/model_implementation/benchmark/model_dict.json new file mode 100644 index 000000000..a30b8adbd --- /dev/null +++ b/rdagent/components/task_implementation/model_implementation/benchmark/model_dict.json @@ -0,0 +1,74 @@ +{ + "PMLP": { + "description": "`PMLP` is identical to a standard MLP during training, but then adopts a GNN architecture (add message passing) during testing.", + "formulation": "\\hat{y}_u = \\psi(\\text{MP}(\\{h^{(l-1)}_v\\}_{v \\in N_u \\cup \\{u\\}}))", + "variables": { + "\\hat{y}_u": "The predicted output for node u", + "\\psi": "A function representing the feed-forward process, consisting of a linear feature transformation followed by a non-linear activation", + "\\text{MP}": "Message Passing operation that aggregates neighbored information", + "h^{(l-1)}_v": "The feature representation of node v at layer (l-1)", + "N_u": "The set of neighbored nodes centered at node u" + }, + "key": "pmlp" + }, + "LINKX": { + "description": "A scalable model for node classification that separately embeds adjacency and node features, combines them with MLPs, and applies simple transformations.", + "formulation": "Y = MLP_f(\\sigma(W[h_A; h_X] + h_A + h_X))", + "variables": { + "Y": "The output predictions", + "\\sigma": "Non-linear activation function", + "W": "Learned weight matrix", + "h_A": "Embedding of the adjacency matrix", + "h_X": "Embedding of the node features", + "MLP_f": "Final multilayer perceptron for prediction" + }, + "key": "linkx" + }, + "GPSConv": { + "description": "A scalable and powerful graph transformer with linear complexity, capable of handling large graphs with state-of-the-art results across diverse benchmarks.", + "formulation": "X^{(l+1)} = \\text{MPNN}^{(l)}(X^{(l)}, A) + \\text{GlobalAttn}^{(l)}(X^{(l)})", + "variables": { + "X^{(l)}": "The node features at layer l", + "A": "The adjacency matrix of the graph", + "X^{(l+1)}": "The updated node features at layer l+1", + "MPNN^{(l)}": "The message-passing neural network function at layer l", + "GlobalAttn^{(l)}": "The global attention function at layer l" + }, + "key": "gpsconv" + }, + "ViSNet": { + "description": "ViSNet is an equivariant geometry-enhanced graph neural network designed for efficient molecular modeling[^1^][1][^2^][2]. It utilizes a Vector-Scalar interactive message passing mechanism to extract and utilize geometric features with low computational costs, achieving state-of-the-art performance on multiple molecular dynamics benchmarks.", + "formulation": "\\text{ViSNet}(G) = \\sum_{u \\in G} f(\\mathbf{h}_u, \\mathbf{e}_u, \\mathbf{v}_u)", + "variables": { + "\\mathbf{h}_u": "Node embedding for atom u", + "\\mathbf{e}_u": "Edge embedding associated with atom u", + "\\mathbf{v}_u": "Direction unit vector for atom u" + }, + "key": "visnet" + }, + "Dir-GNN": { + "description": "A framework for deep learning on directed graphs that extends MPNNs to incorporate edge directionality.", + "formulation": "x^{(k)}_i = COM^{(k)}\\left(x^{(k-1)}_i, m^{(k)}_{i,\\leftarrow}, m^{(k)}_{i,\\rightarrow}\\right)", + "variables": { + "x^{(k)}_i": "The feature representation of node i at layer k", + "m^{(k)}_{i,\\leftarrow}": "The aggregated incoming messages to node i at layer k", + "m^{(k)}_{i,\\rightarrow}": "The aggregated outgoing messages from node i at layer k" + }, + "key": "dirgnn" + }, + "A-DGN": { + "description": "A framework for stable and non-dissipative DGN design, conceived through the lens of ordinary differential equations (ODEs). It ensures long-range information preservation between nodes and prevents gradient vanishing or explosion during training.", + "formulation": "\\frac{\\partial x_u(t)}{\\partial t} = \\sigma(W^T x_u(t) + \\Phi(X(t), N_u) + b)", + "variables": { + "x_u(t)": "The state of node u at time t", + "\\frac{\\partial x_u(t)}{\\partial t}": "The rate of change of the state of node u at time t", + "\\sigma": "A monotonically non-decreasing activation function", + "W": "A weight matrix", + "b": "A bias vector", + "\\Phi(X(t), N_u)": "The aggregation function for the states of the nodes in the neighborhood of u", + "X(t)": "The node feature matrix of the whole graph at time t", + "N_u": "The set of neighboring nodes of u" + }, + "key": "A-DGN" + } +} diff --git a/rdagent/components/task_implementation/model_implementation/prompts.yaml b/rdagent/components/task_implementation/model_implementation/prompts.yaml new file mode 100644 index 000000000..7cc63385e --- /dev/null +++ b/rdagent/components/task_implementation/model_implementation/prompts.yaml @@ -0,0 +1,8 @@ +extract_model_formulation_system: |- + offer description of the proposed model in this paper, write a latex formula with variable of the model. the format should be like " "Model Name": { + "description": "", + "formulation": "", + "variables": { + "\\hat{y}_u": "The predicted output for node u", + }" + such format content should be begin with ```json and end with ``` and the content should be in json format. \ No newline at end of file diff --git a/rdagent/components/task_implementation/model_implementation/task.py b/rdagent/components/task_implementation/model_implementation/task.py index 0cb810705..ef6a0d51e 100644 --- a/rdagent/components/task_implementation/model_implementation/task.py +++ b/rdagent/components/task_implementation/model_implementation/task.py @@ -16,6 +16,7 @@ TaskLoader, ) from rdagent.utils import get_module_by_module_path +import json class ModelImplTask(BaseTask): @@ -42,53 +43,104 @@ def __init__( self.formulation = formulation self.variables = variables self.key = key + def get_information(self): + return f"""name: {self.name} +description: {self.description} +formulation: {self.formulation} +variables: {self.variables} +key: {self.key} +""" + + @staticmethod + def from_dict(dict): + return ModelImplTask(**dict) + + def __repr__(self) -> str: + return f"<{self.__class__.__name__} {self.name}>" class ModelTaskLoderJson(TaskLoader): + # def __init__(self, json_uri: str, select_model: Optional[str] = None) -> None: + # super().__init__() + # self.json_uri = json_uri + # self.select_model = 'A-DGN' + + # def load(self, *argT, **kwargs) -> Sequence[ModelImplTask]: + # # json is supposed to be in the format of {model_name: dict{model_data}} + # model_dict = json.load(open(self.json_uri, "r")) + # if self.select_model is not None: + # assert self.select_model in model_dict + # model_name = self.select_model + # model_data = model_dict[self.select_model] + # else: + # model_name, model_data = list(model_dict.items())[0] + + # model_impl_task = ModelImplTask( + # name=model_name, + # description=model_data["description"], + # formulation=model_data["formulation"], + # variables=model_data["variables"], + # key=model_name + # ) + + # return [model_impl_task] + def __init__(self, json_uri: str) -> None: super().__init__() - # TODO: the json should be loaded from URI. self.json_uri = json_uri def load(self, *argT, **kwargs) -> Sequence[ModelImplTask]: - # TODO: we should load the tasks from json; + # json is supposed to be in the format of {model_name: dict{model_data}} + model_dict = json.load(open(self.json_uri, "r")) - # this version does not align with the right answer + # FIXME: the model in the json file is not right due to extraction error + # We should fix them case by case in the future + # # formula_info = { # "name": "Anti-Symmetric Deep Graph Network (A-DGN)", # "description": "A framework for stable and non-dissipative DGN design. It ensures long-range information preservation between nodes and prevents gradient vanishing or explosion during training.", - # "formulation": "x_u^{(l)} = x_u^{(l-1)} + \\epsilon \\sigma \\left( W^T x_u^{(l-1)} + \\Phi(X^{(l-1)}, N_u) + b \\right)", + # "formulation": r"\mathbf{x}^{\prime}_i = \mathbf{x}_i + \epsilon \cdot \sigma \left( (\mathbf{W}-\mathbf{W}^T-\gamma \mathbf{I}) \mathbf{x}_i + \Phi(\mathbf{X}, \mathcal{N}_i) + \mathbf{b}\right),", # "variables": { - # "x_u^{(l)}": "The state of node u at layer l", - # "\\epsilon": "The step size in the Euler discretization", - # "\\sigma": "A monotonically non-decreasing activation function", - # "W": "An anti-symmetric weight matrix", - # "X^{(l-1)}": "The node feature matrix at layer l-1", - # "N_u": "The set of neighbors of node u", - # "b": "A bias vector", + # r"\mathbf{x}_i": "The state of node i at previous layer", + # r"\epsilon": "The step size in the Euler discretization", + # r"\sigma": "A monotonically non-decreasing activation function", + # r"\Phi": "A graph convolutional operator", + # r"W": "An anti-symmetric weight matrix", + # r"\mathbf{x}^{\prime}_i": "The node feature matrix at layer l-1", + # r"\mathcal{N}_i": "The set of neighbors of node u", + # r"\mathbf{b}": "A bias vector", # }, # "key": "A-DGN", # } - formula_info = { - "name": "Anti-Symmetric Deep Graph Network (A-DGN)", - "description": "A framework for stable and non-dissipative DGN design. It ensures long-range information preservation between nodes and prevents gradient vanishing or explosion during training.", - "formulation": r"\mathbf{x}^{\prime}_i = \mathbf{x}_i + \epsilon \cdot \sigma \left( (\mathbf{W}-\mathbf{W}^T-\gamma \mathbf{I}) \mathbf{x}_i + \Phi(\mathbf{X}, \mathcal{N}_i) + \mathbf{b}\right),", - "variables": { - r"\mathbf{x}_i": "The state of node i at previous layer", - r"\epsilon": "The step size in the Euler discretization", - r"\sigma": "A monotonically non-decreasing activation function", - r"\Phi": "A graph convolutional operator", - r"W": "An anti-symmetric weight matrix", - r"\mathbf{x}^{\prime}_i": "The node feature matrix at layer l-1", - r"\mathcal{N}_i": "The set of neighbors of node u", - r"\mathbf{b}": "A bias vector", - }, - "key": "A-DGN", - } - return [ModelImplTask(**formula_info)] - - -class ModelTaskImpl(TaskImplementation): + model_impl_task_list = [] + for model_name, model_data in model_dict.items(): + model_impl_task = ModelImplTask( + name=model_name, + description=model_data["description"], + formulation=model_data["formulation"], + variables=model_data["variables"], + key=model_data["key"] + ) + model_impl_task_list.append(model_impl_task) + return model_impl_task_list + +class ModelImplementationTaskLoaderFromDict(TaskLoader): + def load(self, model_dict: dict) -> list: + """Load data from a dict.""" + task_l = [] + for model_name, model_data in model_dict.items(): + task = ModelImplTask( + name=model_name, + description=model_data["description"], + formulation=model_data["formulation"], + variables=model_data["variables"], + key=model_name + ) + task_l.append(task) + return task_l + + +class ModelTaskImpl(FBTaskImplementation): """ It is a Pytorch model implementation task; All the things are placed in a folder. diff --git a/rdagent/components/task_implementation/model_implementation/task_extraction.py b/rdagent/components/task_implementation/model_implementation/task_extraction.py new file mode 100644 index 000000000..bffcd1e6a --- /dev/null +++ b/rdagent/components/task_implementation/model_implementation/task_extraction.py @@ -0,0 +1,117 @@ +from __future__ import annotations + +import json +import multiprocessing as mp +import re +from pathlib import Path +from typing import Mapping + +import numpy as np +import pandas as pd +import tiktoken +from jinja2 import Template +from rdagent.core.conf import RD_AGENT_SETTINGS +from rdagent.core.log import RDAgentLog +from rdagent.core.prompts import Prompts +from rdagent.core.task import TaskLoader +from rdagent.components.document_reader.document_reader import load_and_process_pdfs_by_langchain +from rdagent.components.task_implementation.model_implementation.task import ModelImplementationTaskLoaderFromDict +from rdagent.oai.llm_utils import APIBackend, create_embedding_with_multiprocessing + + +document_process_prompts = Prompts(file_path=Path(__file__).parent / "prompts.yaml") + +def extract_model_from_doc(doc_content: str) -> dict: + """ + Extract model information from document content. + + Parameters + ---------- + doc_content : str + Document content. + + Returns + ------- + dict + {factor_name: dict{description, formulation, variables}} + """ + session = APIBackend().build_chat_session( + session_system_prompt=document_process_prompts["extract_model_formulation_system"], + ) + current_user_prompt = doc_content + + # Extract model information from document content. + model_dict = {} + + for _ in range(10): + # try to extract model information from the document content, retry at most 10 times. + extract_result_resp = session.build_chat_completion( + user_prompt=current_user_prompt, + json_mode=False, + ) + re_search_res = re.search(r"```json(.*)```", extract_result_resp, re.S) + ret_json_str = re_search_res.group(1) if re_search_res is not None else "" + try: + ret_dict = json.loads(ret_json_str) + parse_success = bool(isinstance(ret_dict, dict)) + except json.JSONDecodeError: + parse_success = False + if ret_json_str is None or not parse_success: + current_user_prompt = "Your response didn't follow the instruction might be wrong json format. Try again." + else: + for name, formulation_and_description in ret_dict.items(): + if name not in model_dict: + model_dict[name] = formulation_and_description + if len(model_dict) == 0: + current_user_prompt = "No model extracted. Please try again." + else: + break + + + RDAgentLog().info(f"已经完成{len(model_dict)}个模型的提取") + + return model_dict + +def merge_file_to_model_dict_to_model_dict( + file_to_model_dict: dict[str, dict], +) -> dict: + model_dict = {} + for file_name in file_to_model_dict: + for model_name in file_to_model_dict[file_name]: + model_dict.setdefault(model_name, []) + model_dict[model_name].append(file_to_model_dict[file_name][model_name]) + + model_dict_simple_deduplication = {} + for model_name in model_dict: + if len(model_dict[model_name]) > 1: + model_dict_simple_deduplication[model_name] = max( + model_dict[model_name], + key=lambda x: len(x["formulation"]), + ) + else: + model_dict_simple_deduplication[model_name] = model_dict[model_name][0] + return model_dict_simple_deduplication + + +def extract_model_from_docs(docs_dict): + model_dict = {} + for doc_name, doc_content in docs_dict.items(): + model_dict[doc_name] = extract_model_from_doc(doc_content) + return model_dict + + +class ModelImplementationTaskLoaderFromPDFfiles(TaskLoader): + def load(self, file_or_folder_path: Path) -> dict: + docs_dict = load_and_process_pdfs_by_langchain(Path(file_or_folder_path)) # dict{file_path:content} + model_dict = extract_model_from_docs(docs_dict) # dict{file_name: dict{model_name: dict{description, formulation, variables}}} + model_dict = merge_file_to_model_dict_to_model_dict(model_dict) # dict {model_name: dict{description, formulation, variables}} + return ModelImplementationTaskLoaderFromDict().load(model_dict) + +def main(path="../test_doc"): + doc_dict = load_and_process_pdfs_by_langchain(Path(path)) + print(doc_dict.keys()) # if you run code like "python -u", the print content will be truncated + + +import fire +if __name__ == "__main__": + fire.Fire(main) diff --git a/requirements.txt b/requirements.txt index d9b5a1ca4..3d30809fe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,6 +5,19 @@ typer[all] cython scipy python-Levenshtein +scikit-learn +filelock + +# PDF related +pypdf +azure-core +azure-ai-formrecognizer + +# factor implementations +tables + +# azure identity related +azure.identity # CI Fix Tool tree-sitter-python @@ -12,3 +25,5 @@ tree-sitter # Jupyter related jupyter + +python-dotenv