Pdf2 model task (#33)

* add needed dependency * add extract_model_and_implement pipeline * add merge_file_to_model_dict_to_model_dict * implement `rdagent\app\model_implementation\eval.py` * Running benchmark * refine import --------- Co-authored-by: Young <[email protected]>
microsoft · Jun 30, 2024 · 40b8250 · 40b8250
1 parent 5ff0b66
commit 40b8250
Show file tree

Hide file tree

Showing 14 changed files with 345 additions and 46 deletions.
diff --git a/rdagent/app/model_extraction_and_implementation/model_extraction_and_implementation.py b/rdagent/app/model_extraction_and_implementation/model_extraction_and_implementation.py
@@ -0,0 +1,16 @@
+# %%
+from dotenv import load_dotenv
+from rdagent.components.task_implementation.model_implementation.one_shot import ModelTaskGen
+from rdagent.components.task_implementation.model_implementation.task_extraction import ModelImplementationTaskLoaderFromPDFfiles
+
+
+
+def extract_models_and_implement(report_file_path: str="../test_doc") -> None:
+    factor_tasks = ModelImplementationTaskLoaderFromPDFfiles().load(report_file_path)
+    implementation_result = ModelTaskGen().generate(factor_tasks)
+    return implementation_result
+
+
+import fire
+if __name__ == "__main__":
+    fire.Fire(extract_models_and_implement)
diff --git a/rdagent/app/model_implementation/README.md b/rdagent/app/model_implementation/README.md
@@ -17,8 +17,9 @@ pip3 install torch_geometric
 ## Task Extraction
 From paper to task.
 ```bash
-python rdagent/app/model_implementation/task_extraction.py
+# python rdagent/app/model_implementation/task_extraction.py
 # It may based on rdagent/document_reader/document_reader.py
+python rdagent/components/task_implementation/model_implementation/task_extraction.py ./PaperImpBench/raw_paper/
 ```
 
 ## Complete workflow
@@ -30,10 +31,14 @@ From paper to implementation
 
 ## Paper benchmark
 ```bash
+# TODO: it does not work well now.
 python rdagent/app/model_implementation/eval.py
+```
 
 TODO:
-- Is evaluation reasonable
-```
+- Create reasonable benchmark
+  - with uniform input
+  - manually create task
+- Create reasonable evaluation metrics
 
 ## Evolving
diff --git a/rdagent/app/model_implementation/eval.py b/rdagent/app/model_implementation/eval.py
@@ -2,27 +2,32 @@
 
 DIRNAME = Path(__file__).absolute().resolve().parent
 
-from rdagent.model_implementation.benchmark.eval import ModelImpValEval
-from rdagent.model_implementation.one_shot import ModelTaskGen
-from rdagent.model_implementation.task import ModelImpLoader, ModelTaskLoderJson
+from rdagent.components.task_implementation.model_implementation.benchmark.eval import ModelImpValEval
+from rdagent.components.task_implementation.model_implementation.one_shot import ModelTaskGen
+from rdagent.components.task_implementation.model_implementation.task import  ModelImpLoader, ModelTaskLoderJson
 
-mtl = ModelTaskLoderJson("TODO: A Path to json")
+
+bench_folder = DIRNAME.parent.parent / "components" / "task_implementation" / "model_implementation" / "benchmark"
+mtl = ModelTaskLoderJson(str(bench_folder / "model_dict.json"))
 
 task_l = mtl.load()
 
+task_l = [t for t in task_l if t.key == "A-DGN"]  # FIXME: other models does not work well
+
 mtg = ModelTaskGen()
 
 impl_l = mtg.generate(task_l)
 
 # TODO: Align it with the benchmark framework after @wenjun's refine the evaluation part.
 # Currently, we just handcraft a workflow for fast evaluation.
 
-mil = ModelImpLoader(DIRNAME.parent.parent / "model_implementation" / "benchmark" / "gt_code")
+mil = ModelImpLoader(bench_folder / "gt_code")
 
 mie = ModelImpValEval()
 # Evaluation:
 eval_l = []
 for impl in impl_l:
+    print(impl.target_task)
     gt_impl = mil.load(impl.target_task)
     eval_l.append(mie.evaluate(gt_impl, impl))
 

diff --git a/rdagent/components/task_implementation/model_implementation/benchmark/eval.py b/rdagent/components/task_implementation/model_implementation/benchmark/eval.py
@@ -1,6 +1,6 @@
 # TODO: inherent from the benchmark base class
 import torch
-from rdagent.model_implementation.task import ModelTaskImpl
+from rdagent.components.task_implementation.model_implementation.task import ModelTaskImpl
 
 
 def get_data_conf(init_val):

diff --git a/rdagent/components/task_implementation/model_implementation/benchmark/gt_code/dirgnn.py b/rdagent/components/task_implementation/model_implementation/benchmark/gt_code/dirgnn.py
@@ -74,6 +74,9 @@ def __repr__(self) -> str:
         return f"{self.__class__.__name__}({self.conv_in}, alpha={self.alpha})"
 
 
+model_cls = DirGNNConv
+
+
 if __name__ == "__main__":
     node_features = torch.load("node_features.pt")
     edge_index = torch.load("edge_index.pt")

diff --git a/rdagent/components/task_implementation/model_implementation/benchmark/gt_code/gpsconv.py b/rdagent/components/task_implementation/model_implementation/benchmark/gt_code/gpsconv.py
@@ -183,6 +183,9 @@ def __repr__(self) -> str:
         )
 
 
+model_cls = GPSConv
+
+
 if __name__ == "__main__":
     node_features = torch.load("node_features.pt")
     edge_index = torch.load("edge_index.pt")

diff --git a/rdagent/components/task_implementation/model_implementation/benchmark/gt_code/linkx.py b/rdagent/components/task_implementation/model_implementation/benchmark/gt_code/linkx.py
@@ -166,6 +166,7 @@ def __repr__(self) -> str:
             f"out_channels={self.out_channels})"
         )
 
+model_cls = LINKX
 
 if __name__ == "__main__":
     node_features = torch.load("node_features.pt")

diff --git a/rdagent/components/task_implementation/model_implementation/benchmark/gt_code/pmlp.py b/rdagent/components/task_implementation/model_implementation/benchmark/gt_code/pmlp.py
@@ -99,17 +99,14 @@ def __repr__(self) -> str:
         return f"{self.__class__.__name__}({self.in_channels}, " f"{self.out_channels}, num_layers={self.num_layers})"
 
 
+model_cls = PMLP
+
 if __name__ == "__main__":
     node_features = torch.load("node_features.pt")
     edge_index = torch.load("edge_index.pt")
 
     # Model instantiation and forward pass
-    model = PMLP(
-        in_channels=node_features.size(-1),
-        hidden_channels=node_features.size(-1),
-        out_channels=node_features.size(-1),
-        num_layers=1,
-    )
+    model = PMLP(in_channels=node_features.size(-1), hidden_channels=node_features.size(-1), out_channels=node_features.size(-1), num_layers=1)
     output = model(node_features, edge_index)
 
     # Save output to a file

diff --git a/rdagent/components/task_implementation/model_implementation/benchmark/gt_code/visnet.py b/rdagent/components/task_implementation/model_implementation/benchmark/gt_code/visnet.py
@@ -1176,6 +1176,9 @@ def forward(
         return y, None
 
 
+model_cls = ViSNet
+
+
 if __name__ == "__main__":
     node_features = torch.load("node_features.pt")
     edge_index = torch.load("edge_index.pt")

diff --git a/rdagent/components/task_implementation/model_implementation/benchmark/model_dict.json b/rdagent/components/task_implementation/model_implementation/benchmark/model_dict.json
@@ -0,0 +1,74 @@
+{
+    "PMLP": {
+        "description": "`PMLP` is identical to a standard MLP during training, but then adopts a GNN architecture (add message passing) during testing.",
+        "formulation": "\\hat{y}_u = \\psi(\\text{MP}(\\{h^{(l-1)}_v\\}_{v \\in N_u \\cup \\{u\\}}))",
+        "variables": {
+            "\\hat{y}_u": "The predicted output for node u",
+            "\\psi": "A function representing the feed-forward process, consisting of a linear feature transformation followed by a non-linear activation",
+            "\\text{MP}": "Message Passing operation that aggregates neighbored information",
+            "h^{(l-1)}_v": "The feature representation of node v at layer (l-1)",
+            "N_u": "The set of neighbored nodes centered at node u"
+        },
+        "key": "pmlp"
+    },
+    "LINKX": {
+        "description": "A scalable model for node classification that separately embeds adjacency and node features, combines them with MLPs, and applies simple transformations.",
+        "formulation": "Y = MLP_f(\\sigma(W[h_A; h_X] + h_A + h_X))",
+        "variables": {
+            "Y": "The output predictions",
+            "\\sigma": "Non-linear activation function",
+            "W": "Learned weight matrix",
+            "h_A": "Embedding of the adjacency matrix",
+            "h_X": "Embedding of the node features",
+            "MLP_f": "Final multilayer perceptron for prediction"
+        },
+        "key": "linkx"
+    },
+    "GPSConv": {
+        "description": "A scalable and powerful graph transformer with linear complexity, capable of handling large graphs with state-of-the-art results across diverse benchmarks.",
+        "formulation": "X^{(l+1)} = \\text{MPNN}^{(l)}(X^{(l)}, A) + \\text{GlobalAttn}^{(l)}(X^{(l)})",
+        "variables": {
+            "X^{(l)}": "The node features at layer l",
+            "A": "The adjacency matrix of the graph",
+            "X^{(l+1)}": "The updated node features at layer l+1",
+            "MPNN^{(l)}": "The message-passing neural network function at layer l",
+            "GlobalAttn^{(l)}": "The global attention function at layer l"
+        },
+        "key": "gpsconv"
+    },
+    "ViSNet": {
+        "description": "ViSNet is an equivariant geometry-enhanced graph neural network designed for efficient molecular modeling[^1^][1][^2^][2]. It utilizes a Vector-Scalar interactive message passing mechanism to extract and utilize geometric features with low computational costs, achieving state-of-the-art performance on multiple molecular dynamics benchmarks.",
+        "formulation": "\\text{ViSNet}(G) = \\sum_{u \\in G} f(\\mathbf{h}_u, \\mathbf{e}_u, \\mathbf{v}_u)",
+        "variables": {
+            "\\mathbf{h}_u": "Node embedding for atom u",
+            "\\mathbf{e}_u": "Edge embedding associated with atom u",
+            "\\mathbf{v}_u": "Direction unit vector for atom u"
+        },
+        "key": "visnet"
+    },
+    "Dir-GNN": {
+        "description": "A framework for deep learning on directed graphs that extends MPNNs to incorporate edge directionality.",
+        "formulation": "x^{(k)}_i = COM^{(k)}\\left(x^{(k-1)}_i, m^{(k)}_{i,\\leftarrow}, m^{(k)}_{i,\\rightarrow}\\right)",
+        "variables": {
+            "x^{(k)}_i": "The feature representation of node i at layer k",
+            "m^{(k)}_{i,\\leftarrow}": "The aggregated incoming messages to node i at layer k",
+            "m^{(k)}_{i,\\rightarrow}": "The aggregated outgoing messages from node i at layer k"
+        },
+        "key": "dirgnn"
+    },
+    "A-DGN": {
+        "description": "A framework for stable and non-dissipative DGN design, conceived through the lens of ordinary differential equations (ODEs). It ensures long-range information preservation between nodes and prevents gradient vanishing or explosion during training.",
+        "formulation": "\\frac{\\partial x_u(t)}{\\partial t} = \\sigma(W^T x_u(t) + \\Phi(X(t), N_u) + b)",
+        "variables": {
+            "x_u(t)": "The state of node u at time t",
+            "\\frac{\\partial x_u(t)}{\\partial t}": "The rate of change of the state of node u at time t",
+            "\\sigma": "A monotonically non-decreasing activation function",
+            "W": "A weight matrix",
+            "b": "A bias vector",
+            "\\Phi(X(t), N_u)": "The aggregation function for the states of the nodes in the neighborhood of u",
+            "X(t)": "The node feature matrix of the whole graph at time t",
+            "N_u": "The set of neighboring nodes of u"
+        },
+        "key": "A-DGN"
+    }
+}
diff --git a/rdagent/components/task_implementation/model_implementation/prompts.yaml b/rdagent/components/task_implementation/model_implementation/prompts.yaml
@@ -0,0 +1,8 @@
+extract_model_formulation_system: |-
+    offer description of the proposed model in this paper, write a latex formula with variable of the model. the format should be like "        "Model Name": {
+                "description": "",
+                "formulation": "",
+                "variables": {
+                    "\\hat{y}_u": "The predicted output for node u",
+                }"
+    such format content should be begin with ```json and end with ``` and the content should be in json format.
diff --git a/rdagent/components/task_implementation/model_implementation/task.py b/rdagent/components/task_implementation/model_implementation/task.py
@@ -16,6 +16,7 @@
     TaskLoader,
 )
 from rdagent.utils import get_module_by_module_path
+import json
 
 
 class ModelImplTask(BaseTask):
@@ -42,53 +43,104 @@ def __init__(
         self.formulation = formulation
         self.variables = variables
         self.key = key
+    def get_information(self):
+        return f"""name: {self.name}
+description: {self.description}
+formulation: {self.formulation}
+variables: {self.variables}
+key: {self.key}
+"""
+
+    @staticmethod
+    def from_dict(dict):
+        return ModelImplTask(**dict)
+
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__} {self.name}>"
 
 
 class ModelTaskLoderJson(TaskLoader):
+    # def __init__(self, json_uri: str, select_model: Optional[str] = None) -> None:
+    #     super().__init__()
+    #     self.json_uri = json_uri
+    #     self.select_model = 'A-DGN'
+
+    # def load(self, *argT, **kwargs) -> Sequence[ModelImplTask]:
+    #     # json is supposed to be in the format of {model_name: dict{model_data}}
+    #     model_dict = json.load(open(self.json_uri, "r"))
+    #     if self.select_model is not None:
+    #         assert self.select_model in model_dict
+    #         model_name = self.select_model
+    #         model_data = model_dict[self.select_model]
+    #     else:
+    #         model_name, model_data = list(model_dict.items())[0]
+
+    #     model_impl_task = ModelImplTask(
+    #         name=model_name,
+    #         description=model_data["description"],
+    #         formulation=model_data["formulation"],
+    #         variables=model_data["variables"],
+    #         key=model_name
+    #     )
+
+    #     return [model_impl_task]
+
     def __init__(self, json_uri: str) -> None:
         super().__init__()
-        # TODO: the json should be loaded from URI.
         self.json_uri = json_uri
 
     def load(self, *argT, **kwargs) -> Sequence[ModelImplTask]:
-        # TODO: we should load the tasks from json;
+        # json is supposed to be in the format of {model_name: dict{model_data}}
+        model_dict = json.load(open(self.json_uri, "r"))
 
-        # this version does not align with the right answer
+        # FIXME: the model in the json file is not right due to extraction error
+        #       We should fix them case by case in the future
+        # 
         # formula_info = {
         #     "name": "Anti-Symmetric Deep Graph Network (A-DGN)",
         #     "description": "A framework for stable and non-dissipative DGN design. It ensures long-range information preservation between nodes and prevents gradient vanishing or explosion during training.",
-        #     "formulation": "x_u^{(l)} = x_u^{(l-1)} + \\epsilon \\sigma \\left( W^T x_u^{(l-1)} + \\Phi(X^{(l-1)}, N_u) + b \\right)",
+        #     "formulation": r"\mathbf{x}^{\prime}_i = \mathbf{x}_i + \epsilon \cdot \sigma \left( (\mathbf{W}-\mathbf{W}^T-\gamma \mathbf{I}) \mathbf{x}_i + \Phi(\mathbf{X}, \mathcal{N}_i) + \mathbf{b}\right),",
         #     "variables": {
-        #         "x_u^{(l)}": "The state of node u at layer l",
-        #         "\\epsilon": "The step size in the Euler discretization",
-        #         "\\sigma": "A monotonically non-decreasing activation function",
-        #         "W": "An anti-symmetric weight matrix",
-        #         "X^{(l-1)}": "The node feature matrix at layer l-1",
-        #         "N_u": "The set of neighbors of node u",
-        #         "b": "A bias vector",
+        #         r"\mathbf{x}_i": "The state of node i at previous layer",
+        #         r"\epsilon": "The step size in the Euler discretization",
+        #         r"\sigma": "A monotonically non-decreasing activation function",
+        #         r"\Phi": "A graph convolutional operator",
+        #         r"W": "An anti-symmetric weight matrix",
+        #         r"\mathbf{x}^{\prime}_i": "The node feature matrix at layer l-1",
+        #         r"\mathcal{N}_i": "The set of neighbors of node u",
+        #         r"\mathbf{b}": "A bias vector",
         #     },
         #     "key": "A-DGN",
         # }
-        formula_info = {
-            "name": "Anti-Symmetric Deep Graph Network (A-DGN)",
-            "description": "A framework for stable and non-dissipative DGN design. It ensures long-range information preservation between nodes and prevents gradient vanishing or explosion during training.",
-            "formulation": r"\mathbf{x}^{\prime}_i = \mathbf{x}_i + \epsilon \cdot \sigma \left( (\mathbf{W}-\mathbf{W}^T-\gamma \mathbf{I}) \mathbf{x}_i + \Phi(\mathbf{X}, \mathcal{N}_i) + \mathbf{b}\right),",
-            "variables": {
-                r"\mathbf{x}_i": "The state of node i at previous layer",
-                r"\epsilon": "The step size in the Euler discretization",
-                r"\sigma": "A monotonically non-decreasing activation function",
-                r"\Phi": "A graph convolutional operator",
-                r"W": "An anti-symmetric weight matrix",
-                r"\mathbf{x}^{\prime}_i": "The node feature matrix at layer l-1",
-                r"\mathcal{N}_i": "The set of neighbors of node u",
-                r"\mathbf{b}": "A bias vector",
-            },
-            "key": "A-DGN",
-        }
-        return [ModelImplTask(**formula_info)]
-
-
-class ModelTaskImpl(TaskImplementation):
+        model_impl_task_list = []
+        for model_name, model_data in model_dict.items():
+            model_impl_task = ModelImplTask(
+                name=model_name,
+                description=model_data["description"],
+                formulation=model_data["formulation"],
+                variables=model_data["variables"],
+                key=model_data["key"]
+            )
+            model_impl_task_list.append(model_impl_task)
+        return model_impl_task_list
+
+class ModelImplementationTaskLoaderFromDict(TaskLoader):
+    def load(self, model_dict: dict) -> list:
+        """Load data from a dict."""
+        task_l = []
+        for model_name, model_data in model_dict.items():
+            task = ModelImplTask(
+                name=model_name,
+                description=model_data["description"],
+                formulation=model_data["formulation"],
+                variables=model_data["variables"],
+                key=model_name
+            )
+            task_l.append(task)
+        return task_l
+
+
+class ModelTaskImpl(FBTaskImplementation):
     """
     It is a Pytorch model implementation task;
     All the things are placed in a folder.