From a0385ea593768e63816ee972e5f7c989b8f04d52 Mon Sep 17 00:00:00 2001
From: Xu Yang <peteryang@vip.qq.com>
Date: Wed, 10 Jul 2024 15:45:43 +0800
Subject: [PATCH] Implement model (and some factor) coder with evolving (#52)

* store code into FBImplementation

* fix path related bugs

* fix a bug

* fix factor related small bugs

* re-submit all model related code

* new code to model coder

* finish the model evolving code

---------

Co-authored-by: xuyang1 <xuyang1@microsoft.com>
---
 rdagent/app/model_implementation/eval.py      |  23 +-
 .../coder/factor_coder/CoSTEER/__init__.py    |   1 -
 .../factor_coder/CoSTEER/evolving_strategy.py |  17 +-
 .../coder/factor_coder/CoSTEER/scheduler.py   |   2 +-
 .../components/coder/factor_coder/factor.py   |  34 +-
 .../coder/factor_coder/prompts.yaml           |  16 +-
 .../coder/model_coder/CoSTEER/__init__.py     |  86 +++++
 .../coder/model_coder/CoSTEER/evaluators.py   | 333 ++++++++++++++++++
 .../model_coder/CoSTEER/evolvable_subjects.py |  29 ++
 .../model_coder/CoSTEER/evolving_strategy.py  | 145 ++++++++
 .../CoSTEER/knowledge_management.py           | 167 +++++++++
 .../coder/model_coder/benchmark/eval.py       |   7 +-
 .../model_coder/benchmark/model_dict.json     |  20 +-
 rdagent/components/coder/model_coder/conf.py  |  23 +-
 .../components/coder/model_coder/evaluator.py |  59 ----
 rdagent/components/coder/model_coder/model.py | 198 +++--------
 .../coder/model_coder/one_shot/__init__.py    |   1 -
 .../coder/model_coder/one_shot/prompt.yaml    |  21 +-
 .../components/coder/model_coder/prompts.yaml | 129 ++++++-
 .../coder/model_coder/task_loader.py          |   2 +-
 rdagent/components/loader/task_loader.py      |  88 ++++-
 rdagent/core/experiment.py                    |  38 +-
 .../qlib/experiment/model_experiment.py       |  43 ++-
 .../scenarios/qlib/experiment/prompts.yaml    |  64 +++-
 24 files changed, 1249 insertions(+), 297 deletions(-)
 create mode 100644 rdagent/components/coder/model_coder/CoSTEER/__init__.py
 create mode 100644 rdagent/components/coder/model_coder/CoSTEER/evaluators.py
 create mode 100644 rdagent/components/coder/model_coder/CoSTEER/evolvable_subjects.py
 create mode 100644 rdagent/components/coder/model_coder/CoSTEER/evolving_strategy.py
 create mode 100644 rdagent/components/coder/model_coder/CoSTEER/knowledge_management.py
 delete mode 100644 rdagent/components/coder/model_coder/evaluator.py

diff --git a/rdagent/app/model_implementation/eval.py b/rdagent/app/model_implementation/eval.py
index 20fe0268..8d69a047 100644
--- a/rdagent/app/model_implementation/eval.py
+++ b/rdagent/app/model_implementation/eval.py
@@ -1,24 +1,29 @@
 from pathlib import Path
 
+from rdagent.components.coder.model_coder.CoSTEER import ModelCoSTEER
+from rdagent.components.loader.task_loader import ModelImpLoader, ModelTaskLoaderJson
+from rdagent.scenarios.qlib.experiment.model_experiment import (
+    QlibModelExperiment,
+    QlibModelScenario,
+)
+
 DIRNAME = Path(__file__).absolute().resolve().parent
 
 from rdagent.components.coder.model_coder.benchmark.eval import ModelImpValEval
-from rdagent.components.coder.model_coder.model import (
-    ModelImpLoader,
-    ModelTaskLoaderJson,
-)
 from rdagent.components.coder.model_coder.one_shot import ModelCodeWriter
 
-bench_folder = DIRNAME.parent.parent / "components" / "task_implementation" / "model_implementation" / "benchmark"
+bench_folder = DIRNAME.parent.parent / "components" / "coder" / "model_coder" / "benchmark"
 mtl = ModelTaskLoaderJson(str(bench_folder / "model_dict.json"))
 
 task_l = mtl.load()
 
-task_l = [t for t in task_l if t.key == "A-DGN"]  # FIXME: other models does not work well
+task_l = [t for t in task_l if t.name == "A-DGN"]  # FIXME: other models does not work well
 
-mtg = ModelCodeWriter()
+model_experiment = QlibModelExperiment(sub_tasks=task_l)
+# mtg = ModelCodeWriter(scen=QlibModelScenario())
+mtg = ModelCoSTEER(scen=QlibModelScenario())
 
-impl_l = mtg.generate(task_l)
+model_experiment = mtg.generate(model_experiment)
 
 # TODO: Align it with the benchmark framework after @wenjun's refine the evaluation part.
 # Currently, we just handcraft a workflow for fast evaluation.
@@ -28,7 +33,7 @@
 mie = ModelImpValEval()
 # Evaluation:
 eval_l = []
-for impl in impl_l:
+for impl in model_experiment.sub_implementations:
     print(impl.target_task)
     gt_impl = mil.load(impl.target_task)
     eval_l.append(mie.evaluate(gt_impl, impl))
diff --git a/rdagent/components/coder/factor_coder/CoSTEER/__init__.py b/rdagent/components/coder/factor_coder/CoSTEER/__init__.py
index f48b1e47..e3a81409 100644
--- a/rdagent/components/coder/factor_coder/CoSTEER/__init__.py
+++ b/rdagent/components/coder/factor_coder/CoSTEER/__init__.py
@@ -98,5 +98,4 @@ def generate(self, exp: FactorExperiment) -> FactorExperiment:
         if self.new_knowledge_base_path is not None:
             pickle.dump(factor_knowledge_base, open(self.new_knowledge_base_path, "wb"))
         self.knowledge_base = factor_knowledge_base
-        self.latest_factor_implementations = exp.sub_tasks
         return factor_experiment
diff --git a/rdagent/components/coder/factor_coder/CoSTEER/evolving_strategy.py b/rdagent/components/coder/factor_coder/CoSTEER/evolving_strategy.py
index 4d3fa2f7..fcda9ad8 100644
--- a/rdagent/components/coder/factor_coder/CoSTEER/evolving_strategy.py
+++ b/rdagent/components/coder/factor_coder/CoSTEER/evolving_strategy.py
@@ -145,7 +145,7 @@ def implement_one_factor(
                     implement_prompts["evolving_strategy_factor_implementation_v1_system"],
                 )
                 .render(
-                    data_info=get_data_folder_intro(),
+                    scenario=self.scen.get_scenario_all_desc(),
                     queried_former_failed_knowledge=queried_former_failed_knowledge_to_render,
                 )
             )
@@ -154,7 +154,7 @@ def implement_one_factor(
             )
 
             queried_similar_successful_knowledge_to_render = queried_similar_successful_knowledge
-            while True:
+            for _ in range(10):  # max attempt to reduce the length of user_prompt
                 user_prompt = (
                     Environment(undefined=StrictUndefined)
                     .from_string(
@@ -163,6 +163,7 @@ def implement_one_factor(
                     .render(
                         factor_information_str=factor_information_str,
                         queried_similar_successful_knowledge=queried_similar_successful_knowledge_to_render,
+                        queried_former_failed_knowledge=queried_former_failed_knowledge_to_render,
                     )
                     .strip("\n")
                 )
@@ -187,8 +188,9 @@ def implement_one_factor(
             # ast.parse(code)
             factor_implementation = FileBasedFactorImplementation(
                 target_task,
-                code,
             )
+            factor_implementation.prepare()
+            factor_implementation.inject_code(**{"factor.py": code})
 
             return factor_implementation
 
@@ -255,7 +257,7 @@ def implement_one_factor(
             queried_similar_error_knowledge_to_render = queried_similar_error_knowledge
             error_summary_critics = ""
             # 动态地防止prompt超长
-            while True:
+            for _ in range(10):  # max attempt to reduce the length of user_prompt
                 # 总结error（可选）
                 if (
                     error_summary
@@ -266,6 +268,7 @@ def implement_one_factor(
                         Environment(undefined=StrictUndefined)
                         .from_string(implement_prompts["evolving_strategy_error_summary_v2_system"])
                         .render(
+                            scenario=self.scen.get_scenario_all_desc(),
                             factor_information_str=target_factor_task_information,
                             code_and_feedback=queried_former_failed_knowledge_to_render[
                                 -1
@@ -276,7 +279,7 @@ def implement_one_factor(
                     session_summary = APIBackend(use_chat_cache=False).build_chat_session(
                         session_system_prompt=error_summary_system_prompt,
                     )
-                    while True:
+                    for _ in range(10):  # max attempt to reduce the length of error_summary_user_prompt
                         error_summary_user_prompt = (
                             Environment(undefined=StrictUndefined)
                             .from_string(implement_prompts["evolving_strategy_error_summary_v2_user"])
@@ -332,5 +335,7 @@ def implement_one_factor(
                 json_mode=True,
             )
             code = json.loads(response)["code"]
-            factor_implementation = FileBasedFactorImplementation(target_task, code)
+            factor_implementation = FileBasedFactorImplementation(target_task)
+            factor_implementation.prepare()
+            factor_implementation.inject_code(**{"factor.py": code})
             return factor_implementation
diff --git a/rdagent/components/coder/factor_coder/CoSTEER/scheduler.py b/rdagent/components/coder/factor_coder/CoSTEER/scheduler.py
index 90296b84..a0f56dca 100644
--- a/rdagent/components/coder/factor_coder/CoSTEER/scheduler.py
+++ b/rdagent/components/coder/factor_coder/CoSTEER/scheduler.py
@@ -53,7 +53,7 @@ def LLMSelect(
         )
     )
 
-    while True:
+    for _ in range(10):  # max attempt to reduce the length of user_prompt
         user_prompt = (
             Environment(undefined=StrictUndefined)
             .from_string(
diff --git a/rdagent/components/coder/factor_coder/factor.py b/rdagent/components/coder/factor_coder/factor.py
index 72d01995..4932a9ae 100644
--- a/rdagent/components/coder/factor_coder/factor.py
+++ b/rdagent/components/coder/factor_coder/factor.py
@@ -67,19 +67,15 @@ class FileBasedFactorImplementation(FBImplementation):
 
     def __init__(
         self,
-        target_task: FactorTask,
-        code,
+        *args,
         executed_factor_value_dataframe=None,
         raise_exception=False,
+        **kwargs,
     ) -> None:
-        super().__init__(target_task)
-        self.code = code
+        super().__init__(*args, **kwargs)
         self.executed_factor_value_dataframe = executed_factor_value_dataframe
         self.logger = RDAgentLog()
         self.raise_exception = raise_exception
-        self.workspace_path = Path(
-            FACTOR_IMPLEMENT_SETTINGS.file_based_execution_workspace,
-        ) / str(uuid.uuid4())
 
     @staticmethod
     def link_data_to_workspace(data_path: Path, workspace_path: Path):
@@ -98,8 +94,10 @@ def execute_desc(self):
         raise NotImplementedError
 
     def prepare(self, *args, **kwargs):
-        # TODO move the prepare part code in execute into here
-        return super().prepare(*args, **kwargs)
+        self.workspace_path = Path(
+            FACTOR_IMPLEMENT_SETTINGS.file_based_execution_workspace,
+        ) / str(uuid.uuid4())
+        self.workspace_path.mkdir(exist_ok=True, parents=True)
 
     def execute(self, store_result: bool = False) -> Tuple[str, pd.DataFrame]:
         """
@@ -114,7 +112,7 @@ def execute(self, store_result: bool = False) -> Tuple[str, pd.DataFrame]:
         parameters:
         store_result: if True, store the factor value in the instance variable, this feature is to be used in the gt implementation to avoid multiple execution on the same gt implementation
         """
-        if self.code is None:
+        if self.code_dict is None or "factor.py" not in self.code_dict:
             if self.raise_exception:
                 raise CodeFormatException(self.FB_CODE_NOT_SET)
             else:
@@ -123,7 +121,7 @@ def execute(self, store_result: bool = False) -> Tuple[str, pd.DataFrame]:
         with FileLock(self.workspace_path / "execution.lock"):
             if FACTOR_IMPLEMENT_SETTINGS.enable_execution_cache:
                 # NOTE: cache the result for the same code
-                target_file_name = md5_hash(self.code)
+                target_file_name = md5_hash(self.code_dict["factor.py"])
                 cache_file_path = (
                     Path(FACTOR_IMPLEMENT_SETTINGS.implementation_execution_cache_location) / f"{target_file_name}.pkl"
                 )
@@ -142,10 +140,9 @@ def execute(self, store_result: bool = False) -> Tuple[str, pd.DataFrame]:
             source_data_path = Path(
                 FACTOR_IMPLEMENT_SETTINGS.file_based_execution_data_folder,
             )
-            self.workspace_path.mkdir(exist_ok=True, parents=True)
+
             source_data_path.mkdir(exist_ok=True, parents=True)
-            code_path = self.workspace_path / f"{self.target_task.factor_name}.py"
-            code_path.write_text(self.code)
+            code_path = self.workspace_path / f"factor.py"
 
             self.link_data_to_workspace(source_data_path, self.workspace_path)
 
@@ -212,10 +209,11 @@ def __repr__(self) -> str:
     @staticmethod
     def from_folder(task: FactorTask, path: Union[str, Path], **kwargs):
         path = Path(path)
-        factor_path = (path / task.factor_name).with_suffix(".py")
-        with factor_path.open("r") as f:
-            code = f.read()
-        return FileBasedFactorImplementation(task, code=code, **kwargs)
+        code_dict = {}
+        for file_path in path.iterdir():
+            if file_path.suffix == ".py":
+                code_dict[file_path.name] = file_path.read_text()
+        return FileBasedFactorImplementation(target_task=task, code_dict=code_dict, **kwargs)
 
 
 class FactorExperiment(Experiment[FactorTask, FileBasedFactorImplementation]): ...
diff --git a/rdagent/components/coder/factor_coder/prompts.yaml b/rdagent/components/coder/factor_coder/prompts.yaml
index 486b7ef8..53d62fca 100644
--- a/rdagent/components/coder/factor_coder/prompts.yaml
+++ b/rdagent/components/coder/factor_coder/prompts.yaml
@@ -13,6 +13,8 @@ evaluator_code_feedback_v1_system: |-
   If the ground truth code is provided, your critic should only consider checking whether the user's code is align with the ground truth code since the ground truth is definitely correct.
   If the ground truth code is not provided, your critic should consider checking whether the user's code is reasonable and correct.
 
+  Notice that your critics are not for user to debug the code. They are sent to the coding agent to correct the code. So don't give any following items for the user to check like "Please check the code line XXX".
+  
   You should provide the suggestion to each of your critic to help the user improve the code. Please response the critic in the following format. Here is an example structure for the output:
   critic 1: The critic message to critic 1
   critic 2: The critic message to critic 2
@@ -47,12 +49,10 @@ evolving_strategy_factor_implementation_v1_system: |-
 
   {% if queried_former_failed_knowledge|length != 0 %}
   --------------Your former latest attempt:---------------
-  {% for former_failed_knowledge in queried_former_failed_knowledge %}
-  =====Code to implementation {{ loop.index }}=====
-  {{ former_failed_knowledge.implementation.code }}
-  =====Feedback to implementation {{ loop.index }}=====
-  {{ former_failed_knowledge.feedback }}
-  {% endfor %}
+  =====Code to the former implementation=====
+  {{ queried_former_failed_knowledge[-1].implementation.code }}
+  =====Feedback to the former implementation=====
+  {{ queried_former_failed_knowledge[-1].feedback }}
   {% endif %}
 
   Please response the code in the following json format. Here is an example structure for the JSON output:
@@ -118,7 +118,9 @@ evolving_strategy_factor_implementation_v2_user: |-
   {% endif %}
 
 evolving_strategy_error_summary_v2_system: |-
-  You are doing the following task: 
+  User is trying to implement some factors in the following scenario:
+  {{ scenario }}
+  User is doing the following task: 
   {{factor_information_str}}
 
   You have written some code but it meets errors like the following:
diff --git a/rdagent/components/coder/model_coder/CoSTEER/__init__.py b/rdagent/components/coder/model_coder/CoSTEER/__init__.py
new file mode 100644
index 00000000..be9ede38
--- /dev/null
+++ b/rdagent/components/coder/model_coder/CoSTEER/__init__.py
@@ -0,0 +1,86 @@
+import pickle
+from pathlib import Path
+
+from rdagent.components.coder.model_coder.conf import MODEL_IMPL_SETTINGS
+from rdagent.components.coder.model_coder.CoSTEER.evaluators import (
+    ModelCoderMultiEvaluator,
+)
+from rdagent.components.coder.model_coder.CoSTEER.evolvable_subjects import (
+    ModelEvolvingItem,
+)
+from rdagent.components.coder.model_coder.CoSTEER.evolving_strategy import (
+    ModelCoderEvolvingStrategy,
+)
+from rdagent.components.coder.model_coder.CoSTEER.knowledge_management import (
+    ModelKnowledgeBase,
+    ModelRAGStrategy,
+)
+from rdagent.components.coder.model_coder.model import ModelExperiment
+from rdagent.core.evolving_agent import RAGEvoAgent
+from rdagent.core.task_generator import TaskGenerator
+
+
+class ModelCoSTEER(TaskGenerator[ModelExperiment]):
+    def __init__(
+        self,
+        *args,
+        with_knowledge: bool = True,
+        with_feedback: bool = True,
+        knowledge_self_gen: bool = True,
+        **kwargs,
+    ) -> None:
+        super().__init__(*args, **kwargs)
+        self.max_loop = MODEL_IMPL_SETTINGS.max_loop
+        self.knowledge_base_path = (
+            Path(MODEL_IMPL_SETTINGS.knowledge_base_path)
+            if MODEL_IMPL_SETTINGS.knowledge_base_path is not None
+            else None
+        )
+        self.new_knowledge_base_path = (
+            Path(MODEL_IMPL_SETTINGS.new_knowledge_base_path)
+            if MODEL_IMPL_SETTINGS.new_knowledge_base_path is not None
+            else None
+        )
+        self.with_knowledge = with_knowledge
+        self.with_feedback = with_feedback
+        self.knowledge_self_gen = knowledge_self_gen
+        self.evolving_strategy = ModelCoderEvolvingStrategy(scen=self.scen)
+        self.model_evaluator = ModelCoderMultiEvaluator(scen=self.scen)
+
+    def load_or_init_knowledge_base(self, former_knowledge_base_path: Path = None, component_init_list: list = []):
+        if former_knowledge_base_path is not None and former_knowledge_base_path.exists():
+            model_knowledge_base = pickle.load(open(former_knowledge_base_path, "rb"))
+            if not isinstance(model_knowledge_base, ModelKnowledgeBase):
+                raise ValueError("The former knowledge base is not compatible with the current version")
+        else:
+            model_knowledge_base = ModelKnowledgeBase()
+
+        return model_knowledge_base
+
+    def generate(self, exp: ModelExperiment) -> ModelExperiment:
+        # init knowledge base
+        model_knowledge_base = self.load_or_init_knowledge_base(
+            former_knowledge_base_path=self.knowledge_base_path,
+            component_init_list=[],
+        )
+        # init rag method
+        self.rag = ModelRAGStrategy(model_knowledge_base)
+
+        # init intermediate items
+        model_experiment = ModelEvolvingItem(sub_tasks=exp.sub_tasks)
+
+        self.evolve_agent = RAGEvoAgent(max_loop=self.max_loop, evolving_strategy=self.evolving_strategy, rag=self.rag)
+
+        model_experiment = self.evolve_agent.multistep_evolve(
+            model_experiment,
+            self.model_evaluator,
+            with_knowledge=self.with_knowledge,
+            with_feedback=self.with_feedback,
+            knowledge_self_gen=self.knowledge_self_gen,
+        )
+
+        # save new knowledge base
+        if self.new_knowledge_base_path is not None:
+            pickle.dump(model_knowledge_base, open(self.new_knowledge_base_path, "wb"))
+        self.knowledge_base = model_knowledge_base
+        return model_experiment
diff --git a/rdagent/components/coder/model_coder/CoSTEER/evaluators.py b/rdagent/components/coder/model_coder/CoSTEER/evaluators.py
new file mode 100644
index 00000000..15098314
--- /dev/null
+++ b/rdagent/components/coder/model_coder/CoSTEER/evaluators.py
@@ -0,0 +1,333 @@
+import json
+import random
+from pathlib import Path
+from typing import List, Tuple
+
+import numpy as np
+import torch
+from jinja2 import Environment, StrictUndefined
+
+from rdagent.components.coder.model_coder.conf import MODEL_IMPL_SETTINGS
+from rdagent.components.coder.model_coder.CoSTEER.evolvable_subjects import (
+    ModelEvolvingItem,
+)
+from rdagent.components.coder.model_coder.model import ModelImplementation, ModelTask
+from rdagent.core.conf import RD_AGENT_SETTINGS
+from rdagent.core.evaluation import Evaluator
+from rdagent.core.evolving_framework import QueriedKnowledge
+from rdagent.core.experiment import Implementation, Task
+from rdagent.core.log import RDAgentLog
+from rdagent.core.prompts import Prompts
+from rdagent.core.utils import multiprocessing_wrapper
+from rdagent.oai.llm_utils import APIBackend
+
+evaluate_prompts = Prompts(file_path=Path(__file__).parent.parent / "prompts.yaml")
+
+
+def shape_evaluator(prediction: torch.Tensor, target_shape: Tuple = None) -> Tuple[str, bool]:
+    if target_shape is None or prediction is None:
+        return "No output generated from the model. No shape evaluation conducted.", False
+    pre_shape = prediction.shape
+
+    if pre_shape == target_shape:
+        return "The shape of the output is correct.", True
+    else:
+        return f"The shape of the output is incorrect. Expected {target_shape}, but got {pre_shape}.", False
+
+
+def reshape_tensor(original_tensor, target_shape):
+    new_tensor = torch.zeros(target_shape)
+    for i, dim in enumerate(original_tensor.shape):
+        new_tensor = new_tensor.narrow(i, 0, dim).copy_(original_tensor)
+
+    return new_tensor
+
+
+def value_evaluator(
+    prediction: torch.Tensor,
+    target: torch.Tensor,
+) -> Tuple[torch.Tensor, bool]:
+    if target is None or prediction is None:
+        return "No output generated from the model. No value evaluation conducted.", False
+    else:
+        # Calculate the mean absolute difference
+        diff = torch.mean(torch.abs(target - prediction)).item()
+        return (
+            f"The value of the output is correct. The mean absolute difference is {diff}.",
+            diff < 0.1,
+        )
+
+
+class ModelCodeEvaluator(Evaluator):
+    def evaluate(
+        self,
+        target_task: Task,
+        implementation: Implementation,
+        gt_implementation: Implementation,
+        model_execution_feedback: str = "",
+        model_value_feedback: str = "",
+    ):
+        assert isinstance(target_task, ModelTask)
+        assert isinstance(implementation, ModelImplementation)
+        if gt_implementation is not None:
+            assert isinstance(gt_implementation, ModelImplementation)
+
+        model_task_information = target_task.get_information()
+        code = implementation.code
+
+        system_prompt = (
+            Environment(undefined=StrictUndefined)
+            .from_string(evaluate_prompts["evaluator_code_feedback"]["system"])
+            .render(scenario=self.scen.get_scenario_all_desc() if self.scen is not None else "No scenario description.")
+        )
+
+        execution_feedback_to_render = model_execution_feedback
+        for _ in range(10):  # 10 times to split the content is enough
+            user_prompt = (
+                Environment(undefined=StrictUndefined)
+                .from_string(
+                    evaluate_prompts["evaluator_code_feedback"]["user"],
+                )
+                .render(
+                    model_information=model_task_information,
+                    code=code,
+                    model_execution_feedback=execution_feedback_to_render,
+                    model_value_feedback=model_value_feedback,
+                    gt_code=gt_implementation.code if gt_implementation else None,
+                )
+            )
+            if (
+                APIBackend().build_messages_and_calculate_token(
+                    user_prompt=user_prompt,
+                    system_prompt=system_prompt,
+                )
+                > RD_AGENT_SETTINGS.chat_token_limit
+            ):
+                execution_feedback_to_render = execution_feedback_to_render[len(execution_feedback_to_render) // 2 :]
+            else:
+                break
+
+        critic_response = APIBackend().build_messages_and_create_chat_completion(
+            user_prompt=user_prompt,
+            system_prompt=system_prompt,
+            json_mode=False,
+        )
+
+        return critic_response, None
+
+
+class ModelFinalEvaluator(Evaluator):
+    def evaluate(
+        self,
+        target_task: Task,
+        implementation: Implementation,
+        gt_implementation: Implementation,
+        model_execution_feedback: str,
+        model_value_feedback: str,
+        model_code_feedback: str,
+    ):
+        assert isinstance(target_task, ModelTask)
+        assert isinstance(implementation, ModelImplementation)
+        if gt_implementation is not None:
+            assert isinstance(gt_implementation, ModelImplementation)
+
+        system_prompt = (
+            Environment(undefined=StrictUndefined)
+            .from_string(evaluate_prompts["evaluator_final_feedback"]["system"])
+            .render(scenario=self.scen.get_scenario_all_desc() if self.scen is not None else "No scenario description.")
+        )
+
+        execution_feedback_to_render = model_execution_feedback
+
+        for _ in range(10):  # 10 times to split the content is enough
+            user_prompt = (
+                Environment(undefined=StrictUndefined)
+                .from_string(
+                    evaluate_prompts["evaluator_final_feedback"]["user"],
+                )
+                .render(
+                    model_information=target_task.get_information(),
+                    model_execution_feedback=execution_feedback_to_render,
+                    model_code_feedback=model_code_feedback,
+                    model_value_feedback=model_value_feedback,
+                )
+            )
+            if (
+                APIBackend().build_messages_and_calculate_token(
+                    user_prompt=user_prompt,
+                    system_prompt=system_prompt,
+                )
+                > RD_AGENT_SETTINGS.chat_token_limit
+            ):
+                execution_feedback_to_render = execution_feedback_to_render[len(execution_feedback_to_render) // 2 :]
+            else:
+                break
+
+        final_evaluation_dict = json.loads(
+            APIBackend().build_messages_and_create_chat_completion(
+                user_prompt=user_prompt,
+                system_prompt=system_prompt,
+                json_mode=True,
+            ),
+        )
+        if isinstance(final_evaluation_dict["final_decision"], str) and final_evaluation_dict[
+            "final_decision"
+        ].lower() in ("true", "false"):
+            final_evaluation_dict["final_decision"] = bool(final_evaluation_dict["final_decision"])
+        return (
+            final_evaluation_dict["final_feedback"],
+            final_evaluation_dict["final_decision"],
+        )
+
+
+class ModelCoderFeedback:
+    """This feedback includes all the content to the model coder"""
+
+    def __init__(
+        self,
+        execution_feedback: str,
+        shape_feedback: str,
+        value_feedback: str,
+        code_feedback: str,
+        final_feedback: str,
+        final_decision: bool,
+    ):
+        self.execution_feedback: str = execution_feedback
+        self.shape_feedback: str = shape_feedback
+        self.value_feedback: str = value_feedback
+        self.code_feedback: str = code_feedback
+        self.final_feedback: str = final_feedback
+        self.final_decision: str = final_decision
+
+    def __str__(self) -> str:
+        return f"""------------------Model Execution Feedback------------------
+{self.execution_feedback}
+------------------Model Shape Feedback------------------
+{self.shape_feedback}
+------------------Model Value Feedback------------------
+{self.value_feedback}
+------------------Model Code Feedback------------------
+{self.code_feedback}
+------------------Model Final Feedback------------------
+{self.final_feedback}
+------------------Model Final Decision------------------
+This implementation is {'SUCCESS' if self.final_decision else 'FAIL'}.
+"""
+
+
+class ModelCoderEvaluator(Evaluator):
+    def evaluate(
+        self,
+        target_task: Task,
+        implementation: Implementation,
+        gt_implementation: Implementation,
+        queried_knowledge: QueriedKnowledge = None,
+        **kwargs,
+    ) -> ModelCoderFeedback:
+        target_task_information = target_task.get_information()
+        if (
+            queried_knowledge is not None
+            and target_task_information in queried_knowledge.success_task_to_knowledge_dict
+        ):
+            return queried_knowledge.success_task_to_knowledge_dict[target_task_information].feedback
+        elif queried_knowledge is not None and target_task_information in queried_knowledge.failed_task_info_set:
+            return ModelCoderFeedback(
+                execution_feedback="This task has failed too many times, skip implementation.",
+                shape_feedback="This task has failed too many times, skip implementation.",
+                value_feedback="This task has failed too many times, skip implementation.",
+                code_feedback="This task has failed too many times, skip implementation.",
+                final_feedback="This task has failed too many times, skip implementation.",
+                final_decision=False,
+            )
+        assert isinstance(target_task, ModelTask)
+
+        batch_size, num_features, num_timesteps = (
+            random.randint(6, 10),
+            random.randint(6, 10),
+            random.randint(6, 10),
+        )
+        input_value, param_init_value = random.random(), random.random()
+
+        assert isinstance(implementation, ModelImplementation)
+        model_execution_feedback, gen_tensor = implementation.execute(
+            batch_size=batch_size,
+            num_features=num_features,
+            num_timesteps=num_timesteps,
+            input_value=input_value,
+            param_init_value=param_init_value,
+        )
+        if gt_implementation is not None:
+            assert isinstance(gt_implementation, ModelImplementation)
+            _, gt_tensor = gt_implementation.execute(
+                batch_size=batch_size,
+                num_features=num_features,
+                num_timesteps=num_timesteps,
+                input_value=input_value,
+                param_init_value=param_init_value,
+            )
+        else:
+            gt_tensor = None
+
+        shape_feedback, shape_decision = shape_evaluator(gen_tensor, (batch_size, 1))
+        value_feedback, value_decision = value_evaluator(gt_tensor, gen_tensor)
+        code_feedback, _ = ModelCodeEvaluator(scen=self.scen).evaluate(
+            target_task=target_task,
+            implementation=implementation,
+            gt_implementation=gt_implementation,
+            model_execution_feedback=model_execution_feedback,
+            model_value_feedback="\n".join([shape_feedback, value_feedback]),
+        )
+        final_feedback, final_decision = ModelFinalEvaluator(scen=self.scen).evaluate(
+            target_task=target_task,
+            implementation=implementation,
+            gt_implementation=gt_implementation,
+            model_execution_feedback=model_execution_feedback,
+            model_value_feedback=value_feedback,
+            model_code_feedback=code_feedback,
+        )
+
+        return ModelCoderFeedback(
+            execution_feedback=model_execution_feedback,
+            shape_feedback=shape_feedback,
+            value_feedback=value_feedback,
+            code_feedback=code_feedback,
+            final_feedback=final_feedback,
+            final_decision=final_decision,
+        )
+
+
+class ModelCoderMultiEvaluator(Evaluator):
+    def evaluate(
+        self,
+        evo: ModelEvolvingItem,
+        queried_knowledge: QueriedKnowledge = None,
+        **kwargs,
+    ) -> List[ModelCoderFeedback]:
+        multi_implementation_feedback = []
+
+        calls = []
+        for index in range(len(evo.sub_tasks)):
+            corresponding_implementation = evo.sub_implementations[index]
+            corresponding_gt_implementation = (
+                evo.sub_gt_implementations[index] if evo.sub_gt_implementations is not None else None
+            )
+            calls.append(
+                (
+                    ModelCoderEvaluator(scen=self.scen).evaluate,
+                    (
+                        evo.sub_tasks[index],
+                        corresponding_implementation,
+                        corresponding_gt_implementation,
+                        queried_knowledge,
+                    ),
+                ),
+            )
+        multi_implementation_feedback = multiprocessing_wrapper(calls, n=MODEL_IMPL_SETTINGS.evo_multi_proc_n)
+
+        final_decision = [
+            None if single_feedback is None else single_feedback.final_decision
+            for single_feedback in multi_implementation_feedback
+        ]
+        RDAgentLog().info(f"Final decisions: {final_decision} True count: {final_decision.count(True)}")
+
+        return multi_implementation_feedback
diff --git a/rdagent/components/coder/model_coder/CoSTEER/evolvable_subjects.py b/rdagent/components/coder/model_coder/CoSTEER/evolvable_subjects.py
new file mode 100644
index 00000000..70e485b3
--- /dev/null
+++ b/rdagent/components/coder/model_coder/CoSTEER/evolvable_subjects.py
@@ -0,0 +1,29 @@
+from rdagent.components.coder.model_coder.model import (
+    ModelExperiment,
+    ModelImplementation,
+    ModelTask,
+)
+from rdagent.core.evolving_framework import EvolvableSubjects
+from rdagent.core.log import RDAgentLog
+
+
+class ModelEvolvingItem(ModelExperiment, EvolvableSubjects):
+    """
+    Intermediate item of model implementation.
+    """
+
+    def __init__(
+        self,
+        sub_tasks: list[ModelTask],
+        sub_gt_implementations: list[ModelImplementation] = None,
+    ):
+        ModelExperiment.__init__(self, sub_tasks=sub_tasks)
+        if sub_gt_implementations is not None and len(
+            sub_gt_implementations,
+        ) != len(self.sub_tasks):
+            self.sub_gt_implementations = None
+            RDAgentLog().warning(
+                "The length of sub_gt_implementations is not equal to the length of sub_tasks, set sub_gt_implementations to None",
+            )
+        else:
+            self.sub_gt_implementations = sub_gt_implementations
diff --git a/rdagent/components/coder/model_coder/CoSTEER/evolving_strategy.py b/rdagent/components/coder/model_coder/CoSTEER/evolving_strategy.py
new file mode 100644
index 00000000..27ba37fb
--- /dev/null
+++ b/rdagent/components/coder/model_coder/CoSTEER/evolving_strategy.py
@@ -0,0 +1,145 @@
+import json
+from copy import deepcopy
+from pathlib import Path
+
+from jinja2 import Environment, StrictUndefined
+
+from rdagent.components.coder.model_coder.conf import MODEL_IMPL_SETTINGS
+from rdagent.components.coder.model_coder.CoSTEER.evolvable_subjects import (
+    ModelEvolvingItem,
+)
+from rdagent.components.coder.model_coder.CoSTEER.knowledge_management import (
+    ModelQueriedKnowledge,
+)
+from rdagent.components.coder.model_coder.model import ModelImplementation, ModelTask
+from rdagent.core.conf import RD_AGENT_SETTINGS
+from rdagent.core.evolving_framework import EvolvingStrategy
+from rdagent.core.prompts import Prompts
+from rdagent.core.utils import multiprocessing_wrapper
+from rdagent.oai.llm_utils import APIBackend
+
+coder_prompts = Prompts(file_path=Path(__file__).parent.parent / "prompts.yaml")
+
+
+class ModelCoderEvolvingStrategy(EvolvingStrategy):
+    def implement_one_model(
+        self,
+        target_task: ModelTask,
+        queried_knowledge: ModelQueriedKnowledge = None,
+    ) -> ModelImplementation:
+        model_information_str = target_task.get_information()
+
+        if queried_knowledge is not None and model_information_str in queried_knowledge.success_task_to_knowledge_dict:
+            return queried_knowledge.success_task_to_knowledge_dict[model_information_str].implementation
+        elif queried_knowledge is not None and model_information_str in queried_knowledge.failed_task_info_set:
+            return None
+        else:
+            queried_similar_successful_knowledge = (
+                queried_knowledge.working_task_to_similar_successful_knowledge_dict[model_information_str]
+                if queried_knowledge is not None
+                else []
+            )
+            queried_former_failed_knowledge = (
+                queried_knowledge.working_task_to_former_failed_knowledge_dict[model_information_str]
+                if queried_knowledge is not None
+                else []
+            )
+
+            queried_former_failed_knowledge_to_render = queried_former_failed_knowledge
+
+            system_prompt = (
+                Environment(undefined=StrictUndefined)
+                .from_string(
+                    coder_prompts["evolving_strategy_model_coder"]["system"],
+                )
+                .render(
+                    scenario=self.scen.get_scenario_all_desc(),
+                    queried_former_failed_knowledge=queried_former_failed_knowledge_to_render,
+                )
+            )
+
+            queried_similar_successful_knowledge_to_render = queried_similar_successful_knowledge
+            for _ in range(10):  # max attempt to reduce the length of user_prompt
+                user_prompt = (
+                    Environment(undefined=StrictUndefined)
+                    .from_string(
+                        coder_prompts["evolving_strategy_model_coder"]["user"],
+                    )
+                    .render(
+                        model_information_str=model_information_str,
+                        queried_similar_successful_knowledge=queried_similar_successful_knowledge_to_render,
+                        queried_former_failed_knowledge=queried_former_failed_knowledge_to_render,
+                    )
+                    .strip("\n")
+                )
+                if (
+                    APIBackend().build_messages_and_calculate_token(
+                        user_prompt=user_prompt,
+                        system_prompt=system_prompt,
+                    )
+                    < RD_AGENT_SETTINGS.chat_token_limit
+                ):
+                    break
+                elif len(queried_former_failed_knowledge_to_render) > 1:
+                    queried_former_failed_knowledge_to_render = queried_former_failed_knowledge_to_render[1:]
+                elif len(queried_similar_successful_knowledge_to_render) > 1:
+                    queried_similar_successful_knowledge_to_render = queried_similar_successful_knowledge_to_render[1:]
+
+            code = json.loads(
+                APIBackend(use_chat_cache=True).build_messages_and_create_chat_completion(
+                    user_prompt=user_prompt,
+                    system_prompt=system_prompt,
+                    json_mode=True,
+                ),
+            )["code"]
+            # ast.parse(code)
+            model_implementation = ModelImplementation(
+                target_task,
+            )
+            model_implementation.prepare()
+            model_implementation.inject_code(**{"model.py": code})
+
+            return model_implementation
+
+    def evolve(
+        self,
+        *,
+        evo: ModelEvolvingItem,
+        queried_knowledge: ModelQueriedKnowledge | None = None,
+        **kwargs,
+    ) -> ModelEvolvingItem:
+        new_evo = deepcopy(evo)
+
+        # 1.找出需要evolve的model
+        to_be_finished_task_index = []
+        for index, target_model_task in enumerate(new_evo.sub_tasks):
+            target_model_task_desc = target_model_task.get_information()
+            if target_model_task_desc in queried_knowledge.success_task_to_knowledge_dict:
+                new_evo.sub_implementations[index] = queried_knowledge.success_task_to_knowledge_dict[
+                    target_model_task_desc
+                ].implementation
+            elif (
+                target_model_task_desc not in queried_knowledge.success_task_to_knowledge_dict
+                and target_model_task_desc not in queried_knowledge.failed_task_info_set
+            ):
+                to_be_finished_task_index.append(index)
+
+        result = multiprocessing_wrapper(
+            [
+                (self.implement_one_model, (new_evo.sub_tasks[target_index], queried_knowledge))
+                for target_index in to_be_finished_task_index
+            ],
+            n=MODEL_IMPL_SETTINGS.evo_multi_proc_n,
+        )
+
+        for index, target_index in enumerate(to_be_finished_task_index):
+            new_evo.sub_implementations[target_index] = result[index]
+
+        # for target_index in to_be_finished_task_index:
+        #     new_evo.sub_implementations[target_index] = self.implement_one_model(
+        #         new_evo.sub_tasks[target_index], queried_knowledge
+        #     )
+
+        new_evo.corresponding_selection = to_be_finished_task_index
+
+        return new_evo
diff --git a/rdagent/components/coder/model_coder/CoSTEER/knowledge_management.py b/rdagent/components/coder/model_coder/CoSTEER/knowledge_management.py
new file mode 100644
index 00000000..7e46f136
--- /dev/null
+++ b/rdagent/components/coder/model_coder/CoSTEER/knowledge_management.py
@@ -0,0 +1,167 @@
+from rdagent.components.coder.model_coder.conf import MODEL_IMPL_SETTINGS
+from rdagent.components.coder.model_coder.CoSTEER.evaluators import ModelCoderFeedback
+from rdagent.components.coder.model_coder.model import ModelTask
+from rdagent.core.evolving_framework import (
+    EvolvableSubjects,
+    EvoStep,
+    Knowledge,
+    KnowledgeBase,
+    QueriedKnowledge,
+    RAGStrategy,
+)
+from rdagent.core.experiment import Implementation
+from rdagent.oai.llm_utils import calculate_embedding_distance_between_str_list
+
+
+class ModelKnowledge(Knowledge):
+    def __init__(
+        self,
+        target_task: ModelTask,
+        implementation: Implementation,
+        feedback: ModelCoderFeedback,
+    ) -> None:
+        """
+        Initialize a ModelKnowledge object. The ModelKnowledge object is used to store a model implementation without the ground truth code and value.
+
+        Args:
+            model (Model): The model object associated with the KnowledgeManagement.
+
+        Returns:
+            None
+        """
+        self.target_task = target_task
+        self.implementation = implementation
+        self.feedback = feedback
+
+    def get_implementation_and_feedback_str(self) -> str:
+        return f"""------------------Model implementation code:------------------
+{self.implementation.code}
+------------------Model implementation feedback:------------------
+{self.feedback!s}
+"""
+
+
+class ModelQueriedKnowledge(QueriedKnowledge):
+    def __init__(self, success_task_to_knowledge_dict: dict = {}, failed_task_info_set: set = set()) -> None:
+        self.success_task_to_knowledge_dict = success_task_to_knowledge_dict
+        self.failed_task_info_set = failed_task_info_set
+        self.working_task_to_former_failed_knowledge_dict = dict()
+        self.working_task_to_similar_successful_knowledge_dict = dict()
+
+
+class ModelKnowledgeBase(KnowledgeBase):
+    def __init__(self) -> None:
+        self.implementation_trace: dict[str, ModelKnowledge] = dict()
+        self.success_task_info_set: set[str] = set()
+
+        self.task_to_embedding = dict()
+
+    def query(self) -> QueriedKnowledge | None:
+        """
+        Query the knowledge base to get the queried knowledge. So far is handled in RAG strategy.
+        """
+        raise NotImplementedError
+
+
+class ModelRAGStrategy(RAGStrategy):
+    def __init__(self, knowledgebase: ModelKnowledgeBase) -> None:
+        super().__init__(knowledgebase)
+        self.current_generated_trace_count = 0
+
+    def generate_knowledge(
+        self,
+        evolving_trace: list[EvoStep],
+        *,
+        return_knowledge: bool = False,
+    ) -> Knowledge | None:
+        if len(evolving_trace) == self.current_generated_trace_count:
+            return
+        else:
+            for trace_index in range(
+                self.current_generated_trace_count,
+                len(evolving_trace),
+            ):
+                evo_step = evolving_trace[trace_index]
+                implementations = evo_step.evolvable_subjects
+                feedback = evo_step.feedback
+                for task_index in range(len(implementations.sub_tasks)):
+                    target_task = implementations.sub_tasks[task_index]
+                    target_task_information = target_task.get_information()
+                    implementation = implementations.sub_implementations[task_index]
+                    single_feedback = feedback[task_index]
+                    if single_feedback is None:
+                        continue
+                    single_knowledge = ModelKnowledge(
+                        target_task=target_task,
+                        implementation=implementation,
+                        feedback=single_feedback,
+                    )
+                    if target_task_information not in self.knowledgebase.success_task_info_set:
+                        self.knowledgebase.implementation_trace.setdefault(
+                            target_task_information,
+                            [],
+                        ).append(single_knowledge)
+
+                        if single_feedback.final_decision == True:
+                            self.knowledgebase.success_task_info_set.add(
+                                target_task_information,
+                            )
+            self.current_generated_trace_count = len(evolving_trace)
+
+    def query(
+        self,
+        evo: EvolvableSubjects,
+        evolving_trace: list[EvoStep],
+    ) -> QueriedKnowledge | None:
+        query_former_trace_limit = MODEL_IMPL_SETTINGS.query_former_trace_limit
+        query_similar_success_limit = MODEL_IMPL_SETTINGS.query_similar_success_limit
+        fail_task_trial_limit = MODEL_IMPL_SETTINGS.fail_task_trial_limit
+
+        queried_knowledge = ModelQueriedKnowledge()
+        for target_model_task in evo.sub_tasks:
+            target_model_task_information = target_model_task.get_information()
+            if target_model_task_information in self.knowledgebase.success_task_info_set:
+                queried_knowledge.success_task_to_knowledge_dict[target_model_task_information] = (
+                    self.knowledgebase.implementation_trace[target_model_task_information][-1]
+                )
+            elif (
+                len(
+                    self.knowledgebase.implementation_trace.setdefault(
+                        target_model_task_information,
+                        [],
+                    ),
+                )
+                >= fail_task_trial_limit
+            ):
+                queried_knowledge.failed_task_info_set.add(target_model_task_information)
+            else:
+                queried_knowledge.working_task_to_former_failed_knowledge_dict[target_model_task_information] = (
+                    self.knowledgebase.implementation_trace.setdefault(
+                        target_model_task_information,
+                        [],
+                    )[-query_former_trace_limit:]
+                )
+
+                knowledge_base_success_task_list = list(
+                    self.knowledgebase.success_task_info_set,
+                )
+                similarity = calculate_embedding_distance_between_str_list(
+                    [target_model_task_information],
+                    knowledge_base_success_task_list,
+                )[0]
+                similar_indexes = sorted(
+                    range(len(similarity)),
+                    key=lambda i: similarity[i],
+                    reverse=True,
+                )[:query_similar_success_limit]
+                similar_successful_knowledge = [
+                    self.knowledgebase.implementation_trace.setdefault(
+                        knowledge_base_success_task_list[index],
+                        [],
+                    )[-1]
+                    for index in similar_indexes
+                ]
+                queried_knowledge.working_task_to_similar_successful_knowledge_dict[target_model_task_information] = (
+                    similar_successful_knowledge
+                )
+        return queried_knowledge
diff --git a/rdagent/components/coder/model_coder/benchmark/eval.py b/rdagent/components/coder/model_coder/benchmark/eval.py
index a87c7475..77f924e3 100644
--- a/rdagent/components/coder/model_coder/benchmark/eval.py
+++ b/rdagent/components/coder/model_coder/benchmark/eval.py
@@ -22,7 +22,7 @@ class ModelImpValEval:
     - If the model structure is similar, the output will change in similar way when we change the input.
 
     Challenge:
-    - The key difference between it and implementing factors is that we have parameters in the layers (Factor operators often have no parameters or are given parameters).
+    - The key difference between it and implementing models is that we have parameters in the layers (Model operators often have no parameters or are given parameters).
     - we try to initialize the model param in similar value. So only the model structure is different.
 
     Comparing the correlation of following sequences
@@ -41,9 +41,8 @@ def evaluate(self, gt: ModelImplementation, gen: ModelImplementation):
         for _ in range(round_n):
             # run different model initial parameters.
             for init_val in [-0.2, -0.1, 0.1, 0.2]:
-                data, exec_config = get_data_conf(init_val)
-                gt_res = gt.execute(data=data, config=exec_config)
-                res = gen.execute(data=data, config=exec_config)
+                _, gt_res = gt.execute(input_value=init_val, param_init_value=init_val)
+                _, res = gen.execute(input_value=init_val, param_init_value=init_val)
                 eval_pairs.append((res, gt_res))
 
         # flat and concat the output
diff --git a/rdagent/components/coder/model_coder/benchmark/model_dict.json b/rdagent/components/coder/model_coder/benchmark/model_dict.json
index a30b8adb..a15edfff 100644
--- a/rdagent/components/coder/model_coder/benchmark/model_dict.json
+++ b/rdagent/components/coder/model_coder/benchmark/model_dict.json
@@ -9,7 +9,8 @@
             "h^{(l-1)}_v": "The feature representation of node v at layer (l-1)",
             "N_u": "The set of neighbored nodes centered at node u"
         },
-        "key": "pmlp"
+        "key": "pmlp",
+        "model_type": "TimeSeries"
     },
     "LINKX": {
         "description": "A scalable model for node classification that separately embeds adjacency and node features, combines them with MLPs, and applies simple transformations.",
@@ -22,7 +23,8 @@
             "h_X": "Embedding of the node features",
             "MLP_f": "Final multilayer perceptron for prediction"
         },
-        "key": "linkx"
+        "key": "linkx",
+        "model_type": "TimeSeries"
     },
     "GPSConv": {
         "description": "A scalable and powerful graph transformer with linear complexity, capable of handling large graphs with state-of-the-art results across diverse benchmarks.",
@@ -34,7 +36,8 @@
             "MPNN^{(l)}": "The message-passing neural network function at layer l",
             "GlobalAttn^{(l)}": "The global attention function at layer l"
         },
-        "key": "gpsconv"
+        "key": "gpsconv",
+        "model_type": "TimeSeries"
     },
     "ViSNet": {
         "description": "ViSNet is an equivariant geometry-enhanced graph neural network designed for efficient molecular modeling[^1^][1][^2^][2]. It utilizes a Vector-Scalar interactive message passing mechanism to extract and utilize geometric features with low computational costs, achieving state-of-the-art performance on multiple molecular dynamics benchmarks.",
@@ -44,7 +47,8 @@
             "\\mathbf{e}_u": "Edge embedding associated with atom u",
             "\\mathbf{v}_u": "Direction unit vector for atom u"
         },
-        "key": "visnet"
+        "key": "visnet",
+        "model_type": "TimeSeries"
     },
     "Dir-GNN": {
         "description": "A framework for deep learning on directed graphs that extends MPNNs to incorporate edge directionality.",
@@ -54,7 +58,8 @@
             "m^{(k)}_{i,\\leftarrow}": "The aggregated incoming messages to node i at layer k",
             "m^{(k)}_{i,\\rightarrow}": "The aggregated outgoing messages from node i at layer k"
         },
-        "key": "dirgnn"
+        "key": "dirgnn",
+        "model_type": "TimeSeries"
     },
     "A-DGN": {
         "description": "A framework for stable and non-dissipative DGN design, conceived through the lens of ordinary differential equations (ODEs). It ensures long-range information preservation between nodes and prevents gradient vanishing or explosion during training.",
@@ -69,6 +74,7 @@
             "X(t)": "The node feature matrix of the whole graph at time t",
             "N_u": "The set of neighboring nodes of u"
         },
-        "key": "A-DGN"
+        "key": "A-DGN",
+        "model_type": "TimeSeries"
     }
-}
+}
\ No newline at end of file
diff --git a/rdagent/components/coder/model_coder/conf.py b/rdagent/components/coder/model_coder/conf.py
index af504aa6..74283e1a 100644
--- a/rdagent/components/coder/model_coder/conf.py
+++ b/rdagent/components/coder/model_coder/conf.py
@@ -1,13 +1,32 @@
 from pathlib import Path
+from typing import Union
 
 from pydantic_settings import BaseSettings
 
 
 class ModelImplSettings(BaseSettings):
-    workspace_path: Path = Path("./git_ignore_folder/model_imp_workspace/")  # Added type annotation for work_space
-
     class Config:
         env_prefix = "MODEL_IMPL_"  # Use MODEL_IMPL_ as prefix for environment variables
 
+    file_based_execution_workspace: str = str(
+        (Path().cwd() / "git_ignore_folder" / "model_implementation_workspace").absolute(),
+    )
+    implementation_execution_cache_location: str = str(
+        (Path().cwd() / "git_ignore_folder" / "model_implementation_execution_cache").absolute(),
+    )
+
+    knowledge_base_path: Union[str, None] = None
+    new_knowledge_base_path: Union[str, None] = None
+
+    max_loop: int = 10
+
+    query_former_trace_limit: int = 5
+    query_similar_success_limit: int = 5
+    fail_task_trial_limit: int = 20
+
+    evo_multi_proc_n: int = 1
+
+    enable_execution_cache: bool = True  # whether to enable the execution cache
+
 
 MODEL_IMPL_SETTINGS = ModelImplSettings()
diff --git a/rdagent/components/coder/model_coder/evaluator.py b/rdagent/components/coder/model_coder/evaluator.py
deleted file mode 100644
index 9a1830fa..00000000
--- a/rdagent/components/coder/model_coder/evaluator.py
+++ /dev/null
@@ -1,59 +0,0 @@
-import numpy as np
-import torch
-
-
-def shape_evaluator(target, prediction):
-    if target is None or prediction is None:
-        return None, 0
-    tar_shape = target.shape
-    pre_shape = prediction.shape
-
-    diff = []
-    for i in range(max(len(tar_shape), len(pre_shape))):
-        dim_tar = tar_shape[i] if i < len(tar_shape) else 0
-        dim_pre = pre_shape[i] if i < len(pre_shape) else 0
-        diff.append(abs(dim_tar - dim_pre))
-
-    metric = 1 / (np.exp(np.mean(diff)) + 1)
-    return diff, metric
-
-
-def reshape_tensor(original_tensor, target_shape):
-    new_tensor = torch.zeros(target_shape)
-    for i, dim in enumerate(original_tensor.shape):
-        new_tensor = new_tensor.narrow(i, 0, dim).copy_(original_tensor)
-
-    return new_tensor
-
-
-def value_evaluator(target, prediction):
-    if target is None or prediction is None:
-        return None, 0
-    tar_shape = target.shape
-    pre_shape = prediction.shape
-
-    # Determine the shape of the padded tensors
-    dims = [
-        max(s1, s2)
-        for s1, s2 in zip(
-            tar_shape + (1,) * (len(pre_shape) - len(tar_shape)),
-            pre_shape + (1,) * (len(tar_shape) - len(pre_shape)),
-        )
-    ]
-    # Reshape both tensors to the determined shape
-    target = target.reshape(*tar_shape, *(1,) * (max(len(tar_shape), len(pre_shape)) - len(tar_shape)))
-    prediction = prediction.reshape(*pre_shape, *(1,) * (max(len(tar_shape), len(pre_shape)) - len(pre_shape)))
-    target_padded = reshape_tensor(target, dims)
-    prediction_padded = reshape_tensor(prediction, dims)
-
-    # Calculate the mean absolute difference
-    diff = torch.abs(target_padded - prediction_padded)
-    metric = 1 / (1 + np.exp(torch.mean(diff).item()))
-    return diff, metric
-
-
-if __name__ == "__main__":
-    tar = torch.rand(4, 5, 5)
-    pre = torch.rand(4, 1)
-    print(shape_evaluator(tar, pre))
-    print(value_evaluator(tar, pre)[1])
diff --git a/rdagent/components/coder/model_coder/model.py b/rdagent/components/coder/model_coder/model.py
index 9898a3de..bf8d76a8 100644
--- a/rdagent/components/coder/model_coder/model.py
+++ b/rdagent/components/coder/model_coder/model.py
@@ -1,14 +1,16 @@
 import json
+import pickle
+import site
 import uuid
 from pathlib import Path
-from typing import Dict, Optional, Sequence
+from typing import Dict, Optional
 
 import torch
 
 from rdagent.components.coder.model_coder.conf import MODEL_IMPL_SETTINGS
-from rdagent.components.loader.task_loader import ModelTaskLoader
 from rdagent.core.exception import CodeFormatException
-from rdagent.core.experiment import Experiment, FBImplementation, ImpLoader, Task
+from rdagent.core.experiment import Experiment, FBImplementation, Task
+from rdagent.oai.llm_utils import md5_hash
 from rdagent.utils import get_module_by_module_path
 
 
@@ -20,29 +22,20 @@ class ModelTask(Task):
     variables: Dict[str, str]  # map the variable name to the variable description
 
     def __init__(
-        self, name: str, description: str, formulation: str, variables: Dict[str, str], key: Optional[str] = None
+        self, name: str, description: str, formulation: str, variables: Dict[str, str], model_type: Optional[str] = None
     ) -> None:
-        """
-
-        Parameters
-        ----------
-
-        key : Optional[str]
-            Key is a string to identify the task.
-            It will be used to connect to other information(e.g. ground truth).
-        """
-        self.name = name
-        self.description = description
-        self.formulation = formulation
-        self.variables = variables
-        self.key = key
+        self.name: str = name
+        self.description: str = description
+        self.formulation: str = formulation
+        self.variables: str = variables
+        self.model_type: str = model_type  # Tabular for tabular model, TimesSeries for time series model
 
     def get_information(self):
         return f"""name: {self.name}
 description: {self.description}
 formulation: {self.formulation}
 variables: {self.variables}
-key: {self.key}
+model_type: {self.model_type}
 """
 
     @staticmethod
@@ -75,136 +68,57 @@ class ModelImplementation(FBImplementation):
 
     def __init__(self, target_task: Task) -> None:
         super().__init__(target_task)
-        self.path = None
 
     def prepare(self) -> None:
         """
         Prepare for the workspace;
         """
         unique_id = uuid.uuid4()
-        self.path = MODEL_IMPL_SETTINGS.workspace_path / f"M{unique_id}"
+        self.workspace_path = Path(MODEL_IMPL_SETTINGS.file_based_execution_workspace) / f"M{unique_id}"
         # start with `M` so that it can be imported via python
-        self.path.mkdir(parents=True, exist_ok=True)
-
-    def execute(self, data=None, config: dict = {}):
-        mod = get_module_by_module_path(str(self.path / "model.py"))
+        self.workspace_path.mkdir(parents=True, exist_ok=True)
+
+    def execute(
+        self,
+        batch_size: int = 8,
+        num_features: int = 10,
+        num_timesteps: int = 4,
+        input_value: float = 1.0,
+        param_init_value: float = 1.0,
+    ):
         try:
+            if MODEL_IMPL_SETTINGS.enable_execution_cache:
+                # NOTE: cache the result for the same code
+                target_file_name = md5_hash(self.code_dict["model.py"])
+                cache_file_path = (
+                    Path(MODEL_IMPL_SETTINGS.implementation_execution_cache_location) / f"{target_file_name}.pkl"
+                )
+                Path(MODEL_IMPL_SETTINGS.implementation_execution_cache_location).mkdir(exist_ok=True, parents=True)
+                if cache_file_path.exists():
+                    return pickle.load(open(cache_file_path, "rb"))
+            mod = get_module_by_module_path(str(self.workspace_path / "model.py"))
             model_cls = mod.model_cls
-        except AttributeError:
-            raise CodeFormatException("The model_cls is not implemented in the model.py")
-        # model_init =
-
-        assert isinstance(data, tuple)
-        node_feature, _ = data
-        in_channels = node_feature.size(-1)
-        m = model_cls(in_channels)
-
-        # TODO: initialize all the parameters of `m` to `model_eval_param_init`
-        model_eval_param_init: float = config["model_eval_param_init"]
-
-        # initialize all parameters of `m` to `model_eval_param_init`
-        for _, param in m.named_parameters():
-            param.data.fill_(model_eval_param_init)
-
-        assert isinstance(data, tuple)
-        return m(*data)
-
-    def execute_desc(self) -> str:
-        return """
-The the implemented code will be placed in a file like <uuid>/model.py
-
-We'll import the model in the implementation in file `model.py` after setting the cwd into the directory
-- from model import model_cls (So you must have a variable named `model_cls` in the file)
-  - So your implemented code could follow the following pattern
-    ```Python
-    class XXXLayer(torch.nn.Module):
-        ...
-    model_cls = XXXLayer
-    ```
-- initialize the model by initializing it `model_cls(input_dim=INPUT_DIM)`
-- And then verify the model by comparing the output tensors by feeding specific input tensor.
-"""
-
 
-class ModelExperiment(Experiment[ModelTask, ModelImplementation]):
-    ...
-
-
-class ModelTaskLoaderJson(ModelTaskLoader):
-    # def __init__(self, json_uri: str, select_model: Optional[str] = None) -> None:
-    #     super().__init__()
-    #     self.json_uri = json_uri
-    #     self.select_model = 'A-DGN'
-
-    # def load(self, *argT, **kwargs) -> Sequence[ModelImplTask]:
-    #     # json is supposed to be in the format of {model_name: dict{model_data}}
-    #     model_dict = json.load(open(self.json_uri, "r"))
-    #     if self.select_model is not None:
-    #         assert self.select_model in model_dict
-    #         model_name = self.select_model
-    #         model_data = model_dict[self.select_model]
-    #     else:
-    #         model_name, model_data = list(model_dict.items())[0]
-
-    #     model_impl_task = ModelImplTask(
-    #         name=model_name,
-    #         description=model_data["description"],
-    #         formulation=model_data["formulation"],
-    #         variables=model_data["variables"],
-    #         key=model_name
-    #     )
-
-    #     return [model_impl_task]
-
-    def __init__(self, json_uri: str) -> None:
-        super().__init__()
-        self.json_uri = json_uri
-
-    def load(self, *argT, **kwargs) -> Sequence[ModelTask]:
-        # json is supposed to be in the format of {model_name: dict{model_data}}
-        model_dict = json.load(open(self.json_uri, "r"))
-
-        # FIXME: the model in the json file is not right due to extraction error
-        #       We should fix them case by case in the future
-        #
-        # formula_info = {
-        #     "name": "Anti-Symmetric Deep Graph Network (A-DGN)",
-        #     "description": "A framework for stable and non-dissipative DGN design. It ensures long-range information preservation between nodes and prevents gradient vanishing or explosion during training.",
-        #     "formulation": r"\mathbf{x}^{\prime}_i = \mathbf{x}_i + \epsilon \cdot \sigma \left( (\mathbf{W}-\mathbf{W}^T-\gamma \mathbf{I}) \mathbf{x}_i + \Phi(\mathbf{X}, \mathcal{N}_i) + \mathbf{b}\right),",
-        #     "variables": {
-        #         r"\mathbf{x}_i": "The state of node i at previous layer",
-        #         r"\epsilon": "The step size in the Euler discretization",
-        #         r"\sigma": "A monotonically non-decreasing activation function",
-        #         r"\Phi": "A graph convolutional operator",
-        #         r"W": "An anti-symmetric weight matrix",
-        #         r"\mathbf{x}^{\prime}_i": "The node feature matrix at layer l-1",
-        #         r"\mathcal{N}_i": "The set of neighbors of node u",
-        #         r"\mathbf{b}": "A bias vector",
-        #     },
-        #     "key": "A-DGN",
-        # }
-        model_impl_task_list = []
-        for model_name, model_data in model_dict.items():
-            model_impl_task = ModelTask(
-                name=model_name,
-                description=model_data["description"],
-                formulation=model_data["formulation"],
-                variables=model_data["variables"],
-                key=model_data["key"],
-            )
-            model_impl_task_list.append(model_impl_task)
-        return model_impl_task_list
-
-
-class ModelImpLoader(ImpLoader[ModelTask, ModelImplementation]):
-    def __init__(self, path: Path) -> None:
-        self.path = Path(path)
-
-    def load(self, task: ModelTask) -> ModelImplementation:
-        assert task.key is not None
-        mti = ModelImplementation(task)
-        mti.prepare()
-        with open(self.path / f"{task.key}.py", "r") as f:
-            code = f.read()
-        mti.inject_code(**{"model.py": code})
-        return mti
+            if self.target_task.model_type == "Tabular":
+                input_shape = (batch_size, num_features)
+                m = model_cls(num_features=input_shape[1])
+            elif self.target_task.model_type == "TimeSeries":
+                input_shape = (batch_size, num_features, num_timesteps)
+                m = model_cls(num_features=input_shape[1], num_timesteps=input_shape[2])
+            data = torch.full(input_shape, input_value)
+
+            # initialize all parameters of `m` to `param_init_value`
+            for _, param in m.named_parameters():
+                param.data.fill_(param_init_value)
+            out = m(data)
+            execution_model_output = out.cpu().detach()
+            execution_feedback_str = f"Execution successful, output tensor shape: {execution_model_output.shape}"
+            if MODEL_IMPL_SETTINGS.enable_execution_cache:
+                pickle.dump((execution_feedback_str, execution_model_output), open(cache_file_path, "wb"))
+            return execution_feedback_str, execution_model_output
+
+        except Exception as e:
+            return f"Execution error: {e}", None
+
+
+class ModelExperiment(Experiment[ModelTask, ModelImplementation]): ...
diff --git a/rdagent/components/coder/model_coder/one_shot/__init__.py b/rdagent/components/coder/model_coder/one_shot/__init__.py
index c54fff2d..2b3c7dce 100644
--- a/rdagent/components/coder/model_coder/one_shot/__init__.py
+++ b/rdagent/components/coder/model_coder/one_shot/__init__.py
@@ -30,7 +30,6 @@ def generate(self, exp: ModelExperiment) -> ModelExperiment:
                 description=t.description,
                 formulation=t.formulation,
                 variables=t.variables,
-                execute_desc=mti.execute_desc(),
             )
             system_prompt = sys_prompt_tpl.render()
 
diff --git a/rdagent/components/coder/model_coder/one_shot/prompt.yaml b/rdagent/components/coder/model_coder/one_shot/prompt.yaml
index 0145cf35..d9774631 100644
--- a/rdagent/components/coder/model_coder/one_shot/prompt.yaml
+++ b/rdagent/components/coder/model_coder/one_shot/prompt.yaml
@@ -1,8 +1,8 @@
 
 
-code_implement_sys: -|
-  You are an assistant whose job is to answer user's question."
-code_implement_user: -|
+code_implement_sys: |-
+  You are an assistant whose job is to answer user's question.
+code_implement_user: |-
   With the following given information, write a python code using pytorch and torch_geometric to implement the model.
   This model is in the graph learning field, only have one layer.
   The input will be node_feature [num_nodes, dim_feature] and edge_index [2, num_edges]  (It would be the input of the forward model)
@@ -13,6 +13,15 @@ code_implement_user: -|
   3. model formulation:{{formulation}}
   4. model variables:{{variables}}.
   You must complete the forward function as far as you can do.
-  \# Execution
-  Your implemented code will be exectued in the follow way
-  {{execute_desc}}
+  Execution Your implemented code will be executed in the follow way:
+  The the implemented code will be placed in a file like [uuid]/model.py
+  We'll import the model in the implementation in file `model.py` after setting the cwd into the directory
+  - from model import model_cls (So you must have a variable named `model_cls` in the file)
+    - So your implemented code could follow the following pattern
+      ```Python
+      class XXXLayer(torch.nn.Module):
+          ...
+      model_cls = XXXLayer
+      ```
+  - initialize the model by initializing it `model_cls(input_dim=INPUT_DIM)`
+  - And then verify the model by comparing the output tensors by feeding specific input tensor.
diff --git a/rdagent/components/coder/model_coder/prompts.yaml b/rdagent/components/coder/model_coder/prompts.yaml
index 7cc63385..b5679941 100644
--- a/rdagent/components/coder/model_coder/prompts.yaml
+++ b/rdagent/components/coder/model_coder/prompts.yaml
@@ -1,8 +1,123 @@
 extract_model_formulation_system: |-
-    offer description of the proposed model in this paper, write a latex formula with variable of the model. the format should be like "        "Model Name": {
-                "description": "",
-                "formulation": "",
-                "variables": {
-                    "\\hat{y}_u": "The predicted output for node u",
-                }"
-    such format content should be begin with ```json and end with ``` and the content should be in json format.
\ No newline at end of file
+    offer description of the proposed model in this paper, write a latex formula with variable of the model. the format should be like 
+    "Model Name": {
+        "description": "",
+        "formulation": "",
+        "variables": {
+        "\\hat{y}_u": "The predicted output for node u",
+        }"
+    such format content should be begin with ```json and end with ``` and the content should be in json format.
+
+evolving_strategy_model_coder:
+    system: |-
+        User is trying to implement some pytorch models in the following scenario:
+        {{ scenario }}
+        Your code is expected to align the scenario in any form which means The user needs to get the prediction of the model based on the input data.
+
+        To help you write the correct code, the user might provide multiple information that helps you write the correct code:
+        1. The user might provide you the correct code to similar models. Your should learn from these code to write the correct code.
+        2. The user might provide you the failed former code and the corresponding feedback to the code. The feedback contains to the execution, the code and the model output value. You should analyze the feedback and try to correct the latest code.
+        3. The user might provide you the suggestion to the latest fail code and some similar fail to correct pairs. Each pair contains the fail code with similar error and the corresponding corrected version code. You should learn from these suggestion to write the correct code.
+
+        Your must write your code based on your former latest attempt below which consists of your former code and code feedback, you should read the former attempt carefully and must not modify the right part of your former code.
+
+        {% if queried_former_failed_knowledge|length != 0 %}
+        --------------Your former latest attempt:---------------
+        =====Code to the former implementation=====
+        {{ queried_former_failed_knowledge[-1].implementation.code }}
+        =====Feedback to the former implementation=====
+        {{ queried_former_failed_knowledge[-1].feedback }}
+        {% endif %}
+        
+        Please response the code in the following json format. Here is an example structure for the JSON output:
+        {
+            "code": "The Python code as a string."
+        }
+
+    user: |-
+        --------------Target model information:---------------
+        {{ model_information_str }}
+
+        {% if queried_similar_successful_knowledge|length != 0 %}
+        --------------Correct code to similar models:---------------
+        {% for similar_successful_knowledge in queried_similar_successful_knowledge %}
+        =====Model {{loop.index}}:=====
+        {{ similar_successful_knowledge.target_task.get_model_information() }}
+        =====Code:=====
+        {{ similar_successful_knowledge.implementation.code }}
+        {% endfor %}
+        {% endif %}
+
+        {% if queried_former_failed_knowledge|length != 0 %}
+        --------------Former failed code:---------------
+        {% for former_failed_knowledge in queried_former_failed_knowledge %}
+        =====Code to implementation {{ loop.index }}=====
+        {{ former_failed_knowledge.implementation.code }}
+        =====Feedback to implementation {{ loop.index }}=====
+        {{ former_failed_knowledge.feedback }}
+        {% endfor %}
+        {% endif %}
+
+evaluator_code_feedback:
+    system: |-
+        User is trying to implement some models in the following scenario:
+        {{ scenario }}
+        User will provide you the information of the model.
+
+        Your job is to check whether user's code is align with the model information and the scenario.
+        The user will provide the source python code and the execution error message if execution failed.
+        The user might provide you the ground truth code for you to provide the critic. You should not leak the ground truth code to the user in any form but you can use it to provide the critic.
+
+        User has also compared the output generated by the user's code and the ground truth code. The user will provide you some analyze result comparing two output. You may find some error in the code which caused the difference between the two output.
+
+        If the ground truth code is provided, your critic should only consider checking whether the user's code is align with the ground truth code since the ground truth is definitely correct.
+        If the ground truth code is not provided, your critic should consider checking whether the user's code is reasonable and correct to the description and to the scenario.
+
+        Notice that your critics are not for user to debug the code. They are sent to the coding agent to correct the code. So don't give any following items for the user to check like "Please check the code line XXX".
+
+        You should provide the suggestion to each of your critic to help the user improve the code. Please response the critic in the following format. Here is an example structure for the output:
+        critic 1: The critic message to critic 1
+        critic 2: The critic message to critic 2
+    
+    user: |-
+        --------------Model information:---------------
+        {{ model_information }}
+        --------------Python code:---------------
+        {{ code }}
+        --------------Execution feedback:---------------
+        {{ model_execution_feedback }}
+        {% if model_value_feedback is not none %}
+        --------------Model value feedback:---------------
+        {{ model_value_feedback }}
+        {% endif %}
+        {% if gt_code is not none %}
+        --------------Ground truth Python code:---------------
+        {{ gt_code }}
+        {% endif %}
+
+
+evaluator_final_feedback:
+    system: |-
+        User is trying to implement a model in the following scenario:
+        {{ scenario }}
+        User has finished evaluation and got some feedback from the evaluator.
+        The evaluator run the code and get the output and provide several feedback regarding user's code and code output. You should analyze the feedback and considering the scenario and model description to give a final decision about the evaluation result. The final decision concludes whether the model is implemented correctly and if not, detail feedback containing reason and suggestion if the final decision is False.
+
+        The implementation final decision is considered in the following logic:
+        1. If the value and the ground truth value are exactly the same under a small tolerance, the implementation is considered correct.
+        2. If no ground truth value is not provided, the implementation is considered correct if the code execution is successful and the code feedback is align with the scenario and model description.
+
+        Please response the critic in the json format. Here is an example structure for the JSON output, please strictly follow the format:
+        {
+            "final_decision": True,
+            "final_feedback": "The final feedback message",
+        }
+    user: |-
+        --------------Model information:---------------
+        {{ model_information }}
+        --------------Model Execution feedback:---------------
+        {{ model_execution_feedback }}
+        --------------Model Code feedback:---------------
+        {{ model_code_feedback }}
+        --------------Model value feedback:---------------
+        {{ model_value_feedback }}
\ No newline at end of file
diff --git a/rdagent/components/coder/model_coder/task_loader.py b/rdagent/components/coder/model_coder/task_loader.py
index d2018f2d..7f1035f1 100644
--- a/rdagent/components/coder/model_coder/task_loader.py
+++ b/rdagent/components/coder/model_coder/task_loader.py
@@ -28,7 +28,7 @@ def extract_model_from_doc(doc_content: str) -> dict:
     Returns
     -------
     dict
-        {factor_name: dict{description, formulation, variables}}
+        {model_name: dict{description, formulation, variables}}
     """
     session = APIBackend().build_chat_session(
         session_system_prompt=document_process_prompts["extract_model_formulation_system"],
diff --git a/rdagent/components/loader/task_loader.py b/rdagent/components/loader/task_loader.py
index 647cc669..a67262c0 100644
--- a/rdagent/components/loader/task_loader.py
+++ b/rdagent/components/loader/task_loader.py
@@ -1,6 +1,10 @@
+import json
+from pathlib import Path
+from typing import Sequence
+
 from rdagent.components.coder.factor_coder.factor import FactorTask
-from rdagent.components.coder.model_coder.model import ModelTask
-from rdagent.core.experiment import Loader
+from rdagent.components.coder.model_coder.model import ModelImplementation, ModelTask
+from rdagent.core.experiment import ImpLoader, Loader
 
 
 class FactorTaskLoader(Loader[FactorTask]):
@@ -9,3 +13,83 @@ class FactorTaskLoader(Loader[FactorTask]):
 
 class ModelTaskLoader(Loader[ModelTask]):
     pass
+
+
+class ModelTaskLoaderJson(ModelTaskLoader):
+    # def __init__(self, json_uri: str, select_model: Optional[str] = None) -> None:
+    #     super().__init__()
+    #     self.json_uri = json_uri
+    #     self.select_model = 'A-DGN'
+
+    # def load(self, *argT, **kwargs) -> Sequence[ModelImplTask]:
+    #     # json is supposed to be in the format of {model_name: dict{model_data}}
+    #     model_dict = json.load(open(self.json_uri, "r"))
+    #     if self.select_model is not None:
+    #         assert self.select_model in model_dict
+    #         model_name = self.select_model
+    #         model_data = model_dict[self.select_model]
+    #     else:
+    #         model_name, model_data = list(model_dict.items())[0]
+
+    #     model_impl_task = ModelImplTask(
+    #         name=model_name,
+    #         description=model_data["description"],
+    #         formulation=model_data["formulation"],
+    #         variables=model_data["variables"],
+    #         key=model_name
+    #     )
+
+    #     return [model_impl_task]
+
+    def __init__(self, json_uri: str) -> None:
+        super().__init__()
+        self.json_uri = json_uri
+
+    def load(self, *argT, **kwargs) -> Sequence[ModelTask]:
+        # json is supposed to be in the format of {model_name: dict{model_data}}
+        model_dict = json.load(open(self.json_uri, "r"))
+        # FIXME: the model in the json file is not right due to extraction error
+        #       We should fix them case by case in the future
+        #
+        # formula_info = {
+        #     "name": "Anti-Symmetric Deep Graph Network (A-DGN)",
+        #     "description": "A framework for stable and non-dissipative DGN design. It ensures long-range information preservation between nodes and prevents gradient vanishing or explosion during training.",
+        #     "formulation": r"\mathbf{x}^{\prime}_i = \mathbf{x}_i + \epsilon \cdot \sigma \left( (\mathbf{W}-\mathbf{W}^T-\gamma \mathbf{I}) \mathbf{x}_i + \Phi(\mathbf{X}, \mathcal{N}_i) + \mathbf{b}\right),",
+        #     "variables": {
+        #         r"\mathbf{x}_i": "The state of node i at previous layer",
+        #         r"\epsilon": "The step size in the Euler discretization",
+        #         r"\sigma": "A monotonically non-decreasing activation function",
+        #         r"\Phi": "A graph convolutional operator",
+        #         r"W": "An anti-symmetric weight matrix",
+        #         r"\mathbf{x}^{\prime}_i": "The node feature matrix at layer l-1",
+        #         r"\mathcal{N}_i": "The set of neighbors of node u",
+        #         r"\mathbf{b}": "A bias vector",
+        #     },
+        #     "key": "A-DGN",
+        # }
+        model_impl_task_list = []
+        for model_name, model_data in model_dict.items():
+            model_impl_task = ModelTask(
+                name=model_name,
+                description=model_data["description"],
+                formulation=model_data["formulation"],
+                variables=model_data["variables"],
+                model_type=model_data["model_type"],
+            )
+            model_impl_task_list.append(model_impl_task)
+        return model_impl_task_list
+
+
+class ModelImpLoader(ImpLoader[ModelTask, ModelImplementation]):
+    def __init__(self, path: Path) -> None:
+        self.path = Path(path)
+
+    def load(self, task: ModelTask) -> ModelImplementation:
+        assert task.name is not None
+        mti = ModelImplementation(task)
+        mti.prepare()
+        with open(self.path / f"{task.name}.py", "r") as f:
+            code = f.read()
+        mti.inject_code(**{"model.py": code})
+        return mti
+        return mti
diff --git a/rdagent/core/experiment.py b/rdagent/core/experiment.py
index b7abb860..42125da7 100644
--- a/rdagent/core/experiment.py
+++ b/rdagent/core/experiment.py
@@ -1,6 +1,6 @@
 from abc import ABC, abstractmethod
 from pathlib import Path
-from typing import Generic, Optional, Sequence, TypeVar
+from typing import Any, Dict, Generic, Optional, Sequence, TypeVar
 
 """
 This file contains the all the class about organizing the task in RD-Agent.
@@ -22,25 +22,9 @@ def __init__(self, target_task: ASpecificTask) -> None:
         self.target_task = target_task
 
     @abstractmethod
-    def execute(self, data=None, config: dict = {}) -> object:
-        """
-        The execution of the implementation can be dynamic.
-
-        So we may pass in the data and config dynamically.
-        """
+    def execute(self, *args, **kwargs) -> object:
         raise NotImplementedError("execute method is not implemented.")
 
-    @abstractmethod
-    def execute_desc(self):
-        """
-        return the description how we will execute the code in the folder.
-        """
-        raise NotImplementedError(f"This type of input is not supported")
-
-    # TODO:
-    # After execution, it should return some results.
-    # Some evaluators will input the results and output
-
 
 ASpecificImp = TypeVar("ASpecificImp", bound=Implementation)
 
@@ -78,8 +62,17 @@ def run_pipeline(self, **files: str):
     # Why not directly reuse FileBasedFactorImplementation.
     #   Because it has too much concrete dependencies.
     #   e.g.  dataframe, factors
-
-    path: Optional[Path]
+    def __init__(self, *args, code_dict: Dict[str, str] = None, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.code_dict = code_dict  # The code to be injected into the folder, store them in the variable
+        self.workspace_path: Optional[Path] = None
+
+    @property
+    def code(self) -> str:
+        code_string = ""
+        for file_name, code in self.code_dict.items():
+            code_string += f"File: {file_name}\n{code}\n"
+        return code_string
 
     @abstractmethod
     def prepare(self, *args, **kwargs):
@@ -100,8 +93,9 @@ def inject_code(self, **files: str):
             "model.py": "<model code>"
         }
         """
+        self.code_dict = files
         for k, v in files.items():
-            with open(self.path / k, "w") as f:
+            with open(self.workspace_path / k, "w") as f:
                 f.write(v)
 
     def get_files(self) -> list[Path]:
@@ -111,7 +105,7 @@ def get_files(self) -> list[Path]:
         To be general, we only return a list of filenames.
         How to summarize the environment is the responsibility of the TaskGenerator.
         """
-        return list(self.path.iterdir())
+        return list(self.workspace_path.iterdir())
 
 
 class Experiment(ABC, Generic[ASpecificTask, ASpecificImp]):
diff --git a/rdagent/scenarios/qlib/experiment/model_experiment.py b/rdagent/scenarios/qlib/experiment/model_experiment.py
index 774eec6d..8feb185d 100644
--- a/rdagent/scenarios/qlib/experiment/model_experiment.py
+++ b/rdagent/scenarios/qlib/experiment/model_experiment.py
@@ -1 +1,42 @@
-# TODO define QlibModelExperiment here which should be subclass of Experiment
+from pathlib import Path
+
+from rdagent.components.coder.model_coder.model import ModelExperiment
+from rdagent.core.prompts import Prompts
+from rdagent.core.scenario import Scenario
+
+prompt_dict = Prompts(file_path=Path(__file__).parent / "prompts.yaml")
+
+QlibModelExperiment = ModelExperiment
+
+
+class QlibModelScenario(Scenario):
+    @property
+    def background(self) -> str:
+        return prompt_dict["qlib_model_background"]
+
+    @property
+    def source_data(self) -> str:
+        raise NotImplementedError("source_data of QlibModelScenario is not implemented")
+
+    @property
+    def output_format(self) -> str:
+        return prompt_dict["qlib_model_output_format"]
+
+    @property
+    def interface(self) -> str:
+        return prompt_dict["qlib_model_interface"]
+
+    @property
+    def simulator(self) -> str:
+        return prompt_dict["qlib_model_simulator"]
+
+    def get_scenario_all_desc(self) -> str:
+        return f"""Background of the scenario:
+{self.background}
+The interface you should follow to write the runnable code:
+{self.interface}
+The output of your code should be in the format:
+{self.output_format}
+The simulator user can use to test your model:
+{self.simulator}
+"""
diff --git a/rdagent/scenarios/qlib/experiment/prompts.yaml b/rdagent/scenarios/qlib/experiment/prompts.yaml
index b66723db..71bce0f5 100644
--- a/rdagent/scenarios/qlib/experiment/prompts.yaml
+++ b/rdagent/scenarios/qlib/experiment/prompts.yaml
@@ -44,4 +44,66 @@ qlib_factor_simulator: |-
   1. generate a new factor table based on the factor values.
   2. train a model like LightGBM, CatBoost, LSTM or simple PyTorch model to predict the next several days return based on the factor values.
   3. build a portfolio based on the predicted return based on a strategy.
-  4. evaluate the portfolio's performance including the return, sharpe ratio, max drawdown, and so on.
\ No newline at end of file
+  4. evaluate the portfolio's performance including the return, sharpe ratio, max drawdown, and so on.
+
+qlib_model_background: |-
+  The model is a machine learning or deep learning structure used in quantitative investment to predict the returns and risks of a portfolio or a single asset. Models are employed by investors to generate forecasts based on historical data and identified factors, which are central to many quantitative investment strategies.
+  Each model takes the factors as input and predicts the future returns.
+  The model is defined in the following parts:
+  1. Name: The name of the model.
+  2. Description: The description of the model.
+  3. Architecture: The detailed architecture of the model, such as neural network layers or tree structures.
+  4. Hyperparameters: The hyperparameters used in the model, such as learning rate, number of epochs, etc.
+  5. ModelType: The type of the model, "Tabular" for tabular model and "TimeSeries" for time series model.
+  The model should provide clear and detailed documentation of its architecture and hyperparameters. One model should statically define one output with a fixed architecture and hyperparameters. For example, a model with an two GRU layer and a model with three GRU layer should be considered two different models.
+
+qlib_model_interface: |-
+  Your python code should follow the interface to better interact with the user's system.
+  You code should contain several parts:
+  1. The import part: import the necessary libraries.
+  2. A class which is a sub-class of pytorch.nn.Module. This class should should have a init function and a forward function which inputs a tensor and outputs a tensor.
+  3. Set a variable called "model_cls" to the class you defined.
+
+  The user will save your code into a python file called "model.py". Then the user imports model_cls in file "model.py" after setting the cwd into the directory:
+  ```python
+  from model import model_cls
+  ```
+  So your python code should follow the pattern:
+  ```python
+  class XXXModel(torch.nn.Module):
+      ...
+  model_cls = XXXModel
+  ```
+
+  The model has two types, "Tabular" for tabular model and "TimeSeries" for time series model. The input shape to a tabular model is (batch_size, num_features) and the input shape to a time series model is (batch_size, num_features, num_timesteps). The output shape of the model should be (batch_size, 1).
+  The "batch_size" is a dynamic value which is determined by the input of forward function.
+  The "num_features" and "num_timesteps" are static which will be provided to the model through init function.
+  User will initialize the tabular model with the following code:
+  ```python
+  model = model_cls(num_features=num_features)
+  ```
+  User will initialize the time series model with the following code:
+  ```python
+  model = model_cls(num_features=num_features, num_timesteps=num_timesteps)
+  ```
+  No other parameters will be passed to the model so give other parameters a default value or just make them static.
+
+  When dealing with TimeSeries model, remember to permute the input tensor since the input tensor is in the shape of (batch_size, num_features, num_timesteps) and a normal time series model is expecting the input tensor in the shape of (batch_size, num_timesteps, num_features).
+
+  Don't write any try-except block in your python code. The user will catch the exception message and provide the feedback to you. Also, don't write main function in your python code. The user will call the forward method in the model_cls to get the output tensor.
+
+
+qlib_model_output_format: |-
+  Your output should be a tensor with shape (batch_size, 1). 
+  The output tensor should be saved in a file named "output.pth" in the same directory as your python file.
+  The user will evaluate the shape of the output tensor so the tensor read from "output.pth" should be 8 numbers.
+
+qlib_model_simulator: |-
+  The models will be sent into Qlib to train and evaluate their performance in predicting future returns based on historical factor values.
+  Qlib is an AI-oriented quantitative investment platform that aims to realize the potential, empower research, and create value using AI technologies in quantitative investment, from exploring ideas to implementing productions. Qlib supports diverse machine learning modeling paradigms, including supervised learning, market dynamics modeling, and reinforcement learning (RL).
+  User will use Qlib to automatically perform the following tasks:
+  1. Generate a baseline factor table.
+  2. Train the model defined in your class Net to predict the next several days' returns based on the factor values.
+  3. Build a portfolio based on the predicted returns using a specific strategy.
+  4. Evaluate the portfolio's performance, including metrics such as return, Sharpe ratio, max drawdown, and others.
+  5. Iterate on model improvements based on performance evaluations and feedback.
\ No newline at end of file