diff --git a/feedback.py b/feedback.py
new file mode 100644
index 000000000..b040ccb54
--- /dev/null
+++ b/feedback.py
@@ -0,0 +1,78 @@
+import json
+from rdagent.oai.llm_utils import APIBackend
+
+def generate_feedback(result, code, hypothesis):
+    # Define the system prompt
+    sys_prompt = (
+        "You are a professional code review assistant. You will receive some code, a result, and a hypothesis. "
+        "Your task is to provide feedback on how well the code and result support or refute the hypothesis. "
+        "Please provide detailed and constructive feedback."
+    )
+
+    # Define the user prompt
+    usr_prompt = (f'''
+        "Given the following hypothesis, result, and code, provide feedback on how well the code and result support or refute the hypothesis. "
+        "Hypothesis: {hypothesis}\n"
+        "Result: {result}\n"
+        "Code:\n```python\n{code}\n```\n"
+        "Please provide detailed and constructive feedback."
+    ''')
+
+    try:
+        # Call the APIBackend to generate the response
+        response = APIBackend().build_messages_and_create_chat_completion(
+            user_prompt=usr_prompt,
+            system_prompt=sys_prompt,
+            json_mode=True,
+        )
+
+        # Log the raw response for debugging
+        print("Raw Response:\n", response)
+
+        # Parse the JSON response to extract the feedback
+        response_json = json.loads(response)
+        feedback = response_json.get("feedback", "No feedback provided")
+
+        print("Generated Feedback:\n", feedback)
+
+        return feedback
+
+    except json.JSONDecodeError as e:
+        print("Error parsing JSON response from LLM:", e)
+    except Exception as e:
+        print("An unexpected error occurred:", e)
+
+def test_generate_feedback():
+    result = "The model achieved an accuracy of 85% on the validation set."
+    code = '''
+import torch
+import torch.nn as nn
+
+class Net(nn.Module):
+    def __init__(self, input_dim, output_dim=1, act="LeakyReLU"):
+        super(Net, self).__init__()
+        self.drop_input = nn.Dropout(0.05)
+        self.fc = nn.Linear(input_dim, output_dim)
+        if act == "LeakyReLU":
+            self.activation = nn.LeakyReLU(negative_slope=0.1, inplace=False)
+        elif act == "SiLU":
+            self.activation = nn.SiLU()
+        else:
+            raise NotImplementedError(f"Activation function {act} is not supported")
+        self.bn = nn.BatchNorm1d(output_dim)
+
+    def forward(self, x):
+        x = self.drop_input(x)
+        x = self.fc(x)
+        x = self.bn(x)
+        x = self.activation(x)
+        return x
+    '''
+    hypothesis = "The data shows time-series quality."
+
+    feedback = generate_feedback(result, code, hypothesis)
+    print("Final Feedback:\n", feedback)
+
+if __name__ == "__main__":
+    test_generate_feedback()
+
diff --git a/rdagent/components/proposal/model_proposal.py b/rdagent/components/proposal/model_proposal.py
index e69de29bb..32fee9f6c 100644
--- a/rdagent/components/proposal/model_proposal.py
+++ b/rdagent/components/proposal/model_proposal.py
@@ -0,0 +1,97 @@
+from abc import abstractmethod
+from pathlib import Path
+from typing import Tuple
+
+from jinja2 import Environment, StrictUndefined
+
+from rdagent.components.coder.model_coder.model import ModelExperiment
+from rdagent.core.prompts import Prompts
+from rdagent.core.proposal import (
+    Hypothesis,
+    Hypothesis2Experiment,
+    HypothesisGen,
+    HypothesisSet,
+    Scenario,
+    Trace,
+)
+from rdagent.oai.llm_utils import APIBackend
+
+prompt_dict = Prompts(file_path=Path(__file__).parent / "prompts.yaml")
+
+ModelHypothesis = Hypothesis
+
+class ModelHypothesisGen(HypothesisGen):
+    def __init__(self, scen: Scenario):
+        super().__init__(scen)
+
+    # The following methods are scenario related so they should be implemented in the subclass
+    @abstractmethod
+    def prepare_context(self, trace: Trace) -> Tuple[dict, bool]:
+        ...
+
+    @abstractmethod
+    def convert_response(self, response: str) -> ModelHypothesis:
+        ...
+
+    def gen(self, trace: Trace) -> ModelHypothesis:
+        context_dict, json_flag = self.prepare_context(trace)
+
+        system_prompt = (
+            Environment(undefined=StrictUndefined)
+            .from_string(prompt_dict["model_hypothesis_gen"]["system_prompt"])
+            .render(
+                scenario=self.scen.get_scenario_all_desc(),
+                hypothesis_output_format=context_dict["hypothesis_output_format"],
+            )
+        )
+        user_prompt = (
+            Environment(undefined=StrictUndefined)
+            .from_string(prompt_dict["model_hypothesis_gen"]["user_prompt"])
+            .render(
+                hypothesis_and_feedback=context_dict["hypothesis_and_feedback"],
+                RAG=context_dict["RAG"],
+            )
+        )
+
+        resp = APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt, json_mode=json_flag)
+
+        hypothesis = self.convert_response(resp)
+
+        return hypothesis
+
+
+class ModelHypothesis2Experiment(Hypothesis2Experiment[ModelExperiment]):
+    def __init__(self) -> None:
+        super().__init__()
+
+    @abstractmethod
+    def prepare_context(self, hs: HypothesisSet) -> Tuple[dict, bool]:
+        ...
+
+    @abstractmethod
+    def convert_response(self, response: str) -> ModelExperiment:
+        ...
+
+    def convert(self, hs: HypothesisSet) -> ModelExperiment:
+        context, json_flag = self.prepare_context(hs)
+        system_prompt = (
+            Environment(undefined=StrictUndefined)
+            .from_string(prompt_dict["model_hypothesis2experiment"]["system_prompt"])
+            .render(
+                scenario=hs.trace.scen.get_scenario_all_desc(),
+                experiment_output_format=context["experiment_output_format"],
+            )
+        )
+        user_prompt = (
+            Environment(undefined=StrictUndefined)
+            .from_string(prompt_dict["model_hypothesis2experiment"]["user_prompt"])
+            .render(
+                hypothesis_and_feedback=context["hypothesis_and_feedback"],
+                model_list=context["model_list"],
+                RAG=context["RAG"],
+            )
+        )
+
+        resp = APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt, json_mode=json_flag)
+
+        return self.convert_response(resp)
diff --git a/rdagent/core/experiment.py b/rdagent/core/experiment.py
index b7abb8606..537f05c88 100644
--- a/rdagent/core/experiment.py
+++ b/rdagent/core/experiment.py
@@ -1,6 +1,6 @@
 from abc import ABC, abstractmethod
 from pathlib import Path
-from typing import Generic, Optional, Sequence, TypeVar
+from typing import Generic, Optional, Sequence, TypeVar, Any
 
 """
 This file contains the all the class about organizing the task in RD-Agent.
@@ -13,9 +13,25 @@ class Task:
     # I think the task version applies to the base class.
     pass
 
-
 ASpecificTask = TypeVar("ASpecificTask", bound=Task)
 
+# class Result():
+
+#     def __init__(self, result_background) -> None:
+#         self.result_background = result_background # scenario actually 
+    
+#     @abstractmethod
+#     def build_result(result: str):
+#         """
+#         One pass in the result. It adds up with the background. Then things are processed & stored. 
+#         """
+#         raise NotImplementedError("build_result is not implemented.")
+
+#     # TODO: Find some ways to generalise some qualities of the result. IE. Some backgrounds for it in models? 
+#     # Need to Align 
+
+# ASpecificResult = TypeVar("ASpecificResult", bound=Result)
+
 
 class Implementation(ABC, Generic[ASpecificTask]):
     def __init__(self, target_task: ASpecificTask) -> None:
@@ -119,7 +135,7 @@ class Experiment(ABC, Generic[ASpecificTask, ASpecificImp]):
     The experiment is a sequence of tasks and the implementations of the tasks after generated by the TaskGenerator.
     """
 
-    def __init__(self, sub_tasks: Sequence[ASpecificTask]) -> None:
+    def __init__(self, sub_tasks: Sequence[ASpecificTask], result: Any) -> None:
         self.sub_tasks = sub_tasks
         self.sub_implementations: Sequence[ASpecificImp] = [None for _ in self.sub_tasks]
         self.based_experiments: Sequence[Experiment] = []
@@ -128,7 +144,6 @@ def __init__(self, sub_tasks: Sequence[ASpecificTask]) -> None:
 
 TaskOrExperiment = TypeVar("TaskOrExperiment", Task, Experiment)
 
-
 class Loader(ABC, Generic[TaskOrExperiment]):
     @abstractmethod
     def load(self, *args, **kwargs) -> TaskOrExperiment:
diff --git a/rdagent/core/proposal.py b/rdagent/core/proposal.py
index 93e204536..f54df630c 100644
--- a/rdagent/core/proposal.py
+++ b/rdagent/core/proposal.py
@@ -3,11 +3,12 @@
 """
 
 from abc import ABC, abstractmethod
-from typing import Dict, Generic, List, Tuple, TypeVar
+from typing import Dict, Generic, List, Tuple, TypeVar, Any
 
 from rdagent.core.evaluation import Feedback
 from rdagent.core.experiment import Experiment
 from rdagent.core.scenario import Scenario
+from rdagent.core.experiment import ASpecificTask
 
 # class data_ana: XXX
 
@@ -54,6 +55,12 @@ def __init__(self, scen: ASpecificScen) -> None:
         self.scen: ASpecificScen = scen
         self.hist: list[Tuple[Hypothesis, Experiment, HypothesisFeedback]] = []
 
+    def get_last_experiment_info(self) -> Tuple[Hypothesis, ASpecificTask, Any]:
+        """Access the last experiment result, sub-task, and the corresponding hypothesis."""
+        last_hypothesis, last_experiment, _ = self.hist[-1]
+        last_task = last_experiment.sub_tasks[-1]
+        last_result = last_experiment.result
+        return last_hypothesis, last_task, last_result
 
 class HypothesisGen:
     def __init__(self, scen: Scenario):
diff --git a/rdagent/scenarios/qlib/ model_proposal.py b/rdagent/scenarios/qlib/ model_proposal.py
new file mode 100644
index 000000000..52f65de4a
--- /dev/null
+++ b/rdagent/scenarios/qlib/ model_proposal.py	
@@ -0,0 +1,78 @@
+import json
+from pathlib import Path
+from typing import List, Tuple
+
+from jinja2 import Environment, StrictUndefined
+
+from rdagent.components.coder.factor_coder.factor import FactorExperiment, FactorTask
+from rdagent.components.coder.factor_coder.utils import get_data_folder_intro
+from rdagent.components.proposal.factor_proposal import (
+    FactorHypothesis,
+    FactorHypothesis2Experiment,
+    FactorHypothesisGen,
+)
+from rdagent.core.prompts import Prompts
+from rdagent.core.proposal import HypothesisSet, Scenario, Trace
+
+prompt_dict = Prompts(file_path=Path(__file__).parent / "prompts.yaml")
+
+QlibFactorHypothesis = FactorHypothesis
+
+
+class QlibFactorHypothesisGen(FactorHypothesisGen):
+    def __init__(self, scen: Scenario) -> Tuple[dict, bool]:
+        super().__init__(scen)
+
+    def prepare_context(self, trace: Trace) -> None:
+        hypothesis_feedback = (
+            Environment(undefined=StrictUndefined)
+            .from_string(prompt_dict["hypothesis_and_feedback"])
+            .render(trace=trace)
+        )
+        context_dict = {
+            "hypothesis_and_feedback": hypothesis_feedback,
+            "RAG": ...,
+            "hypothesis_output_format": prompt_dict["hypothesis_output_format"],
+        }
+        return context_dict, True
+
+    def convert_response(self, response: str) -> FactorHypothesis:
+        response_dict = json.loads(response)
+        hypothesis = QlibFactorHypothesis(hypothesis=response_dict["hypothesis"], reason=response_dict["reason"])
+        return hypothesis
+
+
+class QlibFactorHypothesis2Experiment(FactorHypothesis2Experiment):
+    def prepare_context(self, hs: HypothesisSet) -> Tuple[dict | bool]:
+        scenario = hs.trace.scen.get_scenario_all_desc()
+        experiment_output_format = prompt_dict["experiment_output_format"]
+
+        hypothesis_and_feedback = (
+            Environment(undefined=StrictUndefined)
+            .from_string(prompt_dict["hypothesis_and_feedback"])
+            .render(trace=hs.trace)
+        )
+
+        experiment_list: List[FactorExperiment] = [t[1] for t in hs.trace.hist]
+
+        factor_list = []
+        for experiment in experiment_list:
+            factor_list.extend(experiment.sub_tasks)
+
+        return {
+            "scenario": scenario,
+            "hypothesis_and_feedback": hypothesis_and_feedback,
+            "experiment_output_format": experiment_output_format,
+            "factor_list": factor_list,
+            "RAG": ...,
+        }, True
+
+    def convert_response(self, response: str) -> FactorExperiment:
+        response_dict = json.loads(response)
+        tasks = []
+        for factor_name in response_dict:
+            description = response_dict[factor_name]["description"]
+            formulation = response_dict[factor_name]["formulation"]
+            variables = response_dict[factor_name]["variables"]
+            tasks.append(FactorTask(factor_name, description, formulation, variables))
+        return FactorExperiment(tasks)
diff --git a/rdagent/scenarios/qlib/experiment/model_experiment.py b/rdagent/scenarios/qlib/experiment/model_experiment.py
index 774eec6d8..cc02fb637 100644
--- a/rdagent/scenarios/qlib/experiment/model_experiment.py
+++ b/rdagent/scenarios/qlib/experiment/model_experiment.py
@@ -1 +1,38 @@
-# TODO define QlibModelExperiment here which should be subclass of Experiment
+from pathlib import Path
+from rdagent.components.coder.model_coder.model import ModelExperiment
+from rdagent.core.prompts import Prompts
+from rdagent.core.scenario import Scenario
+
+prompt_dict = Prompts(file_path=Path(__file__).parent / "prompts.yaml")
+
+QlibModelExperiment = ModelExperiment
+
+class QlibModelScenario(Scenario):
+    @property
+    def background(self) -> str:
+        return prompt_dict["qlib_model_background"]
+
+    @property
+    def output_format(self) -> str:
+        return prompt_dict["qlib_model_output_format"]
+
+    @property
+    def interface(self) -> str:
+        return prompt_dict["qlib_model_interface"]
+
+    @property
+    def simulator(self) -> str:
+        return prompt_dict["qlib_model_simulator"]
+
+    def get_scenario_all_desc(self) -> str:
+        return f"""Background of the scenario:
+{self.background}
+The source data you can use:
+{self.source_data}
+The interface you should follow to write the runnable code:
+{self.interface}
+The output of your code should be in the format:
+{self.output_format}
+The simulator user can use to test your model:
+{self.simulator}
+"""
diff --git a/rdagent/scenarios/qlib/experiment/prompts.yaml b/rdagent/scenarios/qlib/experiment/prompts.yaml
index b66723db2..21e4a7d84 100644
--- a/rdagent/scenarios/qlib/experiment/prompts.yaml
+++ b/rdagent/scenarios/qlib/experiment/prompts.yaml
@@ -44,4 +44,31 @@ qlib_factor_simulator: |-
   1. generate a new factor table based on the factor values.
   2. train a model like LightGBM, CatBoost, LSTM or simple PyTorch model to predict the next several days return based on the factor values.
   3. build a portfolio based on the predicted return based on a strategy.
-  4. evaluate the portfolio's performance including the return, sharpe ratio, max drawdown, and so on.
\ No newline at end of file
+  4. evaluate the portfolio's performance including the return, sharpe ratio, max drawdown, and so on.
+
+qlib_model_background: |-
+  The model is a machine learning or deep learning structure used in quantitative investment to predict the returns and risks of a portfolio or a single asset. Models are employed by investors to generate forecasts based on historical data and identified factors, which are central to many quantitative investment strategies.
+  Each model takes the factors as input and predicts the future returns.
+  The model is defined in the following parts:
+  1. Name: The name of the model.
+  2. Description: The description of the model.
+  3. Architecture: The architecture of the model, such as neural network layers or tree structures.
+  4. Hyperparameters: The hyperparameters used in the model, such as learning rate, number of epochs, etc.
+  The model should provide clear documentation of its architecture and hyperparameters. One model should statically define one output with a fixed architecture and hyperparameters. For example, a model with an LSTM layer and a model with a GRU layer should be considered two different models.
+
+qlib_model_interface: |-
+  Your python code should follow the interface to better interact with the user's system.
+  Your python code should contain the following parts: the import part, the class definition part, and the main part. You should define a class named "Net" and include methods for initialization, training, and prediction. The main part should instantiate this class and call its methods. Don't write any try-except block in your python code. The user will catch the exception message and provide the feedback to you.
+
+qlib_model_output_format: |-
+  TO DO. 
+
+qlib_model_simulator: |-
+  The models will be sent into Qlib to train and evaluate their performance in predicting future returns based on historical factor values.
+  Qlib is an AI-oriented quantitative investment platform that aims to realize the potential, empower research, and create value using AI technologies in quantitative investment, from exploring ideas to implementing productions. Qlib supports diverse machine learning modeling paradigms, including supervised learning, market dynamics modeling, and reinforcement learning (RL).
+  User will use Qlib to automatically perform the following tasks:
+  1. Generate a new model table based on the factor values.
+  2. Train a model like LightGBM, CatBoost, LSTM, or a simple PyTorch model to predict the next several days' returns based on the factor values.
+  3. Build a portfolio based on the predicted returns using a specific strategy.
+  4. Evaluate the portfolio's performance, including metrics such as return, Sharpe ratio, max drawdown, and others.
+  5. Iterate on model improvements based on performance evaluations and feedback.
diff --git a/rdagent/scenarios/qlib/model_task_implementation/__init__.py b/rdagent/scenarios/qlib/model_task_implementation/__init__.py
index fad4c2842..5b5fe1c39 100644
--- a/rdagent/scenarios/qlib/model_task_implementation/__init__.py
+++ b/rdagent/scenarios/qlib/model_task_implementation/__init__.py
@@ -1,5 +1,99 @@
 from rdagent.components.coder.model_coder.one_shot import ModelCodeWriter
+import json
+import os
+from rdagent.oai.llm_utils import APIBackend
+from rdagent.core.prompts import Prompts
+from rdagent.core.task_generator import TaskGenerator
+from rdagent.oai.llm_utils import APIBackend
+import re
+from pathlib import Path
+from typing import Sequence
 
+from jinja2 import Environment, StrictUndefined
+from rdagent.components.coder.model_coder.model import (
+    ModelExperiment,
+    ModelImplementation,
+    ModelTask,
+)
 
-class QlibModelCodeWriter(ModelCodeWriter):
-    ...
+from rdagent.core.prompts import Prompts
+from rdagent.core.task_generator import TaskGenerator
+from rdagent.oai.llm_utils import APIBackend
+
+class QlibModelCodeWriter(TaskGenerator[ModelExperiment]):
+    def generate(self, exp: ModelExperiment) -> ModelExperiment:
+        mti_l = []
+        for t in exp.sub_tasks:
+            mti = ModelImplementation(t)
+            mti.prepare()
+            # Define the system prompt
+            sys_prompt = (
+                "You are a professional code writing assistant. You will receive some hypotheses and "
+                "you will be in charge of writing code to execute them. Follow best practices and ensure the code is correct and efficient."
+            )
+
+            # Define the user prompt
+            usr_prompt = ('''
+                            "Generate a json file with two keys: [code] & [explanation]. "
+                "Using the following information, write a Python code using PyTorch / Torch Geometric to implement the model. Generate the class Net(nn.Module) "
+                "This model is in the quantitative investment field and has only one layer. "
+                "Implement the model forward function based on the provided model formula information:\n"
+                "Hypothesis: The data shows time-series quality.\n"
+                "You must complete the forward function as far as you can."
+                Example Output:
+                        {
+            "code": "import torch\nimport torch.nn as nn\n\nclass Net(nn.Module):\n    def __init__(self, input_dim, output_dim=1, layers=(256,), act=\"LeakyReLU\"):\n        super(Net, self).__init__()\n        layers = [input_dim] + list(layers)\n        dnn_layers = []\n        drop_input = nn.Dropout(0.05)\n        dnn_layers.append(drop_input)\n        hidden_units = input_dim\n        for i, (_input_dim, hidden_units) in enumerate(zip(layers[:-1], layers[1:])):\n            fc = nn.Linear(_input_dim, hidden_units)\n            if act == \"LeakyReLU\":\n                activation = nn.LeakyReLU(negative_slope=0.1, inplace=False)\n            elif act == \"SiLU\":\n                activation = nn.SiLU()\n            else:\n                raise NotImplementedError(f\"This type of input is not supported\")\n            bn = nn.BatchNorm1d(hidden_units)\n            seq = nn.Sequential(fc, bn, activation)\n            dnn_layers.append(seq)\n        drop_input = nn.Dropout(0.05)\n        dnn_layers.append(drop_input)\n        fc = nn.Linear(hidden_units, output_dim)\n        dnn_layers.append(fc)\n        self.dnn_layers = nn.ModuleList(dnn_layers)\n        self._weight_init()\n\n    def _weight_init(self):\n        for m in self.modules():\n            if isinstance(m, nn.Linear):\n                nn.init.kaiming_normal_(m.weight, a=0.1, mode=\"fan_in\", nonlinearity=\"leaky_relu\")\n\n    def forward(self, x):\n        cur_output = x\n        for i, now_layer in enumerate(self.dnn_layers):\n            cur_output = now_layer(cur_output)\n        return cur_output",
+            "explanation": "This Python code defines a flexible neural network class 'Net' using PyTorch. It's designed to be modular with customizable input dimensions, output dimensions, layer configurations, and activation functions. The network begins with a dropout layer, followed by several hidden layers dynamically created based on the 'layers' tuple. Each hidden layer consists of a linear transformation, batch normalization, and an activation function, which can be either LeakyReLU or SiLU. Dropout layers are added before and after the sequence of hidden layers to prevent overfitting. The class also includes a custom weight initialization method, initializing weights using the Kaiming normal method, which is suited for layers followed by LeakyReLU activations. This model could be effectively used in quantitative investment strategies to process time-series data or for general deep learning tasks requiring robust feature extraction and nonlinear transformations."
+                }
+
+                '''
+            )
+            try:
+                # Call the APIBackend to generate the response
+                response = APIBackend().build_messages_and_create_chat_completion(
+                    user_prompt=usr_prompt,
+                    system_prompt=sys_prompt,
+                    json_mode=True,
+                )
+
+                # Log the raw response for debugging
+                print("Raw Response:\n", response)
+
+                # Parse the JSON response to extract the code
+                response_json = json.loads(response)
+                code = response_json.get("code", "No code generated")
+                explanation = response_json.get("explanation", "No explanation provided")
+
+                print("Generated Code:\n", code)
+                print("\nExplanation:\n", explanation)
+
+                # Write the generated code to model.py
+                output_dir = '/home/v-xisenwang/RD-Agent/test/utils/env_tpl'
+                os.makedirs(output_dir, exist_ok=True)
+                output_file = os.path.join(output_dir, 'model.py')
+
+                with open(output_file, 'w') as f:
+                    f.write(code)
+                print(f"Code has been written to {output_file}")
+
+            except json.JSONDecodeError as e:
+                print("Error parsing JSON response from LLM:", e)
+            except Exception as e:
+                print("An unexpected error occurred:", e)
+
+            mti.inject_code(**{"model.py": code})
+            mti_l.append(mti)
+        exp.sub_implementations = mti_l
+        return exp   
+
+    def print_model_details():
+        # Create an instance of APIBackend
+        api_backend = APIBackend()
+
+        # Check which model is being used
+        if api_backend.use_llama2:
+            print("Using LLaMA 2")
+        elif api_backend.use_gcr_endpoint:
+            print(f"Using GCR endpoint: {api_backend.gcr_endpoint}")
+        else:
+            print(f"Using Azure model: {api_backend.chat_model}")
diff --git a/rdagent/scenarios/qlib/model_task_implementation/better_model.py b/rdagent/scenarios/qlib/model_task_implementation/better_model.py
new file mode 100644
index 000000000..a91d37b4f
--- /dev/null
+++ b/rdagent/scenarios/qlib/model_task_implementation/better_model.py
@@ -0,0 +1,89 @@
+import json
+from rdagent.oai.llm_utils import APIBackend
+
+def better_model_generate(feedback, hypothesis, code):
+    # Define the system prompt for generating better code
+    sys_prompt_better_code = (
+        "You are a professional code improvement assistant. You will receive a hypothesis, initial code, and feedback on that code. "
+        "Your task is to generate an improved version of the code based on the feedback, following best practices and ensuring the code is correct and efficient."
+        "Example JSON Structure for Improved Code: "
+        '{"code": "Your improved code here", "explanation": "Detailed explanation of the improvements made"}'
+        'Remember, you must generate a class called Net'
+    )
+
+    # Define the user prompt for generating better code
+    usr_prompt_better_code = f'''
+        "Given the following hypothesis, initial code, and feedback, provide an improved version of the code based on the feedback."
+        "Hypothesis: {hypothesis}\n"
+        "Initial Code:\n```python\n{code}\n```\n"
+        "Feedback: {feedback}\n"
+        "Please provide an improved version of the code and explain the improvements made."
+    '''
+
+    try:
+        # Call the APIBackend to generate the response for improved code
+        response_better_code = APIBackend().build_messages_and_create_chat_completion(
+            user_prompt=usr_prompt_better_code,
+            system_prompt=sys_prompt_better_code,
+            json_mode=True,
+        )
+
+        # Log the raw response for debugging
+        print("Raw Response for Improved Code:\n", response_better_code)
+
+        # Parse the JSON response to extract the improved code and explanation
+        response_json_better_code = json.loads(response_better_code)
+        improved_code = response_json_better_code.get("code", "No code generated")
+        explanation = response_json_better_code.get("explanation", "No explanation provided")
+
+        print("Generated Improved Code:\n", improved_code)
+        print("\nExplanation:\n", explanation)
+
+        return improved_code, explanation
+
+    except json.JSONDecodeError as e:
+        print("Error parsing JSON response from LLM for improved code:", e)
+    except Exception as e:
+        print("An unexpected error occurred while generating improved code:", e)
+
+# Example usage
+if __name__ == "__main__":
+    feedback = """
+    To make the model more suitable for analyzing time-series data and supporting the hypothesis that 'the data shows time-series quality', consider the following modifications:
+    1. Integrate RNN, LSTM, or GRU layers: These layers are specifically designed to handle sequential data and can capture temporal dependencies in a time-series. Adding one or more of these layers could enable the model to learn from the temporal structure of the data.
+    2. Utilize 1D Convolutional Layers: 1D CNNs can be effective in identifying temporal patterns in time-series data when configured properly. They can serve as feature extractors that highlight important time-dependent characteristics in the data.
+    3. Implement Time-Series Specific Preprocessing: Depending on the nature of the time-series, it might be beneficial to include preprocessing steps such as detrending, normalization, or differencing to make the series stationary. This could improve model performance by simplifying the patterns the network needs to learn.
+    4. Explore Time-Distributed Layers: For deep learning models that involve multiple sequential or convolutional layers, applying the same operation at each time step (time-distributed processing) can be beneficial for capturing complex temporal relationships.
+    Overall, adapting the model to incorporate some of these suggestions could enhance its ability to analyze and make predictions based on time-series data, thus better supporting the hypothesis.
+    """
+
+    code = '''
+import torch
+import torch.nn as nn
+
+class Net(nn.Module):
+    def __init__(self, input_dim, output_dim=1, act="LeakyReLU"):
+        super(Net, self).__init__()
+        self.drop_input = nn.Dropout(0.05)
+        self.fc = nn.Linear(input_dim, output_dim)
+        if act == "LeakyReLU":
+            self.activation = nn.LeakyReLU(negative_slope=0.1, inplace=False)
+        elif act == "SiLU":
+            self.activation = nn.SiLU()
+        else:
+            raise NotImplementedError(f"Activation function {act} is not supported")
+        self.bn = nn.BatchNorm1d(output_dim)
+
+    def forward(self, x):
+        x = self.drop_input(x)
+        x = self.fc(x)
+        x = self.bn(x)
+        x = self.activation(x)
+        return x
+    '''
+
+    hypothesis = "The data shows time-series quality."
+
+    improved_code, explanation = better_model_generate(feedback, hypothesis, code)
+    print("Final Improved Code:\n", improved_code)
+    print("Explanation of Improvements:\n", explanation)
diff --git a/rdagent/scenarios/qlib/model_task_implementation/test.py b/rdagent/scenarios/qlib/model_task_implementation/test.py
new file mode 100644
index 000000000..be2820091
--- /dev/null
+++ b/rdagent/scenarios/qlib/model_task_implementation/test.py
@@ -0,0 +1,76 @@
+import json
+import os
+from rdagent.oai.llm_utils import APIBackend
+
+def test_api_backend():
+    # Define the system prompt
+    sys_prompt = (
+        "You are a professional code writing assistant. You will receive some hypotheses and "
+        "you will be in charge of writing code to execute them. Follow best practices and ensure the code is correct and efficient."
+    )
+
+    # Define the user prompt
+    usr_prompt = ('''
+                    "Generate a json file with two keys: [code] & [explanation]. "
+        "Using the following information, write a Python code using PyTorch / Torch Geometric to implement the model. Generate the class Net(nn.Module) "
+        "This model is in the quantitative investment field and has only one layer. "
+        "Implement the model forward function based on the provided model formula information:\n"
+        "Hypothesis: The data shows time-series quality.\n"
+        "You must complete the forward function as far as you can."
+        Example Output:
+                  {
+    "code": "import torch\nimport torch.nn as nn\n\nclass Net(nn.Module):\n    def __init__(self, input_dim, output_dim=1, layers=(256,), act=\"LeakyReLU\"):\n        super(Net, self).__init__()\n        layers = [input_dim] + list(layers)\n        dnn_layers = []\n        drop_input = nn.Dropout(0.05)\n        dnn_layers.append(drop_input)\n        hidden_units = input_dim\n        for i, (_input_dim, hidden_units) in enumerate(zip(layers[:-1], layers[1:])):\n            fc = nn.Linear(_input_dim, hidden_units)\n            if act == \"LeakyReLU\":\n                activation = nn.LeakyReLU(negative_slope=0.1, inplace=False)\n            elif act == \"SiLU\":\n                activation = nn.SiLU()\n            else:\n                raise NotImplementedError(f\"This type of input is not supported\")\n            bn = nn.BatchNorm1d(hidden_units)\n            seq = nn.Sequential(fc, bn, activation)\n            dnn_layers.append(seq)\n        drop_input = nn.Dropout(0.05)\n        dnn_layers.append(drop_input)\n        fc = nn.Linear(hidden_units, output_dim)\n        dnn_layers.append(fc)\n        self.dnn_layers = nn.ModuleList(dnn_layers)\n        self._weight_init()\n\n    def _weight_init(self):\n        for m in self.modules():\n            if isinstance(m, nn.Linear):\n                nn.init.kaiming_normal_(m.weight, a=0.1, mode=\"fan_in\", nonlinearity=\"leaky_relu\")\n\n    def forward(self, x):\n        cur_output = x\n        for i, now_layer in enumerate(self.dnn_layers):\n            cur_output = now_layer(cur_output)\n        return cur_output",
+    "explanation": "This Python code defines a flexible neural network class 'Net' using PyTorch. It's designed to be modular with customizable input dimensions, output dimensions, layer configurations, and activation functions. The network begins with a dropout layer, followed by several hidden layers dynamically created based on the 'layers' tuple. Each hidden layer consists of a linear transformation, batch normalization, and an activation function, which can be either LeakyReLU or SiLU. Dropout layers are added before and after the sequence of hidden layers to prevent overfitting. The class also includes a custom weight initialization method, initializing weights using the Kaiming normal method, which is suited for layers followed by LeakyReLU activations. This model could be effectively used in quantitative investment strategies to process time-series data or for general deep learning tasks requiring robust feature extraction and nonlinear transformations."
+}
+
+        '''
+    )
+
+    try:
+        # Call the APIBackend to generate the response
+        response = APIBackend().build_messages_and_create_chat_completion(
+            user_prompt=usr_prompt,
+            system_prompt=sys_prompt,
+            json_mode=True,
+        )
+
+        # Log the raw response for debugging
+        print("Raw Response:\n", response)
+
+        # Parse the JSON response to extract the code
+        response_json = json.loads(response)
+        code = response_json.get("code", "No code generated")
+        explanation = response_json.get("explanation", "No explanation provided")
+
+        print("Generated Code:\n", code)
+        print("\nExplanation:\n", explanation)
+
+        # Write the generated code to model.py
+        output_dir = '/home/v-xisenwang/RD-Agent/test/utils/env_tpl'
+        os.makedirs(output_dir, exist_ok=True)
+        output_file = os.path.join(output_dir, 'model.py')
+
+        with open(output_file, 'w') as f:
+            f.write(code)
+        print(f"Code has been written to {output_file}")
+
+    except json.JSONDecodeError as e:
+        print("Error parsing JSON response from LLM:", e)
+    except Exception as e:
+        print("An unexpected error occurred:", e)
+
+def print_model_details():
+    # Create an instance of APIBackend
+    api_backend = APIBackend()
+
+    # Check which model is being used
+    if api_backend.use_llama2:
+        print("Using LLaMA 2")
+    elif api_backend.use_gcr_endpoint:
+        print(f"Using GCR endpoint: {api_backend.gcr_endpoint}")
+    else:
+        print(f"Using Azure model: {api_backend.chat_model}")
+
+if __name__ == "__main__":
+    print_model_details()
+    test_api_backend()
diff --git a/rdagent/scenarios/qlib/task_generator/feedback_test.json b/rdagent/scenarios/qlib/task_generator/feedback_test.json
new file mode 100644
index 000000000..2def1a040
--- /dev/null
+++ b/rdagent/scenarios/qlib/task_generator/feedback_test.json
@@ -0,0 +1,61 @@
+
+ {
+  "feedback": {
+    "code_evaluation": {
+      "relevance_to_hypothesis": "The provided code defines a neural network model in PyTorch, but it does not directly support or refute the hypothesis that 'The data shows time-series quality.' The hypothesis pertains to the characteristics of the data, suggesting it may be sequential or have temporal dependencies. However, the code implements a basic neural network architecture without specific adaptations for time-series analysis (e.g., recurrent layers like LSTM or GRU, or convolutional layers designed for sequence processing).",
+      "code_structure": "The structure of the code is clear and follows good object-oriented programming practices by defining a neural network model class that inherits from `nn.Module`. The choice of making the activation function configurable is good for flexibility. Error handling for unsupported activation functions is also a positive aspect.",
+      "improvement_suggestions": {
+        "1": "To better align with the hypothesis about time-series data, consider incorporating layers that are specifically designed for sequence processing, such as LSTM (Long Short-Term Memory) or GRU (Gated Recurrent Unit) layers if the data is indeed time-series.",
+        "2": "Include comments or documentation within the code to explain the choice of architecture, especially if there are reasons to believe this architecture is suitable for time-series data (if that's the case).",
+        "3": "Consider evaluating and mentioning other metrics beyond accuracy, such as precision, recall, or F1 scores, which might provide more insight into the model's performance on time-series data.",
+        "4": "Experiment with different dropout rates and batch normalization positions to optimize the model for potentially sequential data."
+      }
+    },
+    "result_evaluation": {
+      "relevance_to_hypothesis": "An accuracy of 85% on the validation set indicates the model's performance level but does not directly validate or invalidate the hypothesis about the data showing 'time-series quality.' Evaluating a model's accuracy alone is insufficient to conclude about the nature of the data (e.g., whether it effectively captures time dependencies).",
+      "considerations_for_improvement": {
+        "1": "To better support the hypothesis, it would be beneficial to include analysis that directly investigates the time-series characteristics of the data, such as autocorrelation plots, trend decomposition, or evaluating model performance with and without considering the temporal order of data points.",
+        "2": "If possible, compare the current model's performance with models known to be effective for time-series data, such as RNNs, LSTMs, or CNNs for sequence data, to demonstrate whether a time-series approach significantly improves model accuracy."
+      }
+    }
+  }
+}, 
+{
+  "feedback": {
+    "hypothesis_relevance": "The hypothesis regarding 'time-series quality' is not directly addressed or supported by the provided results and code. The results focus on financial metrics (mean, standard deviation, annualized return, information ratio, max drawdown) which are crucial for evaluating the performance of a financial strategy or model but do not inherently relate to the quality of time-series data. To assess time-series quality, aspects such as stationarity, autocorrelation, missing values, or noise levels should be considered. Therefore, the hypothesis and the provided evidence seem misaligned.",
+    "analysis_of_results": "The analysis provided in the result section is thorough regarding the performance comparison of a financial strategy or benchmark at two different times or under two different conditions. However, it lacks a direct connection to evaluating time-series data quality. While the results effectively demonstrate changes in financial performance metrics over time, which could indirectly suggest variations in market conditions or strategy effectiveness, they do not provide insights into the time-series data quality itself. For a more relevant evaluation of the hypothesis, the analysis should include metrics or tests specific to time-series analysis.",
+    "code_review": {
+      "general": "The provided Python code defines a neural network architecture using PyTorch. This network includes an input dropout layer, a fully connected layer, batch normalization, and activation functions that can be chosen. While well-structured for modeling purposes, there's no direct evidence that it's been used to evaluate or improve the 'time-series quality' per the hypothesis. Additionally, without seeing how the network is applied to data and how its outputs relate to the discussed financial metrics, it's challenging to assess its effectiveness or relevance to the hypothesis.",
+      "potential_improvements": {
+        "clarification_needed": "It would help to clarify how this neural network model relates to assessing or improving time-series data quality. If the model's purpose is to predict or analyze financial time series, details on how it's trained, validated, and its prediction performance metrics could establish a stronger link to the hypothesis.",
+        "additional_features": "Incorporating layers or features specifically designed for time-series analysis, such as LSTM (Long Short-Term Memory) or GRU (Gated Recurrent Unit) cells, might make the model more suited for handling time-series data complexities.",
+        "documentation_and_comments": "Adding comments and documentation within the code would aid understanding of its purpose, functionality, and how it should be applied. This is especially important for reviewers or collaborators who might not be familiar with neural network architectures or PyTorch."
+      }
+    },
+    "conclusion": "While the results provide a comprehensive comparison of financial metrics over two periods or conditions, they do not directly support or refute the hypothesis about time-series data quality. The provided neural network code lacks a clear connection to this hypothesis. For a more accurate assessment, future analyses should include metrics specific to time-series quality and clarify how models like the one provided are applied to understand or enhance this aspect."
+  }
+}
+Generated Feedback:
+ {'hypothesis_relevance': "The hypothesis regarding 'time-series quality' is not directly addressed or supported by the provided results and code. The results focus on financial metrics (mean, standard deviation, annualized return, information ratio, max drawdown) which are crucial for evaluating the performance of a financial strategy or model but do not inherently relate to the quality of time-series data. To assess time-series quality, aspects such as stationarity, autocorrelation, missing values, or noise levels should be considered. Therefore, the hypothesis and the provided evidence seem misaligned.", 'analysis_of_results': 'The analysis provided in the result section is thorough regarding the performance comparison of a financial strategy or benchmark at two different times or under two different conditions. However, it lacks a direct connection to evaluating time-series data quality. While the results effectively demonstrate changes in financial performance metrics over time, which could indirectly suggest variations in market conditions or strategy effectiveness, they do not provide insights into the time-series data quality itself. For a more relevant evaluation of the hypothesis, the analysis should include metrics or tests specific to time-series analysis.', 'code_review': {'general': "The provided Python code defines a neural network architecture using PyTorch. This network includes an input dropout layer, a fully connected layer, batch normalization, and activation functions that can be chosen. While well-structured for modeling purposes, there's no direct evidence that it's been used to evaluate or improve the 'time-series quality' per the hypothesis. Additionally, without seeing how the network is applied to data and how its outputs relate to the discussed financial metrics, it's challenging to assess its effectiveness or relevance to the hypothesis.", 'potential_improvements': {'clarification_needed': "It would help to clarify how this neural network model relates to assessing or improving time-series data quality. If the model's purpose is to predict or analyze financial time series, details on how it's trained, validated, and its prediction performance metrics could establish a stronger link to the hypothesis.", 'additional_features': 'Incorporating layers or features specifically designed for time-series analysis, such as LSTM (Long Short-Term Memory) or GRU (Gated Recurrent Unit) cells, might make the model more suited for handling time-series data complexities.', 'documentation_and_comments': 'Adding comments and documentation within the code would aid understanding of its purpose, functionality, and how it should be applied. This is especially important for reviewers or collaborators who might not be familiar with neural network architectures or PyTorch.'}}, 'conclusion': 'While the results provide a comprehensive comparison of financial metrics over two periods or conditions, they do not directly support or refute the hypothesis about time-series data quality. The provided neural network code lacks a clear connection to this hypothesis. For a more accurate assessment, future analyses should include metrics specific to time-series quality and clarify how models like the one provided are applied to understand or enhance this aspect.'}
+Final Feedback:
+ {'hypothesis_relevance': "The hypothesis regarding 'time-series quality' is not directly addressed or supported by the provided results and code. The results focus on financial metrics (mean, standard deviation, annualized return, information ratio, max drawdown) which are crucial for evaluating the performance of a financial strategy or model but do not inherently relate to the quality of time-series data. To assess time-series quality, aspects such as stationarity, autocorrelation, missing values, or noise levels should be considered. Therefore, the hypothesis and the provided evidence seem misaligned.", 'analysis_of_results': 'The analysis provided in the result section is thorough regarding the performance comparison of a financial strategy or benchmark at two different times or under two different conditions. However, it lacks a direct connection to evaluating time-series data quality. While the results effectively demonstrate changes in financial performance metrics over time, which could indirectly suggest variations in market conditions or strategy effectiveness, they do not provide insights into the time-series data quality itself. For a more relevant evaluation of the hypothesis, the analysis should include metrics or tests specific to time-series analysis.', 'code_review': {'general': "The provided Python code defines a neural network architecture using PyTorch. This network includes an input dropout layer, a fully connected layer, batch normalization, and activation functions that can be chosen. While well-structured for modeling purposes, there's no direct evidence that it's been used to evaluate or improve the 'time-series quality' per the hypothesis. Additionally, without seeing how the network is applied to data and how its outputs relate to the discussed financial metrics, it's challenging to assess its effectiveness or relevance to the hypothesis.", 'potential_improvements': {'clarification_needed': "It would help to clarify how this neural network model relates to assessing or improving time-series data quality. If the model's purpose is to predict or analyze financial time series, details on how it's trained, validated, and its prediction performance metrics could establish a stronger link to the hypothesis.", 'additional_features': 'Incorporating layers or features specifically designed for time-series analysis, such as LSTM (Long Short-Term Memory) or GRU (Gated Recurrent Unit) cells, might make the model more suited for handling time-series data complexities.', 'documentation_and_comments': 'Adding comments and documentation within the code would aid understanding of its purpose, functionality, and how it should be applied. This is especially important for reviewers or collaborators who might not be familiar with neural network architectures or PyTorch.'}}, 'conclusion': 'While the results provide a comprehensive comparison of financial metrics over two periods or conditions, they do not directly support or refute the hypothesis about time-series data quality. The provided neural network code lacks a clear connection to this hypothesis. For a more accurate assessment, future analyses should include metrics specific to time-series quality and clarify how models like the one provided are applied to understand or enhance this aspect.'}
+
+ {
+    "Feedback": {
+      "General": "The hypothesis provided is very broad and does not specify a clear, testable statement related to the code or the results. 'The data shows time-series quality' could be interpreted in many ways, such as having a predictable pattern, seasonality, or other characteristics typical of time-series data. However, the results focus on financial performance metrics rather than qualities inherent to the data itself.",
+      "Code Review": {
+        "Observations": "The code implements a simple neural network model with an option to select activation functions. There are no explicit features in the provided code that directly assess or exploit 'time-series quality' as hypothesized. The model architecture is generic and does not include elements specifically designed for time-series analysis (e.g., recurrent layers or convolutional layers suitable for sequential data).",
+        "Suggestions": "To better align with the hypothesis, consider incorporating elements into the model that are specifically designed for time-series analysis, such as LSTM or GRU layers if working with sequences, or 1D CNN layers for temporal patterns. Additionally, incorporating time-series specific features or preprocessing steps could make the evaluation more relevant to the hypothesis."
+      },
+      "Result Analysis": {
+        "Observations": {
+          "Overall": "The results indicate a decline in performance based on several financial metrics when comparing the current results to previous ones. This includes lower returns, higher risk (as indicated by increased max drawdown and volatility), and lower efficiency (as indicated by the information ratio).",
+          "Relevance to Hypothesis": "The results do not directly address the hypothesis about 'time-series quality'. Instead, they focus on the financial performance of a strategy or model. Without further context or analysis, it's difficult to determine how these results relate to the quality of the underlying time-series data."
+        },
+        "Suggestions": "To make the results more relevant to the hypothesis, include an analysis of how the model's predictions or performance relate to characteristics of time-series data being considered. This could involve analyzing prediction accuracy over time, assessing model performance on different time scales, or examining how the model handles common time-series challenges like seasonality and trend."
+      },
+      "Conclusion": {
+        "Summary": "Based on the provided information, it is difficult to directly support or refute the hypothesis since it and the results address different aspects. The hypothesis speaks to qualities of time-series data, whereas the results focus on financial performance metrics of a particular implementation. To accurately test the hypothesis, a more detailed exploration of how the model interacts with and leverages time-series data is necessary."
+      }
+    }
+  }
\ No newline at end of file
diff --git a/rdagent/scenarios/qlib/task_generator/overall_feedback.py b/rdagent/scenarios/qlib/task_generator/overall_feedback.py
new file mode 100644
index 000000000..be03648a7
--- /dev/null
+++ b/rdagent/scenarios/qlib/task_generator/overall_feedback.py
@@ -0,0 +1,160 @@
+import json
+from rdagent.oai.llm_utils import APIBackend
+
+def generate_hypothesis_feedback(result, hypothesis, context):
+    # Define the system prompt for hypothesis feedback
+    sys_prompt_hypothesis = (
+        "You are a professional result analysis assistant. You will receive a result and a hypothesis. "
+        "Your task is to provide feedback on how well the result supports or refutes the hypothesis. "
+        "Please provide detailed and constructive feedback. "
+        "Example JSON Structure for Result Analysis: "
+        '{"Observations": "Your overall observations here", "Feedback for Hypothesis": "Observations related to the hypothesis", '
+        '"New Hypothesis": "Put your new hypothesis here.", "Reasoning": "Provide reasoning for the hypothesis here."}'
+    )
+
+    # Define the user prompt for hypothesis feedback
+    usr_prompt_hypothesis = f'''
+        We are in an experiment of finding hypothesis and validating or rejecting them so that in the end we have a powerful model generated.
+        Here are the context: {context}. 
+        Now let's come to this round. 
+        "Given the following hypothesis and result, provide feedback on how well the result supports or refutes the hypothesis. "
+        "Hypothesis: {hypothesis}\n"
+        "Result: {result}\n"
+        "Please provide detailed and constructive feedback."
+    '''
+
+    try:
+        # Call the APIBackend to generate the response for hypothesis feedback
+        response_hypothesis = APIBackend().build_messages_and_create_chat_completion(
+            user_prompt=usr_prompt_hypothesis,
+            system_prompt=sys_prompt_hypothesis,
+            json_mode=True,
+        )
+
+        # Log the raw response for debugging
+        print("Raw Response for Hypothesis Feedback:\n", response_hypothesis)
+
+        # Parse the JSON response to extract the feedback
+        response_json_hypothesis = json.loads(response_hypothesis)
+        hypothesis_feedback = {
+            "Observations": response_json_hypothesis.get("Observations", "No observations provided"),
+            "Feedback for Hypothesis": response_json_hypothesis.get("Feedback for Hypothesis", "No feedback provided"),
+            "New Hypothesis": response_json_hypothesis.get("New Hypothesis", "No new hypothesis provided"),
+            "Reasoning": response_json_hypothesis.get("Reasoning", "No reasoning provided"),
+        }
+
+        print("Generated Hypothesis Feedback:\n", hypothesis_feedback)
+
+        return hypothesis_feedback
+
+    except json.JSONDecodeError as e:
+        print("Error parsing JSON response from LLM for hypothesis feedback:", e)
+    except Exception as e:
+        print("An unexpected error occurred while generating hypothesis feedback:", e)
+
+# Example usage
+if __name__ == "__main__":
+    result = "The model achieved an accuracy of 85% on the validation set."
+    hypothesis = "The data shows time-series quality."
+    context = "This experiment is aimed at improving time-series prediction accuracy."
+
+    feedback = generate_hypothesis_feedback(result, hypothesis, context)
+    print("Final Feedback:\n", feedback)
+
+
+def test_generate_feedback():
+    # Example data
+    result = '''
+    Previous results are our standard benchmark, and the current result is the result of the new model that follows the hypothesis. 
+    Comparison of Benchmark Return
+Previous Results:
+Mean: 0.0477%
+Std: 1.2295%
+Annualized Return: 11.3561%
+Information Ratio: 0.5987
+Max Drawdown: -37.0479%
+Current Results:
+Mean: 0.0161%
+Std: 1.4384%
+Annualized Return: 3.8237%
+Information Ratio: 0.1723
+Max Drawdown: -48.7617%
+Observations:
+The current benchmark return has lower mean and annualized returns.
+Volatility (std) and max drawdown have increased, indicating higher risk.
+Information ratio has significantly decreased, suggesting reduced efficiency.
+Comparison of Excess Return Without Cost
+Previous Results:
+Mean: 0.0530%
+Std: 0.5718%
+Annualized Return: 12.6029%
+Information Ratio: 1.4286
+Max Drawdown: -7.2310%
+Current Results:
+Mean: 0.0265%
+Std: 0.4131%
+Annualized Return: 6.3043%
+Information Ratio: 0.9892
+Max Drawdown: -8.4109%
+Observations:
+Decrease in mean and annualized return.
+Lower standard deviation, indicating reduced volatility.
+Decreased information ratio, although still relatively high.
+Slight increase in max drawdown.
+Comparison of Excess Return With Cost
+Previous Results:
+Mean: 0.0339%
+Std: 0.5717%
+Annualized Return: 8.0654%
+Information Ratio: 0.9145
+Max Drawdown: -8.6083%
+Current Results:
+Mean: 0.0098%
+Std: 0.4133%
+Annualized Return: 2.3216%
+Information Ratio: 0.3641
+Max Drawdown: -12.0422%
+Observations:
+Significant reduction in mean and annualized return.
+Lower volatility is present.
+Considerable decrease in the information ratio.
+Deeper max drawdown, indicating increased risk of significant loss.
+Overall Summary
+Across all metrics, the previous results generally show better performance, especially in terms of higher returns and more favorable risk-adjusted metrics (information ratios). The current results exhibit decreased performance in almost all aspects: lower returns, lower efficiency in generating returns over the risk taken, and in some cases, increased risk as shown by max drawdowns.
+These findings suggest that the strategy or market conditions represented in the previous results were more favorable for generating higher, more efficient returns with lower associated risks. It would be beneficial to investigate the factors leading to such variations, such as changes in market dynamics, strategy adjustments, or external economic impacts, to refine future approaches.
+    '''
+    code = '''
+import torch
+import torch.nn as nn
+
+class Net(nn.Module):
+    def __init__(self, input_dim, output_dim=1, act="LeakyReLU"):
+        super(Net, self).__init__()
+        self.drop_input = nn.Dropout(0.05)
+        self.fc = nn.Linear(input_dim, output_dim)
+        if act == "LeakyReLU":
+            self.activation = nn.LeakyReLU(negative_slope=0.1, inplace=False)
+        elif act == "SiLU":
+            self.activation = nn.SiLU()
+        else:
+            raise NotImplementedError(f"Activation function {act} is not supported")
+        self.bn = nn.BatchNorm1d(output_dim)
+
+    def forward(self, x):
+        x = self.drop_input(x)
+        x = self.fc(x)
+        x = self.bn(x)
+        x = self.activation(x)
+        return x
+    '''
+    hypothesis = "The data shows time-series quality."
+
+    # code_feedback = generate_code_feedback(code, hypothesis)
+    # print("Final Code Feedback:\n", code_feedback)
+
+    hypothesis_feedback = generate_hypothesis_feedback(result, hypothesis, "")
+    print("Final Hypothesis Feedback:\n", hypothesis_feedback)
+
+if __name__ == "__main__":
+    test_generate_feedback()
+
diff --git a/rdagent/utils/env.py b/rdagent/utils/env.py
index d3835943e..f222ca208 100644
--- a/rdagent/utils/env.py
+++ b/rdagent/utils/env.py
@@ -200,6 +200,7 @@ def prepare(self):
         super().prepare()
         qlib_data_path = next(iter(self.conf.extra_volumes.keys()))
         if not (Path(qlib_data_path) / "qlib_data" / "cn_data").exists():
+            print("We are downloading!")
             cmd = "python -m qlib.run.get_data qlib_data --target_dir ~/.qlib/qlib_data/cn_data --region cn --interval 1d --delete_old False"
             self.run(entry=cmd)
         else:
diff --git a/test/utils/env_tpl/conf_mlp.yaml b/test/utils/env_tpl/conf_mlp.yaml
new file mode 100644
index 000000000..17087f8d6
--- /dev/null
+++ b/test/utils/env_tpl/conf_mlp.yaml
@@ -0,0 +1,103 @@
+qlib_init:
+    provider_uri: "~/.qlib/qlib_data/cn_data"
+    region: cn
+market: &market csi300
+benchmark: &benchmark SH000300
+data_handler_config: &data_handler_config
+    start_time: 2008-01-01
+    end_time: 2020-08-01
+    fit_start_time: 2008-01-01
+    fit_end_time: 2014-12-31
+    instruments: *market
+    infer_processors: [
+        {
+            "class" : "DropCol", 
+            "kwargs":{"col_list": ["VWAP0"]}
+        },
+        {
+             "class" : "CSZFillna", 
+             "kwargs":{"fields_group": "feature"}
+        }
+    ]
+    learn_processors: [
+        {
+            "class" : "DropCol", 
+            "kwargs":{"col_list": ["VWAP0"]}
+        },
+        {
+            "class" : "DropnaProcessor", 
+            "kwargs":{"fields_group": "feature"}
+        },
+        "DropnaLabel",
+        {
+            "class": "CSZScoreNorm", 
+            "kwargs": {"fields_group": "label"}
+        }
+    ]
+    process_type: "independent"
+
+port_analysis_config: &port_analysis_config
+    strategy:
+        class: TopkDropoutStrategy
+        module_path: qlib.contrib.strategy
+        kwargs:
+            signal: <PRED>
+            topk: 50
+            n_drop: 5
+    backtest:
+        start_time: 2017-01-01
+        end_time: 2020-08-01
+        account: 100000000
+        benchmark: *benchmark
+        exchange_kwargs:
+            limit_threshold: 0.095
+            deal_price: close
+            open_cost: 0.0005
+            close_cost: 0.0015
+            min_cost: 5
+task:
+    model:
+        class: DNNModelPytorch
+        module_path: qlib.contrib.model.pytorch_nn
+        kwargs:
+            loss: mse
+            lr: 0.002
+            optimizer: adam
+            max_steps: 8000
+            batch_size: 8192
+            GPU: 0
+            weight_decay: 0.0002
+            # pt_model_uri: "qlib.contrib.model.pytorch_nn.Net"
+            # pt_model_uri: "env_tpl.model.Net"
+            # pt_model_uri: "./model.py:Net"
+            pt_model_uri: "model.Net"
+            pt_model_kwargs:
+              input_dim: 157
+            
+    dataset:
+        class: DatasetH
+        module_path: qlib.data.dataset
+        kwargs:
+            handler:
+                class: Alpha158
+                module_path: qlib.contrib.data.handler
+                kwargs: *data_handler_config
+            segments:
+                train: [2008-01-01, 2014-12-31]
+                valid: [2015-01-01, 2016-12-31]
+                test: [2017-01-01, 2020-08-01]
+    record: 
+        - class: SignalRecord
+          module_path: qlib.workflow.record_temp
+          kwargs: 
+            model: <MODEL>
+            dataset: <DATASET>
+        - class: SigAnaRecord
+          module_path: qlib.workflow.record_temp
+          kwargs: 
+            ana_long_short: False
+            ann_scaler: 252
+        - class: PortAnaRecord
+          module_path: qlib.workflow.record_temp
+          kwargs: 
+            config: *port_analysis_config
diff --git a/test/utils/env_tpl/model.py b/test/utils/env_tpl/model.py
new file mode 100644
index 000000000..6b9863380
--- /dev/null
+++ b/test/utils/env_tpl/model.py
@@ -0,0 +1,32 @@
+import torch
+import torch.nn as nn
+
+class Net(nn.Module):
+    def __init__(self, input_dim, hidden_dim, output_dim=1, num_layers=1, act="LeakyReLU", rnn_type="LSTM"):
+        super(Net, self).__init__()
+        self.hidden_dim = hidden_dim
+        self.num_layers = num_layers
+        self.drop_input = nn.Dropout(0.05)
+        if rnn_type == "LSTM":
+            self.rnn = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
+        elif rnn_type == "GRU":
+            self.rnn = nn.GRU(input_dim, hidden_dim, num_layers, batch_first=True)
+        else:
+            raise NotImplementedError(f"RNN type {rnn_type} is not supported")
+        self.fc = nn.Linear(hidden_dim, output_dim)
+        if act == "LeakyReLU":
+            self.activation = nn.LeakyReLU(negative_slope=0.1, inplace=False)
+        elif act == "SiLU":
+            self.activation = nn.SiLU()
+        else:
+            raise NotImplementedError(f"Activation function {act} is not supported")
+        self.bn = nn.BatchNorm1d(output_dim)
+
+    def forward(self, x):
+        x = self.drop_input(x)
+        x, _ = self.rnn(x)
+        x = x[:, -1, :]
+        x = self.fc(x)
+        x = self.bn(x)
+        x = self.activation(x)
+        return x
\ No newline at end of file
diff --git a/test/utils/test_env.py b/test/utils/test_env.py
index f5b7850be..1ea2c5311 100644
--- a/test/utils/test_env.py
+++ b/test/utils/test_env.py
@@ -1,22 +1,33 @@
 import os
 import sys
+import subprocess
 import unittest
 from pathlib import Path
 sys.path.append(str(Path(__file__).resolve().parent.parent))
-from rdagent.utils.env import QTDockerEnv, LocalEnv, LocalConf
-import shutil
 
+# sys.path.append(str(Path(__file__).resolve().parent.parent))
+from rdagent.utils.env import QTDockerEnv
 
 DIRNAME = Path(__file__).absolute().resolve().parent
 
+# # Assuming 'env_tpl' is inside 'utils', which is inside the main project directory
+# env_tpl_path = Path('/home/v-xisenwang/RD-Agent/test/utils/env_tpl')
+# if env_tpl_path.exists():
+#     sys.path.append(str(env_tpl_path.parent))
+
+# sys.path.append(str(Path(__file__).resolve().parent / "env_tpl"))
+
+# Print the current PYTHONPATH for debugging
+print("PYTHONPATH:\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", sys.path)
 
 class EnvUtils(unittest.TestCase):
+
     def setUp(self):
         pass
 
     def tearDown(self):
         # NOTE: For a docker file, the output are generated with root permission.
-        # mlrun_p = DIRNAME / "env_tpl" / "mlruns"
+        # mlrun_p = DIRNAME / "env_tpl" / "mlruns" 
         # if mlrun_p.exists():
         #     shutil.rmtree(mlrun_p)
         ...
@@ -41,13 +52,18 @@ def test_docker(self):
         And run the docker image with `qrun conf.yaml`
         """
         qtde = QTDockerEnv()
+        print("It is running the prepare()")
         qtde.prepare()
         qtde.prepare()  # you can prepare for multiple times. It is expected to handle it correctly
         # the stdout are returned as result
-        result = qtde.run(local_path=str(DIRNAME / "env_tpl"), entry="qrun conf.yaml")
-        
-        mlrun_p = DIRNAME / "env_tpl" / "mlruns" 
+        # result = qtde.run(local_path=str(DIRNAME / "env_tpl"), entry="qrun conf_mlp.yaml", env={"PYTHONPATH": "/workspace/"})
+        result = qtde.run(local_path=str(DIRNAME / "env_tpl"), entry="qrun conf.yaml", env={"PYTHONPATH": "./"})
+        # result = qtde.run(local_path=str(DIRNAME / "env_tpl"), entry="ls")
+
+        mlrun_p = DIRNAME / "env_tpl" / "mlruns"
         self.assertTrue(mlrun_p.exists(), f"Expected output file {mlrun_p} not found")
+        # mlrun_p = DIRNAME / "env_tpl" / "mlruns" 
+        # self.assertTrue(mlrun_p.exists(), f"Expected output file {mlrun_p} not found")
 
         # read experiment
         result = qtde.run(local_path=str(DIRNAME / "env_tpl"), entry="python read_exp.py")