From 49541c64c68319f83b3086d105d5bd2e617187db Mon Sep 17 00:00:00 2001
From: Young <afe.young@gmail.com>
Date: Wed, 17 Jul 2024 06:08:59 +0000
Subject: [PATCH 01/40] Init todo

---
 rdagent/app/quant_factor_benchmark/eval.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/rdagent/app/quant_factor_benchmark/eval.py b/rdagent/app/quant_factor_benchmark/eval.py
index 3088be30..9c2c3d47 100644
--- a/rdagent/app/quant_factor_benchmark/eval.py
+++ b/rdagent/app/quant_factor_benchmark/eval.py
@@ -27,3 +27,12 @@
 
 # 5.run the eval
 res = eval_method.eval()
+
+# TODO:
+# - Run it:
+# - factor input data generator;
+#   - f_{gt}(input) => value_{gt}
+#   - f_{llm}(input) => value_{llm}
+#   - we have legal issue to release Input
+# - Eval result:
+#   -  check https://github.com/peteryang1/fincov2/blob/master/src/scripts/benchmark/analysis.py

From f3b097ea858bb9289fa630b5ea0d1c4861d9edfc Mon Sep 17 00:00:00 2001
From: WinstonLiyt <1957922024@qq.com>
Date: Thu, 18 Jul 2024 06:58:21 +0000
Subject: [PATCH 02/40] update all code

---
 .../coder/factor_coder/CoSTEER/evolving_agent.py   |  2 +-
 rdagent/components/coder/factor_coder/factor.py    | 14 ++++++++------
 .../qlib/factor_experiment_loader/pdf_loader.py    |  4 +++-
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/rdagent/components/coder/factor_coder/CoSTEER/evolving_agent.py b/rdagent/components/coder/factor_coder/CoSTEER/evolving_agent.py
index feee0ef7..e6ad776b 100644
--- a/rdagent/components/coder/factor_coder/CoSTEER/evolving_agent.py
+++ b/rdagent/components/coder/factor_coder/CoSTEER/evolving_agent.py
@@ -14,6 +14,6 @@ def filter_evolvable_subjects_by_feedback(self, evo: EvolvableSubjects, feedback
         assert len(evo.sub_workspace_list) == len(feedback)
 
         for index in range(len(evo.sub_workspace_list)):
-            if not feedback[index].final_decision:
+            if feedback[index] is not None and not feedback[index].final_decision:
                 evo.sub_workspace_list[index].clear()
         return evo
diff --git a/rdagent/components/coder/factor_coder/factor.py b/rdagent/components/coder/factor_coder/factor.py
index bdf84036..5c9b7390 100644
--- a/rdagent/components/coder/factor_coder/factor.py
+++ b/rdagent/components/coder/factor_coder/factor.py
@@ -141,6 +141,7 @@ def execute(self, store_result: bool = False, data_type: str = "Debug") -> Tuple
             self.link_data_to_workspace(source_data_path, self.workspace_path)
 
             execution_feedback = self.FB_EXECUTION_SUCCEEDED
+            execution_success = False
             try:
                 subprocess.check_output(
                     f"{FACTOR_IMPLEMENT_SETTINGS.python_bin} {code_path}",
@@ -149,6 +150,7 @@ def execute(self, store_result: bool = False, data_type: str = "Debug") -> Tuple
                     stderr=subprocess.STDOUT,
                     timeout=FACTOR_IMPLEMENT_SETTINGS.file_based_execution_timeout,
                 )
+                execution_success = True
             except subprocess.CalledProcessError as e:
                 import site
 
@@ -169,18 +171,18 @@ def execute(self, store_result: bool = False, data_type: str = "Debug") -> Tuple
                     raise RuntimeErrorException(execution_feedback)
 
             workspace_output_file_path = self.workspace_path / "result.h5"
-            if not workspace_output_file_path.exists():
-                execution_feedback += self.FB_OUTPUT_FILE_NOT_FOUND
-                executed_factor_value_dataframe = None
-                if self.raise_exception:
-                    raise NoOutputException(execution_feedback)
-            else:
+            if workspace_output_file_path.exists() and execution_success:
                 try:
                     executed_factor_value_dataframe = pd.read_hdf(workspace_output_file_path)
                     execution_feedback += self.FB_OUTPUT_FILE_FOUND
                 except Exception as e:
                     execution_feedback += f"Error found when reading hdf file: {e}"[:1000]
                     executed_factor_value_dataframe = None
+            else:
+                execution_feedback += self.FB_OUTPUT_FILE_NOT_FOUND
+                executed_factor_value_dataframe = None
+                if self.raise_exception:
+                    raise NoOutputException(execution_feedback)
 
             if store_result and executed_factor_value_dataframe is not None:
                 self.executed_factor_value_dataframe = executed_factor_value_dataframe
diff --git a/rdagent/scenarios/qlib/factor_experiment_loader/pdf_loader.py b/rdagent/scenarios/qlib/factor_experiment_loader/pdf_loader.py
index 5fd0d209..f306ae51 100644
--- a/rdagent/scenarios/qlib/factor_experiment_loader/pdf_loader.py
+++ b/rdagent/scenarios/qlib/factor_experiment_loader/pdf_loader.py
@@ -196,7 +196,7 @@ def __extract_factor_and_formulation_from_one_report(
             factor_dict,
         )
     for factor_name in factor_dict:
-        if factor_name not in factor_to_formulation:
+        if factor_name not in factor_to_formulation or "formulation" not in factor_to_formulation[factor_name] or "variables" not in factor_to_formulation[factor_name]:
             continue
 
         final_factor_dict_to_one_report.setdefault(factor_name, {})
@@ -510,9 +510,11 @@ def load(self, file_or_folder_path: Path) -> dict:
         docs_dict = load_and_process_pdfs_by_langchain(Path(file_or_folder_path))
 
         selected_report_dict = classify_report_from_dict(report_dict=docs_dict, vote_time=1)
+        # selected_report_dict = {'/home/finco/data/report/5322177.pdf': {'class': 1}}
         file_to_factor_result = extract_factors_from_report_dict(docs_dict, selected_report_dict)
         factor_dict = merge_file_to_factor_dict_to_factor_dict(file_to_factor_result)
 
         factor_viability, filtered_factor_dict = check_factor_viability(factor_dict)
         # factor_dict, duplication_names_list = deduplicate_factors_by_llm(factor_dict, factor_viability)
+        
         return FactorExperimentLoaderFromDict().load(filtered_factor_dict)

From 61dc8ce627d9b1f7aa6c28314149379df3b52733 Mon Sep 17 00:00:00 2001
From: WinstonLiyt <1957922024@qq.com>
Date: Thu, 18 Jul 2024 07:00:37 +0000
Subject: [PATCH 03/40] update

---
 rdagent/scenarios/qlib/factor_experiment_loader/pdf_loader.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/rdagent/scenarios/qlib/factor_experiment_loader/pdf_loader.py b/rdagent/scenarios/qlib/factor_experiment_loader/pdf_loader.py
index f306ae51..c9390fc7 100644
--- a/rdagent/scenarios/qlib/factor_experiment_loader/pdf_loader.py
+++ b/rdagent/scenarios/qlib/factor_experiment_loader/pdf_loader.py
@@ -510,7 +510,6 @@ def load(self, file_or_folder_path: Path) -> dict:
         docs_dict = load_and_process_pdfs_by_langchain(Path(file_or_folder_path))
 
         selected_report_dict = classify_report_from_dict(report_dict=docs_dict, vote_time=1)
-        # selected_report_dict = {'/home/finco/data/report/5322177.pdf': {'class': 1}}
         file_to_factor_result = extract_factors_from_report_dict(docs_dict, selected_report_dict)
         factor_dict = merge_file_to_factor_dict_to_factor_dict(file_to_factor_result)
 

From 64812781805699b58aa9239036f115e6b1fe0ee6 Mon Sep 17 00:00:00 2001
From: WinstonLiyt <1957922024@qq.com>
Date: Fri, 19 Jul 2024 08:34:05 +0000
Subject: [PATCH 04/40] Extract factors from financial reports loop finished

---
 rdagent/app/qlib_rd_loop/conf.py              |   4 +
 .../app/qlib_rd_loop/factor_from_report.py    | 122 +++++++++++++++
 .../app/qlib_rd_loop/factor_from_report_sh.py | 147 ++++++++++++++++++
 rdagent/app/qlib_rd_loop/prompts.yaml         |  15 ++
 rdagent/app/qlib_rd_loop/run_script.sh        |  24 +++
 .../factor_coder/CoSTEER/evolving_strategy.py |   8 +
 .../components/coder/factor_coder/factor.py   |   3 +-
 .../scenarios/qlib/developer/factor_runner.py |   4 +-
 rdagent/scenarios/qlib/developer/feedback.py  |  46 +++++-
 rdagent/scenarios/qlib/prompts.yaml           |  23 ++-
 10 files changed, 383 insertions(+), 13 deletions(-)
 create mode 100644 rdagent/app/qlib_rd_loop/factor_from_report.py
 create mode 100644 rdagent/app/qlib_rd_loop/factor_from_report_sh.py
 create mode 100644 rdagent/app/qlib_rd_loop/prompts.yaml
 create mode 100755 rdagent/app/qlib_rd_loop/run_script.sh

diff --git a/rdagent/app/qlib_rd_loop/conf.py b/rdagent/app/qlib_rd_loop/conf.py
index df9d1dd5..0513d4ae 100644
--- a/rdagent/app/qlib_rd_loop/conf.py
+++ b/rdagent/app/qlib_rd_loop/conf.py
@@ -30,5 +30,9 @@ class Config:
     py_bin: str = "/usr/bin/python"
     local_qlib_folder: Path = Path("/home/rdagent/qlib")
 
+    origin_report_path: str = "data/report_origin"
+    local_report_path: str = "data/report"
+    report_result_json_file_path: str = "git_ignore_folder/res_dict.json"
+
 
 PROP_SETTING = PropSetting()
diff --git a/rdagent/app/qlib_rd_loop/factor_from_report.py b/rdagent/app/qlib_rd_loop/factor_from_report.py
new file mode 100644
index 00000000..b4daab0d
--- /dev/null
+++ b/rdagent/app/qlib_rd_loop/factor_from_report.py
@@ -0,0 +1,122 @@
+import json
+from pathlib import Path
+import pickle
+from dotenv import load_dotenv
+from jinja2 import Environment, StrictUndefined
+import pandas as pd
+
+from rdagent.app.qlib_rd_loop.conf import PROP_SETTING
+from rdagent.components.document_reader.document_reader import load_and_process_pdfs_by_langchain
+from rdagent.core.prompts import Prompts
+from rdagent.core.scenario import Scenario
+from rdagent.core.utils import import_class
+from rdagent.log import rdagent_logger as logger
+from rdagent.oai.llm_utils import APIBackend
+from rdagent.scenarios.qlib.developer.factor_coder import QlibFactorCoSTEER
+from rdagent.scenarios.qlib.experiment.factor_experiment import QlibFactorScenario, QlibFactorExperiment
+from rdagent.scenarios.qlib.factor_experiment_loader.pdf_loader import (
+    FactorExperimentLoaderFromPDFfiles,
+    classify_report_from_dict,
+)
+
+from rdagent.core.proposal import (
+    Hypothesis2Experiment,
+    HypothesisExperiment2Feedback,
+    HypothesisGen,
+    Hypothesis,
+    Trace,
+)
+
+from rdagent.core.exception import FactorEmptyException
+from rdagent.core.developer import Developer
+
+assert load_dotenv()
+
+scen: Scenario = import_class(PROP_SETTING.factor_scen)()
+
+hypothesis_gen: HypothesisGen = import_class(PROP_SETTING.factor_hypothesis_gen)(scen)
+
+hypothesis2experiment: Hypothesis2Experiment = import_class(PROP_SETTING.factor_hypothesis2experiment)()
+
+qlib_factor_coder: Developer = import_class(PROP_SETTING.factor_coder)(scen)
+
+qlib_factor_runner: Developer = import_class(PROP_SETTING.factor_runner)(scen)
+
+qlib_factor_summarizer: HypothesisExperiment2Feedback = import_class(PROP_SETTING.factor_summarizer)(scen)
+
+with open(PROP_SETTING.report_result_json_file_path, 'r') as f:
+    judge_pdf_data = json.load(f)
+
+prompts_path = Path(__file__).parent / "prompts.yaml"
+prompts = Prompts(file_path=prompts_path)
+
+def generate_hypothesis(factor_result: dict, report_content: str) -> str:
+    system_prompt = Environment(undefined=StrictUndefined).from_string(prompts["hypothesis_generation"]["system"]).render()
+    user_prompt = Environment(undefined=StrictUndefined).from_string(prompts["hypothesis_generation"]["user"]).render(
+        factor_descriptions=json.dumps(factor_result),
+        report_content=report_content
+    )
+
+    response = APIBackend().build_messages_and_create_chat_completion(
+        user_prompt=user_prompt,
+        system_prompt=system_prompt,
+        json_mode=True,
+    )
+
+    response_json = json.loads(response)
+    hypothesis_text = response_json.get("hypothesis", "No hypothesis generated.")
+    reason_text = response_json.get("reason", "No reason provided.")
+
+    return Hypothesis(hypothesis=hypothesis_text, reason=reason_text)
+
+def extract_factors_and_implement(report_file_path: str) -> tuple:
+    scenario = QlibFactorScenario()
+
+    with logger.tag("extract_factors_and_implement"):
+        with logger.tag("load_factor_tasks"):
+
+            exp = FactorExperimentLoaderFromPDFfiles().load(report_file_path)
+            if exp is None or exp.sub_tasks == []:
+                return None, None
+            
+    docs_dict = load_and_process_pdfs_by_langchain(Path(report_file_path))
+
+    factor_result = {
+        task.factor_name: {
+            "description": task.factor_description,
+            "formulation": task.factor_formulation,
+            "variables": task.variables,
+            "resources": task.factor_resources
+        }
+        for task in exp.sub_tasks
+    }
+
+    report_content = "\n".join(docs_dict.values())
+    hypothesis = generate_hypothesis(factor_result, report_content)
+
+    return exp, hypothesis
+
+trace = Trace(scen=scen)
+
+for file_path, attributes in judge_pdf_data.items():
+    if attributes["class"] == 1:
+        report_file_path = Path(file_path.replace(PROP_SETTING.origin_report_path, PROP_SETTING.local_report_path))
+        if report_file_path.exists():
+            logger.info(f"Processing {report_file_path}")
+            exp, hypothesis = extract_factors_and_implement(str(report_file_path))
+            if exp is None:
+                continue
+            exp.based_experiments = [t[1] for t in trace.hist if t[2]]
+            if len(exp.based_experiments) == 0:
+                exp.based_experiments.append(QlibFactorExperiment(sub_tasks=[]))
+            exp = qlib_factor_coder.develop(exp)
+            exp = qlib_factor_runner.develop(exp)
+            if exp is None:
+                logger.error(f"Factor extraction failed for {report_file_path}. Skipping to the next report.")
+                continue
+            feedback = qlib_factor_summarizer.generateFeedback(exp, hypothesis, trace)
+
+            trace.hist.append((hypothesis, exp, feedback))
+            logger.info(f"Processed {report_file_path}: Result: {exp}")
+        else:
+            logger.error(f"File not found: {report_file_path}")
diff --git a/rdagent/app/qlib_rd_loop/factor_from_report_sh.py b/rdagent/app/qlib_rd_loop/factor_from_report_sh.py
new file mode 100644
index 00000000..5ef4edfe
--- /dev/null
+++ b/rdagent/app/qlib_rd_loop/factor_from_report_sh.py
@@ -0,0 +1,147 @@
+import json
+from pathlib import Path
+import pickle
+from dotenv import load_dotenv
+from jinja2 import Environment, StrictUndefined
+import pandas as pd
+
+from rdagent.app.qlib_rd_loop.conf import PROP_SETTING
+from rdagent.components.document_reader.document_reader import load_and_process_pdfs_by_langchain
+from rdagent.core.prompts import Prompts
+from rdagent.core.scenario import Scenario
+from rdagent.core.utils import import_class
+from rdagent.log import rdagent_logger as logger
+from rdagent.oai.llm_utils import APIBackend
+from rdagent.scenarios.qlib.developer.factor_coder import QlibFactorCoSTEER
+from rdagent.scenarios.qlib.experiment.factor_experiment import QlibFactorScenario, QlibFactorExperiment
+from rdagent.scenarios.qlib.factor_experiment_loader.pdf_loader import (
+    FactorExperimentLoaderFromPDFfiles,
+    classify_report_from_dict,
+)
+
+from rdagent.core.proposal import (
+    Hypothesis2Experiment,
+    HypothesisExperiment2Feedback,
+    HypothesisGen,
+    Hypothesis,
+    Trace,
+)
+
+from rdagent.core.exception import FactorEmptyException
+from rdagent.core.developer import Developer
+
+assert load_dotenv()
+
+scen: Scenario = import_class(PROP_SETTING.factor_scen)()
+
+hypothesis_gen: HypothesisGen = import_class(PROP_SETTING.factor_hypothesis_gen)(scen)
+
+hypothesis2experiment: Hypothesis2Experiment = import_class(PROP_SETTING.factor_hypothesis2experiment)()
+
+qlib_factor_coder: Developer = import_class(PROP_SETTING.factor_coder)(scen)
+
+qlib_factor_runner: Developer = import_class(PROP_SETTING.factor_runner)(scen)
+
+qlib_factor_summarizer: HypothesisExperiment2Feedback = import_class(PROP_SETTING.factor_summarizer)(scen)
+
+json_file_path = "/home/finco/v-yuanteli/RD-Agent/git_ignore_folder/res_dict.json"
+with open(json_file_path, 'r') as f:
+    judge_pdf_data = json.load(f)
+
+prompts_path = Path(__file__).parent / "prompts.yaml"
+prompts = Prompts(file_path=prompts_path)
+
+progress_file = "/home/finco/v-yuanteli/RD-Agent/git_ignore_folder/progress.pkl"
+
+def save_progress(trace, current_index):
+    with open(progress_file, "wb") as f:
+        pickle.dump((trace, current_index), f)
+
+def load_progress():
+    if Path(progress_file).exists():
+        with open(progress_file, "rb") as f:
+            return pickle.load(f)
+    return Trace(scen=scen), 0
+
+def generate_hypothesis(factor_result: dict, report_content: str) -> str:
+    system_prompt = Environment(undefined=StrictUndefined).from_string(prompts["hypothesis_generation"]["system"]).render()
+    user_prompt = Environment(undefined=StrictUndefined).from_string(prompts["hypothesis_generation"]["user"]).render(
+        factor_descriptions=json.dumps(factor_result),
+        report_content=report_content
+    )
+
+    response = APIBackend().build_messages_and_create_chat_completion(
+        user_prompt=user_prompt,
+        system_prompt=system_prompt,
+        json_mode=True,
+    )
+
+    response_json = json.loads(response)
+    hypothesis_text = response_json.get("hypothesis", "No hypothesis generated.")
+    reason_text = response_json.get("reason", "No reason provided.")
+
+    return Hypothesis(hypothesis=hypothesis_text, reason=reason_text)
+
+def extract_factors_and_implement(report_file_path: str) -> tuple:
+    scenario = QlibFactorScenario()
+
+    with logger.tag("extract_factors_and_implement"):
+        with logger.tag("load_factor_tasks"):
+
+            exp = FactorExperimentLoaderFromPDFfiles().load(report_file_path)
+            if exp is None or exp.sub_tasks == []:
+                return None, None
+            
+    docs_dict = load_and_process_pdfs_by_langchain(Path(report_file_path))
+
+    factor_result = {
+        task.factor_name: {
+            "description": task.factor_description,
+            "formulation": task.factor_formulation,
+            "variables": task.variables,
+            "resources": task.factor_resources
+        }
+        for task in exp.sub_tasks
+    }
+
+    report_content = "\n".join(docs_dict.values())
+    hypothesis = generate_hypothesis(factor_result, report_content)
+
+    return exp, hypothesis
+
+trace, start_index = load_progress()
+
+try:
+    judge_pdf_data_items = list(judge_pdf_data.items())
+    for index in range(start_index, len(judge_pdf_data_items)):
+        if index > 1000:
+            break
+        file_path, attributes = judge_pdf_data_items[index]
+        if attributes["class"] == 1:
+            report_file_path = Path(file_path.replace("/data/home/xiaoyang/data/ftp/amc_origin_file/report", "/home/finco/data/report"))
+            if report_file_path.exists():
+                print(f"Processing {report_file_path}")
+                exp, hypothesis = extract_factors_and_implement(str(report_file_path))
+                if exp is None:
+                    continue
+                exp.based_experiments = [t[1] for t in trace.hist if t[2]]
+                if len(exp.based_experiments) == 0:
+                    exp.based_experiments.append(QlibFactorExperiment(sub_tasks=[]))
+                exp = qlib_factor_coder.develop(exp)
+                exp = qlib_factor_runner.develop(exp)
+                if exp is None:
+                    logger.error(f"Factor extraction failed for {report_file_path}. Skipping to the next report.")
+                    continue
+                feedback = qlib_factor_summarizer.generateFeedback(exp, hypothesis, trace)
+
+                trace.hist.append((hypothesis, exp, feedback))
+                print(f"Processed {report_file_path}: Result: {exp}")
+                
+                # Save progress after processing each report
+                save_progress(trace, index + 1)
+            else:
+                print(f"File not found: {report_file_path}")
+except Exception as e:
+    logger.error(f"An error occurred: {e}")
+    save_progress(trace, index)
+    raise
\ No newline at end of file
diff --git a/rdagent/app/qlib_rd_loop/prompts.yaml b/rdagent/app/qlib_rd_loop/prompts.yaml
new file mode 100644
index 00000000..1909087c
--- /dev/null
+++ b/rdagent/app/qlib_rd_loop/prompts.yaml
@@ -0,0 +1,15 @@
+hypothesis_generation:
+  system: |-
+    You are an expert in financial analysis. Your task is to generate a well-reasoned hypothesis based on the provided financial factors and report content.
+    Please ensure your response is in JSON format as shown below:
+    {
+      "hypothesis": "A clear and concise hypothesis based on the provided information.",
+      "reason": "A detailed explanation supporting the generated hypothesis."
+    }
+
+  user: |-
+    The following are the financial factors and their descriptions:
+    {{ factor_descriptions }}
+
+    The report content is as follows:
+    {{ report_content }}
\ No newline at end of file
diff --git a/rdagent/app/qlib_rd_loop/run_script.sh b/rdagent/app/qlib_rd_loop/run_script.sh
new file mode 100755
index 00000000..c9ce7fe4
--- /dev/null
+++ b/rdagent/app/qlib_rd_loop/run_script.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+max_retries=1000
+count=0
+log_file="/home/finco/v-yuanteli/RD-Agent/rdagent/app/qlib_rd_loop/run_script.log"
+
+while [ $count -lt $max_retries ]; do
+  echo "$(date) - Attempt $count of $max_retries" >> $log_file
+  /home/finco/anaconda3/envs/rdagent/bin/python /home/finco/v-yuanteli/RD-Agent/rdagent/app/qlib_rd_loop/factor_from_report_sh.py >> $log_file 2>&1
+  if [ $? -eq 0 ]; then
+    echo "$(date) - Script completed successfully on attempt $count" >> $log_file
+    break
+  fi
+  count=$((count + 1))
+  echo "$(date) - Restarting script after crash... Attempt $count of $max_retries" >> $log_file
+done
+
+if [ $count -ge $max_retries ]; then
+  echo "$(date) - Script failed after $max_retries attempts." >> $log_file
+else
+  echo "$(date) - Script completed successfully." >> $log_file
+fi
+
+# chmod +x /home/finco/v-yuanteli/RD-Agent/rdagent/app/qlib_rd_loop/run_script.sh
\ No newline at end of file
diff --git a/rdagent/components/coder/factor_coder/CoSTEER/evolving_strategy.py b/rdagent/components/coder/factor_coder/CoSTEER/evolving_strategy.py
index 97c225ff..bca73cd9 100644
--- a/rdagent/components/coder/factor_coder/CoSTEER/evolving_strategy.py
+++ b/rdagent/components/coder/factor_coder/CoSTEER/evolving_strategy.py
@@ -71,6 +71,14 @@ def evolve(
             implementation_factors_per_round = int(
                 FACTOR_IMPLEMENT_SETTINGS.select_ratio * len(to_be_finished_task_index)
             )
+
+            # Ensure at least one task is selected
+            if implementation_factors_per_round == 0:
+                implementation_factors_per_round = 1
+
+            if implementation_factors_per_round > len(to_be_finished_task_index):
+                implementation_factors_per_round = len(to_be_finished_task_index)
+
             if FACTOR_IMPLEMENT_SETTINGS.select_method == "random":
                 to_be_finished_task_index = RandomSelect(
                     to_be_finished_task_index,
diff --git a/rdagent/components/coder/factor_coder/factor.py b/rdagent/components/coder/factor_coder/factor.py
index 5c9b7390..3b078512 100644
--- a/rdagent/components/coder/factor_coder/factor.py
+++ b/rdagent/components/coder/factor_coder/factor.py
@@ -109,7 +109,8 @@ def execute(self, store_result: bool = False, data_type: str = "Debug") -> Tuple
                 raise CodeFormatException(self.FB_CODE_NOT_SET)
             else:
                 # TODO: to make the interface compatible with previous code. I kept the original behavior.
-                raise ValueError(self.FB_CODE_NOT_SET)
+                # raise ValueError(self.FB_CODE_NOT_SET)
+                return self.FB_CODE_NOT_SET, None
         with FileLock(self.workspace_path / "execution.lock"):
             if FACTOR_IMPLEMENT_SETTINGS.enable_execution_cache:
                 # NOTE: cache the result for the same code and same data type
diff --git a/rdagent/scenarios/qlib/developer/factor_runner.py b/rdagent/scenarios/qlib/developer/factor_runner.py
index 9e9b880a..a868cc46 100644
--- a/rdagent/scenarios/qlib/developer/factor_runner.py
+++ b/rdagent/scenarios/qlib/developer/factor_runner.py
@@ -60,7 +60,9 @@ def develop(self, exp: QlibFactorExperiment) -> QlibFactorExperiment:
             new_factors = self.process_factor_data(exp)
 
             if new_factors.empty:
-                raise FactorEmptyException("No valid factor data found to merge.")
+                # raise FactorEmptyException("No valid factor data found to merge.")
+                logger.error("No valid factor data found to merge.")
+                return None
 
             # Combine the SOTA factor and new factors if SOTA factor exists
             if SOTA_factor is not None and not SOTA_factor.empty:
diff --git a/rdagent/scenarios/qlib/developer/feedback.py b/rdagent/scenarios/qlib/developer/feedback.py
index 27d93dd0..243ff2d5 100644
--- a/rdagent/scenarios/qlib/developer/feedback.py
+++ b/rdagent/scenarios/qlib/developer/feedback.py
@@ -1,10 +1,8 @@
-# TODO:
-# Implement to feedback.
-
 import json
 from pathlib import Path
 
 from jinja2 import Environment, StrictUndefined
+import pandas as pd
 
 from rdagent.core.experiment import Experiment
 from rdagent.core.prompts import Prompts
@@ -21,6 +19,39 @@
 feedback_prompts = Prompts(file_path=Path(__file__).parent.parent / "prompts.yaml")
 DIRNAME = Path(__file__).absolute().resolve().parent
 
+def process_results(current_result, sota_result):
+    # Convert the results to dataframes
+    current_df = pd.DataFrame(current_result)
+    sota_df = pd.DataFrame(sota_result)
+    
+    # Set the metric as the index
+    current_df.index.name = 'metric'
+    sota_df.index.name = 'metric'
+    
+    # Rename the value column to reflect the result type
+    current_df.rename(columns={'0': 'Current Result'}, inplace=True)
+    sota_df.rename(columns={'0': 'SOTA Result'}, inplace=True)
+    
+    # Combine the dataframes on the Metric index
+    combined_df = pd.concat([current_df, sota_df], axis=1)
+    
+    # Select important metrics for comparison
+    important_metrics = [
+        'Rank ICIR',
+        '1day.excess_return_without_cost.max_drawdown',
+        '1day.excess_return_without_cost.information_ratio',
+        '1day.excess_return_without_cost.annualized_return',
+        '1day.excess_return_with_cost.max_drawdown',
+        '1day.excess_return_with_cost.information_ratio',
+        '1day.excess_return_with_cost.annualized_return',
+        'IC',
+    ]
+    
+    # Filter the combined DataFrame to retain only the important metrics
+    filtered_combined_df = combined_df.loc[important_metrics]
+    
+    return filtered_combined_df.to_dict()
+
 
 class QlibFactorHypothesisExperiment2Feedback(HypothesisExperiment2Feedback):
     def generateFeedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trace) -> HypothesisFeedback:
@@ -41,6 +72,10 @@ def generateFeedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trace
         tasks_factors = [task.get_task_information() for task in exp.sub_tasks]
         sota_result = exp.based_experiments[-1].result
 
+        # Process the results to filter important metrics
+        combined_result = process_results(current_result, sota_result)
+        logger.info(f"combined_result: {combined_result}")
+
         # Generate the system prompt
         sys_prompt = (
             Environment(undefined=StrictUndefined)
@@ -55,8 +90,9 @@ def generateFeedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trace
             .render(
                 hypothesis_text=hypothesis_text,
                 task_details=tasks_factors,
-                current_result=current_result,
-                sota_result=sota_result,
+                # current_result=current_result,
+                # sota_result=sota_result,
+                combined_result=combined_result,
             )
         )
 
diff --git a/rdagent/scenarios/qlib/prompts.yaml b/rdagent/scenarios/qlib/prompts.yaml
index c4c316a8..5f00e0b4 100644
--- a/rdagent/scenarios/qlib/prompts.yaml
+++ b/rdagent/scenarios/qlib/prompts.yaml
@@ -55,7 +55,7 @@ factor_feedback_generation:
     The task is described in the following scenario:
     {{ scenario }}
     You will receive a hypothesis, multiple tasks with their factors, and some results.
-    Your feedback should specify whether the current result supports or refutes the hypothesis, compare it with previous results, and suggest improvements or new directions.
+    Your feedback should specify whether the current result supports or refutes the hypothesis, compare it with previous SOTA results, and suggest improvements or new directions.
     Please provide detailed and constructive feedback for the future exploration.
     Please respond in JSON format, and example JSON Structure for Result Analysis:
     {
@@ -70,15 +70,26 @@ factor_feedback_generation:
     {{ hypothesis_text }}
     Tasks and Factors:
     {{ task_details }}
-    Current Result: 
-    {{ current_result }}
-    SOTA Result: 
-    {{ sota_result }}
-    Analyze the current result in the context of its ability to:
+    Combined Results: 
+    {{ combined_result }}
+    Analyze the combined result in the context of its ability to:
     1. Support or refute the hypothesis.
     2. Show improvement or deterioration compared to the last experiment.
     3. Demonstrate positive or negative effects when compared to Alpha158.
 
+    Evaluation Metrics Explanations:
+    Below are the financial meanings of each metric, which should be used to judge the results:
+
+    - Rank ICIR: Evaluates the stability and average level of Rank IC. Rank ICIR = mean(Rank IC) / std(Rank IC).
+    - 1day.excess_return_without_cost.max_drawdown: Measures the maximum loss from a peak to a trough without considering transaction costs.
+    - 1day.excess_return_without_cost.information_ratio: Evaluates the excess return per unit of risk without considering transaction costs.
+    - 1day.excess_return_with_cost.max_drawdown: Measures the maximum loss from a peak to a trough considering transaction costs.
+    - 1day.excess_return_without_cost.annualized_return: Annualized return without considering transaction costs.
+    - 1day.excess_return_with_cost.annualized_return: Annualized return considering transaction costs.
+    - IC: Measures the correlation between predicted returns (\hat{y}) and actual returns (y), using Pearson correlation.
+    - 1day.excess_return_with_cost.information_ratio: Evaluates the excess return per unit of risk considering transaction costs.
+
+    When judging the results, prioritize metrics that consider transaction costs (with cost), as they provide a more accurate representation of real-world performance. Among these, the annualized return considering transaction costs is particularly important as it gives a clear picture of long-term profitability.
     Provide detailed feedback and recommend whether to replace the best result if the new factor proves superior.
 
 model_feedback_generation:

From ce30c04a95015230139fe22ca248694b22b6055f Mon Sep 17 00:00:00 2001
From: WinstonLiyt <1957922024@qq.com>
Date: Fri, 19 Jul 2024 11:51:33 +0000
Subject: [PATCH 05/40] Fix two small bugs.

---
 rdagent/app/qlib_rd_loop/conf.py              |  1 +
 .../app/qlib_rd_loop/factor_from_report.py    |  3 +--
 .../app/qlib_rd_loop/factor_from_report_sh.py | 22 ++++++++-----------
 rdagent/app/qlib_rd_loop/nohup.out            |  1 +
 rdagent/app/qlib_rd_loop/run_script.sh        |  4 +---
 .../coder/factor_coder/CoSTEER/evaluators.py  | 13 ++++++++---
 6 files changed, 23 insertions(+), 21 deletions(-)
 create mode 100644 rdagent/app/qlib_rd_loop/nohup.out

diff --git a/rdagent/app/qlib_rd_loop/conf.py b/rdagent/app/qlib_rd_loop/conf.py
index 0513d4ae..60d63dbb 100644
--- a/rdagent/app/qlib_rd_loop/conf.py
+++ b/rdagent/app/qlib_rd_loop/conf.py
@@ -33,6 +33,7 @@ class Config:
     origin_report_path: str = "data/report_origin"
     local_report_path: str = "data/report"
     report_result_json_file_path: str = "git_ignore_folder/res_dict.json"
+    progress_file_path: str = "git_ignore_folder/progress.pkl"
 
 
 PROP_SETTING = PropSetting()
diff --git a/rdagent/app/qlib_rd_loop/factor_from_report.py b/rdagent/app/qlib_rd_loop/factor_from_report.py
index b4daab0d..4ea9410a 100644
--- a/rdagent/app/qlib_rd_loop/factor_from_report.py
+++ b/rdagent/app/qlib_rd_loop/factor_from_report.py
@@ -27,7 +27,6 @@
     Trace,
 )
 
-from rdagent.core.exception import FactorEmptyException
 from rdagent.core.developer import Developer
 
 assert load_dotenv()
@@ -114,7 +113,7 @@ def extract_factors_and_implement(report_file_path: str) -> tuple:
             if exp is None:
                 logger.error(f"Factor extraction failed for {report_file_path}. Skipping to the next report.")
                 continue
-            feedback = qlib_factor_summarizer.generateFeedback(exp, hypothesis, trace)
+            feedback = qlib_factor_summarizer.generate_feedback(exp, hypothesis, trace)
 
             trace.hist.append((hypothesis, exp, feedback))
             logger.info(f"Processed {report_file_path}: Result: {exp}")
diff --git a/rdagent/app/qlib_rd_loop/factor_from_report_sh.py b/rdagent/app/qlib_rd_loop/factor_from_report_sh.py
index 5ef4edfe..dfd38e3e 100644
--- a/rdagent/app/qlib_rd_loop/factor_from_report_sh.py
+++ b/rdagent/app/qlib_rd_loop/factor_from_report_sh.py
@@ -27,7 +27,6 @@
     Trace,
 )
 
-from rdagent.core.exception import FactorEmptyException
 from rdagent.core.developer import Developer
 
 assert load_dotenv()
@@ -44,22 +43,19 @@
 
 qlib_factor_summarizer: HypothesisExperiment2Feedback = import_class(PROP_SETTING.factor_summarizer)(scen)
 
-json_file_path = "/home/finco/v-yuanteli/RD-Agent/git_ignore_folder/res_dict.json"
-with open(json_file_path, 'r') as f:
+with open(PROP_SETTING.report_result_json_file_path, 'r') as f:
     judge_pdf_data = json.load(f)
 
 prompts_path = Path(__file__).parent / "prompts.yaml"
 prompts = Prompts(file_path=prompts_path)
 
-progress_file = "/home/finco/v-yuanteli/RD-Agent/git_ignore_folder/progress.pkl"
-
 def save_progress(trace, current_index):
-    with open(progress_file, "wb") as f:
+    with open(PROP_SETTING.progress_file, "wb") as f:
         pickle.dump((trace, current_index), f)
 
 def load_progress():
-    if Path(progress_file).exists():
-        with open(progress_file, "rb") as f:
+    if Path(PROP_SETTING.progress_file).exists():
+        with open(PROP_SETTING.progress_file, "rb") as f:
             return pickle.load(f)
     return Trace(scen=scen), 0
 
@@ -118,9 +114,9 @@ def extract_factors_and_implement(report_file_path: str) -> tuple:
             break
         file_path, attributes = judge_pdf_data_items[index]
         if attributes["class"] == 1:
-            report_file_path = Path(file_path.replace("/data/home/xiaoyang/data/ftp/amc_origin_file/report", "/home/finco/data/report"))
+            report_file_path = Path(file_path.replace(PROP_SETTING.origin_report_path, PROP_SETTING.local_report_path))
             if report_file_path.exists():
-                print(f"Processing {report_file_path}")
+                logger.info(f"Processing {report_file_path}")
                 exp, hypothesis = extract_factors_and_implement(str(report_file_path))
                 if exp is None:
                     continue
@@ -132,15 +128,15 @@ def extract_factors_and_implement(report_file_path: str) -> tuple:
                 if exp is None:
                     logger.error(f"Factor extraction failed for {report_file_path}. Skipping to the next report.")
                     continue
-                feedback = qlib_factor_summarizer.generateFeedback(exp, hypothesis, trace)
+                feedback = qlib_factor_summarizer.generate_feedback(exp, hypothesis, trace)
 
                 trace.hist.append((hypothesis, exp, feedback))
-                print(f"Processed {report_file_path}: Result: {exp}")
+                logger.info(f"Processed {report_file_path}: Result: {exp}")
                 
                 # Save progress after processing each report
                 save_progress(trace, index + 1)
             else:
-                print(f"File not found: {report_file_path}")
+                looger.error(f"File not found: {report_file_path}")
 except Exception as e:
     logger.error(f"An error occurred: {e}")
     save_progress(trace, index)
diff --git a/rdagent/app/qlib_rd_loop/nohup.out b/rdagent/app/qlib_rd_loop/nohup.out
new file mode 100644
index 00000000..48ef9d59
--- /dev/null
+++ b/rdagent/app/qlib_rd_loop/nohup.out
@@ -0,0 +1 @@
+nohup: ignoring input
diff --git a/rdagent/app/qlib_rd_loop/run_script.sh b/rdagent/app/qlib_rd_loop/run_script.sh
index c9ce7fe4..ba22e9cf 100755
--- a/rdagent/app/qlib_rd_loop/run_script.sh
+++ b/rdagent/app/qlib_rd_loop/run_script.sh
@@ -19,6 +19,4 @@ if [ $count -ge $max_retries ]; then
   echo "$(date) - Script failed after $max_retries attempts." >> $log_file
 else
   echo "$(date) - Script completed successfully." >> $log_file
-fi
-
-# chmod +x /home/finco/v-yuanteli/RD-Agent/rdagent/app/qlib_rd_loop/run_script.sh
\ No newline at end of file
+fi
\ No newline at end of file
diff --git a/rdagent/components/coder/factor_coder/CoSTEER/evaluators.py b/rdagent/components/coder/factor_coder/CoSTEER/evaluators.py
index b38348fb..fbfc9956 100644
--- a/rdagent/components/coder/factor_coder/CoSTEER/evaluators.py
+++ b/rdagent/components/coder/factor_coder/CoSTEER/evaluators.py
@@ -334,6 +334,13 @@ def evaluate(
     ) -> Tuple:
         conclusions = []
 
+        # Initialize result variables
+        single_column_result = None
+        same_index_result = None
+        output_format_result = None
+        equal_value_ratio_result = 0
+        high_correlation_result = False
+
         # Check if both dataframe has only one columns
         feedback_str, _ = FactorSingleColumnEvaluator(self.scen).evaluate(implementation, gt_implementation)
         conclusions.append(feedback_str)
@@ -373,9 +380,9 @@ def evaluate(
                 high_correlation_result = False
                 feedback_str = "The source dataframe and the ground truth dataframe have different index. Give up comparing the values and correlation because it's useless"
             conclusions.append(feedback_str)
-        else:
-            equal_value_ratio_result = 0
-            high_correlation_result = False
+        # else:
+        #     equal_value_ratio_result = 0
+        #     high_correlation_result = False
 
         # Combine all conclusions into a single string
         conclusion_str = "\n".join(conclusions)

From aa2ffacec4c76ac7123234076bfbd7d770621140 Mon Sep 17 00:00:00 2001
From: WinstonLiyt <104308117+WinstonLiyt@users.noreply.github.com>
Date: Fri, 19 Jul 2024 19:54:23 +0800
Subject: [PATCH 06/40] Delete rdagent/app/qlib_rd_loop/run_script.sh

---
 rdagent/app/qlib_rd_loop/run_script.sh | 22 ----------------------
 1 file changed, 22 deletions(-)
 delete mode 100755 rdagent/app/qlib_rd_loop/run_script.sh

diff --git a/rdagent/app/qlib_rd_loop/run_script.sh b/rdagent/app/qlib_rd_loop/run_script.sh
deleted file mode 100755
index ba22e9cf..00000000
--- a/rdagent/app/qlib_rd_loop/run_script.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/bin/bash
-
-max_retries=1000
-count=0
-log_file="/home/finco/v-yuanteli/RD-Agent/rdagent/app/qlib_rd_loop/run_script.log"
-
-while [ $count -lt $max_retries ]; do
-  echo "$(date) - Attempt $count of $max_retries" >> $log_file
-  /home/finco/anaconda3/envs/rdagent/bin/python /home/finco/v-yuanteli/RD-Agent/rdagent/app/qlib_rd_loop/factor_from_report_sh.py >> $log_file 2>&1
-  if [ $? -eq 0 ]; then
-    echo "$(date) - Script completed successfully on attempt $count" >> $log_file
-    break
-  fi
-  count=$((count + 1))
-  echo "$(date) - Restarting script after crash... Attempt $count of $max_retries" >> $log_file
-done
-
-if [ $count -ge $max_retries ]; then
-  echo "$(date) - Script failed after $max_retries attempts." >> $log_file
-else
-  echo "$(date) - Script completed successfully." >> $log_file
-fi
\ No newline at end of file

From cecb4c528ccc3bd2e12a3108eef18d38c2fd8565 Mon Sep 17 00:00:00 2001
From: Young <afe.young@gmail.com>
Date: Fri, 19 Jul 2024 13:14:34 +0000
Subject: [PATCH 07/40] Minor mod

---
 rdagent/app/qlib_rd_loop/RDAgent.py                  |  2 +-
 rdagent/app/qlib_rd_loop/factor_from_report_sh.py    |  5 +++--
 .../coder/factor_coder/CoSTEER/evaluators.py         |  3 ---
 .../coder/factor_coder/CoSTEER/evolving_strategy.py  | 12 ++----------
 4 files changed, 6 insertions(+), 16 deletions(-)

diff --git a/rdagent/app/qlib_rd_loop/RDAgent.py b/rdagent/app/qlib_rd_loop/RDAgent.py
index d125b429..661448b9 100644
--- a/rdagent/app/qlib_rd_loop/RDAgent.py
+++ b/rdagent/app/qlib_rd_loop/RDAgent.py
@@ -44,7 +44,7 @@ def run_experiment(self, exp):
         return exp
 
     def generate_feedback(self, exp, hypothesis):
-        feedback = self.qlib_model_summarizer.generateFeedback(exp, hypothesis, self.trace)
+        feedback = self.qlib_model_summarizer.generate_feedback(exp, hypothesis, self.trace)
         self.dump_objects(exp=exp, hypothesis=hypothesis, feedback=feedback, trace=self.trace, filename='step_feedback.pkl')
         return feedback
 
diff --git a/rdagent/app/qlib_rd_loop/factor_from_report_sh.py b/rdagent/app/qlib_rd_loop/factor_from_report_sh.py
index dfd38e3e..504b16be 100644
--- a/rdagent/app/qlib_rd_loop/factor_from_report_sh.py
+++ b/rdagent/app/qlib_rd_loop/factor_from_report_sh.py
@@ -1,3 +1,4 @@
+# TODO: we should have more advanced mechanism to handle such requirements for saving sessions.
 import json
 from pathlib import Path
 import pickle
@@ -136,8 +137,8 @@ def extract_factors_and_implement(report_file_path: str) -> tuple:
                 # Save progress after processing each report
                 save_progress(trace, index + 1)
             else:
-                looger.error(f"File not found: {report_file_path}")
+                logger.error(f"File not found: {report_file_path}")
 except Exception as e:
     logger.error(f"An error occurred: {e}")
     save_progress(trace, index)
-    raise
\ No newline at end of file
+    raise
diff --git a/rdagent/components/coder/factor_coder/CoSTEER/evaluators.py b/rdagent/components/coder/factor_coder/CoSTEER/evaluators.py
index fbfc9956..00b59d8e 100644
--- a/rdagent/components/coder/factor_coder/CoSTEER/evaluators.py
+++ b/rdagent/components/coder/factor_coder/CoSTEER/evaluators.py
@@ -380,9 +380,6 @@ def evaluate(
                 high_correlation_result = False
                 feedback_str = "The source dataframe and the ground truth dataframe have different index. Give up comparing the values and correlation because it's useless"
             conclusions.append(feedback_str)
-        # else:
-        #     equal_value_ratio_result = 0
-        #     high_correlation_result = False
 
         # Combine all conclusions into a single string
         conclusion_str = "\n".join(conclusions)
diff --git a/rdagent/components/coder/factor_coder/CoSTEER/evolving_strategy.py b/rdagent/components/coder/factor_coder/CoSTEER/evolving_strategy.py
index bca73cd9..1493cfac 100644
--- a/rdagent/components/coder/factor_coder/CoSTEER/evolving_strategy.py
+++ b/rdagent/components/coder/factor_coder/CoSTEER/evolving_strategy.py
@@ -68,16 +68,8 @@ def evolve(
         # if the number of factors to be implemented is larger than the limit, we need to select some of them
         if FACTOR_IMPLEMENT_SETTINGS.select_ratio < 1:
             # if the number of loops is equal to the select_loop, we need to select some of them
-            implementation_factors_per_round = int(
-                FACTOR_IMPLEMENT_SETTINGS.select_ratio * len(to_be_finished_task_index)
-            )
-
-            # Ensure at least one task is selected
-            if implementation_factors_per_round == 0:
-                implementation_factors_per_round = 1
-
-            if implementation_factors_per_round > len(to_be_finished_task_index):
-                implementation_factors_per_round = len(to_be_finished_task_index)
+            implementation_factors_per_round = round(FACTOR_IMPLEMENT_SETTINGS.select_ratio * len(to_be_finished_task_index) + 0.5)  # ceilling
+            implementation_factors_per_round = min(implementation_factors_per_round, len(to_be_finished_task_index))  # but not exceed the total number of tasks
 
             if FACTOR_IMPLEMENT_SETTINGS.select_method == "random":
                 to_be_finished_task_index = RandomSelect(

From 61d352ff936e3b0af2d8a6f25a6c99a38fea75b4 Mon Sep 17 00:00:00 2001
From: you-n-g <you-n-g@users.noreply.github.com>
Date: Fri, 19 Jul 2024 21:19:22 +0800
Subject: [PATCH 08/40] Delete rdagent/app/qlib_rd_loop/nohup.out

---
 rdagent/app/qlib_rd_loop/nohup.out | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 rdagent/app/qlib_rd_loop/nohup.out

diff --git a/rdagent/app/qlib_rd_loop/nohup.out b/rdagent/app/qlib_rd_loop/nohup.out
deleted file mode 100644
index 48ef9d59..00000000
--- a/rdagent/app/qlib_rd_loop/nohup.out
+++ /dev/null
@@ -1 +0,0 @@
-nohup: ignoring input

From 367a1cee7ba3cf7057b7d81e6eb69ac8cbc61451 Mon Sep 17 00:00:00 2001
From: WinstonLiyt <1957922024@qq.com>
Date: Mon, 22 Jul 2024 06:33:23 +0000
Subject: [PATCH 09/40] Fix a small bug in file reading.

---
 rdagent/app/qlib_rd_loop/factor_from_report_sh.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/rdagent/app/qlib_rd_loop/factor_from_report_sh.py b/rdagent/app/qlib_rd_loop/factor_from_report_sh.py
index 504b16be..1b1bddcc 100644
--- a/rdagent/app/qlib_rd_loop/factor_from_report_sh.py
+++ b/rdagent/app/qlib_rd_loop/factor_from_report_sh.py
@@ -51,12 +51,12 @@
 prompts = Prompts(file_path=prompts_path)
 
 def save_progress(trace, current_index):
-    with open(PROP_SETTING.progress_file, "wb") as f:
+    with open(PROP_SETTING.progress_file_path, "wb") as f:
         pickle.dump((trace, current_index), f)
 
 def load_progress():
-    if Path(PROP_SETTING.progress_file).exists():
-        with open(PROP_SETTING.progress_file, "rb") as f:
+    if Path(PROP_SETTING.progress_file_path).exists():
+        with open(PROP_SETTING.progress_file_path, "rb") as f:
             return pickle.load(f)
     return Trace(scen=scen), 0
 

From 7887905d8d8f9088e4739f5a6813f9aa3228f5c6 Mon Sep 17 00:00:00 2001
From: WinstonLiyt <1957922024@qq.com>
Date: Mon, 22 Jul 2024 07:52:23 +0000
Subject: [PATCH 10/40] some updates

---
 .gitignore                         | 4 ++++
 rdagent/app/qlib_rd_loop/factor.py | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 88244b47..2e5dd32d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -163,3 +163,7 @@ mlruns/
 # possible output from coder or runner
 *.pth
 *qlib_res.csv
+
+# shell script
+*.out
+*.sh
diff --git a/rdagent/app/qlib_rd_loop/factor.py b/rdagent/app/qlib_rd_loop/factor.py
index 85e29380..4d23d7b7 100644
--- a/rdagent/app/qlib_rd_loop/factor.py
+++ b/rdagent/app/qlib_rd_loop/factor.py
@@ -1,5 +1,5 @@
 """
-TODO: Factor Structure RD-Loop
+Factor Structure RD-Loop
 """
 
 from dotenv import load_dotenv

From d5f36d9778ec88ab40ed2693c0b658779fb10454 Mon Sep 17 00:00:00 2001
From: WinstonLiyt <1957922024@qq.com>
Date: Mon, 22 Jul 2024 10:04:10 +0000
Subject: [PATCH 11/40] Update the detailed process and prompt of factor loop.

---
 rdagent/components/proposal/factor_proposal.py    |  1 +
 rdagent/components/proposal/model_proposal.py     |  2 +-
 rdagent/scenarios/qlib/prompts.yaml               | 15 +++++++++++++++
 .../scenarios/qlib/proposal/factor_proposal.py    |  1 +
 4 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/rdagent/components/proposal/factor_proposal.py b/rdagent/components/proposal/factor_proposal.py
index 8ec66efc..df61b006 100644
--- a/rdagent/components/proposal/factor_proposal.py
+++ b/rdagent/components/proposal/factor_proposal.py
@@ -44,6 +44,7 @@ def gen(self, trace: Trace) -> FactorHypothesis:
                 targets="factors",
                 scenario=self.scen.get_scenario_all_desc(),
                 hypothesis_output_format=context_dict["hypothesis_output_format"],
+                hypothesis_specification=context_dict["hypothesis_specification"],
             )
         )
         user_prompt = (
diff --git a/rdagent/components/proposal/model_proposal.py b/rdagent/components/proposal/model_proposal.py
index 4c1d203e..110c07a9 100644
--- a/rdagent/components/proposal/model_proposal.py
+++ b/rdagent/components/proposal/model_proposal.py
@@ -40,7 +40,7 @@ def gen(self, trace: Trace) -> ModelHypothesis:
                 targets="model",
                 scenario=self.scen.get_scenario_all_desc(),
                 hypothesis_output_format=context_dict["hypothesis_output_format"],
-                hypothesis_specification = context_dict["hypothesis_specification"],
+                hypothesis_specification=context_dict["hypothesis_specification"],
             )
         )
         user_prompt = (
diff --git a/rdagent/scenarios/qlib/prompts.yaml b/rdagent/scenarios/qlib/prompts.yaml
index 292bbf07..1ee860ae 100644
--- a/rdagent/scenarios/qlib/prompts.yaml
+++ b/rdagent/scenarios/qlib/prompts.yaml
@@ -64,6 +64,21 @@ model_hypothesis_specification: |-
     }
   ]
 
+factor_hypothesis_specification: |-
+  Additional Specifications:
+    Hypotheses should grow and evolve based on the previous hypothesis. If there is no previous hypothesis, start with something simple. Gradually build up upon previous hypotheses and feedback.
+    Ensure that the hypothesis focuses on the creation and selection of factors in quantitative finance. Each hypothesis should address specific factor characteristics such as type (momentum, value, quality), calculation methods, or inclusion criteria. Avoid hypotheses related to model architecture or optimization processes.
+
+  Sample Hypotheses (Only learn from the format as these are not the knowledge):
+    - "Include a momentum factor based on the last 12 months' returns."
+    - "Add a value factor calculated as the book-to-market ratio."
+    - "Incorporate a quality factor derived from return on equity (ROE)."
+    - "Use a volatility factor based on the standard deviation of returns over the past 6 months."
+    - "Include a sentiment factor derived from news sentiment scores."
+    - "The momentum factor should be calculated using a 6-month look-back period."
+    - "Combine value and momentum factors using a weighted average approach."
+    - "Filter stocks by market capitalization before calculating the factors."
+
 factor_experiment_output_format: |-
   The output should follow JSON format. The schema is as follows:
   {
diff --git a/rdagent/scenarios/qlib/proposal/factor_proposal.py b/rdagent/scenarios/qlib/proposal/factor_proposal.py
index a24dbb26..b1dadc66 100644
--- a/rdagent/scenarios/qlib/proposal/factor_proposal.py
+++ b/rdagent/scenarios/qlib/proposal/factor_proposal.py
@@ -33,6 +33,7 @@ def prepare_context(self, trace: Trace) -> Tuple[dict, bool]:
             "hypothesis_and_feedback": hypothesis_feedback,
             "RAG": ...,
             "hypothesis_output_format": prompt_dict["hypothesis_output_format"],
+            "hypothesis_specification": prompt_dict["factor_hypothesis_specification"],
         }
         return context_dict, True
 

From aa4c7e52db3570741ce974af73dfd2a4d2280752 Mon Sep 17 00:00:00 2001
From: Taozhi Wang <taozhi.mark.wang@gmail.com>
Date: Tue, 23 Jul 2024 02:42:28 +0000
Subject: [PATCH 12/40] Evaluation & dataset

---
 RD-Agent                                         |  1 +
 rdagent/app/quant_factor_benchmark/eval.py       | 15 +++++++++++----
 rdagent/components/benchmark/conf.py             |  2 +-
 rdagent/components/benchmark/eval_method.py      | 16 +++++++++-------
 .../qlib/factor_experiment_loader/json_loader.py | 12 ++++++++----
 5 files changed, 30 insertions(+), 16 deletions(-)
 create mode 160000 RD-Agent

diff --git a/RD-Agent b/RD-Agent
new file mode 160000
index 00000000..61d67d85
--- /dev/null
+++ b/RD-Agent
@@ -0,0 +1 @@
+Subproject commit 61d67d8518072d69d5169853d49dd6ff88e6055c
diff --git a/rdagent/app/quant_factor_benchmark/eval.py b/rdagent/app/quant_factor_benchmark/eval.py
index 9c2c3d47..d9cb4948 100644
--- a/rdagent/app/quant_factor_benchmark/eval.py
+++ b/rdagent/app/quant_factor_benchmark/eval.py
@@ -1,4 +1,5 @@
-from rdagent.scenarios.qlib.factor_task_loader.json_loader import (
+from rdagent.app.qlib_rd_loop.conf import PROP_SETTING
+from rdagent.scenarios.qlib.factor_experiment_loader.json_loader import (
     FactorTestCaseLoaderFromJsonFile,
 )
 
@@ -6,6 +7,11 @@
 from rdagent.components.benchmark.eval_method import FactorImplementEval
 from rdagent.core.utils import import_class
 
+from rdagent.core.utils import import_class
+from rdagent.core.scenario import Scenario
+from rdagent.scenarios.qlib.experiment.factor_experiment import QlibFactorScenario
+
+
 # 1.read the settings
 bs = BenchmarkSettings()
 
@@ -14,13 +20,14 @@
 
 # 3.declare the method to be tested and pass the arguments.
 
-method_cls = import_class(bs.bench_method_cls)
-generate_method = method_cls()
-
+scen: Scenario = import_class(PROP_SETTING.factor_scen)()
+generate_method = import_class(bs.bench_method_cls)(scen=scen)
+ 
 # 4.declare the eval method and pass the arguments.
 eval_method = FactorImplementEval(
     method=generate_method,
     test_cases=test_cases,
+    scen=scen,
     catch_eval_except=True,
     test_round=bs.bench_test_round,
 )
diff --git a/rdagent/components/benchmark/conf.py b/rdagent/components/benchmark/conf.py
index f854eec3..7ebebc5d 100644
--- a/rdagent/components/benchmark/conf.py
+++ b/rdagent/components/benchmark/conf.py
@@ -18,7 +18,7 @@ class BenchmarkSettings(BaseSettings):
     bench_test_round: int = 10
     bench_test_case_n: Optional[int] = None  # how many test cases to run; If not given, all test cases will be run
 
-    bench_method_cls: str = "rdagent.factor_implementation.CoSTEER.CoSTEERFG"
+    bench_method_cls: str = "rdagent.components.coder.factor_coder.CoSTEER.FactorCoSTEER"
     bench_method_extra_kwargs: dict = field(
         default_factory=dict,
     )  # extra kwargs for the method to be tested except the task list
diff --git a/rdagent/components/benchmark/eval_method.py b/rdagent/components/benchmark/eval_method.py
index 054b1c97..b45451eb 100644
--- a/rdagent/components/benchmark/eval_method.py
+++ b/rdagent/components/benchmark/eval_method.py
@@ -20,6 +20,7 @@
 from rdagent.core.developer import Developer
 from rdagent.core.exception import CoderException
 from rdagent.core.experiment import Task, Workspace
+from rdagent.core.scenario import Scenario
 from rdagent.core.utils import multiprocessing_wrapper
 
 
@@ -114,17 +115,18 @@ def __init__(
         test_cases: TestCase,
         method: Developer,
         *args,
+        scen: Scenario,
         test_round: int = 10,
         **kwargs,
     ):
         online_evaluator_l = [
-            FactorSingleColumnEvaluator(),
-            FactorOutputFormatEvaluator(),
-            FactorRowCountEvaluator(),
-            FactorIndexEvaluator(),
-            FactorMissingValuesEvaluator(),
-            FactorEqualValueCountEvaluator(),
-            FactorCorrelationEvaluator(hard_check=False),
+            FactorSingleColumnEvaluator(scen),
+            FactorOutputFormatEvaluator(scen),
+            FactorRowCountEvaluator(scen),
+            FactorIndexEvaluator(scen),
+            FactorMissingValuesEvaluator(scen),
+            FactorEqualValueCountEvaluator(scen),
+            FactorCorrelationEvaluator(hard_check=False, scen=scen),
         ]
         super().__init__(online_evaluator_l, test_cases, method, *args, **kwargs)
         self.test_round = test_round
diff --git a/rdagent/scenarios/qlib/factor_experiment_loader/json_loader.py b/rdagent/scenarios/qlib/factor_experiment_loader/json_loader.py
index 99395c36..28c37b10 100644
--- a/rdagent/scenarios/qlib/factor_experiment_loader/json_loader.py
+++ b/rdagent/scenarios/qlib/factor_experiment_loader/json_loader.py
@@ -8,7 +8,7 @@
     FactorTask,
 )
 from rdagent.components.loader.experiment_loader import FactorExperimentLoader
-from rdagent.core.experiment import Loader
+from rdagent.core.experiment import Experiment, Loader
 from rdagent.scenarios.qlib.experiment.factor_experiment import QlibFactorExperiment
 
 
@@ -47,7 +47,7 @@ class FactorTestCaseLoaderFromJsonFile:
     def load(self, json_file_path: Path) -> list:
         with open(json_file_path, "r") as file:
             factor_dict = json.load(file)
-        TestData = TestCase()
+        TestData = TestCase(target_task=Experiment(sub_tasks=[]))
         for factor_name, factor_data in factor_dict.items():
             task = FactorTask(
                 factor_name=factor_name,
@@ -55,9 +55,13 @@ def load(self, json_file_path: Path) -> list:
                 factor_formulation=factor_data["formulation"],
                 variables=factor_data["variables"],
             )
-            gt = FactorFBWorkspace(task, code=factor_data["gt_code"])
+            gt = FactorFBWorkspace(task)
+            code = {
+                "factor.py": factor_data["gt_code"]
+            }
+            gt.inject_code(**code)
             gt.execute()
-            TestData.target_task.append(task)
+            TestData.target_task.sub_tasks.append(task)
             TestData.ground_truth.append(gt)
 
         return TestData

From 6d022b8972c69fe9790cb92e4039a035499e6993 Mon Sep 17 00:00:00 2001
From: WinstonLiyt <1957922024@qq.com>
Date: Tue, 23 Jul 2024 07:23:58 +0000
Subject: [PATCH 13/40] Optimize the prompt for generating hypotheses and
 feedback in the factor loop.

---
 rdagent/scenarios/qlib/prompts.yaml | 60 +++++++++++++++++++++--------
 1 file changed, 43 insertions(+), 17 deletions(-)

diff --git a/rdagent/scenarios/qlib/prompts.yaml b/rdagent/scenarios/qlib/prompts.yaml
index 1ee860ae..bc993f42 100644
--- a/rdagent/scenarios/qlib/prompts.yaml
+++ b/rdagent/scenarios/qlib/prompts.yaml
@@ -66,18 +66,36 @@ model_hypothesis_specification: |-
 
 factor_hypothesis_specification: |-
   Additional Specifications:
-    Hypotheses should grow and evolve based on the previous hypothesis. If there is no previous hypothesis, start with something simple. Gradually build up upon previous hypotheses and feedback.
-    Ensure that the hypothesis focuses on the creation and selection of factors in quantitative finance. Each hypothesis should address specific factor characteristics such as type (momentum, value, quality), calculation methods, or inclusion criteria. Avoid hypotheses related to model architecture or optimization processes.
-
-  Sample Hypotheses (Only learn from the format as these are not the knowledge):
-    - "Include a momentum factor based on the last 12 months' returns."
-    - "Add a value factor calculated as the book-to-market ratio."
-    - "Incorporate a quality factor derived from return on equity (ROE)."
-    - "Use a volatility factor based on the standard deviation of returns over the past 6 months."
-    - "Include a sentiment factor derived from news sentiment scores."
-    - "The momentum factor should be calculated using a 6-month look-back period."
-    - "Combine value and momentum factors using a weighted average approach."
-    - "Filter stocks by market capitalization before calculating the factors."
+    - Hypotheses should grow and evolve based on the previous hypothesis. If there is no previous hypothesis, start with something simple.
+    - Gradually build up upon previous hypotheses and feedback.
+    - Ensure that the hypothesis focuses on the creation and selection of factors in quantitative finance.
+    - Each hypothesis should address specific factor characteristics such as type (momentum, value, quality), calculation methods, or inclusion criteria.
+    - Avoid hypotheses related to model architecture or optimization processes.
+  
+  Guiding Principles:
+  1. Diversity and Depth:
+    - Ensure a wide range of factor types, incorporating various financial dimensions (e.g., momentum, value, quality, volatility, sentiment).
+    - Explore different calculation methods and inclusion criteria to understand their impact.
+    - Consider combining multiple factors or filtering criteria for more sophisticated hypotheses.
+  2. Iterative Improvement:
+    - Build upon previous hypotheses, incorporating feedback and observed results.
+    - Aim for continuous refinement and complexity over iterations, starting from basic factors to more advanced combinations and techniques.
+  3. Contextual Relevance:
+    - Tailor hypotheses to the specific financial context and current market conditions.
+    - Leverage domain knowledge and recent financial research to inform hypothesis creation.
+
+  Sample Hypotheses (Use the format for guidance, not the specific content):
+  - "Include a momentum factor based on the last 12 months' returns."
+  - "Add a value factor calculated as the book-to-market ratio."
+  - "Incorporate a quality factor derived from return on equity (ROE)."
+  - "Use a volatility factor based on the standard deviation of returns over the past 6 months."
+  - "Include a sentiment factor derived from news sentiment scores."
+  - "The momentum factor should be calculated using a 6-month look-back period."
+  - "Combine value and momentum factors using a weighted average approach."
+  - "Filter stocks by market capitalization before calculating the factors."
+  - "Explore a liquidity factor based on the trading volume and bid-ask spread."
+  - "Investigate the impact of an earnings surprise factor calculated from recent earnings announcements."
+  - "Develop a composite factor integrating ESG (Environmental, Social, Governance) scores with traditional financial metrics."
 
 factor_experiment_output_format: |-
   The output should follow JSON format. The schema is as follows:
@@ -119,7 +137,7 @@ model_experiment_output_format: |-
   
 factor_feedback_generation:
   system: |-
-    You are a professional result analysis assistant on data driven R&D.
+    You are a professional result analysis assistant in data-driven R&D.
     The task is described in the following scenario:
     {{ scenario }}
     You will receive a hypothesis, multiple tasks with their factors, and some results.
@@ -142,8 +160,7 @@ factor_feedback_generation:
     {{ combined_result }}
     Analyze the combined result in the context of its ability to:
     1. Support or refute the hypothesis.
-    2. Show improvement or deterioration compared to the last experiment.
-    3. Demonstrate positive or negative effects when compared to Alpha158.
+    2. Show improvement or deterioration compared to the SOTA experiment.
 
     Evaluation Metrics Explanations:
     Below are the financial meanings of each metric, which should be used to judge the results:
@@ -157,8 +174,17 @@ factor_feedback_generation:
     - IC: Measures the correlation between predicted returns (\hat{y}) and actual returns (y), using Pearson correlation.
     - 1day.excess_return_with_cost.information_ratio: Evaluates the excess return per unit of risk considering transaction costs.
 
-    When judging the results, prioritize metrics that consider transaction costs (with cost), as they provide a more accurate representation of real-world performance. Among these, the annualized return considering transaction costs is particularly important as it gives a clear picture of long-term profitability.
-    Provide detailed feedback and recommend whether to replace the best result if the new factor proves superior.
+    When judging the results:
+    1. Prioritize metrics that consider transaction costs (with cost):
+    - These metrics provide a more accurate representation of real-world performance.
+    2. Evaluate all metrics:
+        - Compare the combined results against the current best results across all metrics to get a comprehensive view of performance.
+    3. Focus on the annualized return considering transaction costs:
+        - This metric is particularly important as it gives a clear picture of long-term profitability.
+    4. Recommendation for replacement:
+        - If the new factor demonstrates a significant improvement in the annualized return considering transaction costs, it should be recommended to replace the current best result, even if other metrics show minor variations.
+
+    Please provide detailed feedback and recommend whether to replace the best result if the new factor proves superior.
 
 model_feedback_generation:
   system: |-

From c51a6f08a2b73f547697940c44022b4b5567b769 Mon Sep 17 00:00:00 2001
From: Taozhi Wang <taozhi.mark.wang@gmail.com>
Date: Tue, 23 Jul 2024 08:06:14 +0000
Subject: [PATCH 14/40] Generate new data

---
 rdagent/components/benchmark/analysis.py  | 63 +++++++++++++++++++++++
 rdagent/components/benchmark/example.json |  6 +--
 2 files changed, 66 insertions(+), 3 deletions(-)
 create mode 100644 rdagent/components/benchmark/analysis.py

diff --git a/rdagent/components/benchmark/analysis.py b/rdagent/components/benchmark/analysis.py
new file mode 100644
index 00000000..1aedb7cc
--- /dev/null
+++ b/rdagent/components/benchmark/analysis.py
@@ -0,0 +1,63 @@
+import pickle
+import os
+import pandas as pd
+import matplotlib.pyplot as plt
+
+# Function to load and process each pickle file
+def process_pickle_file(file_path):
+    try:
+        with open(file_path, 'rb') as file:
+            data = pickle.load(file)
+        # Assuming data is a DataFrame or similar
+        print(f"Data from {file_path} processed successfully.")
+        return data
+    except Exception as e:
+        print(f"Error processing {file_path}: {e}")
+        return None
+    
+def analysis(folder_path):
+    success_count = 0
+    fail_count = 0
+
+    # Logging the errors
+    error_log = open("error_log.log", "w")
+
+    # List to store data for visualization
+    data_frames = []
+
+    # Processing each file in the directory
+    for file_name in os.listdir(folder_path):
+        file_path = os.path.join(folder_path, file_name)
+        data = process_pickle_file(file_path)
+        if data is not None:
+            data_frames.append(data)
+
+    for df in data_frames:
+        if 'Execution succeeded' in df[0]:
+            success_count += 1
+        else:
+            fail_count += 1
+            error_log.write(f"{file_path}: \n{df[0]}\n")
+
+    # Writing summary
+    print(f"Number of successful files: {success_count}")
+    print(f"Number of failed files: {fail_count}")
+
+    # Closing the error log file
+    error_log.close()
+
+def view_pickle_file(folder_path):
+    for file_name in os.listdir(folder_path):
+        file_path = os.path.join(folder_path, file_name)
+
+        print(f'the path of this file is: {file_path}\n')
+        with open(file_path, 'rb') as file:
+            data = pickle.load(file)
+            for i in range(len(data)):
+                print(data[i])
+
+
+if __name__ == '__main__':
+    folder_path = '/data/userdata/v-taozhiwang/RD-Agent/git_ignore_folder/factor_implementation_execution_cache'
+    
+    analysis(folder_path)
\ No newline at end of file
diff --git a/rdagent/components/benchmark/example.json b/rdagent/components/benchmark/example.json
index b69ffd8e..f1e5a85c 100644
--- a/rdagent/components/benchmark/example.json
+++ b/rdagent/components/benchmark/example.json
@@ -6,7 +6,7 @@
             "20-day turnover rate": "Average turnover rate over the past 20 days.",
             "Market Capitalization": "Total market value of a company's outstanding shares."
         },
-        "gt_code": "import pandas as pd\n\ndata_f = pd.read_hdf('daily_f.h5')\n\ndata = data_f.reset_index()\nwindow_size = 20\n\nnominator=data.groupby('instrument')[['30\u65e5\u6362\u624b\u7387']].rolling(window=window_size).mean().reset_index(0, drop=True)\n# transfer to series\nnew=nominator['30\u65e5\u6362\u624b\u7387']\ndata['Turnover_Rate_Factor']=new/data['\u6d41\u901aA\u80a1']\n\n# # set the datetime and instrument as index and drop the original index\nresult=pd.DataFrame(data['Turnover_Rate_Factor']).set_index(data_f.index)\n\n# transfer the result to series\nresult=result['Turnover_Rate_Factor']\nresult.to_hdf(\"result.h5\", key=\"data\")\n"
+        "gt_code": "import pandas as pd\n\ndata_f = pd.read_hdf('daily_f.h5')\n\ndata = data_f.reset_index()\nwindow_size = 20\n\nnominator=data.groupby('instrument')[['TurnoverRate_30D']].rolling(window=window_size).mean().reset_index(0, drop=True)\n# transfer to series\nnew=nominator['TurnoverRate_30D']\ndata['Turnover_Rate_Factor']=new/data['TradableACapital']\n\n# set the datetime and instrument as index and drop the original index\nresult=pd.DataFrame(data['Turnover_Rate_Factor']).set_index(data_f.index)\n\n# transfer the result to series\nresult=result['Turnover_Rate_Factor']\nresult.to_hdf(\"result.h5\", key=\"data\")" 
     },
     "PctTurn20": {
         "description": "A factor representing the percentage change in turnover rate over the past 20 trading days, market-value neutralized.",
@@ -16,7 +16,7 @@
             "Turnover_{i, t}": "Turnover of stock i at day t.",
             "Turnover_{i, t-20}": "Turnover of stock i at day t-20."
         },
-        "gt_code": "import pandas as pd\nfrom statsmodels import api as sm\n\n\ndef fill_mean(s: pd.Series) -> pd.Series:\n    return s.fillna(s.mean()).fillna(0.0)\n\n\ndef market_value_neutralize(s: pd.Series, mv: pd.Series) -> pd.Series:\n    s = s.groupby(\"datetime\", group_keys=False).apply(fill_mean)\n    mv = mv.groupby(\"datetime\", group_keys=False).apply(fill_mean)\n\n    df_f = mv.to_frame(\"\u5e02\u503c\")\n    df_f[\"const\"] = 1\n    X = df_f[[\"\u5e02\u503c\", \"const\"]]\n\n    # Perform the Ordinary Least Squares (OLS) regression\n    model = sm.OLS(s, X)\n    results = model.fit()\n\n    # Calculate the residuals\n    df_f[\"residual\"] = results.resid\n    df_f[\"norm_resi\"] = df_f.groupby(level=\"datetime\", group_keys=False)[\"residual\"].apply(\n        lambda x: (x - x.mean()) / x.std(),\n    )\n    return df_f[\"norm_resi\"]\n\n\n# get_turnover\ndf_pv = pd.read_hdf(\"daily_pv.h5\", key=\"data\")\ndf_f = pd.read_hdf(\"daily_f.h5\", key=\"data\")\nturnover = df_pv[\"$money\"] / df_f[\"\u6d41\u901a\u5e02\u503c\"]\n\nf = turnover.groupby(\"instrument\").pct_change(periods=20)\n\nf_neutralized = market_value_neutralize(f, df_f[\"\u6d41\u901a\u5e02\u503c\"])\n\nf_neutralized.to_hdf(\"result.h5\", key=\"data\")\n"
+        "gt_code": "import pandas as pd\nfrom statsmodels import api as sm\n\ndef fill_mean(s: pd.Series) -> pd.Series:\n    return s.fillna(s.mean()).fillna(0.0)\n\ndef market_value_neutralize(s: pd.Series, mv: pd.Series) -> pd.Series:\n    s = s.groupby(\"datetime\", group_keys=False).apply(fill_mean)\n    mv = mv.groupby(\"datetime\", group_keys=False).apply(fill_mean)\n\n    df_f = mv.to_frame(\"MarketValue\")\n    df_f[\"const\"] = 1\n    X = df_f[[\"MarketValue\", \"const\"]]\n\n    # Perform the Ordinary Least Squares (OLS) regression\n    model = sm.OLS(s, X)\n    results = model.fit()\n\n    # Calculate the residuals\n    df_f[\"residual\"] = results.resid\n    df_f[\"norm_resi\"] = df_f.groupby(level=\"datetime\", group_keys=False)[\"residual\"].apply(\n        lambda x: (x - x.mean()) / x.std(),\n    )\n    return df_f[\"norm_resi\"]\n\n\n# get_turnover\ndf_pv = pd.read_hdf(\"daily_pv.h5\", key=\"data\")\ndf_f = pd.read_hdf(\"daily_f.h5\", key=\"data\")\nturnover = df_pv[\"$money\"] / df_f[\"TradableMarketValue\"]\n\nf = turnover.groupby(\"instrument\").pct_change(periods=20)\n\nf_neutralized = market_value_neutralize(f, df_f[\"TradableMarketValue\"])\n\nf_neutralized.to_hdf(\"result.h5\", key=\"data\")"
     },
     "PB_ROE": {
         "description": "Constructed using the ranking difference between PB and ROE, with PB and ROE replacing original PB and ROE to obtain reconstructed factor values.",
@@ -25,6 +25,6 @@
             "\\text{rank}(PB_t)": "Ranking PB on cross-section at time t.",
             "\\text{rank}(ROE_t)": "Ranking single-quarter ROE on cross-section at time t."
         },
-        "gt_code": "#!/usr/bin/env python\n\nimport pandas as pd\n\ndata_f = pd.read_hdf('daily_f.h5')\n\ndata = data_f.reset_index()\n\n# Calculate the rank of PB and ROE\ndata['PB_rank'] = data.groupby('datetime')['B/P'].rank()\ndata['ROE_rank'] = data.groupby('datetime')['ROE'].rank()\n\n# Calculate the difference between the ranks\ndata['PB_ROE'] = data['PB_rank'] - data['ROE_rank']\n\n# set the datetime and instrument as index and drop the original index\nresult=pd.DataFrame(data['PB_ROE']).set_index(data_f.index)\n\n# transfer the result to series\nresult=result['PB_ROE']\nresult.to_hdf(\"result.h5\", key=\"data\")\n"
+        "gt_code": "#!/usr/bin/env python\n\nimport pandas as pd\n\ndata_f = pd.read_hdf('daily_f.h5')\n\ndata = data_f.reset_index()\n\n# Calculate the rank of PB and ROE\ndata['PB_rank'] = data.groupby('datetime')['B/P'].rank()\ndata['ROE_rank'] = data.groupby('datetime')['ROE'].rank()\n\n# Calculate the difference between the ranks\ndata['PB_ROE'] = data['PB_rank'] - data['ROE_rank']\n\n# set the datetime and instrument as index and drop the original index\nresult=pd.DataFrame(data['PB_ROE']).set_index(data_f.index)\n\n# transfer the result to series\nresult=result['PB_ROE']\nresult.to_hdf(\"result.h5\", key=\"data\")"
     }
 }
\ No newline at end of file

From 90bd7e3923d1ab103a98b737d4405ec4bacad1b6 Mon Sep 17 00:00:00 2001
From: Taozhi Wang <taozhi.mark.wang@gmail.com>
Date: Wed, 24 Jul 2024 01:28:53 +0000
Subject: [PATCH 15/40] dataset generation

---
 .../components/benchmark/generate_dataset.py  | 72 +++++++++++++++++++
 1 file changed, 72 insertions(+)
 create mode 100644 rdagent/components/benchmark/generate_dataset.py

diff --git a/rdagent/components/benchmark/generate_dataset.py b/rdagent/components/benchmark/generate_dataset.py
new file mode 100644
index 00000000..6e0520d1
--- /dev/null
+++ b/rdagent/components/benchmark/generate_dataset.py
@@ -0,0 +1,72 @@
+import pandas as pd
+import numpy as np
+import random
+import string
+
+def create_new_hdf5_file(file_path, new_path):
+    """ Create a new HDF5 file with random data. """
+    # Load the dataset
+    data = pd.read_hdf(file_path, key='data')
+
+    columns = [] # TODO select the column we want to keep
+    selected_data = data[columns]
+
+    # Generate new data for each column
+    new_data = pd.DataFrame(index=selected_data.index)
+
+    for column in selected_data.columns:
+        if column == 'B/P':
+            mean = selected_data[column].mean().values[0]
+            std = selected_data[column].std().values[0]
+        else:
+            mean = selected_data[column].mean()
+            std = selected_data[column].std()
+        new_data[column] = np.random.normal(mean, std, size=selected_data.shape[0])
+
+    # Save the new dataset
+    new_data.to_hdf(new_path, key='data', mode='w')
+
+    print("New dataset created and saved successfully!")
+
+def change_head(path):
+    data = pd.read_hdf(path, key='data')
+    columns = [
+        'B/P', 'ROE',
+        'TotalCapitalStock', 'TradableACapital', 'TotalMarketValue', 'TradableMarketValue', 'StockPrice', 
+        'E/P', 'ECut/P', 'EBIT/EV', 'EBITDA/EV', 'ROA_Q', 'MACrossover', 'QuarterlyUnrestrictedShareholdersRatioChange', 
+        'HSZZ_ALPHA_3M', 'ROE_Q', 'ROA_TTM', 'S_ROAG', 'ExternalFinancingScale_2Y', 'ConMarketConf_5D', 
+        'NetProfitSequentialQuarterlyChange', 'SemiannualUnrestrictedShareholdersRatioChange', 'STD_12M', 
+        'LarSmaDiffSellValue', 'OperatingCashFlowRatio', 'TurnoverRate_30D', 'HSZZ_R2_3M', 'Sales_Growth_3Y', 
+        'PricePosition_30D', 'NetProfitMargin', 'OperatingProfitYOY', 'SalesToCashRatio', 
+        'FutureUnrestrictedRatio_3M', 'HSZZ_ALPHA_12M', 'Idiosyncrasy', 'RatingChange', 'TSKEW', 
+        'WeeklyConsensusChangeJR_1W', 'HSZZ_BETA_3M', 'PricePosition_180D', 'MedSellValue', 
+        'UnlimitedShareholdersAverageAmount', 'T_ROEG', 'QuarterlyAverageShareholdersRatioChange', 
+        'FixedAssetTurnover', 'MonthlyRatingChange_1M', 'FutureUnrestrictedRatio_6M', 'TurnoverRate_30D_90D', 
+        'Sales2EV', 'ILLIQ_1M', 'Profit_Growth_TTM', 'HighLow_1M', 'OperationCash_TTM', 
+        'FutureUnrestrictedRatio_6MOver30DAvgTurnover', 'TurnoverRate_30D_180D', 'GrossProfitMargin', 
+        'AnalystMomentumScore', 'ShareExpansionRatio_2Y', 'ROIC', 'TurnoverRate_60D', 
+        'ExternalFinancingAdjustedGrowthRate', 'Weighted_Strength_3M', 'Weighted_Strength_1M', 
+        'FutureUnrestrictedRatio_1MOver30DAvgTurnover', 'OperatingCashFlowOverRevenue_TTM', 'ConMarketConf_10D', 
+        'HSZZ_ResidualStd_3M', 'RevenueSequentialYOY', 'RevenueYOY', 'EXTE'
+    ]
+
+    data.columns = columns
+    data.to_hdf(path, key='data', mode='w')
+    print("Head changed successfully!")
+
+def view_hdf5_file(filename):
+    with pd.HDFStore(filename, 'r') as store:
+        print("Keys in the file:", store.keys())
+        data = store['data']
+        print(data.head())
+        print("\nSummary statistics:\n", data.describe())  
+        print(data.index)
+
+
+
+if __name__ == '__main__':
+    path = ''
+    new_path = ''
+    create_new_hdf5_file(file_path=path, new_path=new_path)
+    change_head(new_path)
+    view_hdf5_file(new_path)
\ No newline at end of file

From 4fd9733b06142382194d32d91ec45ebed052a936 Mon Sep 17 00:00:00 2001
From: WinstonLiyt <1957922024@qq.com>
Date: Wed, 24 Jul 2024 02:59:02 +0000
Subject: [PATCH 16/40] Performed further optimizations on the factor loop and
 report extraction loop, added log handling for both processes, and
 implemented a screenshot feature for report extraction.

---
 rdagent/app/qlib_rd_loop/factor.py            |  5 ++-
 .../app/qlib_rd_loop/factor_from_report_sh.py | 45 ++++++++++++-------
 .../coder/factor_coder/CoSTEER/evaluators.py  | 44 +++++++++++-------
 .../document_reader/document_reader.py        | 10 +++++
 .../scenarios/qlib/experiment/workspace.py    |  6 ---
 .../factor_experiment_loader/pdf_loader.py    | 25 ++++++++---
 rdagent/scenarios/qlib/prompts.yaml           | 26 ++++++++++-
 7 files changed, 115 insertions(+), 46 deletions(-)

diff --git a/rdagent/app/qlib_rd_loop/factor.py b/rdagent/app/qlib_rd_loop/factor.py
index 4d23d7b7..a748dd8e 100644
--- a/rdagent/app/qlib_rd_loop/factor.py
+++ b/rdagent/app/qlib_rd_loop/factor.py
@@ -36,7 +36,7 @@
 trace = Trace(scen=scen)
 for _ in range(PROP_SETTING.evolving_n):
     try:
-        with logger.tag("r"):  # research
+        with logger.tag("r"):
             hypothesis = hypothesis_gen.gen(trace)
             logger.log_object(hypothesis, tag="hypothesis generation")
 
@@ -49,6 +49,9 @@
 
         with logger.tag("ef"):
             exp = qlib_factor_runner.develop(exp)
+            if exp is None:
+                logger.error(f"Factor extraction failed.")
+                continue
             logger.log_object(exp, tag="factor runner result")
             feedback = qlib_factor_summarizer.generate_feedback(exp, hypothesis, trace)
             logger.log_object(feedback, tag="feedback")
diff --git a/rdagent/app/qlib_rd_loop/factor_from_report_sh.py b/rdagent/app/qlib_rd_loop/factor_from_report_sh.py
index 1b1bddcc..94396ede 100644
--- a/rdagent/app/qlib_rd_loop/factor_from_report_sh.py
+++ b/rdagent/app/qlib_rd_loop/factor_from_report_sh.py
@@ -7,7 +7,7 @@
 import pandas as pd
 
 from rdagent.app.qlib_rd_loop.conf import PROP_SETTING
-from rdagent.components.document_reader.document_reader import load_and_process_pdfs_by_langchain
+from rdagent.components.document_reader.document_reader import extract_first_page_screenshot_from_pdf, load_and_process_pdfs_by_langchain
 from rdagent.core.prompts import Prompts
 from rdagent.core.scenario import Scenario
 from rdagent.core.utils import import_class
@@ -88,7 +88,11 @@ def extract_factors_and_implement(report_file_path: str) -> tuple:
             exp = FactorExperimentLoaderFromPDFfiles().load(report_file_path)
             if exp is None or exp.sub_tasks == []:
                 return None, None
-            
+
+        with logger.tag("load_pdf_screenshot"):
+            pdf_screenshot = extract_first_page_screenshot_from_pdf(report_file_path)
+            logger.log_object(pdf_screenshot, tag="load_pdf_screenshot")
+
     docs_dict = load_and_process_pdfs_by_langchain(Path(report_file_path))
 
     factor_result = {
@@ -118,19 +122,30 @@ def extract_factors_and_implement(report_file_path: str) -> tuple:
             report_file_path = Path(file_path.replace(PROP_SETTING.origin_report_path, PROP_SETTING.local_report_path))
             if report_file_path.exists():
                 logger.info(f"Processing {report_file_path}")
-                exp, hypothesis = extract_factors_and_implement(str(report_file_path))
-                if exp is None:
-                    continue
-                exp.based_experiments = [t[1] for t in trace.hist if t[2]]
-                if len(exp.based_experiments) == 0:
-                    exp.based_experiments.append(QlibFactorExperiment(sub_tasks=[]))
-                exp = qlib_factor_coder.develop(exp)
-                exp = qlib_factor_runner.develop(exp)
-                if exp is None:
-                    logger.error(f"Factor extraction failed for {report_file_path}. Skipping to the next report.")
-                    continue
-                feedback = qlib_factor_summarizer.generate_feedback(exp, hypothesis, trace)
-
+                
+                with logger.tag("r"):
+                    exp, hypothesis = extract_factors_and_implement(str(report_file_path))
+                    if exp is None:
+                        continue
+                    exp.based_experiments = [t[1] for t in trace.hist if t[2]]
+                    if len(exp.based_experiments) == 0:
+                        exp.based_experiments.append(QlibFactorExperiment(sub_tasks=[]))
+                    logger.log_object(hypothesis, tag="hypothesis generation")
+                    logger.log_object(exp.sub_tasks, tag="experiment generation")
+                
+                with logger.tag("d"):
+                    exp = qlib_factor_coder.develop(exp)
+                    logger.log_object(exp.sub_workspace_list)
+
+                with logger.tag("ef"):
+                    exp = qlib_factor_runner.develop(exp)
+                    if exp is None:
+                        logger.error(f"Factor extraction failed for {report_file_path}. Skipping to the next report.")
+                        continue
+                    logger.log_object(exp, tag="factor runner result")
+                    feedback = qlib_factor_summarizer.generate_feedback(exp, hypothesis, trace)
+                    logger.log_object(feedback, tag="feedback")
+                
                 trace.hist.append((hypothesis, exp, feedback))
                 logger.info(f"Processed {report_file_path}: Result: {exp}")
                 
diff --git a/rdagent/components/coder/factor_coder/CoSTEER/evaluators.py b/rdagent/components/coder/factor_coder/CoSTEER/evaluators.py
index 00b59d8e..edd2cbd3 100644
--- a/rdagent/components/coder/factor_coder/CoSTEER/evaluators.py
+++ b/rdagent/components/coder/factor_coder/CoSTEER/evaluators.py
@@ -437,21 +437,35 @@ def evaluate(
             else:
                 break
 
-        final_evaluation_dict = json.loads(
-            APIBackend().build_messages_and_create_chat_completion(
-                user_prompt=user_prompt,
-                system_prompt=system_prompt,
-                json_mode=True,
-            ),
-        )
-        if isinstance(final_evaluation_dict["final_decision"], str) and final_evaluation_dict[
-            "final_decision"
-        ].lower() in ("true", "false"):
-            final_evaluation_dict["final_decision"] = bool(final_evaluation_dict["final_decision"])
-        return (
-            final_evaluation_dict["final_decision"],
-            final_evaluation_dict["final_feedback"],
-        )
+        final_evaluation_dict = None
+        attempts = 0
+        max_attempts = 3
+
+        while attempts < max_attempts:
+            try:
+                final_evaluation_dict = json.loads(
+                    APIBackend().build_messages_and_create_chat_completion(
+                        user_prompt=user_prompt,
+                        system_prompt=system_prompt,
+                        json_mode=True,
+                    ),
+                )
+                final_decision = final_evaluation_dict["final_decision"]
+                final_feedback = final_evaluation_dict["final_feedback"]
+
+                if isinstance(final_decision, str) and final_decision.lower() in ("true", "false"):
+                    final_decision = bool(final_decision)
+
+                return final_decision, final_feedback
+
+            except json.JSONDecodeError as e:
+                raise ValueError("Failed to decode JSON response from API.") from e
+            except KeyError as e:
+                attempts += 1
+                if attempts >= max_attempts:
+                    raise KeyError("Response from API is missing 'final_decision' or 'final_feedback' key after multiple attempts.") from e
+        
+        return None, None
 
 
 class FactorSingleFeedback:
diff --git a/rdagent/components/document_reader/document_reader.py b/rdagent/components/document_reader/document_reader.py
index db5cc570..9341066f 100644
--- a/rdagent/components/document_reader/document_reader.py
+++ b/rdagent/components/document_reader/document_reader.py
@@ -1,6 +1,8 @@
 from __future__ import annotations
 
 from pathlib import Path
+import fitz
+from PIL import Image
 from typing import TYPE_CHECKING
 
 from azure.ai.formrecognizer import DocumentAnalysisClient
@@ -96,3 +98,11 @@ def load_and_process_pdfs_by_azure_document_intelligence(path: Path) -> dict[str
                     RD_AGENT_SETTINGS.azure_document_intelligence_endpoint,
                 )
     return content_dict
+
+def extract_first_page_screenshot_from_pdf(pdf_path: Path) -> Image:
+    doc = fitz.open(pdf_path)
+    page = doc.load_page(0)
+    pix = page.get_pixmap()
+    image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+    
+    return image
\ No newline at end of file
diff --git a/rdagent/scenarios/qlib/experiment/workspace.py b/rdagent/scenarios/qlib/experiment/workspace.py
index b9b58038..6f549a7e 100644
--- a/rdagent/scenarios/qlib/experiment/workspace.py
+++ b/rdagent/scenarios/qlib/experiment/workspace.py
@@ -17,12 +17,6 @@ def execute(self, qlib_config_name: str = "conf.yaml", run_env: dict = {}, *args
         qtde = QTDockerEnv()
         qtde.prepare()
 
-        # Run the Docker command
-        execute_log = qtde.run(
-            local_path=str(self.workspace_path),
-            entry="rm -r mlruns",
-            env=run_env,
-        )
         # Run the Qlib backtest
         execute_log = qtde.run(
             local_path=str(self.workspace_path),
diff --git a/rdagent/scenarios/qlib/factor_experiment_loader/pdf_loader.py b/rdagent/scenarios/qlib/factor_experiment_loader/pdf_loader.py
index c9390fc7..5a265b4f 100644
--- a/rdagent/scenarios/qlib/factor_experiment_loader/pdf_loader.py
+++ b/rdagent/scenarios/qlib/factor_experiment_loader/pdf_loader.py
@@ -70,7 +70,7 @@ def classify_report_from_dict(
         if isinstance(value, str):
             content = value
         else:
-            logger.warning(f"输入格式不符合要求: {file_name}")
+            logger.warning(f"Input format does not meet the requirements: {file_name}")
             res_dict[file_name] = {"class": 0}
             continue
 
@@ -102,7 +102,7 @@ def classify_report_from_dict(
                 res = json.loads(res)
                 vote_list.append(int(res["class"]))
             except json.JSONDecodeError:
-                logger.warning(f"返回值无法解析: {file_name}")
+                logger.warning(f"Return value could not be parsed: {file_name}")
                 res_dict[file_name] = {"class": 0}
             count_0 = vote_list.count(0)
             count_1 = vote_list.count(1)
@@ -243,7 +243,7 @@ def extract_factors_from_report_dict(
     )
     for index, file_name in enumerate(file_name_list):
         final_report_factor_dict[file_name] = factor_dict_list[index]
-    logger.info(f"已经完成{len(final_report_factor_dict)}个报告的因子提取")
+    logger.info(f"Factor extraction completed for {len(final_report_factor_dict)} reports")
 
     return final_report_factor_dict
 
@@ -507,13 +507,24 @@ def deduplicate_factors_by_llm(  # noqa: C901, PLR0912
 
 class FactorExperimentLoaderFromPDFfiles(FactorExperimentLoader):
     def load(self, file_or_folder_path: Path) -> dict:
-        docs_dict = load_and_process_pdfs_by_langchain(Path(file_or_folder_path))
+        with logger.tag("docs"):
+            docs_dict = load_and_process_pdfs_by_langchain(Path(file_or_folder_path))
+            logger.log_object(docs_dict, tag="docs dict")
 
         selected_report_dict = classify_report_from_dict(report_dict=docs_dict, vote_time=1)
-        file_to_factor_result = extract_factors_from_report_dict(docs_dict, selected_report_dict)
-        factor_dict = merge_file_to_factor_dict_to_factor_dict(file_to_factor_result)
+        
+        with logger.tag("file_to_factor_result"):
+            file_to_factor_result = extract_factors_from_report_dict(docs_dict, selected_report_dict)
+            logger.log_object(file_to_factor_result, tag="file_to_factor_result")
+        
+        with logger.tag("factor_dict"):
+            factor_dict = merge_file_to_factor_dict_to_factor_dict(file_to_factor_result)
+            logger.log_object(factor_dict, tag="factor_dict")
+
+        with logger.tag("filtered_factor_dict"):
+            factor_viability, filtered_factor_dict = check_factor_viability(factor_dict)
+            logger.log_object(filtered_factor_dict, tag="filtered_factor_dict")
 
-        factor_viability, filtered_factor_dict = check_factor_viability(factor_dict)
         # factor_dict, duplication_names_list = deduplicate_factors_by_llm(factor_dict, factor_viability)
         
         return FactorExperimentLoaderFromDict().load(filtered_factor_dict)
diff --git a/rdagent/scenarios/qlib/prompts.yaml b/rdagent/scenarios/qlib/prompts.yaml
index bc993f42..919cd66d 100644
--- a/rdagent/scenarios/qlib/prompts.yaml
+++ b/rdagent/scenarios/qlib/prompts.yaml
@@ -67,19 +67,22 @@ model_hypothesis_specification: |-
 factor_hypothesis_specification: |-
   Additional Specifications:
     - Hypotheses should grow and evolve based on the previous hypothesis. If there is no previous hypothesis, start with something simple.
-    - Gradually build up upon previous hypotheses and feedback.
+    - Gradually build upon previous hypotheses and feedback.
     - Ensure that the hypothesis focuses on the creation and selection of factors in quantitative finance.
     - Each hypothesis should address specific factor characteristics such as type (momentum, value, quality), calculation methods, or inclusion criteria.
     - Avoid hypotheses related to model architecture or optimization processes.
-  
+    - If a hypothesis can be improved further, refine it. If it achieves the desired results, explore a new direction. Previous factors exceeding SOTA (State of the Art) are preserved and combined with new factors for subsequent evaluations.
+
   Guiding Principles:
   1. Diversity and Depth:
     - Ensure a wide range of factor types, incorporating various financial dimensions (e.g., momentum, value, quality, volatility, sentiment).
     - Explore different calculation methods and inclusion criteria to understand their impact.
     - Consider combining multiple factors or filtering criteria for more sophisticated hypotheses.
+
   2. Iterative Improvement:
     - Build upon previous hypotheses, incorporating feedback and observed results.
     - Aim for continuous refinement and complexity over iterations, starting from basic factors to more advanced combinations and techniques.
+
   3. Contextual Relevance:
     - Tailor hypotheses to the specific financial context and current market conditions.
     - Leverage domain knowledge and recent financial research to inform hypothesis creation.
@@ -97,6 +100,25 @@ factor_hypothesis_specification: |-
   - "Investigate the impact of an earnings surprise factor calculated from recent earnings announcements."
   - "Develop a composite factor integrating ESG (Environmental, Social, Governance) scores with traditional financial metrics."
 
+  Detailed Workflow:
+  1. Initial Hypothesis:
+    - Begin with a simple factor, such as "Include a momentum factor based on the last 12 months' returns."
+
+  2. Refine Hypothesis:
+    - If the initial hypothesis is promising, refine it further, e.g., "The momentum factor should be calculated using a 6-month look-back period."
+
+  3. Combine Factors:
+    - As individual factors show potential, combine them, e.g., "Combine value and momentum factors using a weighted average approach."
+
+  4. Contextual Adjustments:
+    - Adjust factors based on market conditions or new financial insights, e.g., "Incorporate a quality factor derived from return on equity (ROE)."
+
+  5. Advanced Hypotheses:
+    - Explore sophisticated combinations or new types of factors, e.g., "Develop a composite factor integrating ESG scores with traditional financial metrics."
+
+  Remember: If a hypothesis achieves the desired results, start a new direction while preserving the effective factors from previous hypotheses. New evaluations should combine the newly proposed factors with previously successful factors that surpassed SOTA.
+
+
 factor_experiment_output_format: |-
   The output should follow JSON format. The schema is as follows:
   {

From 1d66f163cd43dca6c17521ba5d3420cdbea2717a Mon Sep 17 00:00:00 2001
From: you-n-g <you-n-g@users.noreply.github.com>
Date: Wed, 24 Jul 2024 12:06:42 +0800
Subject: [PATCH 17/40] Update
 rdagent/components/coder/factor_coder/CoSTEER/evaluators.py

---
 rdagent/components/coder/factor_coder/CoSTEER/evaluators.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/rdagent/components/coder/factor_coder/CoSTEER/evaluators.py b/rdagent/components/coder/factor_coder/CoSTEER/evaluators.py
index edd2cbd3..a1ddf6f7 100644
--- a/rdagent/components/coder/factor_coder/CoSTEER/evaluators.py
+++ b/rdagent/components/coder/factor_coder/CoSTEER/evaluators.py
@@ -437,6 +437,7 @@ def evaluate(
             else:
                 break
 
+        # TODO:  with retry_context(retry_n=3, except_list=[KeyError]):
         final_evaluation_dict = None
         attempts = 0
         max_attempts = 3

From b1bdfdde63c4c3f89c713ba738de0197822f242e Mon Sep 17 00:00:00 2001
From: WinstonLiyt <1957922024@qq.com>
Date: Wed, 24 Jul 2024 04:13:09 +0000
Subject: [PATCH 18/40] Update package.txt for fitz.

---
 requirements/package.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements/package.txt b/requirements/package.txt
index 0f2ee084..6ea91c33 100644
--- a/requirements/package.txt
+++ b/requirements/package.txt
@@ -19,6 +19,7 @@ langchain
 tiktoken
 scikit-learn
 docker
+fitz  # Extract shotsreens from pdf
 
 # azure identity related
 azure.identity

From 864f5a0c2cff0d83765f0dd67a90597bc3bb0754 Mon Sep 17 00:00:00 2001
From: Taozhi Wang <taozhi.mark.wang@gmail.com>
Date: Wed, 24 Jul 2024 06:04:24 +0000
Subject: [PATCH 19/40] add the result

---
 rdagent/app/quant_factor_benchmark/eval.py    | 24 +++++++
 rdagent/components/benchmark/conf.py          |  2 +-
 .../components/benchmark/generate_dataset.py  | 72 -------------------
 .../factor_coder/CoSTEER/evolving_agent.py    |  2 +-
 .../components/coder/factor_coder/config.py   |  2 +-
 5 files changed, 27 insertions(+), 75 deletions(-)
 delete mode 100644 rdagent/components/benchmark/generate_dataset.py

diff --git a/rdagent/app/quant_factor_benchmark/eval.py b/rdagent/app/quant_factor_benchmark/eval.py
index d9cb4948..5924401c 100644
--- a/rdagent/app/quant_factor_benchmark/eval.py
+++ b/rdagent/app/quant_factor_benchmark/eval.py
@@ -1,3 +1,7 @@
+import os
+from pathlib import Path
+import pickle
+import time
 from rdagent.app.qlib_rd_loop.conf import PROP_SETTING
 from rdagent.scenarios.qlib.factor_experiment_loader.json_loader import (
     FactorTestCaseLoaderFromJsonFile,
@@ -11,6 +15,7 @@
 from rdagent.core.scenario import Scenario
 from rdagent.scenarios.qlib.experiment.factor_experiment import QlibFactorScenario
 
+from pprint import pprint
 
 # 1.read the settings
 bs = BenchmarkSettings()
@@ -35,6 +40,25 @@
 # 5.run the eval
 res = eval_method.eval()
 
+# 6.save the result
+pprint(res)
+
+res_workspace = (Path().cwd() / "git_ignore_folder" / "eval_results").absolute()
+print(str(res_workspace))
+
+# Save results
+timestamp = time.strftime("%Y%m%d-%H%M%S", time.localtime(time.time()))
+
+if not os.path.exists(str(res_workspace)):
+    os.makedirs(str(res_workspace))
+
+df_file_path = res_workspace / ("result_" + timestamp + ".csv")
+res_pkl_path = res_workspace / ("res_promptV2" + timestamp + ".pkl")
+res_pkl_path = res_workspace / ("res_promptV2" + timestamp + ".pkl")
+with open(str(res_pkl_path), "wb") as file:
+    # file.write(str(res))
+    pickle.dump(res, file)
+
 # TODO:
 # - Run it:
 # - factor input data generator;
diff --git a/rdagent/components/benchmark/conf.py b/rdagent/components/benchmark/conf.py
index 7ebebc5d..cefd6391 100644
--- a/rdagent/components/benchmark/conf.py
+++ b/rdagent/components/benchmark/conf.py
@@ -15,7 +15,7 @@ class BenchmarkSettings(BaseSettings):
 
     bench_data_path: Path = DIRNAME / "example.json"
 
-    bench_test_round: int = 10
+    bench_test_round: int = 1
     bench_test_case_n: Optional[int] = None  # how many test cases to run; If not given, all test cases will be run
 
     bench_method_cls: str = "rdagent.components.coder.factor_coder.CoSTEER.FactorCoSTEER"
diff --git a/rdagent/components/benchmark/generate_dataset.py b/rdagent/components/benchmark/generate_dataset.py
deleted file mode 100644
index 6e0520d1..00000000
--- a/rdagent/components/benchmark/generate_dataset.py
+++ /dev/null
@@ -1,72 +0,0 @@
-import pandas as pd
-import numpy as np
-import random
-import string
-
-def create_new_hdf5_file(file_path, new_path):
-    """ Create a new HDF5 file with random data. """
-    # Load the dataset
-    data = pd.read_hdf(file_path, key='data')
-
-    columns = [] # TODO select the column we want to keep
-    selected_data = data[columns]
-
-    # Generate new data for each column
-    new_data = pd.DataFrame(index=selected_data.index)
-
-    for column in selected_data.columns:
-        if column == 'B/P':
-            mean = selected_data[column].mean().values[0]
-            std = selected_data[column].std().values[0]
-        else:
-            mean = selected_data[column].mean()
-            std = selected_data[column].std()
-        new_data[column] = np.random.normal(mean, std, size=selected_data.shape[0])
-
-    # Save the new dataset
-    new_data.to_hdf(new_path, key='data', mode='w')
-
-    print("New dataset created and saved successfully!")
-
-def change_head(path):
-    data = pd.read_hdf(path, key='data')
-    columns = [
-        'B/P', 'ROE',
-        'TotalCapitalStock', 'TradableACapital', 'TotalMarketValue', 'TradableMarketValue', 'StockPrice', 
-        'E/P', 'ECut/P', 'EBIT/EV', 'EBITDA/EV', 'ROA_Q', 'MACrossover', 'QuarterlyUnrestrictedShareholdersRatioChange', 
-        'HSZZ_ALPHA_3M', 'ROE_Q', 'ROA_TTM', 'S_ROAG', 'ExternalFinancingScale_2Y', 'ConMarketConf_5D', 
-        'NetProfitSequentialQuarterlyChange', 'SemiannualUnrestrictedShareholdersRatioChange', 'STD_12M', 
-        'LarSmaDiffSellValue', 'OperatingCashFlowRatio', 'TurnoverRate_30D', 'HSZZ_R2_3M', 'Sales_Growth_3Y', 
-        'PricePosition_30D', 'NetProfitMargin', 'OperatingProfitYOY', 'SalesToCashRatio', 
-        'FutureUnrestrictedRatio_3M', 'HSZZ_ALPHA_12M', 'Idiosyncrasy', 'RatingChange', 'TSKEW', 
-        'WeeklyConsensusChangeJR_1W', 'HSZZ_BETA_3M', 'PricePosition_180D', 'MedSellValue', 
-        'UnlimitedShareholdersAverageAmount', 'T_ROEG', 'QuarterlyAverageShareholdersRatioChange', 
-        'FixedAssetTurnover', 'MonthlyRatingChange_1M', 'FutureUnrestrictedRatio_6M', 'TurnoverRate_30D_90D', 
-        'Sales2EV', 'ILLIQ_1M', 'Profit_Growth_TTM', 'HighLow_1M', 'OperationCash_TTM', 
-        'FutureUnrestrictedRatio_6MOver30DAvgTurnover', 'TurnoverRate_30D_180D', 'GrossProfitMargin', 
-        'AnalystMomentumScore', 'ShareExpansionRatio_2Y', 'ROIC', 'TurnoverRate_60D', 
-        'ExternalFinancingAdjustedGrowthRate', 'Weighted_Strength_3M', 'Weighted_Strength_1M', 
-        'FutureUnrestrictedRatio_1MOver30DAvgTurnover', 'OperatingCashFlowOverRevenue_TTM', 'ConMarketConf_10D', 
-        'HSZZ_ResidualStd_3M', 'RevenueSequentialYOY', 'RevenueYOY', 'EXTE'
-    ]
-
-    data.columns = columns
-    data.to_hdf(path, key='data', mode='w')
-    print("Head changed successfully!")
-
-def view_hdf5_file(filename):
-    with pd.HDFStore(filename, 'r') as store:
-        print("Keys in the file:", store.keys())
-        data = store['data']
-        print(data.head())
-        print("\nSummary statistics:\n", data.describe())  
-        print(data.index)
-
-
-
-if __name__ == '__main__':
-    path = ''
-    new_path = ''
-    create_new_hdf5_file(file_path=path, new_path=new_path)
-    change_head(new_path)
-    view_hdf5_file(new_path)
\ No newline at end of file
diff --git a/rdagent/components/coder/factor_coder/CoSTEER/evolving_agent.py b/rdagent/components/coder/factor_coder/CoSTEER/evolving_agent.py
index feee0ef7..75661805 100644
--- a/rdagent/components/coder/factor_coder/CoSTEER/evolving_agent.py
+++ b/rdagent/components/coder/factor_coder/CoSTEER/evolving_agent.py
@@ -14,6 +14,6 @@ def filter_evolvable_subjects_by_feedback(self, evo: EvolvableSubjects, feedback
         assert len(evo.sub_workspace_list) == len(feedback)
 
         for index in range(len(evo.sub_workspace_list)):
-            if not feedback[index].final_decision:
+            if feedback[index] and not feedback[index].final_decision:
                 evo.sub_workspace_list[index].clear()
         return evo
diff --git a/rdagent/components/coder/factor_coder/config.py b/rdagent/components/coder/factor_coder/config.py
index cc3c301c..9a62b4e6 100644
--- a/rdagent/components/coder/factor_coder/config.py
+++ b/rdagent/components/coder/factor_coder/config.py
@@ -10,7 +10,7 @@ class FactorImplementSettings(BaseSettings):
     class Config:
         env_prefix = "FACTOR_CODER_"  # Use FACTOR_CODER_ as prefix for environment variables
 
-    coder_use_cache: bool = False
+    coder_use_cache: bool = True
     data_folder: str = str(
         (Path().cwd() / "git_ignore_folder" / "factor_implementation_source_data").absolute(),
     )

From 048c6fe2356f8b5788404933dbcaaf205ae743ab Mon Sep 17 00:00:00 2001
From: WinstonLiyt <104308117+WinstonLiyt@users.noreply.github.com>
Date: Wed, 24 Jul 2024 15:00:21 +0800
Subject: [PATCH 20/40] Performed further optimizations on the factor loop and
 report extraction loop, added log handling for both processes, and
 implemented a screenshot feature for report extraction. (#100) (#102)

- Performed further optimizations on the factor loop and report extraction loop.
- Added log handling for both processes.
- Implemented a screenshot feature for report extraction.

From f9b57b96cf041c5a7c271e91c5ab3c0b711e4b57 Mon Sep 17 00:00:00 2001
From: Taozhi Wang <taozhi.mark.wang@gmail.com>
Date: Wed, 24 Jul 2024 08:01:51 +0000
Subject: [PATCH 21/40] Analysis

---
 .../app/quant_factor_benchmark/analysis.py    | 290 ++++++++++++++++++
 .../quant_factor_benchmark/design/__init__.py |   0
 .../design/benchmark.py                       |  72 +++++
 3 files changed, 362 insertions(+)
 create mode 100644 rdagent/app/quant_factor_benchmark/analysis.py
 create mode 100644 rdagent/app/quant_factor_benchmark/design/__init__.py
 create mode 100644 rdagent/app/quant_factor_benchmark/design/benchmark.py

diff --git a/rdagent/app/quant_factor_benchmark/analysis.py b/rdagent/app/quant_factor_benchmark/analysis.py
new file mode 100644
index 00000000..7dd94cc9
--- /dev/null
+++ b/rdagent/app/quant_factor_benchmark/analysis.py
@@ -0,0 +1,290 @@
+
+from pathlib import Path
+import pickle
+import pandas as pd
+
+from rdagent.app.quant_factor_benchmark.design.benchmark import summarize_res as summarize_res
+
+
+results = {
+    "1 round experiment": "git_ignore_folder/eval_results/res_promptV220240724-060037.pkl",
+}
+
+
+def load_data(file_path):
+    file_path = Path(file_path)
+
+    if not (file_path.is_file() and file_path.suffix == ".pkl" and file_path.name.startswith("res_")):
+        raise ValueError("You may get a invalid file path")
+
+    sum_df = []
+    with file_path.open("rb") as f:
+        res = pickle.load(f)
+
+    sum_df.append(summarize_res(res))
+    sum_df = pd.concat(sum_df, axis=1)
+    if sum_df.shape[0] == 0:
+        raise ValueError("No data in the file")
+    print(file_path, sum_df.shape)
+
+    index = [
+        "FactorSingleColumnEvaluator",
+        "FactorOutputFormatEvaluator",
+        "FactorRowCountEvaluator",
+        "FactorIndexEvaluator",
+        "FactorMissingValuesEvaluator",
+        "FactorEqualValueCountEvaluator",
+        "FactorCorrelationEvaluator",
+        "run factor error",
+    ]
+
+    # reindex in case of failing to run evaluator.
+    # If all implemented factors fail to run, some evaluators may not appear in the result.
+    sum_df = sum_df.reindex(index, axis=0)
+
+    sum_df_clean = sum_df.T.groupby(level=0).apply(lambda x: x.reset_index(drop=True))
+
+    # sum_df.columns
+
+    def get_run_error(sum_df_clean):
+        run_error = sum_df_clean["run factor error"]
+
+        run_error = run_error.unstack()
+
+        run_error = run_error.T.fillna(False).astype(bool)  # null indicate no exception
+
+        succ_rate = ~run_error
+        succ_rate = succ_rate.mean(axis=0).to_frame("success rate")
+        # make it display in a percentage rate
+        # succ_rate["success rate"] = succ_rate["success rate"].map(lambda x: f"{x:.2%}")
+        return succ_rate
+
+    succ_rate = get_run_error(sum_df_clean)
+
+    def reformat_succ_rate(display_df):
+        """You may get dataframe like this:
+
+                                    success rate
+        250-day_high_distance             80.00%
+        Corr_Close_Turnover               20.00%
+        EP_TTM                            20.00%
+        High_Frequency_Skewness           60.00%
+        Momentum                          50.00%
+        Morning_30-min_Return             30.00%
+        UID                                0.00%
+        Weighted_Earnings_Frequency       10.00%
+        """
+        index_map = {
+            # =========================research benchmark===========================
+            "One_Month_Volatility": ("One_Month_Volatility", "Volume&Price", "Easy"),
+            "Vol20": ("Vol20", "Volume&Price", "Medium"),
+            "Alpha#70": ("Alpha#70", "Volume&Price", "Hard"),
+            "DailyRDvar": ("DailyRDvar", "High-Frequency", "Easy"),
+            "AdjRDvar": ("AdjRDvar", "High-Frequency", "Medium"),
+            "AdjRDskew": ("AdjRDskew", "High-Frequency", "Hard"),
+            "PEG": ("PEG", "Fundamentals", "Easy"),
+            "Turnover_STD_1M": ("Turnover_STD_1M", "Fundamentals", "Medium"),
+            "turnover_correlation_with_price": (
+                "turnover_correlation_with_price",
+                "Fundamentals",
+                "Hard",
+            ),
+            "minute_pv_corr": ("minute_pv_corr", "High-Frequency", "New Discovery"),
+            "Liquidity_Factor": ("Liquidity_Factor", "Fundamentals", "New Discovery"),
+            # =========================project benchmark===========================
+            "250-day_high_distance": ("250-day_high_distance", "Volume&Price", "Medium"),
+            "Corr_Close_Turnover": ("Corr_Close_Turnover", "Fundamentals", "Easy"),
+            "EP_TTM": ("EP_TTM", "Fundamentals", "Medium"),
+            "High_Frequency_Skewness": ("High_Frequency_Skewness", "High-Frequency", "Easy"),
+            "Momentum": ("Momentum", "Volume&Price", "Easy"),
+            "Morning_30-min_Return": ("Morning_30-min_Return", "High-Frequency", "Medium"),
+            "UID": ("UID", "High-Frequency", "Hard"),
+            "Weighted_Earnings_Frequency": ("Weighted_Earnings_Frequency", "Volume&Price", "Hard"),
+            "Turnover_Rate_Factor": ("Turnover_Rate_Factor", "Fundamentals", "New Discovery"),
+            "PctTurn20": ("PctTurn20", "Volume&Price", "New Discovery"),
+            "PB_ROE": ("PB_ROE", "Fundamentals", "New Discovery"),
+        }
+
+        new_idx = []
+        display_df = display_df[display_df.index.isin(index_map.keys())]
+        # display_df = display_df.reindex(index_map.keys())
+        for idx in display_df.index:
+            new_idx.append(index_map[idx])
+        display_df.index = pd.MultiIndex.from_tuples(
+            new_idx,
+            names=["Factor", "Category", "Difficulty"],
+        )
+
+        display_df = display_df.swaplevel(0, 2).swaplevel(0, 1).sort_index(axis=0)
+
+        def sort_key_func(x):
+            order_v = []
+            for i in x:
+                order_v.append({"Easy": 0, "Medium": 1, "Hard": 2, "New Discovery": 3}.get(i, i))
+            return order_v
+
+        return display_df.sort_index(key=sort_key_func)
+
+    succ_rate_f = reformat_succ_rate(succ_rate)
+    succ_rate_f
+
+    sum_df_clean["FactorRowCountEvaluator"]
+
+    def get_run_error(eval_series):
+        eval_series = eval_series.unstack()
+
+        succ_rate = eval_series.T.fillna(False).astype(bool)  # false indicate failure
+
+        succ_rate = succ_rate.mean(axis=0).to_frame("success rate")
+        # make it display in a percentage rate
+        # succ_rate["success rate"] = succ_rate["success rate"].map(lambda x: f"{x:.2%}")
+        return succ_rate
+
+    format_issue = (
+        sum_df_clean["FactorRowCountEvaluator"] & sum_df_clean["FactorIndexEvaluator"]
+    )
+
+    format_succ_rate = get_run_error(format_issue)
+    format_succ_rate_f = reformat_succ_rate(format_succ_rate)
+
+    corr = sum_df_clean["FactorCorrelationEvaluator"] * format_issue
+
+    corr = corr.unstack().T.mean(axis=0).to_frame("corr(only success)")
+    corr_res = reformat_succ_rate(corr)
+
+    corr_max = sum_df_clean["FactorCorrelationEvaluator"] * format_issue
+
+    corr_max = corr_max.unstack().T.max(axis=0).to_frame("corr(only success)")
+    corr_max_res = reformat_succ_rate(corr_max)
+
+    value_max = sum_df_clean["FactorMissingValuesEvaluator"] * format_issue
+    value_max = value_max.unstack().T.max(axis=0).to_frame("max_value")
+    value_max_res = reformat_succ_rate(value_max)
+
+    value_avg = (
+        (sum_df_clean["FactorMissingValuesEvaluator"] * format_issue)
+        .unstack()
+        .T.mean(axis=0)
+        .to_frame("avg_value")
+    )
+    value_avg_res = reformat_succ_rate(value_avg)
+
+    result_all = pd.concat(
+        {
+            "avg. Correlation (value only)": corr_res.iloc[:, 0],
+            "avg. Format successful rate": format_succ_rate_f.iloc[:, 0],
+            "avg. Run successful rate": succ_rate_f.iloc[:, 0],
+            "max. Correlation": corr_max_res.iloc[:, 0],
+            "max. accuracy": value_max_res.iloc[:, 0],
+            "avg. accuracy": value_avg_res.iloc[:, 0],
+        },
+        axis=1,
+    )
+
+    def result_all_key_order(x):
+        order_v = []
+        for i in x:
+            order_v.append(
+                {
+                    "avg. Run successful rate": 0,
+                    "avg. Format successful rate": 1,
+                    "avg. Correlation (value only)": 2,
+                    "max. Correlation": 3,
+                    "max. accuracy": 4,
+                    "avg. accuracy": 5,
+                }.get(i, i),
+            )
+        return order_v
+
+    df = result_all.sort_index(axis=1, key=result_all_key_order)
+    print(df)
+
+    df["Selected Correlation"] = [0, 0, 0] # @TODO
+
+    # Calculate the mean of each column
+    mean_values = df.fillna(0.0).mean()
+    mean_df = pd.DataFrame(mean_values).T
+
+    # TODO: set it as multi-index
+    # Assign the MultiIndex to the DataFrame
+    mean_df.index = pd.MultiIndex.from_tuples([("-", "-", "Average")], names=["Factor", "Category", "Difficulty"])
+
+    # Append the mean values to the end of the dataframe
+    df_w_mean = pd.concat([df, mean_df]).astype("float")
+
+    return df_w_mean
+
+
+def display_df(df):
+    # This depends on jupyter
+    def _single_formatter(column):
+        if column.endswith("rate"):
+
+            def _f(x):
+                return "{:.2%}".format(x) if pd.notnull(x) else "-"
+
+        else:
+
+            def _f(x):
+                return "{:.4}".format(x) if pd.notnull(x) else "-"
+
+        return _f
+
+    def get_formatters():
+        # Show NaN or None as '-'.  Don't convert the value to string
+        fmts = {column: _single_formatter(column) for column in df.columns}
+        return fmts
+
+    # TODO display
+    df_w_mean.drop(["max. accuracy", "avg. accuracy"], axis=1).style.format(get_formatters()).background_gradient(
+        axis=0, vmax=1, vmin=0, cmap=__import__("seaborn").light_palette("green", as_cmap=True)
+    )
+
+
+final_res = {}
+for k, p in results.items():
+    df = load_data(p)
+    print(df)
+    final_res[k] = df.iloc[-1, :]
+
+final_res = pd.DataFrame(final_res)
+
+
+# TODO plot it with seaborn and save it as a file
+final_res.drop(["max. accuracy", "avg. accuracy"], axis=0).T
+
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+plt.rcParams["axes.unicode_minus"] = False
+
+
+def change_fs(font_size):
+    font_size = font_size
+    plt.rc("font", size=font_size)  # controls default text sizes
+    plt.rc("axes", titlesize=font_size)  # fontsize of the axes title
+    plt.rc("axes", labelsize=font_size)  # fontsize of the x and y labels
+    plt.rc("xtick", labelsize=font_size)  # fontsize of the tick labels
+    plt.rc("ytick", labelsize=font_size)  # fontsize of the tick labels
+    plt.rc("legend", fontsize=font_size)  # legend fontsize
+    plt.rc("figure", titlesize=font_size)  # fontsize of the figure title
+
+
+change_fs(20)
+
+
+# Prepare the data for plotting
+plot_data = final_res.drop(["max. accuracy", "avg. accuracy"], axis=0).T
+plot_data = plot_data.reset_index().melt("index", var_name="a", value_name="b")
+
+# Create the plot
+plt.figure(figsize=(10, 6))
+sns.barplot(x="index", y="b", hue="a", data=plot_data)
+
+# Set the labels and title
+plt.xlabel("Method")
+plt.ylabel("Value")
+plt.title("Comparison of Different Methods")
+
+# Save the plot as a file
+plt.savefig("comparison_plot.png")
diff --git a/rdagent/app/quant_factor_benchmark/design/__init__.py b/rdagent/app/quant_factor_benchmark/design/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/rdagent/app/quant_factor_benchmark/design/benchmark.py b/rdagent/app/quant_factor_benchmark/design/benchmark.py
new file mode 100644
index 00000000..084a52e0
--- /dev/null
+++ b/rdagent/app/quant_factor_benchmark/design/benchmark.py
@@ -0,0 +1,72 @@
+from collections import defaultdict
+from copy import deepcopy
+from pathlib import Path
+from typing import Dict, List, Tuple, Union
+
+import pandas as pd
+from tqdm import tqdm
+
+from rdagent.core.exception import RunnerException
+from rdagent.components.coder.factor_coder.CoSTEER.evaluators import (
+    FactorCorrelationEvaluator,
+    FactorEqualValueCountEvaluator,
+    FactorEvaluator,
+    FactorIndexEvaluator,
+    FactorMissingValuesEvaluator,
+    FactorOutputFormatEvaluator,
+    FactorRowCountEvaluator,
+    FactorSingleColumnEvaluator,
+)
+
+
+"""
+Define EVAL_RES_ONLINE with an example
+{
+    <factor_name: str>: [
+        (
+            <FactorEvaluator: object>,
+            if successfully run(call `.execute()`):
+                [
+                    (Evaluator,
+                        if successfully evaluate it:
+                            (feedback, metric),
+                        else:
+                            EvaluationException
+                    ),
+                    ... other evaluators ...
+                ]
+            else:
+                <Run Exception>
+        )
+        ... more similar tuples ...
+    ]
+}
+"""
+EVAL_RES_ONLINE = Dict[
+    str,
+    List[Tuple[FactorEvaluator, Union[object, RunnerException]]],
+]
+
+def summarize_res(res: EVAL_RES_ONLINE) -> pd.DataFrame:
+    # None: indicate that it raises exception and get no results
+    sum_res = {}
+    for factor_name, runs in res.items():
+        for fi, err_or_res_l in runs:
+            # NOTE:  str(fi) may not be unique!!  Because the workspace can be skipped when hitting the cache.
+            uniq_key = f"{str(fi)},{id(fi)}"
+
+            key = (factor_name, uniq_key)
+            val = {}
+            if isinstance(err_or_res_l, Exception):
+                val["run factor error"] = str(err_or_res_l.__class__)
+            else:
+                val["run factor error"] = None
+                for ev_obj, err_or_res in err_or_res_l:
+                    if isinstance(err_or_res, Exception):
+                        val[str(ev_obj)] = None
+                    else:
+                        feedback, metric = err_or_res
+                        val[str(ev_obj)] = metric
+            sum_res[key] = val
+
+    return pd.DataFrame(sum_res)

From b9d9194725c8040d8aa2cede9c8335b3a5619195 Mon Sep 17 00:00:00 2001
From: WinstonLiyt <1957922024@qq.com>
Date: Wed, 24 Jul 2024 08:10:09 +0000
Subject: [PATCH 22/40] Optimized log output.

---
 rdagent/app/qlib_rd_loop/factor_from_report_sh.py         | 2 +-
 .../scenarios/qlib/factor_experiment_loader/pdf_loader.py | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/rdagent/app/qlib_rd_loop/factor_from_report_sh.py b/rdagent/app/qlib_rd_loop/factor_from_report_sh.py
index 94396ede..c8cc57e4 100644
--- a/rdagent/app/qlib_rd_loop/factor_from_report_sh.py
+++ b/rdagent/app/qlib_rd_loop/factor_from_report_sh.py
@@ -91,7 +91,7 @@ def extract_factors_and_implement(report_file_path: str) -> tuple:
 
         with logger.tag("load_pdf_screenshot"):
             pdf_screenshot = extract_first_page_screenshot_from_pdf(report_file_path)
-            logger.log_object(pdf_screenshot, tag="load_pdf_screenshot")
+            logger.log_object(pdf_screenshot)
 
     docs_dict = load_and_process_pdfs_by_langchain(Path(report_file_path))
 
diff --git a/rdagent/scenarios/qlib/factor_experiment_loader/pdf_loader.py b/rdagent/scenarios/qlib/factor_experiment_loader/pdf_loader.py
index 5a265b4f..d3d4f471 100644
--- a/rdagent/scenarios/qlib/factor_experiment_loader/pdf_loader.py
+++ b/rdagent/scenarios/qlib/factor_experiment_loader/pdf_loader.py
@@ -509,21 +509,21 @@ class FactorExperimentLoaderFromPDFfiles(FactorExperimentLoader):
     def load(self, file_or_folder_path: Path) -> dict:
         with logger.tag("docs"):
             docs_dict = load_and_process_pdfs_by_langchain(Path(file_or_folder_path))
-            logger.log_object(docs_dict, tag="docs dict")
+            logger.log_object(docs_dict)
 
         selected_report_dict = classify_report_from_dict(report_dict=docs_dict, vote_time=1)
         
         with logger.tag("file_to_factor_result"):
             file_to_factor_result = extract_factors_from_report_dict(docs_dict, selected_report_dict)
-            logger.log_object(file_to_factor_result, tag="file_to_factor_result")
+            logger.log_object(file_to_factor_result)
         
         with logger.tag("factor_dict"):
             factor_dict = merge_file_to_factor_dict_to_factor_dict(file_to_factor_result)
-            logger.log_object(factor_dict, tag="factor_dict")
+            logger.log_object(factor_dict)
 
         with logger.tag("filtered_factor_dict"):
             factor_viability, filtered_factor_dict = check_factor_viability(factor_dict)
-            logger.log_object(filtered_factor_dict, tag="filtered_factor_dict")
+            logger.log_object(filtered_factor_dict)
 
         # factor_dict, duplication_names_list = deduplicate_factors_by_llm(factor_dict, factor_viability)
         

From db82b67bebafa9fcfad466a12703308381ff44d2 Mon Sep 17 00:00:00 2001
From: Taozhi Wang <taozhi.mark.wang@gmail.com>
Date: Wed, 24 Jul 2024 08:27:06 +0000
Subject: [PATCH 23/40] Factor update

---
 .../app/quant_factor_benchmark/analysis.py    | 44 ++++---------
 rdagent/components/benchmark/analysis.py      | 63 -------------------
 rdagent/components/benchmark/example.json     |  6 ++
 3 files changed, 18 insertions(+), 95 deletions(-)
 delete mode 100644 rdagent/components/benchmark/analysis.py

diff --git a/rdagent/app/quant_factor_benchmark/analysis.py b/rdagent/app/quant_factor_benchmark/analysis.py
index 7dd94cc9..81a7c4c9 100644
--- a/rdagent/app/quant_factor_benchmark/analysis.py
+++ b/rdagent/app/quant_factor_benchmark/analysis.py
@@ -1,15 +1,27 @@
 
+import json
 from pathlib import Path
 import pickle
 import pandas as pd
 
 from rdagent.app.quant_factor_benchmark.design.benchmark import summarize_res as summarize_res
+from rdagent.components.benchmark.conf import BenchmarkSettings
 
 
 results = {
     "1 round experiment": "git_ignore_folder/eval_results/res_promptV220240724-060037.pkl",
 }
 
+# Get index map from the json file
+index_map = {}
+bs = BenchmarkSettings()
+def load(json_file_path: Path) -> None:
+    with open(json_file_path, "r") as file:
+        factor_dict = json.load(file)
+    for factor_name, factor_data in factor_dict.items():
+        index_map[factor_name] = (factor_name, factor_data["Category"], factor_data["Difficulty"])
+
+load(bs.bench_data_path)
 
 def load_data(file_path):
     file_path = Path(file_path)
@@ -74,36 +86,6 @@ def reformat_succ_rate(display_df):
         UID                                0.00%
         Weighted_Earnings_Frequency       10.00%
         """
-        index_map = {
-            # =========================research benchmark===========================
-            "One_Month_Volatility": ("One_Month_Volatility", "Volume&Price", "Easy"),
-            "Vol20": ("Vol20", "Volume&Price", "Medium"),
-            "Alpha#70": ("Alpha#70", "Volume&Price", "Hard"),
-            "DailyRDvar": ("DailyRDvar", "High-Frequency", "Easy"),
-            "AdjRDvar": ("AdjRDvar", "High-Frequency", "Medium"),
-            "AdjRDskew": ("AdjRDskew", "High-Frequency", "Hard"),
-            "PEG": ("PEG", "Fundamentals", "Easy"),
-            "Turnover_STD_1M": ("Turnover_STD_1M", "Fundamentals", "Medium"),
-            "turnover_correlation_with_price": (
-                "turnover_correlation_with_price",
-                "Fundamentals",
-                "Hard",
-            ),
-            "minute_pv_corr": ("minute_pv_corr", "High-Frequency", "New Discovery"),
-            "Liquidity_Factor": ("Liquidity_Factor", "Fundamentals", "New Discovery"),
-            # =========================project benchmark===========================
-            "250-day_high_distance": ("250-day_high_distance", "Volume&Price", "Medium"),
-            "Corr_Close_Turnover": ("Corr_Close_Turnover", "Fundamentals", "Easy"),
-            "EP_TTM": ("EP_TTM", "Fundamentals", "Medium"),
-            "High_Frequency_Skewness": ("High_Frequency_Skewness", "High-Frequency", "Easy"),
-            "Momentum": ("Momentum", "Volume&Price", "Easy"),
-            "Morning_30-min_Return": ("Morning_30-min_Return", "High-Frequency", "Medium"),
-            "UID": ("UID", "High-Frequency", "Hard"),
-            "Weighted_Earnings_Frequency": ("Weighted_Earnings_Frequency", "Volume&Price", "Hard"),
-            "Turnover_Rate_Factor": ("Turnover_Rate_Factor", "Fundamentals", "New Discovery"),
-            "PctTurn20": ("PctTurn20", "Volume&Price", "New Discovery"),
-            "PB_ROE": ("PB_ROE", "Fundamentals", "New Discovery"),
-        }
 
         new_idx = []
         display_df = display_df[display_df.index.isin(index_map.keys())]
@@ -199,8 +181,6 @@ def result_all_key_order(x):
     df = result_all.sort_index(axis=1, key=result_all_key_order)
     print(df)
 
-    df["Selected Correlation"] = [0, 0, 0] # @TODO
-
     # Calculate the mean of each column
     mean_values = df.fillna(0.0).mean()
     mean_df = pd.DataFrame(mean_values).T
diff --git a/rdagent/components/benchmark/analysis.py b/rdagent/components/benchmark/analysis.py
deleted file mode 100644
index 1aedb7cc..00000000
--- a/rdagent/components/benchmark/analysis.py
+++ /dev/null
@@ -1,63 +0,0 @@
-import pickle
-import os
-import pandas as pd
-import matplotlib.pyplot as plt
-
-# Function to load and process each pickle file
-def process_pickle_file(file_path):
-    try:
-        with open(file_path, 'rb') as file:
-            data = pickle.load(file)
-        # Assuming data is a DataFrame or similar
-        print(f"Data from {file_path} processed successfully.")
-        return data
-    except Exception as e:
-        print(f"Error processing {file_path}: {e}")
-        return None
-    
-def analysis(folder_path):
-    success_count = 0
-    fail_count = 0
-
-    # Logging the errors
-    error_log = open("error_log.log", "w")
-
-    # List to store data for visualization
-    data_frames = []
-
-    # Processing each file in the directory
-    for file_name in os.listdir(folder_path):
-        file_path = os.path.join(folder_path, file_name)
-        data = process_pickle_file(file_path)
-        if data is not None:
-            data_frames.append(data)
-
-    for df in data_frames:
-        if 'Execution succeeded' in df[0]:
-            success_count += 1
-        else:
-            fail_count += 1
-            error_log.write(f"{file_path}: \n{df[0]}\n")
-
-    # Writing summary
-    print(f"Number of successful files: {success_count}")
-    print(f"Number of failed files: {fail_count}")
-
-    # Closing the error log file
-    error_log.close()
-
-def view_pickle_file(folder_path):
-    for file_name in os.listdir(folder_path):
-        file_path = os.path.join(folder_path, file_name)
-
-        print(f'the path of this file is: {file_path}\n')
-        with open(file_path, 'rb') as file:
-            data = pickle.load(file)
-            for i in range(len(data)):
-                print(data[i])
-
-
-if __name__ == '__main__':
-    folder_path = '/data/userdata/v-taozhiwang/RD-Agent/git_ignore_folder/factor_implementation_execution_cache'
-    
-    analysis(folder_path)
\ No newline at end of file
diff --git a/rdagent/components/benchmark/example.json b/rdagent/components/benchmark/example.json
index f1e5a85c..742927da 100644
--- a/rdagent/components/benchmark/example.json
+++ b/rdagent/components/benchmark/example.json
@@ -6,6 +6,8 @@
             "20-day turnover rate": "Average turnover rate over the past 20 days.",
             "Market Capitalization": "Total market value of a company's outstanding shares."
         },
+        "Category": "Fundamentals",
+        "Difficulty": "Easy",
         "gt_code": "import pandas as pd\n\ndata_f = pd.read_hdf('daily_f.h5')\n\ndata = data_f.reset_index()\nwindow_size = 20\n\nnominator=data.groupby('instrument')[['TurnoverRate_30D']].rolling(window=window_size).mean().reset_index(0, drop=True)\n# transfer to series\nnew=nominator['TurnoverRate_30D']\ndata['Turnover_Rate_Factor']=new/data['TradableACapital']\n\n# set the datetime and instrument as index and drop the original index\nresult=pd.DataFrame(data['Turnover_Rate_Factor']).set_index(data_f.index)\n\n# transfer the result to series\nresult=result['Turnover_Rate_Factor']\nresult.to_hdf(\"result.h5\", key=\"data\")" 
     },
     "PctTurn20": {
@@ -16,6 +18,8 @@
             "Turnover_{i, t}": "Turnover of stock i at day t.",
             "Turnover_{i, t-20}": "Turnover of stock i at day t-20."
         },
+        "Category": "Volume&Price",
+        "Difficulty": "Medium",
         "gt_code": "import pandas as pd\nfrom statsmodels import api as sm\n\ndef fill_mean(s: pd.Series) -> pd.Series:\n    return s.fillna(s.mean()).fillna(0.0)\n\ndef market_value_neutralize(s: pd.Series, mv: pd.Series) -> pd.Series:\n    s = s.groupby(\"datetime\", group_keys=False).apply(fill_mean)\n    mv = mv.groupby(\"datetime\", group_keys=False).apply(fill_mean)\n\n    df_f = mv.to_frame(\"MarketValue\")\n    df_f[\"const\"] = 1\n    X = df_f[[\"MarketValue\", \"const\"]]\n\n    # Perform the Ordinary Least Squares (OLS) regression\n    model = sm.OLS(s, X)\n    results = model.fit()\n\n    # Calculate the residuals\n    df_f[\"residual\"] = results.resid\n    df_f[\"norm_resi\"] = df_f.groupby(level=\"datetime\", group_keys=False)[\"residual\"].apply(\n        lambda x: (x - x.mean()) / x.std(),\n    )\n    return df_f[\"norm_resi\"]\n\n\n# get_turnover\ndf_pv = pd.read_hdf(\"daily_pv.h5\", key=\"data\")\ndf_f = pd.read_hdf(\"daily_f.h5\", key=\"data\")\nturnover = df_pv[\"$money\"] / df_f[\"TradableMarketValue\"]\n\nf = turnover.groupby(\"instrument\").pct_change(periods=20)\n\nf_neutralized = market_value_neutralize(f, df_f[\"TradableMarketValue\"])\n\nf_neutralized.to_hdf(\"result.h5\", key=\"data\")"
     },
     "PB_ROE": {
@@ -25,6 +29,8 @@
             "\\text{rank}(PB_t)": "Ranking PB on cross-section at time t.",
             "\\text{rank}(ROE_t)": "Ranking single-quarter ROE on cross-section at time t."
         },
+        "Category": "High-Frequency",
+        "Difficulty": "Hard",
         "gt_code": "#!/usr/bin/env python\n\nimport pandas as pd\n\ndata_f = pd.read_hdf('daily_f.h5')\n\ndata = data_f.reset_index()\n\n# Calculate the rank of PB and ROE\ndata['PB_rank'] = data.groupby('datetime')['B/P'].rank()\ndata['ROE_rank'] = data.groupby('datetime')['ROE'].rank()\n\n# Calculate the difference between the ranks\ndata['PB_ROE'] = data['PB_rank'] - data['ROE_rank']\n\n# set the datetime and instrument as index and drop the original index\nresult=pd.DataFrame(data['PB_ROE']).set_index(data_f.index)\n\n# transfer the result to series\nresult=result['PB_ROE']\nresult.to_hdf(\"result.h5\", key=\"data\")"
     }
 }
\ No newline at end of file

From 265b6b3b56658ac72b926f2fcf2e7832d6f0d7af Mon Sep 17 00:00:00 2001
From: WinstonLiyt <1957922024@qq.com>
Date: Wed, 24 Jul 2024 09:10:36 +0000
Subject: [PATCH 24/40] A draft of the "Quick Start" section for README

---
 README.md | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 61 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index f007e96d..0f353556 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,12 @@
 TODO: Add badges.
 
-# News
+# 📰 News
 | 🗞️News        | 📝Description                 |
 | --            | ------                        |
 | First release | RDAgent are release on Github |
 
 
-# Introduction
+# 🌟 Introduction
 
 ![](docs/_static/scen.jpg)
 
@@ -31,6 +31,65 @@ We have a quick 🎥demo for one use case of RDAgent.
 # ⚡Quick start
 You can try our demo by running the following command:
 
+### 🐍 Create a Conda Environment
+- Create a new conda environment with Python 3.10:
+  ```sh
+  conda create -n rdagent python=3.10
+  ```
+- Activate the environment:
+  ```sh
+  conda activate rdagent
+  ```
+
+### 🛠️ Run Make Files
+- **Navigate to the directory containing the MakeFile** and set up the development environment:
+  ```sh
+  make dev
+  ```
+
+### 📦 Install Pytorch
+- Install Pytorch and related libraries:
+  ```sh
+  pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+  pip3 install torch_geometric
+  ```
+
+### ⚙️ Environment Configuration
+- Place the `.env` file in the same directory as the `.env.example` file.
+- Export each variable in the `.env` file:
+  ```sh
+  export $(grep -v '^#' .env | xargs)
+  ```
+
+### ☁️ Azure Configuration
+- Install Azure CLI:
+  ```sh
+  curl -L https://aka.ms/InstallAzureCli | bash
+  ```
+- Log in to Azure:
+  ```sh
+  az login --use-device-code
+  ```
+
+- `exit` and re-login to your environment (this step may not be necessary).
+
+### 🚀 Run the Application
+- Run the factor extraction and implementation application based on financial reports:
+  ```sh
+  python rdagent/app/qlib_rd_loop/factor_from_report_sh.py
+  ```
+
+- Run the self-loop factor extraction and implementation application:
+  ```sh
+  python rdagent/app/qlib_rd_loop/factor.py
+  ```
+
+- Run the self-loop model extraction and implementation application:
+  ```sh
+  python rdagent/app/qlib_rd_loop/model.py
+  ```
+
+
 ```bash
 # TODO:
 # prepare environment

From 68f0a75b53dffbfe0192fa3df29991fd047df27e Mon Sep 17 00:00:00 2001
From: WinstonLiyt <1957922024@qq.com>
Date: Wed, 24 Jul 2024 10:16:46 +0000
Subject: [PATCH 25/40] Add scenario descriptions.

---
 README.md                         |  6 +--
 docs/scenarios_and_quickstart.rst | 82 ++++++++++++++++++++++++++++++-
 2 files changed, 83 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 0f353556..ee9c3513 100644
--- a/README.md
+++ b/README.md
@@ -119,9 +119,9 @@ Here is our supported scenarios
 
 | Scenario/Target | Model Implementation                   | Data Building                                                                      |
 | --              | --                                     | --                                                                                 |
-| 💹Finance       | Iteratively Proposing Ideas & Evolving | Auto reports reading & implementation <br/> Iteratively Proposing Ideas & Evolving |
-| 🩺Medical       | Iteratively Proposing Ideas & Evolving | -                                                                                  |
-| General         | Auto paper reading & implementation    | -                                                                                  |
+| 💹 Finance       | Iteratively Proposing Ideas & Evolving | Auto reports reading & implementation <br/> Iteratively Proposing Ideas & Evolving |
+| 🩺 Medical       | Iteratively Proposing Ideas & Evolving | -                                                                                  |
+| 🏭 General         | Auto paper reading & implementation    | -                                                                                  |
 
 Different scenarios vary in entrance and configuration. Please check the detailed setup tutorial in the scenarios documents.
 
diff --git a/docs/scenarios_and_quickstart.rst b/docs/scenarios_and_quickstart.rst
index cdc613b6..e264cb24 100644
--- a/docs/scenarios_and_quickstart.rst
+++ b/docs/scenarios_and_quickstart.rst
@@ -5,29 +5,107 @@ Scenarios and Quick Start
 Scenario lists
 =========================
 
-TODO: Copy the content in the README.md
+.. list-table:: 
+   :header-rows: 1
 
+   * - Scenario/Target
+     - Model Implementation
+     - Data Building
+   * - 💹 Finance
+     - Iteratively Proposing Ideas & Evolving
+     - Auto reports reading & implementation
+       Iteratively Proposing Ideas & Evolving
+   * - 🩺 Medical
+     - Iteratively Proposing Ideas & Evolving
+     - 
+   * - 🏭 General
+     - Auto paper reading & implementation
+     - 
 
 Scnarios' demo & quick start
 =========================
 
 Scen1
 -----
+🤖 Knowledge-Based Hypothesis Generation and Iteration
 
 Scen1 Intro
 ~~~~~~~~~~~
+In this scenario, our model autonomously generates and tests hypotheses using a knowledge base. The process involves:
+
+- **🔍 Hypothesis Generation**: The model proposes new hypotheses.
+- **📝 Factor Creation**: Write and define new factors.
+- **✅ Factor Validation**: Validate the factors quantitatively.
+- **📈 Backtesting with Qlib**: 
+
+  - **Dataset**: CSI300
+  - **Model**: LGBModel
+  - **Factors**: Alpha158 +
+  - **Data Split**:
+
+    - **Train**: 2008-01-01 to 2014-12-31
+    - **Valid**: 2015-01-01 to 2016-12-31
+    - **Test**: 2017-01-01 to 2020-08-01
+- **🔄 Feedback Analysis**: Analyze backtest results.
+- **🔧 Hypothesis Refinement**: Refine hypotheses based on feedback and repeat.
 
 Scen1 Demo
 ~~~~~~~~~~
+.. TODO
 
 Scen1 Quick Start
 ~~~~~~~~~~~~~~~~~
 
+To quickly start the factor extraction process, run the following command in your terminal within the 'rdagent' virtual environment:
 
+.. code-block:: sh
 
+    python rdagent/app/qlib_rd_loop/factor.py
 
 
 Usage of modules
-================
+~~~~~~~~~~~~~~~~~
 TODO: Show some examples:
 
+
+Scen2: 
+-----
+📄 Research Report-Based Factor Extraction
+
+Scen2 Intro
+~~~~~~~~~~~
+In this scenario, factors and hypotheses are extracted from research reports. The process includes:
+
+- **🔍 Factor Extraction**: Extract relevant factors from research reports.
+- **📝 Factor Creation**: Define these extracted factors.
+- **✅ Factor Validation**: Validate the extracted factors.
+- **📈 Backtesting with Qlib**: 
+
+  - **Dataset**: CSI300
+  - **Model**: LGBModel
+  - **Factors**: Alpha158 +
+  - **Data Split**:
+
+    - **Train**: 2008-01-01 to 2014-12-31
+    - **Valid**: 2015-01-01 to 2016-12-31
+    - **Test**: 2017-01-01 to 2020-08-01
+- **🔄 Feedback Analysis**: Analyze backtest results.
+- **🔧 Hypothesis Refinement**: Refine hypotheses based on feedback and continue the cycle.
+
+Scen2 Demo
+~~~~~~~~~~
+.. TODO
+
+Scen2 Quick Start
+~~~~~~~~~~~~~~~~~
+
+To quickly start the factor extraction process, run the following command in your terminal within the 'rdagent' virtual environment:
+
+.. code-block:: sh
+
+    python rdagent/app/qlib_rd_loop/factor_from_report_sh.py
+
+
+Usage of modules
+~~~~~~~~~~~~~~~~~
+TODO: Show some examples:
\ No newline at end of file

From 52dc9385dab781492b4b4e2513011d21e93c6d33 Mon Sep 17 00:00:00 2001
From: Taozhi Wang <taozhi.mark.wang@gmail.com>
Date: Thu, 25 Jul 2024 02:52:39 +0000
Subject: [PATCH 26/40] Updates

---
 .../app/quant_factor_benchmark/analysis.py    |  4 +-
 .../quant_factor_benchmark/design/__init__.py |  0
 .../design/benchmark.py                       | 72 -------------------
 rdagent/app/quant_factor_benchmark/eval.py    | 28 +-------
 rdagent/components/benchmark/conf.py          |  5 +-
 rdagent/components/benchmark/eval_method.py   | 36 +++++++++-
 .../components/coder/factor_coder/config.py   |  2 +-
 7 files changed, 43 insertions(+), 104 deletions(-)
 delete mode 100644 rdagent/app/quant_factor_benchmark/design/__init__.py
 delete mode 100644 rdagent/app/quant_factor_benchmark/design/benchmark.py

diff --git a/rdagent/app/quant_factor_benchmark/analysis.py b/rdagent/app/quant_factor_benchmark/analysis.py
index 81a7c4c9..6e6a20ff 100644
--- a/rdagent/app/quant_factor_benchmark/analysis.py
+++ b/rdagent/app/quant_factor_benchmark/analysis.py
@@ -4,12 +4,12 @@
 import pickle
 import pandas as pd
 
-from rdagent.app.quant_factor_benchmark.design.benchmark import summarize_res as summarize_res
+from rdagent.components.benchmark.eval_method import summarize_res as summarize_res
 from rdagent.components.benchmark.conf import BenchmarkSettings
 
 
 results = {
-    "1 round experiment": "git_ignore_folder/eval_results/res_promptV220240724-060037.pkl",
+    "1 round experiment": "log/2024-07-25_02-32-24-552766/1445941/2024-07-25_02-48-07-756153.pkl",
 }
 
 # Get index map from the json file
diff --git a/rdagent/app/quant_factor_benchmark/design/__init__.py b/rdagent/app/quant_factor_benchmark/design/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/rdagent/app/quant_factor_benchmark/design/benchmark.py b/rdagent/app/quant_factor_benchmark/design/benchmark.py
deleted file mode 100644
index 084a52e0..00000000
--- a/rdagent/app/quant_factor_benchmark/design/benchmark.py
+++ /dev/null
@@ -1,72 +0,0 @@
-from collections import defaultdict
-from copy import deepcopy
-from pathlib import Path
-from typing import Dict, List, Tuple, Union
-
-import pandas as pd
-from tqdm import tqdm
-
-from rdagent.core.exception import RunnerException
-from rdagent.components.coder.factor_coder.CoSTEER.evaluators import (
-    FactorCorrelationEvaluator,
-    FactorEqualValueCountEvaluator,
-    FactorEvaluator,
-    FactorIndexEvaluator,
-    FactorMissingValuesEvaluator,
-    FactorOutputFormatEvaluator,
-    FactorRowCountEvaluator,
-    FactorSingleColumnEvaluator,
-)
-
-
-"""
-Define EVAL_RES_ONLINE with an example
-{
-    <factor_name: str>: [
-        (
-            <FactorEvaluator: object>,
-            if successfully run(call `.execute()`):
-                [
-                    (Evaluator,
-                        if successfully evaluate it:
-                            (feedback, metric),
-                        else:
-                            EvaluationException
-                    ),
-                    ... other evaluators ...
-                ]
-            else:
-                <Run Exception>
-        )
-        ... more similar tuples ...
-    ]
-}
-"""
-EVAL_RES_ONLINE = Dict[
-    str,
-    List[Tuple[FactorEvaluator, Union[object, RunnerException]]],
-]
-
-def summarize_res(res: EVAL_RES_ONLINE) -> pd.DataFrame:
-    # None: indicate that it raises exception and get no results
-    sum_res = {}
-    for factor_name, runs in res.items():
-        for fi, err_or_res_l in runs:
-            # NOTE:  str(fi) may not be unique!!  Because the workspace can be skipped when hitting the cache.
-            uniq_key = f"{str(fi)},{id(fi)}"
-
-            key = (factor_name, uniq_key)
-            val = {}
-            if isinstance(err_or_res_l, Exception):
-                val["run factor error"] = str(err_or_res_l.__class__)
-            else:
-                val["run factor error"] = None
-                for ev_obj, err_or_res in err_or_res_l:
-                    if isinstance(err_or_res, Exception):
-                        val[str(ev_obj)] = None
-                    else:
-                        feedback, metric = err_or_res
-                        val[str(ev_obj)] = metric
-            sum_res[key] = val
-
-    return pd.DataFrame(sum_res)
diff --git a/rdagent/app/quant_factor_benchmark/eval.py b/rdagent/app/quant_factor_benchmark/eval.py
index 5924401c..a22f928f 100644
--- a/rdagent/app/quant_factor_benchmark/eval.py
+++ b/rdagent/app/quant_factor_benchmark/eval.py
@@ -3,6 +3,7 @@
 import pickle
 import time
 from rdagent.app.qlib_rd_loop.conf import PROP_SETTING
+from rdagent.log import rdagent_logger as logger
 from rdagent.scenarios.qlib.factor_experiment_loader.json_loader import (
     FactorTestCaseLoaderFromJsonFile,
 )
@@ -41,29 +42,4 @@
 res = eval_method.eval()
 
 # 6.save the result
-pprint(res)
-
-res_workspace = (Path().cwd() / "git_ignore_folder" / "eval_results").absolute()
-print(str(res_workspace))
-
-# Save results
-timestamp = time.strftime("%Y%m%d-%H%M%S", time.localtime(time.time()))
-
-if not os.path.exists(str(res_workspace)):
-    os.makedirs(str(res_workspace))
-
-df_file_path = res_workspace / ("result_" + timestamp + ".csv")
-res_pkl_path = res_workspace / ("res_promptV2" + timestamp + ".pkl")
-res_pkl_path = res_workspace / ("res_promptV2" + timestamp + ".pkl")
-with open(str(res_pkl_path), "wb") as file:
-    # file.write(str(res))
-    pickle.dump(res, file)
-
-# TODO:
-# - Run it:
-# - factor input data generator;
-#   - f_{gt}(input) => value_{gt}
-#   - f_{llm}(input) => value_{llm}
-#   - we have legal issue to release Input
-# - Eval result:
-#   -  check https://github.com/peteryang1/fincov2/blob/master/src/scripts/benchmark/analysis.py
+logger.log_object(res)
diff --git a/rdagent/components/benchmark/conf.py b/rdagent/components/benchmark/conf.py
index cefd6391..a05bbe39 100644
--- a/rdagent/components/benchmark/conf.py
+++ b/rdagent/components/benchmark/conf.py
@@ -11,11 +11,14 @@
 
 
 class BenchmarkSettings(BaseSettings):
+    class Config:
+        env_prefix = "BENCHMARK_"  # Use BENCHMARK_ as prefix for environment variables
+
     ground_truth_dir: Path = DIRNAME / "ground_truth"
 
     bench_data_path: Path = DIRNAME / "example.json"
 
-    bench_test_round: int = 1
+    bench_test_round: int = 10
     bench_test_case_n: Optional[int] = None  # how many test cases to run; If not given, all test cases will be run
 
     bench_method_cls: str = "rdagent.components.coder.factor_coder.CoSTEER.FactorCoSTEER"
diff --git a/rdagent/components/benchmark/eval_method.py b/rdagent/components/benchmark/eval_method.py
index b45451eb..4d96afd3 100644
--- a/rdagent/components/benchmark/eval_method.py
+++ b/rdagent/components/benchmark/eval_method.py
@@ -1,7 +1,8 @@
 from collections import defaultdict
 from pathlib import Path
-from typing import List, Tuple, Union
+from typing import Dict, List, Tuple, Union
 
+import pandas as pd
 from tqdm import tqdm
 
 from rdagent.components.coder.factor_coder.config import FACTOR_IMPLEMENT_SETTINGS
@@ -18,12 +19,17 @@
 from rdagent.components.coder.factor_coder.factor import FactorFBWorkspace
 from rdagent.core.conf import RD_AGENT_SETTINGS
 from rdagent.core.developer import Developer
-from rdagent.core.exception import CoderException
+from rdagent.core.exception import CoderException, RunnerException
 from rdagent.core.experiment import Task, Workspace
 from rdagent.core.scenario import Scenario
 from rdagent.core.utils import multiprocessing_wrapper
 
 
+EVAL_RES = Dict[
+    str,
+    List[Tuple[FactorEvaluator, Union[object, RunnerException]]],
+]
+
 class TestCase:
     def __init__(
         self,
@@ -165,3 +171,29 @@ def eval(self):
             res[gt_case.target_task.factor_name].append((gen_factor, eval_res))
 
         return res
+
+    @staticmethod
+    def summarize_res(res: EVAL_RES) -> pd.DataFrame:
+        # None: indicate that it raises exception and get no results
+        sum_res = {}
+        for factor_name, runs in res.items():
+            for fi, err_or_res_l in runs:
+                # NOTE:  str(fi) may not be unique!!  Because the workspace can be skipped when hitting the cache.
+                uniq_key = f"{str(fi)},{id(fi)}"
+
+                key = (factor_name, uniq_key)
+                val = {}
+                if isinstance(err_or_res_l, Exception):
+                    val["run factor error"] = str(err_or_res_l.__class__)
+                else:
+                    val["run factor error"] = None
+                    for ev_obj, err_or_res in err_or_res_l:
+                        if isinstance(err_or_res, Exception):
+                            val[str(ev_obj)] = None
+                        else:
+                            feedback, metric = err_or_res
+                            val[str(ev_obj)] = metric
+                sum_res[key] = val
+
+        return pd.DataFrame(sum_res)
+
diff --git a/rdagent/components/coder/factor_coder/config.py b/rdagent/components/coder/factor_coder/config.py
index 9a62b4e6..cc3c301c 100644
--- a/rdagent/components/coder/factor_coder/config.py
+++ b/rdagent/components/coder/factor_coder/config.py
@@ -10,7 +10,7 @@ class FactorImplementSettings(BaseSettings):
     class Config:
         env_prefix = "FACTOR_CODER_"  # Use FACTOR_CODER_ as prefix for environment variables
 
-    coder_use_cache: bool = True
+    coder_use_cache: bool = False
     data_folder: str = str(
         (Path().cwd() / "git_ignore_folder" / "factor_implementation_source_data").absolute(),
     )

From 11980dc5059544a21e3faef688549bb20825b440 Mon Sep 17 00:00:00 2001
From: Young <afe.young@gmail.com>
Date: Thu, 25 Jul 2024 03:00:07 +0000
Subject: [PATCH 27/40] Adjust content

---
 README.md             | 32 ++++++++++----------------------
 docs/installation.rst | 24 ++++++++++++++++++++++++
 2 files changed, 34 insertions(+), 22 deletions(-)

diff --git a/README.md b/README.md
index ee9c3513..ef11a3a0 100644
--- a/README.md
+++ b/README.md
@@ -32,7 +32,7 @@ We have a quick 🎥demo for one use case of RDAgent.
 You can try our demo by running the following command:
 
 ### 🐍 Create a Conda Environment
-- Create a new conda environment with Python 3.10:
+- Create a new conda environment with Python (3.10 and 3.11 are well tested in our CI):
   ```sh
   conda create -n rdagent python=3.10
   ```
@@ -42,12 +42,16 @@ You can try our demo by running the following command:
   ```
 
 ### 🛠️ Run Make Files
+TODO: `pip install rdagent` in the future.
+
 - **Navigate to the directory containing the MakeFile** and set up the development environment:
   ```sh
   make dev
   ```
 
 ### 📦 Install Pytorch
+TODO: use docker in quick start intead.
+
 - Install Pytorch and related libraries:
   ```sh
   pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
@@ -56,24 +60,17 @@ You can try our demo by running the following command:
 
 ### ⚙️ Environment Configuration
 - Place the `.env` file in the same directory as the `.env.example` file.
+  - TOOD: please refer to ... for the detailed explanation of the `.env`
+  - TODO: simplify `.env.example`  only keep OpenAI or Azure Azure OpenAI
 - Export each variable in the `.env` file:
   ```sh
   export $(grep -v '^#' .env | xargs)
   ```
+### 🚀 Run the Application
+TODO: run the front-page demo.
 
-### ☁️ Azure Configuration
-- Install Azure CLI:
-  ```sh
-  curl -L https://aka.ms/InstallAzureCli | bash
-  ```
-- Log in to Azure:
-  ```sh
-  az login --use-device-code
-  ```
-
-- `exit` and re-login to your environment (this step may not be necessary).
+The [🎥demo]() is implemented by the above commands.
 
-### 🚀 Run the Application
 - Run the factor extraction and implementation application based on financial reports:
   ```sh
   python rdagent/app/qlib_rd_loop/factor_from_report_sh.py
@@ -90,15 +87,6 @@ You can try our demo by running the following command:
   ```
 
 
-```bash
-# TODO:
-# prepare environment
-# installation
-# App entrance
-```
-
-The [🎥demo]() is implemented by the above commands.
-
 # Scenarios
 
 We have applied RD-Agent to multiple valuable data-driven industrial scenarios..
diff --git a/docs/installation.rst b/docs/installation.rst
index 90a3f1a0..b4852be3 100644
--- a/docs/installation.rst
+++ b/docs/installation.rst
@@ -13,3 +13,27 @@ Configuration
 =============
 
 Quick configuration
+
+
+
+
+Azure OpenAI
+------------
+
+
+
+USE_AZURE_TOKEN_PROVIDER
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+### ☁️ Azure Configuration
+- Install Azure CLI:
+  ```sh
+  curl -L https://aka.ms/InstallAzureCli | bash
+  ```
+- Log in to Azure:
+  ```sh
+  az login --use-device-code
+  ```
+
+- `exit` and re-login to your environment (this step may not be necessary).
+

From 98906af3fea7e9d26ce5483fae36f89acdd452c4 Mon Sep 17 00:00:00 2001
From: WinstonLiyt <1957922024@qq.com>
Date: Thu, 25 Jul 2024 06:37:04 +0000
Subject: [PATCH 28/40] Enable logging of backtesting in Qlib and store
 rich-text descriptions in Trace. Support one-step debugging for factor
 extraction.

---
 rdagent/app/qlib_rd_loop/factor_w_sc.py       | 91 +++++++++++++++++++
 .../qlib/experiment/factor_experiment.py      |  4 +
 .../factor_template/read_exp_res.py           |  3 +
 .../scenarios/qlib/experiment/workspace.py    |  2 +-
 rdagent/scenarios/qlib/prompts.yaml           | 11 ++-
 5 files changed, 106 insertions(+), 5 deletions(-)
 create mode 100644 rdagent/app/qlib_rd_loop/factor_w_sc.py

diff --git a/rdagent/app/qlib_rd_loop/factor_w_sc.py b/rdagent/app/qlib_rd_loop/factor_w_sc.py
new file mode 100644
index 00000000..fe06e2d8
--- /dev/null
+++ b/rdagent/app/qlib_rd_loop/factor_w_sc.py
@@ -0,0 +1,91 @@
+"""
+Factor workflow with session control
+It is from `rdagent/app/qlib_rd_loop/factor.py` and try to replace `rdagent/app/qlib_rd_loop/RDAgent.py`
+"""
+
+from typing import Any
+from dotenv import load_dotenv
+import fire
+
+from rdagent.core.exception import FactorEmptyError
+from rdagent.core.scenario import Scenario
+from rdagent.log import rdagent_logger as logger
+
+load_dotenv(override=True)
+
+from rdagent.app.qlib_rd_loop.conf import PROP_SETTING
+from rdagent.core.developer import Developer
+from rdagent.core.proposal import (
+    Hypothesis2Experiment,
+    HypothesisExperiment2Feedback,
+    HypothesisGen,
+    Trace,
+)
+from rdagent.core.utils import import_class
+
+from rdagent.utils.workflow import LoopMeta, LoopBase
+
+class FactorLoop(LoopBase, metaclass=LoopMeta):
+
+    def __init__(self):
+        scen: Scenario = import_class(PROP_SETTING.factor_scen)()
+
+        self.hypothesis_gen: HypothesisGen = import_class(PROP_SETTING.factor_hypothesis_gen)(scen)
+
+        self.hypothesis2experiment: Hypothesis2Experiment = import_class(PROP_SETTING.factor_hypothesis2experiment)()
+
+        self.qlib_factor_coder: Developer = import_class(PROP_SETTING.factor_coder)(scen)
+        self.qlib_factor_runner: Developer = import_class(PROP_SETTING.factor_runner)(scen)
+
+        self.qlib_factor_summarizer: HypothesisExperiment2Feedback = import_class(PROP_SETTING.factor_summarizer)(scen)
+        self.trace = Trace(scen=scen)
+        super().__init__()
+    
+    def propose(self, prev_out: dict[str, Any]):
+        with logger.tag("r"):
+            hypothesis = self.hypothesis_gen.gen(self.trace)
+            logger.log_object(hypothesis, tag="hypothesis generation")
+        return hypothesis
+
+    def exp_gen(self, prev_out: dict[str, Any]):
+        with logger.tag("r"):
+            exp = self.hypothesis2experiment.convert(prev_out["propose"], self.trace)
+            logger.log_object(exp.sub_tasks, tag="experiment generation")
+        return exp
+
+    def coding(self, prev_out: dict[str, Any]):
+        with logger.tag("d"):
+            exp = self.qlib_factor_coder.develop(prev_out["exp_gen"])
+            logger.log_object(exp.sub_workspace_list, tag="factor coder result")
+        return exp
+
+    def running(self, prev_out: dict[str, Any]):
+        with logger.tag("ef"):
+            exp = self.qlib_factor_runner.develop(prev_out["coding"])
+            logger.log_object(exp, tag="factor runner result")
+        return exp
+
+    def feedback(self, prev_out: dict[str, Any]):
+        feedback = self.qlib_factor_summarizer.generate_feedback(prev_out["running"], prev_out["propose"], self.trace)
+        logger.log_object(feedback, tag="feedback")
+        self.trace.hist.append((prev_out["propose"], prev_out["running"], feedback))
+
+
+def main(path=None, step_n=None):
+    """
+    You can continue running session by
+
+    .. code-block:: python
+
+        dotenv run -- python rdagent/app/qlib_rd_loop/factor_w_sc.py $LOG_PATH/__session__/1/0_propose  --step_n 1   # `step_n` is a optional paramter
+
+    """
+    if path is None:
+        model_loop = FactorLoop()
+    else:
+        model_loop = FactorLoop.load(path)
+    model_loop.run(step_n=step_n)
+
+
+if __name__ == "__main__":
+    fire.Fire(main)
\ No newline at end of file
diff --git a/rdagent/scenarios/qlib/experiment/factor_experiment.py b/rdagent/scenarios/qlib/experiment/factor_experiment.py
index b4821eb4..09e166d5 100644
--- a/rdagent/scenarios/qlib/experiment/factor_experiment.py
+++ b/rdagent/scenarios/qlib/experiment/factor_experiment.py
@@ -39,6 +39,10 @@ def interface(self) -> str:
     @property
     def simulator(self) -> str:
         return prompt_dict["qlib_factor_simulator"]
+    
+    @property
+    def rich_style_description(self)->str:
+        return "Below is QlibFactor Evolving Automatic R&D Demo."
 
     def get_scenario_all_desc(self) -> str:
         return f"""Background of the scenario:
diff --git a/rdagent/scenarios/qlib/experiment/factor_template/read_exp_res.py b/rdagent/scenarios/qlib/experiment/factor_template/read_exp_res.py
index 19930248..d08e3b4c 100644
--- a/rdagent/scenarios/qlib/experiment/factor_template/read_exp_res.py
+++ b/rdagent/scenarios/qlib/experiment/factor_template/read_exp_res.py
@@ -43,3 +43,6 @@
     metrics.to_csv(output_path)
 
     print(f"Output has been saved to {output_path}")
+
+    ret_data_frame = latest_recorder.load_object("portfolio_analysis/report_normal_1day.pkl")
+    ret_data_frame.to_pickle("ret.pkl")
\ No newline at end of file
diff --git a/rdagent/scenarios/qlib/experiment/workspace.py b/rdagent/scenarios/qlib/experiment/workspace.py
index 00871440..f3e16e11 100644
--- a/rdagent/scenarios/qlib/experiment/workspace.py
+++ b/rdagent/scenarios/qlib/experiment/workspace.py
@@ -31,7 +31,7 @@ def execute(self, qlib_config_name: str = "conf.yaml", run_env: dict = {}, *args
         )
 
         ret_df = pd.read_pickle(self.workspace_path / "ret.pkl")
-        logger.log_object(ret_df, tag="returns")  # TODO: tag
+        logger.log_object(ret_df, tag="Quantitative Backtesting Chart")
 
         csv_path = self.workspace_path / "qlib_res.csv"
 
diff --git a/rdagent/scenarios/qlib/prompts.yaml b/rdagent/scenarios/qlib/prompts.yaml
index 19814df6..139ebd57 100644
--- a/rdagent/scenarios/qlib/prompts.yaml
+++ b/rdagent/scenarios/qlib/prompts.yaml
@@ -59,7 +59,7 @@ factor_hypothesis_specification: |-
     - "Filter stocks by market capitalization before calculating the factors."
 
 factor_hypothesis_specification: |-
-  Additional Specifications:
+  Specifications:
     - Hypotheses should grow and evolve based on the previous hypothesis. If there is no previous hypothesis, start with something simple.
     - Gradually build upon previous hypotheses and feedback.
     - Ensure that the hypothesis focuses on the creation and selection of factors in quantitative finance.
@@ -94,7 +94,7 @@ factor_hypothesis_specification: |-
   - "Investigate the impact of an earnings surprise factor calculated from recent earnings announcements."
   - "Develop a composite factor integrating ESG (Environmental, Social, Governance) scores with traditional financial metrics."
 
-  Detailed Workflow:
+  Detailed Workflow: (These are just examples for the format, do not be constrained by these examples)
   1. Initial Hypothesis:
     - Begin with a simple factor, such as "Include a momentum factor based on the last 12 months' returns."
 
@@ -110,8 +110,11 @@ factor_hypothesis_specification: |-
   5. Advanced Hypotheses:
     - Explore sophisticated combinations or new types of factors, e.g., "Develop a composite factor integrating ESG scores with traditional financial metrics."
 
-  Remember: If a hypothesis achieves the desired results, start a new direction while preserving the effective factors from previous hypotheses. New evaluations should combine the newly proposed factors with previously successful factors that surpassed SOTA.
-
+  Important Note:
+  - If a hypothesis achieves the desired results, start a new direction while preserving the effective factors from previous hypotheses.
+  - New evaluations should combine the newly proposed factors with previously successful factors that surpassed SOTA.
+  - If the previous hypothesis factor exceeds SOTA, you can write another factor in the same direction, otherwise, explore new directions.
+  - The final maintained SOTA is the continuous accumulation of factors that surpass each iteration.
 
 factor_experiment_output_format: |-
   The output should follow JSON format. The schema is as follows:

From 702c830b8759cc6613618e9abbaed6de8d7ad9e6 Mon Sep 17 00:00:00 2001
From: Taozhi Wang <taozhi.mark.wang@gmail.com>
Date: Thu, 25 Jul 2024 07:10:51 +0000
Subject: [PATCH 29/40] Reformat analysis.py

---
 .../app/quant_factor_benchmark/analysis.py    | 390 +++++++-----------
 1 file changed, 152 insertions(+), 238 deletions(-)

diff --git a/rdagent/app/quant_factor_benchmark/analysis.py b/rdagent/app/quant_factor_benchmark/analysis.py
index 6e6a20ff..75aea00d 100644
--- a/rdagent/app/quant_factor_benchmark/analysis.py
+++ b/rdagent/app/quant_factor_benchmark/analysis.py
@@ -1,169 +1,60 @@
-
 import json
-from pathlib import Path
 import pickle
 import pandas as pd
+from pathlib import Path
+import matplotlib.pyplot as plt
+import seaborn as sns
 
-from rdagent.components.benchmark.eval_method import summarize_res as summarize_res
+from rdagent.components.benchmark.eval_method import FactorImplementEval
 from rdagent.components.benchmark.conf import BenchmarkSettings
 
-
-results = {
-    "1 round experiment": "log/2024-07-25_02-32-24-552766/1445941/2024-07-25_02-48-07-756153.pkl",
-}
-
-# Get index map from the json file
-index_map = {}
-bs = BenchmarkSettings()
-def load(json_file_path: Path) -> None:
-    with open(json_file_path, "r") as file:
-        factor_dict = json.load(file)
-    for factor_name, factor_data in factor_dict.items():
-        index_map[factor_name] = (factor_name, factor_data["Category"], factor_data["Difficulty"])
-
-load(bs.bench_data_path)
-
-def load_data(file_path):
-    file_path = Path(file_path)
-
-    if not (file_path.is_file() and file_path.suffix == ".pkl" and file_path.name.startswith("res_")):
-        raise ValueError("You may get a invalid file path")
-
-    sum_df = []
-    with file_path.open("rb") as f:
-        res = pickle.load(f)
-
-    sum_df.append(summarize_res(res))
-    sum_df = pd.concat(sum_df, axis=1)
-    if sum_df.shape[0] == 0:
-        raise ValueError("No data in the file")
-    print(file_path, sum_df.shape)
-
-    index = [
-        "FactorSingleColumnEvaluator",
-        "FactorOutputFormatEvaluator",
-        "FactorRowCountEvaluator",
-        "FactorIndexEvaluator",
-        "FactorMissingValuesEvaluator",
-        "FactorEqualValueCountEvaluator",
-        "FactorCorrelationEvaluator",
-        "run factor error",
-    ]
-
-    # reindex in case of failing to run evaluator.
-    # If all implemented factors fail to run, some evaluators may not appear in the result.
-    sum_df = sum_df.reindex(index, axis=0)
-
-    sum_df_clean = sum_df.T.groupby(level=0).apply(lambda x: x.reset_index(drop=True))
-
-    # sum_df.columns
-
-    def get_run_error(sum_df_clean):
-        run_error = sum_df_clean["run factor error"]
-
-        run_error = run_error.unstack()
-
-        run_error = run_error.T.fillna(False).astype(bool)  # null indicate no exception
-
-        succ_rate = ~run_error
-        succ_rate = succ_rate.mean(axis=0).to_frame("success rate")
-        # make it display in a percentage rate
-        # succ_rate["success rate"] = succ_rate["success rate"].map(lambda x: f"{x:.2%}")
-        return succ_rate
-
-    succ_rate = get_run_error(sum_df_clean)
-
-    def reformat_succ_rate(display_df):
-        """You may get dataframe like this:
-
-                                    success rate
-        250-day_high_distance             80.00%
-        Corr_Close_Turnover               20.00%
-        EP_TTM                            20.00%
-        High_Frequency_Skewness           60.00%
-        Momentum                          50.00%
-        Morning_30-min_Return             30.00%
-        UID                                0.00%
-        Weighted_Earnings_Frequency       10.00%
-        """
-
+class BenchmarkAnalyzer:
+    def __init__(self, settings):
+        self.settings = settings
+        self.index_map = self.load_index_map()
+
+    def load_index_map(self):
+        index_map = {}
+        with open(self.settings.bench_data_path, "r") as file:
+            factor_dict = json.load(file)
+        for factor_name, data in factor_dict.items():
+            index_map[factor_name] = (factor_name, data["Category"], data["Difficulty"])
+        return index_map
+
+    def load_data(self, file_path):
+        file_path = Path(file_path)
+        if not (file_path.is_file() and file_path.suffix == ".pkl"):
+            raise ValueError("Invalid file path")
+        
+        with file_path.open("rb") as f:
+            res = pickle.load(f)
+        
+        return res
+
+    def process_results(self, results):
+        final_res = {}
+        for experiment, path in results.items():
+            data = self.load_data(path)
+            summarized_data = FactorImplementEval.summarize_res(data)
+            processed_data = self.analyze_data(summarized_data)
+            final_res[experiment] = processed_data.iloc[-1, :]
+        return final_res
+    
+    def reformat_succ_rate(self, display_df):
         new_idx = []
-        display_df = display_df[display_df.index.isin(index_map.keys())]
-        # display_df = display_df.reindex(index_map.keys())
+        display_df = display_df[display_df.index.isin(self.index_map.keys())]
         for idx in display_df.index:
-            new_idx.append(index_map[idx])
+            new_idx.append(self.index_map[idx])
+
         display_df.index = pd.MultiIndex.from_tuples(
             new_idx,
             names=["Factor", "Category", "Difficulty"],
         )
-
         display_df = display_df.swaplevel(0, 2).swaplevel(0, 1).sort_index(axis=0)
 
-        def sort_key_func(x):
-            order_v = []
-            for i in x:
-                order_v.append({"Easy": 0, "Medium": 1, "Hard": 2, "New Discovery": 3}.get(i, i))
-            return order_v
-
-        return display_df.sort_index(key=sort_key_func)
-
-    succ_rate_f = reformat_succ_rate(succ_rate)
-    succ_rate_f
-
-    sum_df_clean["FactorRowCountEvaluator"]
-
-    def get_run_error(eval_series):
-        eval_series = eval_series.unstack()
-
-        succ_rate = eval_series.T.fillna(False).astype(bool)  # false indicate failure
-
-        succ_rate = succ_rate.mean(axis=0).to_frame("success rate")
-        # make it display in a percentage rate
-        # succ_rate["success rate"] = succ_rate["success rate"].map(lambda x: f"{x:.2%}")
-        return succ_rate
-
-    format_issue = (
-        sum_df_clean["FactorRowCountEvaluator"] & sum_df_clean["FactorIndexEvaluator"]
-    )
-
-    format_succ_rate = get_run_error(format_issue)
-    format_succ_rate_f = reformat_succ_rate(format_succ_rate)
-
-    corr = sum_df_clean["FactorCorrelationEvaluator"] * format_issue
-
-    corr = corr.unstack().T.mean(axis=0).to_frame("corr(only success)")
-    corr_res = reformat_succ_rate(corr)
-
-    corr_max = sum_df_clean["FactorCorrelationEvaluator"] * format_issue
-
-    corr_max = corr_max.unstack().T.max(axis=0).to_frame("corr(only success)")
-    corr_max_res = reformat_succ_rate(corr_max)
-
-    value_max = sum_df_clean["FactorMissingValuesEvaluator"] * format_issue
-    value_max = value_max.unstack().T.max(axis=0).to_frame("max_value")
-    value_max_res = reformat_succ_rate(value_max)
-
-    value_avg = (
-        (sum_df_clean["FactorMissingValuesEvaluator"] * format_issue)
-        .unstack()
-        .T.mean(axis=0)
-        .to_frame("avg_value")
-    )
-    value_avg_res = reformat_succ_rate(value_avg)
-
-    result_all = pd.concat(
-        {
-            "avg. Correlation (value only)": corr_res.iloc[:, 0],
-            "avg. Format successful rate": format_succ_rate_f.iloc[:, 0],
-            "avg. Run successful rate": succ_rate_f.iloc[:, 0],
-            "max. Correlation": corr_max_res.iloc[:, 0],
-            "max. accuracy": value_max_res.iloc[:, 0],
-            "avg. accuracy": value_avg_res.iloc[:, 0],
-        },
-        axis=1,
-    )
-
-    def result_all_key_order(x):
+        return display_df.sort_index(key=lambda x: [{"Easy": 0, "Medium": 1, "Hard": 2, "New Discovery": 3}.get(i, i) for i in x])
+    
+    def result_all_key_order(self, x):
         order_v = []
         for i in x:
             order_v.append(
@@ -178,93 +69,116 @@ def result_all_key_order(x):
             )
         return order_v
 
-    df = result_all.sort_index(axis=1, key=result_all_key_order)
-    print(df)
-
-    # Calculate the mean of each column
-    mean_values = df.fillna(0.0).mean()
-    mean_df = pd.DataFrame(mean_values).T
-
-    # TODO: set it as multi-index
-    # Assign the MultiIndex to the DataFrame
-    mean_df.index = pd.MultiIndex.from_tuples([("-", "-", "Average")], names=["Factor", "Category", "Difficulty"])
-
-    # Append the mean values to the end of the dataframe
-    df_w_mean = pd.concat([df, mean_df]).astype("float")
-
-    return df_w_mean
-
-
-def display_df(df):
-    # This depends on jupyter
-    def _single_formatter(column):
-        if column.endswith("rate"):
-
-            def _f(x):
-                return "{:.2%}".format(x) if pd.notnull(x) else "-"
-
-        else:
-
-            def _f(x):
-                return "{:.4}".format(x) if pd.notnull(x) else "-"
-
-        return _f
-
-    def get_formatters():
-        # Show NaN or None as '-'.  Don't convert the value to string
-        fmts = {column: _single_formatter(column) for column in df.columns}
-        return fmts
-
-    # TODO display
-    df_w_mean.drop(["max. accuracy", "avg. accuracy"], axis=1).style.format(get_formatters()).background_gradient(
-        axis=0, vmax=1, vmin=0, cmap=__import__("seaborn").light_palette("green", as_cmap=True)
-    )
-
-
-final_res = {}
-for k, p in results.items():
-    df = load_data(p)
-    print(df)
-    final_res[k] = df.iloc[-1, :]
-
-final_res = pd.DataFrame(final_res)
-
-
-# TODO plot it with seaborn and save it as a file
-final_res.drop(["max. accuracy", "avg. accuracy"], axis=0).T
-
-import matplotlib.pyplot as plt
-import seaborn as sns
-
-plt.rcParams["axes.unicode_minus"] = False
-
-
-def change_fs(font_size):
-    font_size = font_size
-    plt.rc("font", size=font_size)  # controls default text sizes
-    plt.rc("axes", titlesize=font_size)  # fontsize of the axes title
-    plt.rc("axes", labelsize=font_size)  # fontsize of the x and y labels
-    plt.rc("xtick", labelsize=font_size)  # fontsize of the tick labels
-    plt.rc("ytick", labelsize=font_size)  # fontsize of the tick labels
-    plt.rc("legend", fontsize=font_size)  # legend fontsize
-    plt.rc("figure", titlesize=font_size)  # fontsize of the figure title
-
-
-change_fs(20)
-
+    def analyze_data(self, sum_df):
+        index = [
+            "FactorSingleColumnEvaluator",
+            "FactorOutputFormatEvaluator",
+            "FactorRowCountEvaluator",
+            "FactorIndexEvaluator",
+            "FactorMissingValuesEvaluator",
+            "FactorEqualValueCountEvaluator",
+            "FactorCorrelationEvaluator",
+            "run factor error",
+        ]
+        sum_df = sum_df.reindex(index, axis=0)
+        sum_df_clean = sum_df.T.groupby(level=0).apply(lambda x: x.reset_index(drop=True))
+
+        run_error = sum_df_clean["run factor error"].unstack().T.fillna(False).astype(bool)
+        succ_rate = ~run_error
+        succ_rate = succ_rate.mean(axis=0).to_frame("success rate")
 
-# Prepare the data for plotting
-plot_data = final_res.drop(["max. accuracy", "avg. accuracy"], axis=0).T
-plot_data = plot_data.reset_index().melt("index", var_name="a", value_name="b")
+        succ_rate_f = self.reformat_succ_rate(succ_rate)
+        succ_rate_f
 
-# Create the plot
-plt.figure(figsize=(10, 6))
-sns.barplot(x="index", y="b", hue="a", data=plot_data)
+        sum_df_clean["FactorRowCountEvaluator"]
 
-# Set the labels and title
-plt.xlabel("Method")
-plt.ylabel("Value")
-plt.title("Comparison of Different Methods")
+        format_issue = (
+            sum_df_clean["FactorRowCountEvaluator"] & sum_df_clean["FactorIndexEvaluator"]
+        )
+        eval_series = format_issue.unstack()
+        succ_rate = eval_series.T.fillna(False).astype(bool)  # false indicate failure
+        format_succ_rate = succ_rate.mean(axis=0).to_frame("success rate")
+        format_succ_rate_f = self.reformat_succ_rate(format_succ_rate)
+
+        corr = sum_df_clean["FactorCorrelationEvaluator"] * format_issue
+        corr = corr.unstack().T.mean(axis=0).to_frame("corr(only success)")
+        corr_res = self.reformat_succ_rate(corr)
+        corr_max = sum_df_clean["FactorCorrelationEvaluator"] * format_issue
+
+        corr_max = corr_max.unstack().T.max(axis=0).to_frame("corr(only success)")
+        corr_max_res = self.reformat_succ_rate(corr_max)
+
+        value_max = sum_df_clean["FactorMissingValuesEvaluator"] * format_issue
+        value_max = value_max.unstack().T.max(axis=0).to_frame("max_value")
+        value_max_res = self.reformat_succ_rate(value_max)
+
+        value_avg = (
+            (sum_df_clean["FactorMissingValuesEvaluator"] * format_issue)
+            .unstack()
+            .T.mean(axis=0)
+            .to_frame("avg_value")
+        )
+        value_avg_res = self.reformat_succ_rate(value_avg)
+
+        result_all = pd.concat(
+            {
+                "avg. Correlation (value only)": corr_res.iloc[:, 0],
+                "avg. Format successful rate": format_succ_rate_f.iloc[:, 0],
+                "avg. Run successful rate": succ_rate_f.iloc[:, 0],
+                "max. Correlation": corr_max_res.iloc[:, 0],
+                "max. accuracy": value_max_res.iloc[:, 0],
+                "avg. accuracy": value_avg_res.iloc[:, 0],
+            },
+            axis=1,
+        )
 
-# Save the plot as a file
-plt.savefig("comparison_plot.png")
+        df = result_all.sort_index(axis=1, key=self.result_all_key_order)
+        print(df)
+
+        # Calculate the mean of each column
+        mean_values = df.fillna(0.0).mean()
+        mean_df = pd.DataFrame(mean_values).T
+
+        # Assign the MultiIndex to the DataFrame
+        mean_df.index = pd.MultiIndex.from_tuples([("-", "-", "Average")], names=["Factor", "Category", "Difficulty"])
+
+        # Append the mean values to the end of the dataframe
+        df_w_mean = pd.concat([df, mean_df]).astype("float")
+
+        return df_w_mean
+
+
+
+class Plotter:
+    @staticmethod
+    def change_fs(font_size):
+        plt.rc("font", size=font_size)
+        plt.rc("axes", titlesize=font_size)
+        plt.rc("axes", labelsize=font_size)
+        plt.rc("xtick", labelsize=font_size)
+        plt.rc("ytick", labelsize=font_size)
+        plt.rc("legend", fontsize=font_size)
+        plt.rc("figure", titlesize=font_size)
+
+    @staticmethod
+    def plot_data(data, file_name):
+        plt.figure(figsize=(10, 6))
+        sns.barplot(x="index", y="b", hue="a", data=data)
+        plt.xlabel("Method")
+        plt.ylabel("Value")
+        plt.title("Comparison of Different Methods")
+        plt.savefig(file_name)
+
+if __name__ == "__main__":
+    settings = BenchmarkSettings()
+    benchmark = BenchmarkAnalyzer(settings)
+    results = {
+        "1 round experiment": "git_ignore_folder/eval_results/res_promptV220240724-060037.pkl",
+    }
+    final_results = benchmark.process_results(results)
+    final_results_df = pd.DataFrame(final_results)
+
+    Plotter.change_fs(20)
+    plot_data = final_results_df.drop(["max. accuracy", "avg. accuracy"], axis=0).T
+    plot_data = plot_data.reset_index().melt("index", var_name="a", value_name="b")
+    Plotter.plot_data(plot_data, "rdagent/app/quant_factor_benchmark/comparison_plot.png")

From ac80c93172759412d5fe603965ffe797a9406a32 Mon Sep 17 00:00:00 2001
From: Taozhi Wang <taozhi.mark.wang@gmail.com>
Date: Thu, 25 Jul 2024 07:46:50 +0000
Subject: [PATCH 30/40] CI fix

---
 RD-Agent | 1 -
 1 file changed, 1 deletion(-)
 delete mode 160000 RD-Agent

diff --git a/RD-Agent b/RD-Agent
deleted file mode 160000
index 61d67d85..00000000
--- a/RD-Agent
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 61d67d8518072d69d5169853d49dd6ff88e6055c

From eb1c04e82761074ea7277758737f8af02556b634 Mon Sep 17 00:00:00 2001
From: Young <afe.young@gmail.com>
Date: Thu, 25 Jul 2024 07:57:29 +0000
Subject: [PATCH 31/40] Refactor

---
 rdagent/app/qlib_rd_loop/conf.py        | 48 ++++++++-------
 rdagent/app/qlib_rd_loop/factor_w_sc.py | 77 ++++---------------------
 rdagent/app/qlib_rd_loop/model_w_sc.py  | 68 +++-------------------
 rdagent/components/workflow/rd_loop.py  |  3 +-
 4 files changed, 49 insertions(+), 147 deletions(-)

diff --git a/rdagent/app/qlib_rd_loop/conf.py b/rdagent/app/qlib_rd_loop/conf.py
index 60d63dbb..e9a07dd7 100644
--- a/rdagent/app/qlib_rd_loop/conf.py
+++ b/rdagent/app/qlib_rd_loop/conf.py
@@ -2,38 +2,46 @@
 
 from pydantic_settings import BaseSettings
 
+from rdagent.components.workflow.conf import BasePropSetting
 
-class PropSetting(BaseSettings):
+
+class ModelBasePropSetting(BasePropSetting):
     class Config:
-        env_prefix = "QLIB_"  # Use MODEL_CODER_ as prefix for environment variables
+        env_prefix = "QLIB_MODEL_"  # Use MODEL_CODER_ as prefix for environment variables
         protected_namespaces = ()  # Add 'model_' to the protected namespaces
 
-    factor_scen: str = "rdagent.scenarios.qlib.experiment.factor_experiment.QlibFactorScenario"
-    factor_hypothesis_gen: str = "rdagent.scenarios.qlib.proposal.factor_proposal.QlibFactorHypothesisGen"
-    factor_hypothesis2experiment: str = (
-        "rdagent.scenarios.qlib.proposal.factor_proposal.QlibFactorHypothesis2Experiment"
-    )
-    factor_coder: str = "rdagent.scenarios.qlib.developer.factor_coder.QlibFactorCoSTEER"
-    factor_runner: str = "rdagent.scenarios.qlib.developer.factor_runner.QlibFactorRunner"
-    factor_summarizer: str = "rdagent.scenarios.qlib.developer.feedback.QlibFactorHypothesisExperiment2Feedback"
+    scen: str = "rdagent.scenarios.qlib.experiment.model_experiment.QlibModelScenario"
+    hypothesis_gen: str = "rdagent.scenarios.qlib.proposal.model_proposal.QlibModelHypothesisGen"
+    hypothesis2experiment: str = "rdagent.scenarios.qlib.proposal.model_proposal.QlibModelHypothesis2Experiment"
+    coder: str = "rdagent.scenarios.qlib.developer.model_coder.QlibModelCoSTEER"
+    runner: str = "rdagent.scenarios.qlib.developer.model_runner.QlibModelRunner"
+    summarizer: str = "rdagent.scenarios.qlib.developer.feedback.QlibModelHypothesisExperiment2Feedback"
+
+    evolving_n: int = 10
+
 
+class FactorBasePropSetting(BasePropSetting):
+    class Config:
+        env_prefix = "QLIB_FACTOR_"  # Use MODEL_CODER_ as prefix for environment variables
+        protected_namespaces = ()  # Add 'model_' to the protected namespaces
+
+    # 1) override base settings
     # TODO: model part is not finished yet
-    model_scen: str = "rdagent.scenarios.qlib.experiment.model_experiment.QlibModelScenario"
-    model_hypothesis_gen: str = "rdagent.scenarios.qlib.proposal.model_proposal.QlibModelHypothesisGen"
-    model_hypothesis2experiment: str = "rdagent.scenarios.qlib.proposal.model_proposal.QlibModelHypothesis2Experiment"
-    model_coder: str = "rdagent.scenarios.qlib.developer.model_coder.QlibModelCoSTEER"
-    model_runner: str = "rdagent.scenarios.qlib.developer.model_runner.QlibModelRunner"
-    model_summarizer: str = "rdagent.scenarios.qlib.developer.feedback.QlibModelHypothesisExperiment2Feedback"
+    scen: str = "rdagent.scenarios.qlib.experiment.factor_experiment.QlibFactorScenario"
+    hypothesis_gen: str = "rdagent.scenarios.qlib.proposal.factor_proposal.QlibFactorHypothesisGen"
+    hypothesis2experiment: str = "rdagent.scenarios.qlib.proposal.factor_proposal.QlibFactorHypothesis2Experiment"
+    coder: str = "rdagent.scenarios.qlib.developer.factor_coder.QlibFactorCoSTEER"
+    runner: str = "rdagent.scenarios.qlib.developer.factor_runner.QlibFactorRunner"
+    summarizer: str = "rdagent.scenarios.qlib.developer.feedback.QlibFactorHypothesisExperiment2Feedback"
 
     evolving_n: int = 10
 
-    py_bin: str = "/usr/bin/python"
-    local_qlib_folder: Path = Path("/home/rdagent/qlib")
-
+    # 2) sub task specific:
     origin_report_path: str = "data/report_origin"
     local_report_path: str = "data/report"
     report_result_json_file_path: str = "git_ignore_folder/res_dict.json"
     progress_file_path: str = "git_ignore_folder/progress.pkl"
 
 
-PROP_SETTING = PropSetting()
+FACTOR_PROP_SETTING = FactorBasePropSetting()
+MODEL_PROP_SETTING = ModelBasePropSetting()
diff --git a/rdagent/app/qlib_rd_loop/factor_w_sc.py b/rdagent/app/qlib_rd_loop/factor_w_sc.py
index fe06e2d8..1e6f8220 100644
--- a/rdagent/app/qlib_rd_loop/factor_w_sc.py
+++ b/rdagent/app/qlib_rd_loop/factor_w_sc.py
@@ -1,74 +1,16 @@
 """
 Factor workflow with session control
-It is from `rdagent/app/qlib_rd_loop/factor.py` and try to replace `rdagent/app/qlib_rd_loop/RDAgent.py`
 """
 
-from typing import Any
-from dotenv import load_dotenv
 import fire
 
+from rdagent.app.qlib_rd_loop.conf import FACTOR_PROP_SETTING
+from rdagent.components.workflow.rd_loop import RDLoop
 from rdagent.core.exception import FactorEmptyError
-from rdagent.core.scenario import Scenario
-from rdagent.log import rdagent_logger as logger
 
-load_dotenv(override=True)
 
-from rdagent.app.qlib_rd_loop.conf import PROP_SETTING
-from rdagent.core.developer import Developer
-from rdagent.core.proposal import (
-    Hypothesis2Experiment,
-    HypothesisExperiment2Feedback,
-    HypothesisGen,
-    Trace,
-)
-from rdagent.core.utils import import_class
-
-from rdagent.utils.workflow import LoopMeta, LoopBase
-
-class FactorLoop(LoopBase, metaclass=LoopMeta):
-
-    def __init__(self):
-        scen: Scenario = import_class(PROP_SETTING.factor_scen)()
-
-        self.hypothesis_gen: HypothesisGen = import_class(PROP_SETTING.factor_hypothesis_gen)(scen)
-
-        self.hypothesis2experiment: Hypothesis2Experiment = import_class(PROP_SETTING.factor_hypothesis2experiment)()
-
-        self.qlib_factor_coder: Developer = import_class(PROP_SETTING.factor_coder)(scen)
-        self.qlib_factor_runner: Developer = import_class(PROP_SETTING.factor_runner)(scen)
-
-        self.qlib_factor_summarizer: HypothesisExperiment2Feedback = import_class(PROP_SETTING.factor_summarizer)(scen)
-        self.trace = Trace(scen=scen)
-        super().__init__()
-    
-    def propose(self, prev_out: dict[str, Any]):
-        with logger.tag("r"):
-            hypothesis = self.hypothesis_gen.gen(self.trace)
-            logger.log_object(hypothesis, tag="hypothesis generation")
-        return hypothesis
-
-    def exp_gen(self, prev_out: dict[str, Any]):
-        with logger.tag("r"):
-            exp = self.hypothesis2experiment.convert(prev_out["propose"], self.trace)
-            logger.log_object(exp.sub_tasks, tag="experiment generation")
-        return exp
-
-    def coding(self, prev_out: dict[str, Any]):
-        with logger.tag("d"):
-            exp = self.qlib_factor_coder.develop(prev_out["exp_gen"])
-            logger.log_object(exp.sub_workspace_list, tag="factor coder result")
-        return exp
-
-    def running(self, prev_out: dict[str, Any]):
-        with logger.tag("ef"):
-            exp = self.qlib_factor_runner.develop(prev_out["coding"])
-            logger.log_object(exp, tag="factor runner result")
-        return exp
-
-    def feedback(self, prev_out: dict[str, Any]):
-        feedback = self.qlib_factor_summarizer.generate_feedback(prev_out["running"], prev_out["propose"], self.trace)
-        logger.log_object(feedback, tag="feedback")
-        self.trace.hist.append((prev_out["propose"], prev_out["running"], feedback))
+class FactorRDLoop(RDLoop):
+    skip_loop_error = (FactorEmptyError,)
 
 
 def main(path=None, step_n=None):
@@ -81,11 +23,16 @@ def main(path=None, step_n=None):
 
     """
     if path is None:
-        model_loop = FactorLoop()
+        model_loop = FactorRDLoop(FACTOR_PROP_SETTING)
     else:
-        model_loop = FactorLoop.load(path)
+        model_loop = FactorRDLoop.load(path)
     model_loop.run(step_n=step_n)
 
 
 if __name__ == "__main__":
-    fire.Fire(main)
\ No newline at end of file
+    fire.Fire(main)
+
+
+
+if __name__ == "__main__":
+    fire.Fire(main)
diff --git a/rdagent/app/qlib_rd_loop/model_w_sc.py b/rdagent/app/qlib_rd_loop/model_w_sc.py
index c5fca4c3..59e89b67 100644
--- a/rdagent/app/qlib_rd_loop/model_w_sc.py
+++ b/rdagent/app/qlib_rd_loop/model_w_sc.py
@@ -1,70 +1,16 @@
 """
 Model workflow with session control
-It is from `rdagent/app/qlib_rd_loop/model.py` and try to replace `rdagent/app/qlib_rd_loop/RDAgent.py`
 """
 
 import fire
-from typing import Any
-from rdagent.app.qlib_rd_loop.conf import PROP_SETTING
-from rdagent.core.developer import Developer
-from rdagent.core.exception import ModelEmptyError
-from rdagent.core.proposal import (
-    Hypothesis2Experiment,
-    HypothesisExperiment2Feedback,
-    HypothesisGen,
-    Trace,
-)
-from rdagent.core.scenario import Scenario
-from rdagent.core.utils import import_class
-from rdagent.log import rdagent_logger as logger
-
-from rdagent.utils.workflow import LoopMeta, LoopBase
-
-class ModelLoop(LoopBase, metaclass=LoopMeta):
-    # TODO: supporting customized loop control like catching `ModelEmptyError`
-
-    def __init__(self):
-        scen: Scenario = import_class(PROP_SETTING.model_scen)()
-
-        self.hypothesis_gen: HypothesisGen = import_class(PROP_SETTING.model_hypothesis_gen)(scen)
-
-        self.hypothesis2experiment: Hypothesis2Experiment = import_class(PROP_SETTING.model_hypothesis2experiment)()
 
-        self.qlib_model_coder: Developer = import_class(PROP_SETTING.model_coder)(scen)
-        self.qlib_model_runner: Developer = import_class(PROP_SETTING.model_runner)(scen)
-
-        self.qlib_model_summarizer: HypothesisExperiment2Feedback = import_class(PROP_SETTING.model_summarizer)(scen)
-        self.trace = Trace(scen=scen)
-        super().__init__()
-
-    def propose(self, prev_out: dict[str, Any]):
-        with logger.tag("r"):  # research
-            hypothesis = self.hypothesis_gen.gen(self.trace)
-            logger.log_object(hypothesis, tag="hypothesis generation")
-        return hypothesis
-
-    def exp_gen(self, prev_out: dict[str, Any]):
-        with logger.tag("r"):  # research
-            exp = self.hypothesis2experiment.convert(prev_out["propose"], self.trace)
-            logger.log_object(exp.sub_tasks, tag="experiment generation")
-        return exp
-
-    def coding(self, prev_out: dict[str, Any]):
-        with logger.tag("d"):  # develop
-            exp = self.qlib_model_coder.develop(prev_out["exp_gen"])
-            logger.log_object(exp.sub_workspace_list, tag="model coder result")
-        return exp
+from rdagent.app.qlib_rd_loop.conf import MODEL_PROP_SETTING
+from rdagent.components.workflow.rd_loop import RDLoop
+from rdagent.core.exception import ModelEmptyError
 
-    def running(self, prev_out: dict[str, Any]):
-        with logger.tag("ef"):  # evaluate and feedback
-            exp = self.qlib_model_runner.develop(prev_out["coding"])
-            logger.log_object(exp, tag="model runner result")
-        return exp
 
-    def feedback(self, prev_out: dict[str, Any]):
-        feedback = self.qlib_model_summarizer.generate_feedback(prev_out["running"], prev_out["propose"], self.trace)
-        logger.log_object(feedback, tag="feedback")
-        self.trace.hist.append((prev_out["propose"],prev_out["running"] , feedback))
+class ModelRDLoop(RDLoop):
+    skip_loop_error = (ModelEmptyError,)
 
 
 def main(path=None, step_n=None):
@@ -77,9 +23,9 @@ def main(path=None, step_n=None):
 
     """
     if path is None:
-        model_loop = ModelLoop()
+        model_loop = ModelRDLoop(MODEL_PROP_SETTING)
     else:
-        model_loop = ModelLoop.load(path)
+        model_loop = ModelRDLoop.load(path)
     model_loop.run(step_n=step_n)
 
 
diff --git a/rdagent/components/workflow/rd_loop.py b/rdagent/components/workflow/rd_loop.py
index 69a73d5d..d1a40017 100644
--- a/rdagent/components/workflow/rd_loop.py
+++ b/rdagent/components/workflow/rd_loop.py
@@ -60,5 +60,6 @@ def running(self, prev_out: dict[str, Any]):
 
     def feedback(self, prev_out: dict[str, Any]):
         feedback = self.summarizer.generate_feedback(prev_out["running"], prev_out["propose"], self.trace)
-        logger.log_object(feedback, tag="feedback")
+        with logger.tag("ef"):  # evaluate and feedback
+            logger.log_object(feedback, tag="feedback")
         self.trace.hist.append((prev_out["propose"],prev_out["running"] , feedback))

From f9295e087971e98aded78eb902928ebcf98ea520 Mon Sep 17 00:00:00 2001
From: Young <afe.young@gmail.com>
Date: Thu, 25 Jul 2024 07:59:47 +0000
Subject: [PATCH 32/40] remove useless code

---
 rdagent/app/qlib_rd_loop/conf.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/rdagent/app/qlib_rd_loop/conf.py b/rdagent/app/qlib_rd_loop/conf.py
index e9a07dd7..cf618866 100644
--- a/rdagent/app/qlib_rd_loop/conf.py
+++ b/rdagent/app/qlib_rd_loop/conf.py
@@ -1,7 +1,3 @@
-from pathlib import Path
-
-from pydantic_settings import BaseSettings
-
 from rdagent.components.workflow.conf import BasePropSetting
 
 

From d2770c6ae401efebb1a336ba2d5fc2d1b5b87b83 Mon Sep 17 00:00:00 2001
From: Suhan Cui <51844791+SH-Src@users.noreply.github.com>
Date: Thu, 25 Jul 2024 16:12:52 +0800
Subject: [PATCH 33/40] fix bugs (#111)

---
 rdagent/scenarios/data_mining/experiment/model_experiment.py | 4 ++++
 rdagent/scenarios/data_mining/proposal/model_proposal.py     | 4 +++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/rdagent/scenarios/data_mining/experiment/model_experiment.py b/rdagent/scenarios/data_mining/experiment/model_experiment.py
index ff90bf8f..f737f83d 100644
--- a/rdagent/scenarios/data_mining/experiment/model_experiment.py
+++ b/rdagent/scenarios/data_mining/experiment/model_experiment.py
@@ -38,6 +38,10 @@ def interface(self) -> str:
     @property
     def simulator(self) -> str:
         return prompt_dict["dm_model_simulator"]
+    
+    @property
+    def rich_style_description(self)->str:
+        return "Below is MIMIC Model Evolving Automatic R&D Demo."
 
     def get_scenario_all_desc(self) -> str:
         return f"""Background of the scenario:
diff --git a/rdagent/scenarios/data_mining/proposal/model_proposal.py b/rdagent/scenarios/data_mining/proposal/model_proposal.py
index 2f420944..32ec8896 100644
--- a/rdagent/scenarios/data_mining/proposal/model_proposal.py
+++ b/rdagent/scenarios/data_mining/proposal/model_proposal.py
@@ -84,10 +84,12 @@ def convert_response(self, response: str, trace: Trace) -> ModelExperiment:
         tasks = []
         for model_name in response_dict:
             description = response_dict[model_name]["description"]
+            formulation = response_dict[model_name]["formulation"]
             architecture = response_dict[model_name]["architecture"]
+            variables = response_dict[model_name]["variables"]
             hyperparameters = response_dict[model_name]["hyperparameters"]
             model_type = response_dict[model_name]["model_type"]
-            tasks.append(ModelTask(model_name, description, architecture, hyperparameters, model_type))
+            tasks.append(ModelTask(model_name, description, formulation, architecture, variables, hyperparameters, model_type))
         exp = DMModelExperiment(tasks)
         exp.based_experiments = [t[1] for t in trace.hist if t[2]]
         return exp

From 22b176b9f1efd8f5ec1ffc7b005b2be1725dc8e1 Mon Sep 17 00:00:00 2001
From: WinstonLiyt <1957922024@qq.com>
Date: Thu, 25 Jul 2024 13:55:29 +0000
Subject: [PATCH 34/40] Fix two small bugs.

---
 rdagent/app/qlib_rd_loop/factor_w_sc.py       |  7 +------
 rdagent/scenarios/qlib/prompts.yaml           | 18 ++++++++++++++----
 .../qlib/proposal/factor_proposal.py          | 19 +++++++++++++++++++
 3 files changed, 34 insertions(+), 10 deletions(-)

diff --git a/rdagent/app/qlib_rd_loop/factor_w_sc.py b/rdagent/app/qlib_rd_loop/factor_w_sc.py
index 1e6f8220..79087162 100644
--- a/rdagent/app/qlib_rd_loop/factor_w_sc.py
+++ b/rdagent/app/qlib_rd_loop/factor_w_sc.py
@@ -30,9 +30,4 @@ def main(path=None, step_n=None):
 
 
 if __name__ == "__main__":
-    fire.Fire(main)
-
-
-
-if __name__ == "__main__":
-    fire.Fire(main)
+    fire.Fire(main)
\ No newline at end of file
diff --git a/rdagent/scenarios/qlib/prompts.yaml b/rdagent/scenarios/qlib/prompts.yaml
index 139ebd57..5e3239d9 100644
--- a/rdagent/scenarios/qlib/prompts.yaml
+++ b/rdagent/scenarios/qlib/prompts.yaml
@@ -111,10 +111,20 @@ factor_hypothesis_specification: |-
     - Explore sophisticated combinations or new types of factors, e.g., "Develop a composite factor integrating ESG scores with traditional financial metrics."
 
   Important Note:
-  - If a hypothesis achieves the desired results, start a new direction while preserving the effective factors from previous hypotheses.
-  - New evaluations should combine the newly proposed factors with previously successful factors that surpassed SOTA.
-  - If the previous hypothesis factor exceeds SOTA, you can write another factor in the same direction, otherwise, explore new directions.
-  - The final maintained SOTA is the continuous accumulation of factors that surpass each iteration.
+    Logic Explanation:
+      - If the previous hypothesis factor exceeds SOTA, the SOTA factor library will include this factor.
+      - The new experiment will generate new factors, which will be combined with the factors in the SOTA library.
+      - These combined factors will be backtested and compared against the current SOTA to iterate continuously.
+    Development Directions:
+      - New Direction:
+          - Propose a new factor direction for exploration and construction.
+      - Optimization of Existing Direction:
+          - If the previous experiment's factor replaced SOTA, you can further improve upon that factor.
+          - Clearly specify the differences in name and improvements compared to the previous factor.
+      - Continued Research:
+          - If the previous experiment's factor did not replace SOTA, continue researching how to optimize and construct factors in this direction.
+    Final Goal:
+      - The final maintained SOTA should be the continuous accumulation of factors that surpass each iteration.
 
 factor_experiment_output_format: |-
   The output should follow JSON format. The schema is as follows:
diff --git a/rdagent/scenarios/qlib/proposal/factor_proposal.py b/rdagent/scenarios/qlib/proposal/factor_proposal.py
index b1dadc66..28e63516 100644
--- a/rdagent/scenarios/qlib/proposal/factor_proposal.py
+++ b/rdagent/scenarios/qlib/proposal/factor_proposal.py
@@ -72,13 +72,32 @@ def prepare_context(self, hypothesis: Hypothesis, trace: Trace) -> Tuple[dict |
     def convert_response(self, response: str, trace: Trace) -> FactorExperiment:
         response_dict = json.loads(response)
         tasks = []
+
         for factor_name in response_dict:
             description = response_dict[factor_name]["description"]
             formulation = response_dict[factor_name]["formulation"]
             variables = response_dict[factor_name]["variables"]
             tasks.append(FactorTask(factor_name, description, formulation, variables))
+        
         exp = QlibFactorExperiment(tasks)
         exp.based_experiments = [t[1] for t in trace.hist if t[2]]
+        
         if len(exp.based_experiments) == 0:
             exp.based_experiments.append(QlibFactorExperiment(sub_tasks=[]))
+
+        unique_tasks = []
+        
+        for task in tasks:
+            duplicate = False
+            for based_exp in exp.based_experiments:
+                for sub_task in based_exp.sub_tasks:
+                    if task.factor_name == sub_task.factor_name:
+                        duplicate = True
+                        break
+                if duplicate:
+                    break
+            if not duplicate:
+                unique_tasks.append(task)
+
+        exp.tasks = unique_tasks
         return exp

From fb1478e64abecc83bf0129b59a22851baff7bae6 Mon Sep 17 00:00:00 2001
From: WinstonLiyt <1957922024@qq.com>
Date: Thu, 25 Jul 2024 14:38:28 +0000
Subject: [PATCH 35/40] Fix a merge bug.

---
 rdagent/components/benchmark/eval_method.py | 36 +--------------------
 rdagent/scenarios/qlib/prompts.yaml         |  1 -
 2 files changed, 1 insertion(+), 36 deletions(-)

diff --git a/rdagent/components/benchmark/eval_method.py b/rdagent/components/benchmark/eval_method.py
index f71e127e..193d1ce2 100644
--- a/rdagent/components/benchmark/eval_method.py
+++ b/rdagent/components/benchmark/eval_method.py
@@ -19,6 +19,7 @@
 from rdagent.components.coder.factor_coder.factor import FactorFBWorkspace
 from rdagent.core.conf import RD_AGENT_SETTINGS
 from rdagent.core.developer import Developer
+
 from rdagent.core.exception import CoderException, RunnerException
 from rdagent.core.experiment import Task, Workspace
 from rdagent.core.scenario import Scenario
@@ -122,7 +123,6 @@ def __init__(
         method: Developer,
         *args,
         scen: Scenario,
-        scen: Scenario,
         test_round: int = 10,
         **kwargs,
     ):
@@ -134,13 +134,6 @@ def __init__(
             FactorMissingValuesEvaluator(scen),
             FactorEqualValueCountEvaluator(scen),
             FactorCorrelationEvaluator(hard_check=False, scen=scen),
-            FactorSingleColumnEvaluator(scen),
-            FactorOutputFormatEvaluator(scen),
-            FactorRowCountEvaluator(scen),
-            FactorIndexEvaluator(scen),
-            FactorMissingValuesEvaluator(scen),
-            FactorEqualValueCountEvaluator(scen),
-            FactorCorrelationEvaluator(hard_check=False, scen=scen),
         ]
         super().__init__(online_evaluator_l, test_cases, method, *args, **kwargs)
         self.test_round = test_round
@@ -204,30 +197,3 @@ def summarize_res(res: EVAL_RES) -> pd.DataFrame:
                 sum_res[key] = val
 
         return pd.DataFrame(sum_res)
-
-
-    @staticmethod
-    def summarize_res(res: EVAL_RES) -> pd.DataFrame:
-        # None: indicate that it raises exception and get no results
-        sum_res = {}
-        for factor_name, runs in res.items():
-            for fi, err_or_res_l in runs:
-                # NOTE:  str(fi) may not be unique!!  Because the workspace can be skipped when hitting the cache.
-                uniq_key = f"{str(fi)},{id(fi)}"
-
-                key = (factor_name, uniq_key)
-                val = {}
-                if isinstance(err_or_res_l, Exception):
-                    val["run factor error"] = str(err_or_res_l.__class__)
-                else:
-                    val["run factor error"] = None
-                    for ev_obj, err_or_res in err_or_res_l:
-                        if isinstance(err_or_res, Exception):
-                            val[str(ev_obj)] = None
-                        else:
-                            feedback, metric = err_or_res
-                            val[str(ev_obj)] = metric
-                sum_res[key] = val
-
-        return pd.DataFrame(sum_res)
-
diff --git a/rdagent/scenarios/qlib/prompts.yaml b/rdagent/scenarios/qlib/prompts.yaml
index f2a1158c..5e3239d9 100644
--- a/rdagent/scenarios/qlib/prompts.yaml
+++ b/rdagent/scenarios/qlib/prompts.yaml
@@ -59,7 +59,6 @@ factor_hypothesis_specification: |-
     - "Filter stocks by market capitalization before calculating the factors."
 
 factor_hypothesis_specification: |-
-  Specifications:
   Specifications:
     - Hypotheses should grow and evolve based on the previous hypothesis. If there is no previous hypothesis, start with something simple.
     - Gradually build upon previous hypotheses and feedback.

From 09e2d88d6bfb16bd5a850ad9039d76ab3453c5ea Mon Sep 17 00:00:00 2001
From: WinstonLiyt <1957922024@qq.com>
Date: Fri, 26 Jul 2024 02:56:50 +0000
Subject: [PATCH 36/40] Fix two small bugs.

---
 rdagent/core/utils.py                              | 3 ++-
 rdagent/scenarios/qlib/proposal/factor_proposal.py | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/rdagent/core/utils.py b/rdagent/core/utils.py
index a01f2a64..949d4a37 100644
--- a/rdagent/core/utils.py
+++ b/rdagent/core/utils.py
@@ -26,7 +26,8 @@ def __new__(cls, *args: Any, **kwargs: Any) -> Any:
             # TODO: this restriction can be solved.
             exception_message = "Please only use kwargs in Singleton to avoid misunderstanding."
             raise RDAgentException(exception_message)
-        kwargs_hash = hash(tuple(sorted(kwargs.items())))
+        all_args = [(-1, f"{cls.__module__}.{cls.__name__}")] + [(i, args[i]) for i in args] + list(sorted(kwargs.items()))
+        kwargs_hash = hash(tuple(all_args))
         if kwargs_hash not in cls._instance_dict:
             cls._instance_dict[kwargs_hash] = super().__new__(cls)  # Corrected call
             cls._instance_dict[kwargs_hash].__init__(**kwargs)  # Ensure __init__ is called
diff --git a/rdagent/scenarios/qlib/proposal/factor_proposal.py b/rdagent/scenarios/qlib/proposal/factor_proposal.py
index 28e63516..eea5fa5e 100644
--- a/rdagent/scenarios/qlib/proposal/factor_proposal.py
+++ b/rdagent/scenarios/qlib/proposal/factor_proposal.py
@@ -39,7 +39,7 @@ def prepare_context(self, trace: Trace) -> Tuple[dict, bool]:
 
     def convert_response(self, response: str) -> FactorHypothesis:
         response_dict = json.loads(response)
-        hypothesis = QlibFactorHypothesis(hypothesis=response_dict["hypothesis"], reason=response_dict["reason"])
+        hypothesis = QlibFactorHypothesis(hypothesis=response_dict["hypothesis"], reason=response_dict["reason"], concise_reason=response_dict["concise_reason"])
         return hypothesis
 
 

From 787450c359689fe63e46d8348c849c1b3bfc7cce Mon Sep 17 00:00:00 2001
From: WinstonLiyt <1957922024@qq.com>
Date: Mon, 29 Jul 2024 03:52:33 +0000
Subject: [PATCH 37/40] fix some bugs.

---
 .../app/qlib_rd_loop/factor_from_report_sh.py | 115 +++++++++---------
 rdagent/app/qlib_rd_loop/factor_w_sc.py       |  14 ++-
 rdagent/app/qlib_rd_loop/prompts.yaml         |   3 +-
 rdagent/components/benchmark/eval_method.py   |   4 +-
 rdagent/scenarios/qlib/prompts.yaml           |  15 ---
 rdagent/utils/workflow.py                     |   7 +-
 6 files changed, 80 insertions(+), 78 deletions(-)
 mode change 100644 => 100755 rdagent/app/qlib_rd_loop/factor_w_sc.py

diff --git a/rdagent/app/qlib_rd_loop/factor_from_report_sh.py b/rdagent/app/qlib_rd_loop/factor_from_report_sh.py
index f0d0a961..899f2165 100644
--- a/rdagent/app/qlib_rd_loop/factor_from_report_sh.py
+++ b/rdagent/app/qlib_rd_loop/factor_from_report_sh.py
@@ -7,7 +7,7 @@
 from dotenv import load_dotenv
 from jinja2 import Environment, StrictUndefined
 
-from rdagent.app.qlib_rd_loop.conf import PROP_SETTING
+from rdagent.app.qlib_rd_loop.conf import FACTOR_PROP_SETTING
 from rdagent.components.document_reader.document_reader import (
     extract_first_page_screenshot_from_pdf,
     load_and_process_pdfs_by_langchain,
@@ -37,19 +37,19 @@
 
 assert load_dotenv()
 
-scen: Scenario = import_class(PROP_SETTING.factor_scen)()
+scen: Scenario = import_class(FACTOR_PROP_SETTING.scen)()
 
-hypothesis_gen: HypothesisGen = import_class(PROP_SETTING.factor_hypothesis_gen)(scen)
+hypothesis_gen: HypothesisGen = import_class(FACTOR_PROP_SETTING.hypothesis_gen)(scen)
 
-hypothesis2experiment: Hypothesis2Experiment = import_class(PROP_SETTING.factor_hypothesis2experiment)()
+hypothesis2experiment: Hypothesis2Experiment = import_class(FACTOR_PROP_SETTING.hypothesis2experiment)()
 
-qlib_factor_coder: Developer = import_class(PROP_SETTING.factor_coder)(scen)
+qlib_factor_coder: Developer = import_class(FACTOR_PROP_SETTING.coder)(scen)
 
-qlib_factor_runner: Developer = import_class(PROP_SETTING.factor_runner)(scen)
+qlib_factor_runner: Developer = import_class(FACTOR_PROP_SETTING.runner)(scen)
 
-qlib_factor_summarizer: HypothesisExperiment2Feedback = import_class(PROP_SETTING.factor_summarizer)(scen)
+qlib_factor_summarizer: HypothesisExperiment2Feedback = import_class(FACTOR_PROP_SETTING.summarizer)(scen)
 
-with open(PROP_SETTING.report_result_json_file_path, "r") as f:
+with open(FACTOR_PROP_SETTING.report_result_json_file_path, "r") as f:
     judge_pdf_data = json.load(f)
 
 prompts_path = Path(__file__).parent / "prompts.yaml"
@@ -57,13 +57,13 @@
 
 
 def save_progress(trace, current_index):
-    with open(PROP_SETTING.progress_file_path, "wb") as f:
+    with open(FACTOR_PROP_SETTING.progress_file_path, "wb") as f:
         pickle.dump((trace, current_index), f)
 
 
 def load_progress():
-    if Path(PROP_SETTING.progress_file_path).exists():
-        with open(PROP_SETTING.progress_file_path, "rb") as f:
+    if Path(FACTOR_PROP_SETTING.progress_file_path).exists():
+        with open(FACTOR_PROP_SETTING.progress_file_path, "rb") as f:
             return pickle.load(f)
     return Trace(scen=scen), 0
 
@@ -87,8 +87,9 @@ def generate_hypothesis(factor_result: dict, report_content: str) -> str:
     response_json = json.loads(response)
     hypothesis_text = response_json.get("hypothesis", "No hypothesis generated.")
     reason_text = response_json.get("reason", "No reason provided.")
+    concise_reason_text = response_json.get("concise_reason", "No concise reason provided.")
 
-    return Hypothesis(hypothesis=hypothesis_text, reason=reason_text)
+    return Hypothesis(hypothesis=hypothesis_text, reason=reason_text, concise_reason=concise_reason_text)
 
 
 def extract_factors_and_implement(report_file_path: str) -> tuple:
@@ -124,48 +125,48 @@ def extract_factors_and_implement(report_file_path: str) -> tuple:
 
 trace, start_index = load_progress()
 
-try:
-    judge_pdf_data_items = list(judge_pdf_data.items())
-    for index in range(start_index, len(judge_pdf_data_items)):
-        if index > 1000:
-            break
-        file_path, attributes = judge_pdf_data_items[index]
-        if attributes["class"] == 1:
-            report_file_path = Path(file_path.replace(PROP_SETTING.origin_report_path, PROP_SETTING.local_report_path))
-            if report_file_path.exists():
-                logger.info(f"Processing {report_file_path}")
-
-                with logger.tag("r"):
-                    exp, hypothesis = extract_factors_and_implement(str(report_file_path))
-                    if exp is None:
-                        continue
-                    exp.based_experiments = [t[1] for t in trace.hist if t[2]]
-                    if len(exp.based_experiments) == 0:
-                        exp.based_experiments.append(QlibFactorExperiment(sub_tasks=[]))
-                    logger.log_object(hypothesis, tag="hypothesis generation")
-                    logger.log_object(exp.sub_tasks, tag="experiment generation")
-
-                with logger.tag("d"):
-                    exp = qlib_factor_coder.develop(exp)
-                    logger.log_object(exp.sub_workspace_list)
-
-                with logger.tag("ef"):
-                    exp = qlib_factor_runner.develop(exp)
-                    if exp is None:
-                        logger.error(f"Factor extraction failed for {report_file_path}. Skipping to the next report.")
-                        continue
-                    logger.log_object(exp, tag="factor runner result")
-                    feedback = qlib_factor_summarizer.generate_feedback(exp, hypothesis, trace)
-                    logger.log_object(feedback, tag="feedback")
-
-                trace.hist.append((hypothesis, exp, feedback))
-                logger.info(f"Processed {report_file_path}: Result: {exp}")
-
-                # Save progress after processing each report
-                save_progress(trace, index + 1)
-            else:
-                logger.error(f"File not found: {report_file_path}")
-except Exception as e:
-    logger.error(f"An error occurred: {e}")
-    save_progress(trace, index)
-    raise
+# try:
+judge_pdf_data_items = list(judge_pdf_data.items())
+for index in range(start_index, len(judge_pdf_data_items)):
+    if index > 1000:
+        break
+    file_path, attributes = judge_pdf_data_items[index]
+    if attributes["class"] == 1:
+        report_file_path = Path(file_path.replace(FACTOR_PROP_SETTING.origin_report_path, FACTOR_PROP_SETTING.local_report_path))
+        if report_file_path.exists():
+            logger.info(f"Processing {report_file_path}")
+
+            with logger.tag("r"):
+                exp, hypothesis = extract_factors_and_implement(str(report_file_path))
+                if exp is None:
+                    continue
+                exp.based_experiments = [t[1] for t in trace.hist if t[2]]
+                if len(exp.based_experiments) == 0:
+                    exp.based_experiments.append(QlibFactorExperiment(sub_tasks=[]))
+                logger.log_object(hypothesis, tag="hypothesis generation")
+                logger.log_object(exp.sub_tasks, tag="experiment generation")
+
+            with logger.tag("d"):
+                exp = qlib_factor_coder.develop(exp)
+                logger.log_object(exp.sub_workspace_list)
+
+            with logger.tag("ef"):
+                exp = qlib_factor_runner.develop(exp)
+                if exp is None:
+                    logger.error(f"Factor extraction failed for {report_file_path}. Skipping to the next report.")
+                    continue
+                logger.log_object(exp, tag="factor runner result")
+                feedback = qlib_factor_summarizer.generate_feedback(exp, hypothesis, trace)
+                logger.log_object(feedback, tag="feedback")
+
+            trace.hist.append((hypothesis, exp, feedback))
+            logger.info(f"Processed {report_file_path}: Result: {exp}")
+
+            # Save progress after processing each report
+            save_progress(trace, index + 1)
+        else:
+            logger.error(f"File not found: {report_file_path}")
+# except Exception as e:
+#     logger.error(f"An error occurred: {e}")
+#     save_progress(trace, index)
+#     raise
diff --git a/rdagent/app/qlib_rd_loop/factor_w_sc.py b/rdagent/app/qlib_rd_loop/factor_w_sc.py
old mode 100644
new mode 100755
index 10c9d19e..59c21195
--- a/rdagent/app/qlib_rd_loop/factor_w_sc.py
+++ b/rdagent/app/qlib_rd_loop/factor_w_sc.py
@@ -2,15 +2,27 @@
 Factor workflow with session control
 """
 
+from typing import Any
+
 import fire
 
 from rdagent.app.qlib_rd_loop.conf import FACTOR_PROP_SETTING
 from rdagent.components.workflow.rd_loop import RDLoop
 from rdagent.core.exception import FactorEmptyError
+from rdagent.log import rdagent_logger as logger
 
 
 class FactorRDLoop(RDLoop):
     skip_loop_error = (FactorEmptyError,)
+    
+    def running(self, prev_out: dict[str, Any]):
+        with logger.tag("ef"):  # evaluate and feedback
+            exp = self.runner.develop(prev_out["coding"])
+            if exp is None:
+                logger.error(f"Factor extraction failed.")
+                raise FactorEmptyError("Factor extraction failed.")
+            logger.log_object(exp, tag="runner result")
+        return exp
 
 
 def main(path=None, step_n=None):
@@ -30,4 +42,4 @@ def main(path=None, step_n=None):
 
 
 if __name__ == "__main__":
-    fire.Fire(main)
+    fire.Fire(main)
\ No newline at end of file
diff --git a/rdagent/app/qlib_rd_loop/prompts.yaml b/rdagent/app/qlib_rd_loop/prompts.yaml
index 1909087c..2861ce1b 100644
--- a/rdagent/app/qlib_rd_loop/prompts.yaml
+++ b/rdagent/app/qlib_rd_loop/prompts.yaml
@@ -4,7 +4,8 @@ hypothesis_generation:
     Please ensure your response is in JSON format as shown below:
     {
       "hypothesis": "A clear and concise hypothesis based on the provided information.",
-      "reason": "A detailed explanation supporting the generated hypothesis."
+      "reason": "A detailed explanation supporting the generated hypothesis.",
+      "concise_reason": One line summary that focuses on the justification for the change that leads to the hypothesis (like a part of a knowledge that we are building)
     }
 
   user: |-
diff --git a/rdagent/components/benchmark/eval_method.py b/rdagent/components/benchmark/eval_method.py
index b1b4632d..c94a0e3e 100644
--- a/rdagent/components/benchmark/eval_method.py
+++ b/rdagent/components/benchmark/eval_method.py
@@ -19,14 +19,14 @@
 from rdagent.components.coder.factor_coder.factor import FactorFBWorkspace
 from rdagent.core.conf import RD_AGENT_SETTINGS
 from rdagent.core.developer import Developer
-from rdagent.core.exception import CoderException, RunnerException
+from rdagent.core.exception import CoderError
 from rdagent.core.experiment import Task, Workspace
 from rdagent.core.scenario import Scenario
 from rdagent.core.utils import multiprocessing_wrapper
 
 EVAL_RES = Dict[
     str,
-    List[Tuple[FactorEvaluator, Union[object, RunnerException]]],
+    List[Tuple[FactorEvaluator, Union[object, CoderError]]],
 ]
 
 
diff --git a/rdagent/scenarios/qlib/prompts.yaml b/rdagent/scenarios/qlib/prompts.yaml
index 5e3239d9..eec73402 100644
--- a/rdagent/scenarios/qlib/prompts.yaml
+++ b/rdagent/scenarios/qlib/prompts.yaml
@@ -43,21 +43,6 @@ model_hypothesis_specification: |-
 
     6th Round Hypothesis (If fourth round didn't work):  The model should be a CNN. The CNN should have 5 convolutional layers. Use Leaky ReLU activation for all layers. Use dropout regularization with a rate of 0.3. (Reasoning: As regularisation rate of 0.5 didn't work, we only change a new regularisation and keep the other elements that worked. This means making changes in the current level.)    
 
-factor_hypothesis_specification: |-
-  Additional Specifications:
-    Hypotheses should grow and evolve based on the previous hypothesis. If there is no previous hypothesis, start with something simple. Gradually build up upon previous hypotheses and feedback.
-    Ensure that the hypothesis focuses on the creation and selection of factors in quantitative finance. Each hypothesis should address specific factor characteristics such as type (momentum, value, quality), calculation methods, or inclusion criteria. Avoid hypotheses related to model architecture or optimization processes.
-
-  Sample Hypotheses (Only learn from the format as these are not the knowledge):
-    - "Include a momentum factor based on the last 12 months' returns."
-    - "Add a value factor calculated as the book-to-market ratio."
-    - "Incorporate a quality factor derived from return on equity (ROE)."
-    - "Use a volatility factor based on the standard deviation of returns over the past 6 months."
-    - "Include a sentiment factor derived from news sentiment scores."
-    - "The momentum factor should be calculated using a 6-month look-back period."
-    - "Combine value and momentum factors using a weighted average approach."
-    - "Filter stocks by market capitalization before calculating the factors."
-
 factor_hypothesis_specification: |-
   Specifications:
     - Hypotheses should grow and evolve based on the previous hypothesis. If there is no previous hypothesis, start with something simple.
diff --git a/rdagent/utils/workflow.py b/rdagent/utils/workflow.py
index f8975904..5ec54fc1 100644
--- a/rdagent/utils/workflow.py
+++ b/rdagent/utils/workflow.py
@@ -35,7 +35,10 @@ def __new__(cls, clsname, bases, attrs):
         steps = LoopMeta._get_steps(bases)  # all the base classes of parents
         for name, attr in attrs.items():
             if not name.startswith("__") and isinstance(attr, Callable):
-                steps.append(name)
+                if name not in steps:
+                    # NOTE: if we override the step in the subclass
+                    # Then it is not the new step. So we skip it.
+                    steps.append(name)
         attrs["steps"] = steps
         return super().__new__(cls, clsname, bases, attrs)
 
@@ -125,4 +128,4 @@ def load(cls, path: str | Path):
 
         max_loop = max(session.loop_trace.keys())
         logger.storage.truncate(time=session.loop_trace[max_loop][-1].end)
-        return session
+        return session
\ No newline at end of file

From b36e1cfc0bb6fe91d26535107f8f612f51c820ef Mon Sep 17 00:00:00 2001
From: WinstonLiyt <1957922024@qq.com>
Date: Mon, 29 Jul 2024 09:42:41 +0000
Subject: [PATCH 38/40] Fix some format bugs.

---
 rdagent/app/qlib_rd_loop/factor_from_report_sh.py | 4 +++-
 rdagent/app/qlib_rd_loop/factor_w_sc.py           | 4 ++--
 rdagent/utils/workflow.py                         | 2 +-
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/rdagent/app/qlib_rd_loop/factor_from_report_sh.py b/rdagent/app/qlib_rd_loop/factor_from_report_sh.py
index 899f2165..76752167 100644
--- a/rdagent/app/qlib_rd_loop/factor_from_report_sh.py
+++ b/rdagent/app/qlib_rd_loop/factor_from_report_sh.py
@@ -132,7 +132,9 @@ def extract_factors_and_implement(report_file_path: str) -> tuple:
         break
     file_path, attributes = judge_pdf_data_items[index]
     if attributes["class"] == 1:
-        report_file_path = Path(file_path.replace(FACTOR_PROP_SETTING.origin_report_path, FACTOR_PROP_SETTING.local_report_path))
+        report_file_path = Path(
+            file_path.replace(FACTOR_PROP_SETTING.origin_report_path, FACTOR_PROP_SETTING.local_report_path)
+        )
         if report_file_path.exists():
             logger.info(f"Processing {report_file_path}")
 
diff --git a/rdagent/app/qlib_rd_loop/factor_w_sc.py b/rdagent/app/qlib_rd_loop/factor_w_sc.py
index 59c21195..d6fde5fe 100755
--- a/rdagent/app/qlib_rd_loop/factor_w_sc.py
+++ b/rdagent/app/qlib_rd_loop/factor_w_sc.py
@@ -14,7 +14,7 @@
 
 class FactorRDLoop(RDLoop):
     skip_loop_error = (FactorEmptyError,)
-    
+
     def running(self, prev_out: dict[str, Any]):
         with logger.tag("ef"):  # evaluate and feedback
             exp = self.runner.develop(prev_out["coding"])
@@ -42,4 +42,4 @@ def main(path=None, step_n=None):
 
 
 if __name__ == "__main__":
-    fire.Fire(main)
\ No newline at end of file
+    fire.Fire(main)
diff --git a/rdagent/utils/workflow.py b/rdagent/utils/workflow.py
index 5ec54fc1..f4e568cd 100644
--- a/rdagent/utils/workflow.py
+++ b/rdagent/utils/workflow.py
@@ -128,4 +128,4 @@ def load(cls, path: str | Path):
 
         max_loop = max(session.loop_trace.keys())
         logger.storage.truncate(time=session.loop_trace[max_loop][-1].end)
-        return session
\ No newline at end of file
+        return session

From 3e42a7bd68250218a43f4a7eedb3d4929650e836 Mon Sep 17 00:00:00 2001
From: WinstonLiyt <1957922024@qq.com>
Date: Mon, 29 Jul 2024 09:44:32 +0000
Subject: [PATCH 39/40] Restore a file.

---
 .../app/qlib_rd_loop/factor_from_report_sh.py | 94 +++++++++----------
 1 file changed, 47 insertions(+), 47 deletions(-)

diff --git a/rdagent/app/qlib_rd_loop/factor_from_report_sh.py b/rdagent/app/qlib_rd_loop/factor_from_report_sh.py
index 76752167..47e08688 100644
--- a/rdagent/app/qlib_rd_loop/factor_from_report_sh.py
+++ b/rdagent/app/qlib_rd_loop/factor_from_report_sh.py
@@ -125,50 +125,50 @@ def extract_factors_and_implement(report_file_path: str) -> tuple:
 
 trace, start_index = load_progress()
 
-# try:
-judge_pdf_data_items = list(judge_pdf_data.items())
-for index in range(start_index, len(judge_pdf_data_items)):
-    if index > 1000:
-        break
-    file_path, attributes = judge_pdf_data_items[index]
-    if attributes["class"] == 1:
-        report_file_path = Path(
-            file_path.replace(FACTOR_PROP_SETTING.origin_report_path, FACTOR_PROP_SETTING.local_report_path)
-        )
-        if report_file_path.exists():
-            logger.info(f"Processing {report_file_path}")
-
-            with logger.tag("r"):
-                exp, hypothesis = extract_factors_and_implement(str(report_file_path))
-                if exp is None:
-                    continue
-                exp.based_experiments = [t[1] for t in trace.hist if t[2]]
-                if len(exp.based_experiments) == 0:
-                    exp.based_experiments.append(QlibFactorExperiment(sub_tasks=[]))
-                logger.log_object(hypothesis, tag="hypothesis generation")
-                logger.log_object(exp.sub_tasks, tag="experiment generation")
-
-            with logger.tag("d"):
-                exp = qlib_factor_coder.develop(exp)
-                logger.log_object(exp.sub_workspace_list)
-
-            with logger.tag("ef"):
-                exp = qlib_factor_runner.develop(exp)
-                if exp is None:
-                    logger.error(f"Factor extraction failed for {report_file_path}. Skipping to the next report.")
-                    continue
-                logger.log_object(exp, tag="factor runner result")
-                feedback = qlib_factor_summarizer.generate_feedback(exp, hypothesis, trace)
-                logger.log_object(feedback, tag="feedback")
-
-            trace.hist.append((hypothesis, exp, feedback))
-            logger.info(f"Processed {report_file_path}: Result: {exp}")
-
-            # Save progress after processing each report
-            save_progress(trace, index + 1)
-        else:
-            logger.error(f"File not found: {report_file_path}")
-# except Exception as e:
-#     logger.error(f"An error occurred: {e}")
-#     save_progress(trace, index)
-#     raise
+try:
+    judge_pdf_data_items = list(judge_pdf_data.items())
+    for index in range(start_index, len(judge_pdf_data_items)):
+        if index > 1000:
+            break
+        file_path, attributes = judge_pdf_data_items[index]
+        if attributes["class"] == 1:
+            report_file_path = Path(
+                file_path.replace(FACTOR_PROP_SETTING.origin_report_path, FACTOR_PROP_SETTING.local_report_path)
+            )
+            if report_file_path.exists():
+                logger.info(f"Processing {report_file_path}")
+
+                with logger.tag("r"):
+                    exp, hypothesis = extract_factors_and_implement(str(report_file_path))
+                    if exp is None:
+                        continue
+                    exp.based_experiments = [t[1] for t in trace.hist if t[2]]
+                    if len(exp.based_experiments) == 0:
+                        exp.based_experiments.append(QlibFactorExperiment(sub_tasks=[]))
+                    logger.log_object(hypothesis, tag="hypothesis generation")
+                    logger.log_object(exp.sub_tasks, tag="experiment generation")
+
+                with logger.tag("d"):
+                    exp = qlib_factor_coder.develop(exp)
+                    logger.log_object(exp.sub_workspace_list)
+
+                with logger.tag("ef"):
+                    exp = qlib_factor_runner.develop(exp)
+                    if exp is None:
+                        logger.error(f"Factor extraction failed for {report_file_path}. Skipping to the next report.")
+                        continue
+                    logger.log_object(exp, tag="factor runner result")
+                    feedback = qlib_factor_summarizer.generate_feedback(exp, hypothesis, trace)
+                    logger.log_object(feedback, tag="feedback")
+
+                trace.hist.append((hypothesis, exp, feedback))
+                logger.info(f"Processed {report_file_path}: Result: {exp}")
+
+                # Save progress after processing each report
+                save_progress(trace, index + 1)
+            else:
+                logger.error(f"File not found: {report_file_path}")
+except Exception as e:
+    logger.error(f"An error occurred: {e}")
+    save_progress(trace, index)
+    raise

From ad7d18da1e1d3faae29015afedb7b36afa75bed8 Mon Sep 17 00:00:00 2001
From: WinstonLiyt <1957922024@qq.com>
Date: Mon, 29 Jul 2024 09:55:55 +0000
Subject: [PATCH 40/40] Fix a format bug.

---
 rdagent/scenarios/data_mining/proposal/model_proposal.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/rdagent/scenarios/data_mining/proposal/model_proposal.py b/rdagent/scenarios/data_mining/proposal/model_proposal.py
index 8594255b..bd51bba5 100644
--- a/rdagent/scenarios/data_mining/proposal/model_proposal.py
+++ b/rdagent/scenarios/data_mining/proposal/model_proposal.py
@@ -50,7 +50,11 @@ def prepare_context(self, trace: Trace) -> Tuple[dict, bool]:
 
     def convert_response(self, response: str) -> ModelHypothesis:
         response_dict = json.loads(response)
-        hypothesis = DMModelHypothesis(hypothesis=response_dict["hypothesis"], reason=response_dict["reason"], concise_reason=response_dict["concise_reason"])
+        hypothesis = DMModelHypothesis(
+            hypothesis=response_dict["hypothesis"],
+            reason=response_dict["reason"],
+            concise_reason=response_dict["concise_reason"],
+        )
         return hypothesis