cancel some comments and fix some bugs

microsoft · Aug 2, 2024 · a883a78 · a883a78
1 parent 6fd15a7
commit a883a78
Show file tree

Hide file tree

Showing 6 changed files with 30 additions and 72 deletions.
diff --git a/rdagent/app/qlib_rd_loop/factor_from_report_sh.py b/rdagent/app/qlib_rd_loop/factor_from_report_sh.py
@@ -1,6 +1,6 @@
 # TODO: we should have more advanced mechanism to handle such requirements for saving sessions.
-import json
 import csv
+import json
 import pickle
 from pathlib import Path
 from typing import Any
@@ -42,13 +42,14 @@
 )
 from rdagent.utils.workflow import LoopBase, LoopMeta
 
-with open(FACTOR_PROP_SETTING.report_result_json_file_path, 'r') as input_file:
+with open(FACTOR_PROP_SETTING.report_result_json_file_path, "r") as input_file:
     csv_reader = csv.reader(input_file)
     judge_pdf_data = [row[0] for row in csv_reader]
 
 prompts_path = Path(__file__).parent / "prompts.yaml"
 prompts = Prompts(file_path=prompts_path)
 
+
 def generate_hypothesis(factor_result: dict, report_content: str) -> str:
     system_prompt = (
         Environment(undefined=StrictUndefined).from_string(prompts["hypothesis_generation"]["system"]).render()
@@ -72,6 +73,7 @@ def generate_hypothesis(factor_result: dict, report_content: str) -> str:
 
     return Hypothesis(hypothesis=hypothesis_text, reason=reason_text, concise_reason=concise_reason_text)
 
+
 def extract_factors_and_implement(report_file_path: str) -> tuple:
     scenario = QlibFactorScenario()
 
@@ -105,6 +107,7 @@ def extract_factors_and_implement(report_file_path: str) -> tuple:
 
 class FactorReportLoop(LoopBase, metaclass=LoopMeta):
     skip_loop_error = (FactorEmptyError,)
+
     def __init__(self, PROP_SETTING: BasePropSetting):
         scen: Scenario = import_class(PROP_SETTING.scen)()
 
@@ -116,7 +119,11 @@ def __init__(self, PROP_SETTING: BasePropSetting):
 
         self.judge_pdf_data_items = judge_pdf_data
         self.index = 0
-        self.hypo_exp_cache = pickle.load(open(FACTOR_PROP_SETTING.report_extract_result, "rb")) if Path(FACTOR_PROP_SETTING.report_extract_result).exists() else {}
+        self.hypo_exp_cache = (
+            pickle.load(open(FACTOR_PROP_SETTING.report_extract_result, "rb"))
+            if Path(FACTOR_PROP_SETTING.report_extract_result).exists()
+            else {}
+        )
         super().__init__()
 
     def propose_hypo_exp(self, prev_out: dict[str, Any]):
@@ -128,7 +135,9 @@ def propose_hypo_exp(self, prev_out: dict[str, Any]):
                 self.index += 1
                 if report_file_path in self.hypo_exp_cache:
                     hypothesis, exp = self.hypo_exp_cache[report_file_path]
-                    exp.based_experiments = [QlibFactorExperiment(sub_tasks=[])] + [t[1] for t in self.trace.hist if t[2]]
+                    exp.based_experiments = [QlibFactorExperiment(sub_tasks=[])] + [
+                        t[1] for t in self.trace.hist if t[2]
+                    ]
                 else:
                     continue
                 # else:
@@ -142,8 +151,8 @@ def propose_hypo_exp(self, prev_out: dict[str, Any]):
                     with logger.tag("load_pdf_screenshot"):
                         pdf_screenshot = extract_first_page_screenshot_from_pdf(report_file_path)
                         logger.log_object(pdf_screenshot)
-                exp.sub_workspace_list = exp.sub_workspace_list[:FACTOR_PROP_SETTING.max_factor_per_report]
-                exp.sub_tasks = exp.sub_tasks[:FACTOR_PROP_SETTING.max_factor_per_report]
+                exp.sub_workspace_list = exp.sub_workspace_list[: FACTOR_PROP_SETTING.max_factor_per_report]
+                exp.sub_tasks = exp.sub_tasks[: FACTOR_PROP_SETTING.max_factor_per_report]
                 logger.log_object(hypothesis, tag="hypothesis generation")
                 logger.log_object(exp.sub_tasks, tag="experiment generation")
                 return hypothesis, exp
@@ -169,6 +178,7 @@ def feedback(self, prev_out: dict[str, Any]):
             logger.log_object(feedback, tag="feedback")
         self.trace.hist.append((prev_out["propose_hypo_exp"][0], prev_out["running"], feedback))
 
+
 def main(path=None, step_n=None):
     """
     You can continue running session by

diff --git a/rdagent/components/coder/factor_coder/CoSTEER/evaluators.py b/rdagent/components/coder/factor_coder/CoSTEER/evaluators.py
@@ -170,33 +170,37 @@ def evaluate(
         max_attempts = 3
         attempts = 0
         final_evaluation_dict = None
-        
+
         while attempts < max_attempts:
             try:
                 resp = APIBackend().build_messages_and_create_chat_completion(
                     user_prompt=gen_df_info_str, system_prompt=system_prompt, json_mode=True
                 )
                 resp_dict = json.loads(resp)
-
-                if isinstance(resp_dict["output_format_decision"], str) and resp_dict["output_format_decision"].lower() in (
+
+                if isinstance(resp_dict["output_format_decision"], str) and resp_dict[
+                    "output_format_decision"
+                ].lower() in (
                     "true",
                     "false",
                 ):
                     resp_dict["output_format_decision"] = bool(resp_dict["output_format_decision"])
-                
+
                 return (
                     resp_dict["output_format_feedback"],
                     resp_dict["output_format_decision"],
                 )
-            
+
             except json.JSONDecodeError as e:
                 raise ValueError("Failed to decode JSON response from API.") from e
-            
+
             except KeyError as e:
                 attempts += 1
                 if attempts >= max_attempts:
-                    raise KeyError("Response from API is missing 'output_format_decision' or 'output_format_feedback' key after multiple attempts.") from e
-
+                    raise KeyError(
+                        "Response from API is missing 'output_format_decision' or 'output_format_feedback' key after multiple attempts."
+                    ) from e
+
         return "Failed to evaluate output format after multiple attempts.", False
 
 

diff --git a/rdagent/components/coder/factor_coder/CoSTEER/evolving_strategy.py b/rdagent/components/coder/factor_coder/CoSTEER/evolving_strategy.py
@@ -88,30 +88,6 @@ def evolve(
                 self.scen,
             )
 
-        # if FACTOR_IMPLEMENT_SETTINGS.select_ratio < 1:
-        #     # if the number of loops is equal to the select_loop, we need to select some of them
-        #     implementation_factors_per_round = round(
-        #         FACTOR_IMPLEMENT_SETTINGS.select_ratio * len(to_be_finished_task_index) + 0.5
-        #     )  # ceilling
-        #     implementation_factors_per_round = min(
-        #         implementation_factors_per_round, len(to_be_finished_task_index)
-        #     )  # but not exceed the total number of tasks
-
-        #     if FACTOR_IMPLEMENT_SETTINGS.select_method == "random":
-        #         to_be_finished_task_index = RandomSelect(
-        #             to_be_finished_task_index,
-        #             implementation_factors_per_round,
-        #         )
-
-        #     if FACTOR_IMPLEMENT_SETTINGS.select_method == "scheduler":
-        #         to_be_finished_task_index = LLMSelect(
-        #             to_be_finished_task_index,
-        #             implementation_factors_per_round,
-        #             evo,
-        #             queried_knowledge.former_traces,
-        #             self.scen,
-        #         )
-
         result = multiprocessing_wrapper(
             [
                 (self.implement_one_factor, (evo.sub_tasks[target_index], queried_knowledge))

diff --git a/rdagent/components/coder/factor_coder/factor.py b/rdagent/components/coder/factor_coder/factor.py
@@ -70,7 +70,6 @@ def __init__(
     ) -> None:
         super().__init__(*args, **kwargs)
         self.executed_factor_value_dataframe = executed_factor_value_dataframe
-        # self.logger = logger
         self.raise_exception = raise_exception
 
     @staticmethod

diff --git a/rdagent/core/utils.py b/rdagent/core/utils.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-import pickle
 import importlib
 import json
 import multiprocessing as mp
@@ -103,4 +102,4 @@ def multiprocessing_wrapper(func_calls: list[tuple[Callable, tuple]], n: int) ->
         return [f(*args) for f, args in func_calls]
     with mp.Pool(processes=n) as pool:
         results = [pool.apply_async(f, args) for f, args in func_calls]
-        return [result.get() for result in results]
+        return [result.get() for result in results]
diff --git a/rdagent/scenarios/qlib/developer/factor_runner.py b/rdagent/scenarios/qlib/developer/factor_runner.py
@@ -37,32 +37,6 @@ class QlibFactorRunner(CachedRunner[QlibFactorExperiment]):
     - results in `mlflow`
     """
 
-    def calculate_information_coefficient(
-        self, concat_feature: pd.DataFrame, SOTA_feature_column_size: int, new_feature_columns_size: int
-    ) -> pd.DataFrame:
-        res = pd.Series(index=range(SOTA_feature_column_size * new_feature_columns_size))
-        for col1 in range(SOTA_feature_column_size):
-            for col2 in range(SOTA_feature_column_size, SOTA_feature_column_size + new_feature_columns_size):
-                res.loc[col1 * new_feature_columns_size + col2 - SOTA_feature_column_size] = concat_feature.iloc[
-                    :, col1
-                ].corr(concat_feature.iloc[:, col2])
-        return res
-
-    def deduplicate_new_factors(self, SOTA_feature: pd.DataFrame, new_feature: pd.DataFrame) -> pd.DataFrame:
-        # calculate the IC between each column of SOTA_feature and new_feature
-        # if the IC is larger than a threshold, remove the new_feature column
-        # return the new_feature
-
-        concat_feature = pd.concat([SOTA_feature, new_feature], axis=1)
-        IC_max = (
-            concat_feature.groupby("datetime")
-            .apply(lambda x: self.calculate_information_coefficient(x, SOTA_feature.shape[1], new_feature.shape[1]))
-            .mean()
-        )
-        IC_max.index = pd.MultiIndex.from_product([range(SOTA_feature.shape[1]), range(new_feature.shape[1])])
-        IC_max = IC_max.unstack().max(axis=0)
-        return new_feature.iloc[:, IC_max[IC_max < 0.99].index]
-
     def develop(self, exp: QlibFactorExperiment) -> QlibFactorExperiment:
         """
         Generate the experiment by processing and combining factor data,
@@ -92,17 +66,13 @@ def develop(self, exp: QlibFactorExperiment) -> QlibFactorExperiment:
 
             # Combine the SOTA factor and new factors if SOTA factor exists
             if SOTA_factor is not None and not SOTA_factor.empty:
-                new_factors = self.deduplicate_new_factors(SOTA_factor, new_factors)
-                if new_factors.empty:
-                    raise FactorEmptyError("No valid factor data found to merge.")
                 combined_factors = pd.concat([SOTA_factor, new_factors], axis=1).dropna()
             else:
                 combined_factors = new_factors
 
             # Sort and nest the combined factors under 'feature'
-            # print(combined_factors)
             combined_factors = combined_factors.sort_index()
-            combined_factors = combined_factors.loc[:, ~combined_factors.columns.duplicated(keep='last')]
+            combined_factors = combined_factors.loc[:, ~combined_factors.columns.duplicated(keep="last")]
             new_columns = pd.MultiIndex.from_product([["feature"], combined_factors.columns])
             combined_factors.columns = new_columns