fix some bugs.

microsoft · Jul 29, 2024 · 787450c · 787450c
1 parent 9c64f14
commit 787450c
Show file tree

Hide file tree

Showing 6 changed files with 80 additions and 78 deletions.
diff --git a/rdagent/app/qlib_rd_loop/factor_from_report_sh.py b/rdagent/app/qlib_rd_loop/factor_from_report_sh.py
@@ -7,7 +7,7 @@
 from dotenv import load_dotenv
 from jinja2 import Environment, StrictUndefined
 
-from rdagent.app.qlib_rd_loop.conf import PROP_SETTING
+from rdagent.app.qlib_rd_loop.conf import FACTOR_PROP_SETTING
 from rdagent.components.document_reader.document_reader import (
     extract_first_page_screenshot_from_pdf,
     load_and_process_pdfs_by_langchain,
@@ -37,33 +37,33 @@
 
 assert load_dotenv()
 
-scen: Scenario = import_class(PROP_SETTING.factor_scen)()
+scen: Scenario = import_class(FACTOR_PROP_SETTING.scen)()
 
-hypothesis_gen: HypothesisGen = import_class(PROP_SETTING.factor_hypothesis_gen)(scen)
+hypothesis_gen: HypothesisGen = import_class(FACTOR_PROP_SETTING.hypothesis_gen)(scen)
 
-hypothesis2experiment: Hypothesis2Experiment = import_class(PROP_SETTING.factor_hypothesis2experiment)()
+hypothesis2experiment: Hypothesis2Experiment = import_class(FACTOR_PROP_SETTING.hypothesis2experiment)()
 
-qlib_factor_coder: Developer = import_class(PROP_SETTING.factor_coder)(scen)
+qlib_factor_coder: Developer = import_class(FACTOR_PROP_SETTING.coder)(scen)
 
-qlib_factor_runner: Developer = import_class(PROP_SETTING.factor_runner)(scen)
+qlib_factor_runner: Developer = import_class(FACTOR_PROP_SETTING.runner)(scen)
 
-qlib_factor_summarizer: HypothesisExperiment2Feedback = import_class(PROP_SETTING.factor_summarizer)(scen)
+qlib_factor_summarizer: HypothesisExperiment2Feedback = import_class(FACTOR_PROP_SETTING.summarizer)(scen)
 
-with open(PROP_SETTING.report_result_json_file_path, "r") as f:
+with open(FACTOR_PROP_SETTING.report_result_json_file_path, "r") as f:
     judge_pdf_data = json.load(f)
 
 prompts_path = Path(__file__).parent / "prompts.yaml"
 prompts = Prompts(file_path=prompts_path)
 
 
 def save_progress(trace, current_index):
-    with open(PROP_SETTING.progress_file_path, "wb") as f:
+    with open(FACTOR_PROP_SETTING.progress_file_path, "wb") as f:
         pickle.dump((trace, current_index), f)
 
 
 def load_progress():
-    if Path(PROP_SETTING.progress_file_path).exists():
-        with open(PROP_SETTING.progress_file_path, "rb") as f:
+    if Path(FACTOR_PROP_SETTING.progress_file_path).exists():
+        with open(FACTOR_PROP_SETTING.progress_file_path, "rb") as f:
             return pickle.load(f)
     return Trace(scen=scen), 0
 
@@ -87,8 +87,9 @@ def generate_hypothesis(factor_result: dict, report_content: str) -> str:
     response_json = json.loads(response)
     hypothesis_text = response_json.get("hypothesis", "No hypothesis generated.")
     reason_text = response_json.get("reason", "No reason provided.")
+    concise_reason_text = response_json.get("concise_reason", "No concise reason provided.")
 
-    return Hypothesis(hypothesis=hypothesis_text, reason=reason_text)
+    return Hypothesis(hypothesis=hypothesis_text, reason=reason_text, concise_reason=concise_reason_text)
 
 
 def extract_factors_and_implement(report_file_path: str) -> tuple:
@@ -124,48 +125,48 @@ def extract_factors_and_implement(report_file_path: str) -> tuple:
 
 trace, start_index = load_progress()
 
-try:
-    judge_pdf_data_items = list(judge_pdf_data.items())
-    for index in range(start_index, len(judge_pdf_data_items)):
-        if index > 1000:
-            break
-        file_path, attributes = judge_pdf_data_items[index]
-        if attributes["class"] == 1:
-            report_file_path = Path(file_path.replace(PROP_SETTING.origin_report_path, PROP_SETTING.local_report_path))
-            if report_file_path.exists():
-                logger.info(f"Processing {report_file_path}")
-
-                with logger.tag("r"):
-                    exp, hypothesis = extract_factors_and_implement(str(report_file_path))
-                    if exp is None:
-                        continue
-                    exp.based_experiments = [t[1] for t in trace.hist if t[2]]
-                    if len(exp.based_experiments) == 0:
-                        exp.based_experiments.append(QlibFactorExperiment(sub_tasks=[]))
-                    logger.log_object(hypothesis, tag="hypothesis generation")
-                    logger.log_object(exp.sub_tasks, tag="experiment generation")
-
-                with logger.tag("d"):
-                    exp = qlib_factor_coder.develop(exp)
-                    logger.log_object(exp.sub_workspace_list)
-
-                with logger.tag("ef"):
-                    exp = qlib_factor_runner.develop(exp)
-                    if exp is None:
-                        logger.error(f"Factor extraction failed for {report_file_path}. Skipping to the next report.")
-                        continue
-                    logger.log_object(exp, tag="factor runner result")
-                    feedback = qlib_factor_summarizer.generate_feedback(exp, hypothesis, trace)
-                    logger.log_object(feedback, tag="feedback")
-
-                trace.hist.append((hypothesis, exp, feedback))
-                logger.info(f"Processed {report_file_path}: Result: {exp}")
-
-                # Save progress after processing each report
-                save_progress(trace, index + 1)
-            else:
-                logger.error(f"File not found: {report_file_path}")
-except Exception as e:
-    logger.error(f"An error occurred: {e}")
-    save_progress(trace, index)
-    raise
+# try:
+judge_pdf_data_items = list(judge_pdf_data.items())
+for index in range(start_index, len(judge_pdf_data_items)):
+    if index > 1000:
+        break
+    file_path, attributes = judge_pdf_data_items[index]
+    if attributes["class"] == 1:
+        report_file_path = Path(file_path.replace(FACTOR_PROP_SETTING.origin_report_path, FACTOR_PROP_SETTING.local_report_path))
+        if report_file_path.exists():
+            logger.info(f"Processing {report_file_path}")
+
+            with logger.tag("r"):
+                exp, hypothesis = extract_factors_and_implement(str(report_file_path))
+                if exp is None:
+                    continue
+                exp.based_experiments = [t[1] for t in trace.hist if t[2]]
+                if len(exp.based_experiments) == 0:
+                    exp.based_experiments.append(QlibFactorExperiment(sub_tasks=[]))
+                logger.log_object(hypothesis, tag="hypothesis generation")
+                logger.log_object(exp.sub_tasks, tag="experiment generation")
+
+            with logger.tag("d"):
+                exp = qlib_factor_coder.develop(exp)
+                logger.log_object(exp.sub_workspace_list)
+
+            with logger.tag("ef"):
+                exp = qlib_factor_runner.develop(exp)
+                if exp is None:
+                    logger.error(f"Factor extraction failed for {report_file_path}. Skipping to the next report.")
+                    continue
+                logger.log_object(exp, tag="factor runner result")
+                feedback = qlib_factor_summarizer.generate_feedback(exp, hypothesis, trace)
+                logger.log_object(feedback, tag="feedback")
+
+            trace.hist.append((hypothesis, exp, feedback))
+            logger.info(f"Processed {report_file_path}: Result: {exp}")
+
+            # Save progress after processing each report
+            save_progress(trace, index + 1)
+        else:
+            logger.error(f"File not found: {report_file_path}")
+# except Exception as e:
+#     logger.error(f"An error occurred: {e}")
+#     save_progress(trace, index)
+#     raise
diff --git a/rdagent/app/qlib_rd_loop/factor_w_sc.py b/rdagent/app/qlib_rd_loop/factor_w_sc.py
@@ -2,15 +2,27 @@
 Factor workflow with session control
 """
 
+from typing import Any
+
 import fire
 
 from rdagent.app.qlib_rd_loop.conf import FACTOR_PROP_SETTING
 from rdagent.components.workflow.rd_loop import RDLoop
 from rdagent.core.exception import FactorEmptyError
+from rdagent.log import rdagent_logger as logger
 
 
 class FactorRDLoop(RDLoop):
     skip_loop_error = (FactorEmptyError,)
+
+    def running(self, prev_out: dict[str, Any]):
+        with logger.tag("ef"):  # evaluate and feedback
+            exp = self.runner.develop(prev_out["coding"])
+            if exp is None:
+                logger.error(f"Factor extraction failed.")
+                raise FactorEmptyError("Factor extraction failed.")
+            logger.log_object(exp, tag="runner result")
+        return exp
 
 
 def main(path=None, step_n=None):
@@ -30,4 +42,4 @@ def main(path=None, step_n=None):
 
 
 if __name__ == "__main__":
-    fire.Fire(main)
+    fire.Fire(main)
diff --git a/rdagent/app/qlib_rd_loop/prompts.yaml b/rdagent/app/qlib_rd_loop/prompts.yaml
@@ -4,7 +4,8 @@ hypothesis_generation:
     Please ensure your response is in JSON format as shown below:
     {
       "hypothesis": "A clear and concise hypothesis based on the provided information.",
-      "reason": "A detailed explanation supporting the generated hypothesis."
+      "reason": "A detailed explanation supporting the generated hypothesis.",
+      "concise_reason": One line summary that focuses on the justification for the change that leads to the hypothesis (like a part of a knowledge that we are building)
     }
 
   user: |-

diff --git a/rdagent/components/benchmark/eval_method.py b/rdagent/components/benchmark/eval_method.py
@@ -19,14 +19,14 @@
 from rdagent.components.coder.factor_coder.factor import FactorFBWorkspace
 from rdagent.core.conf import RD_AGENT_SETTINGS
 from rdagent.core.developer import Developer
-from rdagent.core.exception import CoderException, RunnerException
+from rdagent.core.exception import CoderError
 from rdagent.core.experiment import Task, Workspace
 from rdagent.core.scenario import Scenario
 from rdagent.core.utils import multiprocessing_wrapper
 
 EVAL_RES = Dict[
     str,
-    List[Tuple[FactorEvaluator, Union[object, RunnerException]]],
+    List[Tuple[FactorEvaluator, Union[object, CoderError]]],
 ]
 
 

diff --git a/rdagent/scenarios/qlib/prompts.yaml b/rdagent/scenarios/qlib/prompts.yaml
@@ -43,21 +43,6 @@ model_hypothesis_specification: |-
 
     6th Round Hypothesis (If fourth round didn't work):  The model should be a CNN. The CNN should have 5 convolutional layers. Use Leaky ReLU activation for all layers. Use dropout regularization with a rate of 0.3. (Reasoning: As regularisation rate of 0.5 didn't work, we only change a new regularisation and keep the other elements that worked. This means making changes in the current level.)    
 
-factor_hypothesis_specification: |-
-  Additional Specifications:
-    Hypotheses should grow and evolve based on the previous hypothesis. If there is no previous hypothesis, start with something simple. Gradually build up upon previous hypotheses and feedback.
-    Ensure that the hypothesis focuses on the creation and selection of factors in quantitative finance. Each hypothesis should address specific factor characteristics such as type (momentum, value, quality), calculation methods, or inclusion criteria. Avoid hypotheses related to model architecture or optimization processes.
-
-  Sample Hypotheses (Only learn from the format as these are not the knowledge):
-    - "Include a momentum factor based on the last 12 months' returns."
-    - "Add a value factor calculated as the book-to-market ratio."
-    - "Incorporate a quality factor derived from return on equity (ROE)."
-    - "Use a volatility factor based on the standard deviation of returns over the past 6 months."
-    - "Include a sentiment factor derived from news sentiment scores."
-    - "The momentum factor should be calculated using a 6-month look-back period."
-    - "Combine value and momentum factors using a weighted average approach."
-    - "Filter stocks by market capitalization before calculating the factors."
-
 factor_hypothesis_specification: |-
   Specifications:
     - Hypotheses should grow and evolve based on the previous hypothesis. If there is no previous hypothesis, start with something simple.

diff --git a/rdagent/utils/workflow.py b/rdagent/utils/workflow.py
@@ -35,7 +35,10 @@ def __new__(cls, clsname, bases, attrs):
         steps = LoopMeta._get_steps(bases)  # all the base classes of parents
         for name, attr in attrs.items():
             if not name.startswith("__") and isinstance(attr, Callable):
-                steps.append(name)
+                if name not in steps:
+                    # NOTE: if we override the step in the subclass
+                    # Then it is not the new step. So we skip it.
+                    steps.append(name)
         attrs["steps"] = steps
         return super().__new__(cls, clsname, bases, attrs)
 
@@ -125,4 +128,4 @@ def load(cls, path: str | Path):
 
         max_loop = max(session.loop_trace.keys())
         logger.storage.truncate(time=session.loop_trace[max_loop][-1].end)
-        return session
+        return session