diff --git a/rdagent/app/qlib_rd_loop/conf.py b/rdagent/app/qlib_rd_loop/conf.py index 19ac64c64..f09564225 100644 --- a/rdagent/app/qlib_rd_loop/conf.py +++ b/rdagent/app/qlib_rd_loop/conf.py @@ -35,7 +35,7 @@ class Config: # 2) sub task specific: origin_report_path: str = "data/report_origin" local_report_path: str = "data/report" - report_result_json_file_path: str = "git_ignore_folder/res_dict.csv" + report_result_json_file_path: str = "git_ignore_folder/report_list.json" progress_file_path: str = "git_ignore_folder/progress.pkl" report_extract_result: str = "git_ignore_folder/hypo_exp_cache.pkl" max_factor_per_report: int = 10000 diff --git a/rdagent/app/qlib_rd_loop/factor_from_report_sh.py b/rdagent/app/qlib_rd_loop/factor_from_report_sh.py index 1b57ae7bb..a36d00c00 100644 --- a/rdagent/app/qlib_rd_loop/factor_from_report_sh.py +++ b/rdagent/app/qlib_rd_loop/factor_from_report_sh.py @@ -1,13 +1,9 @@ # TODO: we should have more advanced mechanism to handle such requirements for saving sessions. -import csv import json -import pickle from pathlib import Path -from typing import Any +from typing import Any, Tuple import fire -import pandas as pd -from dotenv import load_dotenv from jinja2 import Environment, StrictUndefined from rdagent.app.qlib_rd_loop.conf import FACTOR_PROP_SETTING @@ -16,36 +12,23 @@ load_and_process_pdfs_by_langchain, ) from rdagent.components.workflow.conf import BasePropSetting -from rdagent.components.workflow.rd_loop import RDLoop from rdagent.core.developer import Developer from rdagent.core.exception import FactorEmptyError from rdagent.core.prompts import Prompts -from rdagent.core.proposal import ( - Hypothesis, - Hypothesis2Experiment, - HypothesisExperiment2Feedback, - HypothesisGen, - Trace, -) +from rdagent.core.proposal import Hypothesis, HypothesisExperiment2Feedback, Trace from rdagent.core.scenario import Scenario from rdagent.core.utils import import_class from rdagent.log import rdagent_logger as logger from rdagent.oai.llm_utils import APIBackend -from rdagent.scenarios.qlib.developer.factor_coder import QlibFactorCoSTEER from rdagent.scenarios.qlib.experiment.factor_experiment import ( QlibFactorExperiment, QlibFactorScenario, ) from rdagent.scenarios.qlib.factor_experiment_loader.pdf_loader import ( FactorExperimentLoaderFromPDFfiles, - classify_report_from_dict, ) from rdagent.utils.workflow import LoopBase, LoopMeta -with open(FACTOR_PROP_SETTING.report_result_json_file_path, "r") as input_file: - csv_reader = csv.reader(input_file) - judge_pdf_data = [row[0] for row in csv_reader] - prompts_path = Path(__file__).parent / "prompts.yaml" prompts = Prompts(file_path=prompts_path) @@ -78,7 +61,7 @@ def generate_hypothesis(factor_result: dict, report_content: str) -> str: ) -def extract_factors_and_implement(report_file_path: str) -> tuple: +def extract_hypothesis_and_exp_from_reports(report_file_path: str) -> Tuple[QlibFactorExperiment, Hypothesis]: scenario = QlibFactorScenario() with logger.tag("extract_factors_and_implement"): @@ -121,21 +104,18 @@ def __init__(self, PROP_SETTING: BasePropSetting): self.summarizer: HypothesisExperiment2Feedback = import_class(PROP_SETTING.summarizer)(scen) self.trace = Trace(scen=scen) - self.judge_pdf_data_items = judge_pdf_data + self.judge_pdf_data_items = json.load(open(FACTOR_PROP_SETTING.report_result_json_file_path, "r")) self.pdf_file_index = 0 - self.hypo_exp_cache = ( - pickle.load(open(FACTOR_PROP_SETTING.report_extract_result, "rb")) - if Path(FACTOR_PROP_SETTING.report_extract_result).exists() - else {} - ) super().__init__() def propose_hypo_exp(self, prev_out: dict[str, Any]): with logger.tag("r"): while True: + if self.pdf_file_index > 100: + break report_file_path = self.judge_pdf_data_items[self.pdf_file_index] self.pdf_file_index += 1 - exp, hypothesis = extract_factors_and_implement(str(report_file_path)) + exp, hypothesis = extract_hypothesis_and_exp_from_reports(str(report_file_path)) if exp is None: continue exp.based_experiments = [QlibFactorExperiment(sub_tasks=[])] + [t[1] for t in self.trace.hist if t[2]] diff --git a/rdagent/components/coder/factor_coder/CoSTEER/evolving_strategy.py b/rdagent/components/coder/factor_coder/CoSTEER/evolving_strategy.py index 96867ac62..b682901ca 100644 --- a/rdagent/components/coder/factor_coder/CoSTEER/evolving_strategy.py +++ b/rdagent/components/coder/factor_coder/CoSTEER/evolving_strategy.py @@ -69,24 +69,20 @@ def evolve( if FACTOR_IMPLEMENT_SETTINGS.select_threshold < len(to_be_finished_task_index): # Select a fixed number of factors if the total exceeds the threshold - implementation_factors_per_round = FACTOR_IMPLEMENT_SETTINGS.select_threshold - else: - implementation_factors_per_round = len(to_be_finished_task_index) - - if FACTOR_IMPLEMENT_SETTINGS.select_method == "random": - to_be_finished_task_index = RandomSelect( - to_be_finished_task_index, - implementation_factors_per_round, - ) + if FACTOR_IMPLEMENT_SETTINGS.select_method == "random": + to_be_finished_task_index = RandomSelect( + to_be_finished_task_index, + FACTOR_IMPLEMENT_SETTINGS.select_threshold, + ) - if FACTOR_IMPLEMENT_SETTINGS.select_method == "scheduler": - to_be_finished_task_index = LLMSelect( - to_be_finished_task_index, - implementation_factors_per_round, - evo, - queried_knowledge.former_traces, - self.scen, - ) + if FACTOR_IMPLEMENT_SETTINGS.select_method == "scheduler": + to_be_finished_task_index = LLMSelect( + to_be_finished_task_index, + FACTOR_IMPLEMENT_SETTINGS.select_threshold, + evo, + queried_knowledge.former_traces, + self.scen, + ) result = multiprocessing_wrapper( [