From 48c81ead959fbd94a63d023fd8c0ffe23ea325c6 Mon Sep 17 00:00:00 2001 From: Xu Yang Date: Mon, 5 Aug 2024 21:01:59 +0800 Subject: [PATCH] build factor source data (price and volumns) from qlib if no source data is provided by the user (#168) --- .gitignore | 3 + .../qlib_rd_loop/factor_from_report_w_sc.py | 21 ++-- .../factor_coder/CoSTEER/evolving_strategy.py | 1 - .../coder/factor_coder/CoSTEER/scheduler.py | 1 - .../components/coder/factor_coder/utils.py | 51 ---------- .../experiment/factor_data_template/README.md | 24 +++++ .../factor_data_template/generate.py | 27 +++++ .../qlib/experiment/factor_experiment.py | 2 +- .../factor_from_report_experiment.py | 1 - rdagent/scenarios/qlib/experiment/utils.py | 98 +++++++++++++++++++ rdagent/utils/workflow.py | 5 +- 11 files changed, 166 insertions(+), 68 deletions(-) delete mode 100644 rdagent/components/coder/factor_coder/utils.py create mode 100644 rdagent/scenarios/qlib/experiment/factor_data_template/README.md create mode 100644 rdagent/scenarios/qlib/experiment/factor_data_template/generate.py create mode 100644 rdagent/scenarios/qlib/experiment/utils.py diff --git a/.gitignore b/.gitignore index fd067266..400cf7d8 100644 --- a/.gitignore +++ b/.gitignore @@ -139,6 +139,9 @@ dmypy.json # all pkl files *.pkl +# all h5 files +*.h5 + # all vs-code files .vscode/ diff --git a/rdagent/app/qlib_rd_loop/factor_from_report_w_sc.py b/rdagent/app/qlib_rd_loop/factor_from_report_w_sc.py index 690e5161..c9c9c79b 100644 --- a/rdagent/app/qlib_rd_loop/factor_from_report_w_sc.py +++ b/rdagent/app/qlib_rd_loop/factor_from_report_w_sc.py @@ -6,29 +6,26 @@ import fire from jinja2 import Environment, StrictUndefined -from rdagent.app.qlib_rd_loop.conf import FactorBasePropSetting +from rdagent.app.qlib_rd_loop.conf import ( + FACTOR_FROM_REPORT_PROP_SETTING, + FactorBasePropSetting, +) +from rdagent.app.qlib_rd_loop.factor_w_sc import FactorRDLoop from rdagent.components.document_reader.document_reader import ( extract_first_page_screenshot_from_pdf, load_and_process_pdfs_by_langchain, ) -from rdagent.components.workflow.conf import BasePropSetting from rdagent.components.workflow.rd_loop import RDLoop -from rdagent.core.developer import Developer from rdagent.core.exception import FactorEmptyError from rdagent.core.prompts import Prompts -from rdagent.core.proposal import Hypothesis, HypothesisExperiment2Feedback, Trace -from rdagent.core.scenario import Scenario -from rdagent.core.utils import import_class +from rdagent.core.proposal import Hypothesis from rdagent.log import rdagent_logger as logger from rdagent.oai.llm_utils import APIBackend from rdagent.scenarios.qlib.experiment.factor_experiment import QlibFactorExperiment -from rdagent.scenarios.qlib.experiment.factor_from_report_experiment import ( - QlibFactorFromReportScenario, -) from rdagent.scenarios.qlib.factor_experiment_loader.pdf_loader import ( FactorExperimentLoaderFromPDFfiles, ) -from rdagent.utils.workflow import LoopBase, LoopMeta +from rdagent.utils.workflow import LoopMeta prompts_path = Path(__file__).parent / "prompts.yaml" prompts = Prompts(file_path=prompts_path) @@ -90,14 +87,14 @@ def extract_hypothesis_and_exp_from_reports(report_file_path: str) -> Tuple[Qlib return exp, hypothesis -class FactorReportLoop(RDLoop, metaclass=LoopMeta): +class FactorReportLoop(FactorRDLoop, metaclass=LoopMeta): skip_loop_error = (FactorEmptyError,) def __init__(self, PROP_SETTING: FactorBasePropSetting): + super().__init__(PROP_SETTING=PROP_SETTING) self.judge_pdf_data_items = json.load(open(PROP_SETTING.report_result_json_file_path, "r")) self.pdf_file_index = 0 self.valid_pdf_file_count = 0 - super().__init__(PROP_SETTING=PROP_SETTING) def propose_hypo_exp(self, prev_out: dict[str, Any]): with logger.tag("r"): diff --git a/rdagent/components/coder/factor_coder/CoSTEER/evolving_strategy.py b/rdagent/components/coder/factor_coder/CoSTEER/evolving_strategy.py index b682901c..2dbfa9c5 100644 --- a/rdagent/components/coder/factor_coder/CoSTEER/evolving_strategy.py +++ b/rdagent/components/coder/factor_coder/CoSTEER/evolving_strategy.py @@ -17,7 +17,6 @@ RandomSelect, ) from rdagent.components.coder.factor_coder.factor import FactorFBWorkspace, FactorTask -from rdagent.components.coder.factor_coder.utils import get_data_folder_intro from rdagent.core.conf import RD_AGENT_SETTINGS from rdagent.core.evolving_framework import EvolvingStrategy, QueriedKnowledge from rdagent.core.experiment import Workspace diff --git a/rdagent/components/coder/factor_coder/CoSTEER/scheduler.py b/rdagent/components/coder/factor_coder/CoSTEER/scheduler.py index ed930666..4e31e566 100644 --- a/rdagent/components/coder/factor_coder/CoSTEER/scheduler.py +++ b/rdagent/components/coder/factor_coder/CoSTEER/scheduler.py @@ -7,7 +7,6 @@ from rdagent.components.coder.factor_coder.CoSTEER.evolvable_subjects import ( FactorEvolvingItem, ) -from rdagent.components.coder.factor_coder.utils import get_data_folder_intro from rdagent.core.conf import RD_AGENT_SETTINGS from rdagent.core.prompts import Prompts from rdagent.core.scenario import Scenario diff --git a/rdagent/components/coder/factor_coder/utils.py b/rdagent/components/coder/factor_coder/utils.py deleted file mode 100644 index a36d7bcd..00000000 --- a/rdagent/components/coder/factor_coder/utils.py +++ /dev/null @@ -1,51 +0,0 @@ -from pathlib import Path - -import pandas as pd - -# render it with jinja -from jinja2 import Environment, StrictUndefined - -from rdagent.components.coder.factor_coder.config import FACTOR_IMPLEMENT_SETTINGS - -TPL = """ -{{file_name}} -```{{type_desc}} -{{content}} -```` -""" -# Create a Jinja template from the string -JJ_TPL = Environment(undefined=StrictUndefined).from_string(TPL) - - -def get_data_folder_intro(): - """Directly get the info of the data folder. - It is for preparing prompting message. - """ - content_l = [] - for p in Path(FACTOR_IMPLEMENT_SETTINGS.data_folder_debug).iterdir(): - if p.name.endswith(".h5"): - df = pd.read_hdf(p) - # get df.head() as string with full width - pd.set_option("display.max_columns", None) # or 1000 - pd.set_option("display.max_rows", None) # or 1000 - pd.set_option("display.max_colwidth", None) # or 199 - rendered = JJ_TPL.render( - file_name=p.name, - type_desc="generated by `pd.read_hdf(filename).head()`", - content=df.head().to_string(), - ) - content_l.append(rendered) - elif p.name.endswith(".md"): - with open(p) as f: - content = f.read() - rendered = JJ_TPL.render( - file_name=p.name, - type_desc="markdown", - content=content, - ) - content_l.append(rendered) - else: - raise NotImplementedError( - f"file type {p.name} is not supported. Please implement its description function.", - ) - return "\n ----------------- file splitter -------------\n".join(content_l) diff --git a/rdagent/scenarios/qlib/experiment/factor_data_template/README.md b/rdagent/scenarios/qlib/experiment/factor_data_template/README.md new file mode 100644 index 00000000..d03534de --- /dev/null +++ b/rdagent/scenarios/qlib/experiment/factor_data_template/README.md @@ -0,0 +1,24 @@ +# How to read files. +For example, if you want to read `filename.h5` +```Python +import pandas as pd +df = pd.read_hdf("filename.h5", key="data") +``` +NOTE: **key is always "data" for all hdf5 files **. + +# Here is a short description about the data + +| Filename | Description | +| -------------- | -----------------------------------------------------------------| +| "daily_pv.h5" | Adjusted daily price and volume data. | + + +# For different data, We have some basic knowledge for them + +## Daily price and volume data +$open: open price of the stock on that day. +$close: close price of the stock on that day. +$high: high price of the stock on that day. +$low: low price of the stock on that day. +$volume: volume of the stock on that day. +$factor: factor value of the stock on that day. \ No newline at end of file diff --git a/rdagent/scenarios/qlib/experiment/factor_data_template/generate.py b/rdagent/scenarios/qlib/experiment/factor_data_template/generate.py new file mode 100644 index 00000000..f05aee5b --- /dev/null +++ b/rdagent/scenarios/qlib/experiment/factor_data_template/generate.py @@ -0,0 +1,27 @@ +import qlib + +qlib.init(provider_uri="~/.qlib/qlib_data/cn_data") + +from qlib.data import D + +instruments = D.instruments() +fields = ["$open", "$close", "$high", "$low", "$volume", "$factor"] +data = D.features(instruments, fields, freq="day").swaplevel().sort_index().loc["2008-12-29":].sort_index() + +data.to_hdf("./daily_pv_all.h5", key="data") + + +fields = ["$open", "$close", "$high", "$low", "$volume", "$factor"] +data = ( + ( + D.features(instruments, fields, start_time="2018-01-01", end_time="2019-12-31", freq="day") + .swaplevel() + .sort_index() + ) + .swaplevel() + .loc[data.reset_index()["instrument"].unique()[:100]] + .swaplevel() + .sort_index() +) + +data.to_hdf("./daily_pv_debug.h5", key="data") diff --git a/rdagent/scenarios/qlib/experiment/factor_experiment.py b/rdagent/scenarios/qlib/experiment/factor_experiment.py index 4587ee9b..8cd7404f 100644 --- a/rdagent/scenarios/qlib/experiment/factor_experiment.py +++ b/rdagent/scenarios/qlib/experiment/factor_experiment.py @@ -5,9 +5,9 @@ FactorFBWorkspace, FactorTask, ) -from rdagent.components.coder.factor_coder.utils import get_data_folder_intro from rdagent.core.prompts import Prompts from rdagent.core.scenario import Scenario +from rdagent.scenarios.qlib.experiment.utils import get_data_folder_intro from rdagent.scenarios.qlib.experiment.workspace import QlibFBWorkspace prompt_dict = Prompts(file_path=Path(__file__).parent / "prompts.yaml") diff --git a/rdagent/scenarios/qlib/experiment/factor_from_report_experiment.py b/rdagent/scenarios/qlib/experiment/factor_from_report_experiment.py index 7c63e845..df75c8ba 100644 --- a/rdagent/scenarios/qlib/experiment/factor_from_report_experiment.py +++ b/rdagent/scenarios/qlib/experiment/factor_from_report_experiment.py @@ -5,7 +5,6 @@ FactorFBWorkspace, FactorTask, ) -from rdagent.components.coder.factor_coder.utils import get_data_folder_intro from rdagent.core.prompts import Prompts from rdagent.core.scenario import Scenario from rdagent.scenarios.qlib.experiment.factor_experiment import QlibFactorScenario diff --git a/rdagent/scenarios/qlib/experiment/utils.py b/rdagent/scenarios/qlib/experiment/utils.py new file mode 100644 index 00000000..a39e590e --- /dev/null +++ b/rdagent/scenarios/qlib/experiment/utils.py @@ -0,0 +1,98 @@ +import shutil +from pathlib import Path + +import pandas as pd + +# render it with jinja +from jinja2 import Environment, StrictUndefined + +from rdagent.components.coder.factor_coder.config import FACTOR_IMPLEMENT_SETTINGS +from rdagent.utils.env import QTDockerEnv + + +def generate_data_folder_from_qlib(): + template_path = Path(__file__).parent / "factor_data_template" + qtde = QTDockerEnv() + qtde.prepare() + + # Run the Qlib backtest + execute_log = qtde.run( + local_path=str(template_path), + entry=f"python generate.py", + ) + + assert ( + Path(__file__).parent / "factor_data_template" / "daily_pv_all.h5" + ).exists(), "daily_pv_all.h5 is not generated." + assert ( + Path(__file__).parent / "factor_data_template" / "daily_pv_debug.h5" + ).exists(), "daily_pv_debug.h5 is not generated." + + Path(FACTOR_IMPLEMENT_SETTINGS.data_folder).mkdir(parents=True, exist_ok=True) + shutil.copy( + Path(__file__).parent / "factor_data_template" / "daily_pv_all.h5", + Path(FACTOR_IMPLEMENT_SETTINGS.data_folder) / "daily_pv.h5", + ) + shutil.copy( + Path(__file__).parent / "factor_data_template" / "README.md", + Path(FACTOR_IMPLEMENT_SETTINGS.data_folder) / "README.md", + ) + + Path(FACTOR_IMPLEMENT_SETTINGS.data_folder_debug).mkdir(parents=True, exist_ok=True) + shutil.copy( + Path(__file__).parent / "factor_data_template" / "daily_pv_debug.h5", + Path(FACTOR_IMPLEMENT_SETTINGS.data_folder_debug) / "daily_pv.h5", + ) + shutil.copy( + Path(__file__).parent / "factor_data_template" / "README.md", + Path(FACTOR_IMPLEMENT_SETTINGS.data_folder_debug) / "README.md", + ) + + +def get_data_folder_intro(): + """Directly get the info of the data folder. + It is for preparing prompting message. + """ + + if ( + not Path(FACTOR_IMPLEMENT_SETTINGS.data_folder).exists() + or not Path(FACTOR_IMPLEMENT_SETTINGS.data_folder_debug).exists() + ): + generate_data_folder_from_qlib() + + JJ_TPL = Environment(undefined=StrictUndefined).from_string( + """ +{{file_name}} +```{{type_desc}} +{{content}} +``` +""" + ) + content_l = [] + for p in Path(FACTOR_IMPLEMENT_SETTINGS.data_folder_debug).iterdir(): + if p.name.endswith(".h5"): + df = pd.read_hdf(p) + # get df.head() as string with full width + pd.set_option("display.max_columns", None) # or 1000 + pd.set_option("display.max_rows", None) # or 1000 + pd.set_option("display.max_colwidth", None) # or 199 + rendered = JJ_TPL.render( + file_name=p.name, + type_desc="generated by `pd.read_hdf(filename).head()`", + content=df.head().to_string(), + ) + content_l.append(rendered) + elif p.name.endswith(".md"): + with open(p) as f: + content = f.read() + rendered = JJ_TPL.render( + file_name=p.name, + type_desc="markdown", + content=content, + ) + content_l.append(rendered) + else: + raise NotImplementedError( + f"file type {p.name} is not supported. Please implement its description function.", + ) + return "\n ----------------- file splitter -------------\n".join(content_l) diff --git a/rdagent/utils/workflow.py b/rdagent/utils/workflow.py index f4e568cd..9399ff00 100644 --- a/rdagent/utils/workflow.py +++ b/rdagent/utils/workflow.py @@ -7,6 +7,7 @@ However, Python generator is not picklable (dill does not support pickle as well) """ + import datetime import pickle from collections import defaultdict @@ -27,7 +28,9 @@ def _get_steps(bases): """ steps = [] for base in bases: - steps.extend(LoopMeta._get_steps(base.__bases__) + getattr(base, "steps", [])) + for step in LoopMeta._get_steps(base.__bases__) + getattr(base, "steps", []): + if step not in steps: + steps.append(step) return steps def __new__(cls, clsname, bases, attrs):