Skip to content

Commit

Permalink
build factor source data (price and volumns) from qlib if no source d…
Browse files Browse the repository at this point in the history
…ata is provided by the user (#168)
  • Loading branch information
peteryang1 authored Aug 5, 2024
1 parent 6a5a750 commit 48c81ea
Show file tree
Hide file tree
Showing 11 changed files with 166 additions and 68 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,9 @@ dmypy.json
# all pkl files
*.pkl

# all h5 files
*.h5

# all vs-code files
.vscode/

Expand Down
21 changes: 9 additions & 12 deletions rdagent/app/qlib_rd_loop/factor_from_report_w_sc.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,29 +6,26 @@
import fire
from jinja2 import Environment, StrictUndefined

from rdagent.app.qlib_rd_loop.conf import FactorBasePropSetting
from rdagent.app.qlib_rd_loop.conf import (
FACTOR_FROM_REPORT_PROP_SETTING,
FactorBasePropSetting,
)
from rdagent.app.qlib_rd_loop.factor_w_sc import FactorRDLoop
from rdagent.components.document_reader.document_reader import (
extract_first_page_screenshot_from_pdf,
load_and_process_pdfs_by_langchain,
)
from rdagent.components.workflow.conf import BasePropSetting
from rdagent.components.workflow.rd_loop import RDLoop
from rdagent.core.developer import Developer
from rdagent.core.exception import FactorEmptyError
from rdagent.core.prompts import Prompts
from rdagent.core.proposal import Hypothesis, HypothesisExperiment2Feedback, Trace
from rdagent.core.scenario import Scenario
from rdagent.core.utils import import_class
from rdagent.core.proposal import Hypothesis
from rdagent.log import rdagent_logger as logger
from rdagent.oai.llm_utils import APIBackend
from rdagent.scenarios.qlib.experiment.factor_experiment import QlibFactorExperiment
from rdagent.scenarios.qlib.experiment.factor_from_report_experiment import (
QlibFactorFromReportScenario,
)
from rdagent.scenarios.qlib.factor_experiment_loader.pdf_loader import (
FactorExperimentLoaderFromPDFfiles,
)
from rdagent.utils.workflow import LoopBase, LoopMeta
from rdagent.utils.workflow import LoopMeta

prompts_path = Path(__file__).parent / "prompts.yaml"
prompts = Prompts(file_path=prompts_path)
Expand Down Expand Up @@ -90,14 +87,14 @@ def extract_hypothesis_and_exp_from_reports(report_file_path: str) -> Tuple[Qlib
return exp, hypothesis


class FactorReportLoop(RDLoop, metaclass=LoopMeta):
class FactorReportLoop(FactorRDLoop, metaclass=LoopMeta):
skip_loop_error = (FactorEmptyError,)

def __init__(self, PROP_SETTING: FactorBasePropSetting):
super().__init__(PROP_SETTING=PROP_SETTING)
self.judge_pdf_data_items = json.load(open(PROP_SETTING.report_result_json_file_path, "r"))
self.pdf_file_index = 0
self.valid_pdf_file_count = 0
super().__init__(PROP_SETTING=PROP_SETTING)

def propose_hypo_exp(self, prev_out: dict[str, Any]):
with logger.tag("r"):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
RandomSelect,
)
from rdagent.components.coder.factor_coder.factor import FactorFBWorkspace, FactorTask
from rdagent.components.coder.factor_coder.utils import get_data_folder_intro
from rdagent.core.conf import RD_AGENT_SETTINGS
from rdagent.core.evolving_framework import EvolvingStrategy, QueriedKnowledge
from rdagent.core.experiment import Workspace
Expand Down
1 change: 0 additions & 1 deletion rdagent/components/coder/factor_coder/CoSTEER/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from rdagent.components.coder.factor_coder.CoSTEER.evolvable_subjects import (
FactorEvolvingItem,
)
from rdagent.components.coder.factor_coder.utils import get_data_folder_intro
from rdagent.core.conf import RD_AGENT_SETTINGS
from rdagent.core.prompts import Prompts
from rdagent.core.scenario import Scenario
Expand Down
51 changes: 0 additions & 51 deletions rdagent/components/coder/factor_coder/utils.py

This file was deleted.

24 changes: 24 additions & 0 deletions rdagent/scenarios/qlib/experiment/factor_data_template/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# How to read files.
For example, if you want to read `filename.h5`
```Python
import pandas as pd
df = pd.read_hdf("filename.h5", key="data")
```
NOTE: **key is always "data" for all hdf5 files **.

# Here is a short description about the data

| Filename | Description |
| -------------- | -----------------------------------------------------------------|
| "daily_pv.h5" | Adjusted daily price and volume data. |


# For different data, We have some basic knowledge for them

## Daily price and volume data
$open: open price of the stock on that day.
$close: close price of the stock on that day.
$high: high price of the stock on that day.
$low: low price of the stock on that day.
$volume: volume of the stock on that day.
$factor: factor value of the stock on that day.
27 changes: 27 additions & 0 deletions rdagent/scenarios/qlib/experiment/factor_data_template/generate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import qlib

qlib.init(provider_uri="~/.qlib/qlib_data/cn_data")

from qlib.data import D

instruments = D.instruments()
fields = ["$open", "$close", "$high", "$low", "$volume", "$factor"]
data = D.features(instruments, fields, freq="day").swaplevel().sort_index().loc["2008-12-29":].sort_index()

data.to_hdf("./daily_pv_all.h5", key="data")


fields = ["$open", "$close", "$high", "$low", "$volume", "$factor"]
data = (
(
D.features(instruments, fields, start_time="2018-01-01", end_time="2019-12-31", freq="day")
.swaplevel()
.sort_index()
)
.swaplevel()
.loc[data.reset_index()["instrument"].unique()[:100]]
.swaplevel()
.sort_index()
)

data.to_hdf("./daily_pv_debug.h5", key="data")
2 changes: 1 addition & 1 deletion rdagent/scenarios/qlib/experiment/factor_experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
FactorFBWorkspace,
FactorTask,
)
from rdagent.components.coder.factor_coder.utils import get_data_folder_intro
from rdagent.core.prompts import Prompts
from rdagent.core.scenario import Scenario
from rdagent.scenarios.qlib.experiment.utils import get_data_folder_intro
from rdagent.scenarios.qlib.experiment.workspace import QlibFBWorkspace

prompt_dict = Prompts(file_path=Path(__file__).parent / "prompts.yaml")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
FactorFBWorkspace,
FactorTask,
)
from rdagent.components.coder.factor_coder.utils import get_data_folder_intro
from rdagent.core.prompts import Prompts
from rdagent.core.scenario import Scenario
from rdagent.scenarios.qlib.experiment.factor_experiment import QlibFactorScenario
Expand Down
98 changes: 98 additions & 0 deletions rdagent/scenarios/qlib/experiment/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
import shutil
from pathlib import Path

import pandas as pd

# render it with jinja
from jinja2 import Environment, StrictUndefined

from rdagent.components.coder.factor_coder.config import FACTOR_IMPLEMENT_SETTINGS
from rdagent.utils.env import QTDockerEnv


def generate_data_folder_from_qlib():
template_path = Path(__file__).parent / "factor_data_template"
qtde = QTDockerEnv()
qtde.prepare()

# Run the Qlib backtest
execute_log = qtde.run(
local_path=str(template_path),
entry=f"python generate.py",
)

assert (
Path(__file__).parent / "factor_data_template" / "daily_pv_all.h5"
).exists(), "daily_pv_all.h5 is not generated."
assert (
Path(__file__).parent / "factor_data_template" / "daily_pv_debug.h5"
).exists(), "daily_pv_debug.h5 is not generated."

Path(FACTOR_IMPLEMENT_SETTINGS.data_folder).mkdir(parents=True, exist_ok=True)
shutil.copy(
Path(__file__).parent / "factor_data_template" / "daily_pv_all.h5",
Path(FACTOR_IMPLEMENT_SETTINGS.data_folder) / "daily_pv.h5",
)
shutil.copy(
Path(__file__).parent / "factor_data_template" / "README.md",
Path(FACTOR_IMPLEMENT_SETTINGS.data_folder) / "README.md",
)

Path(FACTOR_IMPLEMENT_SETTINGS.data_folder_debug).mkdir(parents=True, exist_ok=True)
shutil.copy(
Path(__file__).parent / "factor_data_template" / "daily_pv_debug.h5",
Path(FACTOR_IMPLEMENT_SETTINGS.data_folder_debug) / "daily_pv.h5",
)
shutil.copy(
Path(__file__).parent / "factor_data_template" / "README.md",
Path(FACTOR_IMPLEMENT_SETTINGS.data_folder_debug) / "README.md",
)


def get_data_folder_intro():
"""Directly get the info of the data folder.
It is for preparing prompting message.
"""

if (
not Path(FACTOR_IMPLEMENT_SETTINGS.data_folder).exists()
or not Path(FACTOR_IMPLEMENT_SETTINGS.data_folder_debug).exists()
):
generate_data_folder_from_qlib()

JJ_TPL = Environment(undefined=StrictUndefined).from_string(
"""
{{file_name}}
```{{type_desc}}
{{content}}
```
"""
)
content_l = []
for p in Path(FACTOR_IMPLEMENT_SETTINGS.data_folder_debug).iterdir():
if p.name.endswith(".h5"):
df = pd.read_hdf(p)
# get df.head() as string with full width
pd.set_option("display.max_columns", None) # or 1000
pd.set_option("display.max_rows", None) # or 1000
pd.set_option("display.max_colwidth", None) # or 199
rendered = JJ_TPL.render(
file_name=p.name,
type_desc="generated by `pd.read_hdf(filename).head()`",
content=df.head().to_string(),
)
content_l.append(rendered)
elif p.name.endswith(".md"):
with open(p) as f:
content = f.read()
rendered = JJ_TPL.render(
file_name=p.name,
type_desc="markdown",
content=content,
)
content_l.append(rendered)
else:
raise NotImplementedError(
f"file type {p.name} is not supported. Please implement its description function.",
)
return "\n ----------------- file splitter -------------\n".join(content_l)
5 changes: 4 additions & 1 deletion rdagent/utils/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
However, Python generator is not picklable (dill does not support pickle as well)
"""

import datetime
import pickle
from collections import defaultdict
Expand All @@ -27,7 +28,9 @@ def _get_steps(bases):
"""
steps = []
for base in bases:
steps.extend(LoopMeta._get_steps(base.__bases__) + getattr(base, "steps", []))
for step in LoopMeta._get_steps(base.__bases__) + getattr(base, "steps", []):
if step not in steps:
steps.append(step)
return steps

def __new__(cls, clsname, bases, attrs):
Expand Down

0 comments on commit 48c81ea

Please sign in to comment.