Skip to content

Commit

Permalink
Benchmark (#114)
Browse files Browse the repository at this point in the history
* Init todo

* Evaluation & dataset

* Generate new data

* dataset generation

* add the result

* Analysis

* Factor update

* Updates

* Reformat analysis.py

* CI fix

---------

Co-authored-by: Young <[email protected]>
Co-authored-by: you-n-g <[email protected]>
  • Loading branch information
3 people authored Jul 25, 2024
1 parent df68078 commit 3cba84d
Show file tree
Hide file tree
Showing 6 changed files with 269 additions and 21 deletions.
184 changes: 184 additions & 0 deletions rdagent/app/quant_factor_benchmark/analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
import json
import pickle
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

from rdagent.components.benchmark.eval_method import FactorImplementEval
from rdagent.components.benchmark.conf import BenchmarkSettings

class BenchmarkAnalyzer:
def __init__(self, settings):
self.settings = settings
self.index_map = self.load_index_map()

def load_index_map(self):
index_map = {}
with open(self.settings.bench_data_path, "r") as file:
factor_dict = json.load(file)
for factor_name, data in factor_dict.items():
index_map[factor_name] = (factor_name, data["Category"], data["Difficulty"])
return index_map

def load_data(self, file_path):
file_path = Path(file_path)
if not (file_path.is_file() and file_path.suffix == ".pkl"):
raise ValueError("Invalid file path")

with file_path.open("rb") as f:
res = pickle.load(f)

return res

def process_results(self, results):
final_res = {}
for experiment, path in results.items():
data = self.load_data(path)
summarized_data = FactorImplementEval.summarize_res(data)
processed_data = self.analyze_data(summarized_data)
final_res[experiment] = processed_data.iloc[-1, :]
return final_res

def reformat_succ_rate(self, display_df):
new_idx = []
display_df = display_df[display_df.index.isin(self.index_map.keys())]
for idx in display_df.index:
new_idx.append(self.index_map[idx])

display_df.index = pd.MultiIndex.from_tuples(
new_idx,
names=["Factor", "Category", "Difficulty"],
)
display_df = display_df.swaplevel(0, 2).swaplevel(0, 1).sort_index(axis=0)

return display_df.sort_index(key=lambda x: [{"Easy": 0, "Medium": 1, "Hard": 2, "New Discovery": 3}.get(i, i) for i in x])

def result_all_key_order(self, x):
order_v = []
for i in x:
order_v.append(
{
"avg. Run successful rate": 0,
"avg. Format successful rate": 1,
"avg. Correlation (value only)": 2,
"max. Correlation": 3,
"max. accuracy": 4,
"avg. accuracy": 5,
}.get(i, i),
)
return order_v

def analyze_data(self, sum_df):
index = [
"FactorSingleColumnEvaluator",
"FactorOutputFormatEvaluator",
"FactorRowCountEvaluator",
"FactorIndexEvaluator",
"FactorMissingValuesEvaluator",
"FactorEqualValueCountEvaluator",
"FactorCorrelationEvaluator",
"run factor error",
]
sum_df = sum_df.reindex(index, axis=0)
sum_df_clean = sum_df.T.groupby(level=0).apply(lambda x: x.reset_index(drop=True))

run_error = sum_df_clean["run factor error"].unstack().T.fillna(False).astype(bool)
succ_rate = ~run_error
succ_rate = succ_rate.mean(axis=0).to_frame("success rate")

succ_rate_f = self.reformat_succ_rate(succ_rate)
succ_rate_f

sum_df_clean["FactorRowCountEvaluator"]

format_issue = (
sum_df_clean["FactorRowCountEvaluator"] & sum_df_clean["FactorIndexEvaluator"]
)
eval_series = format_issue.unstack()
succ_rate = eval_series.T.fillna(False).astype(bool) # false indicate failure
format_succ_rate = succ_rate.mean(axis=0).to_frame("success rate")
format_succ_rate_f = self.reformat_succ_rate(format_succ_rate)

corr = sum_df_clean["FactorCorrelationEvaluator"] * format_issue
corr = corr.unstack().T.mean(axis=0).to_frame("corr(only success)")
corr_res = self.reformat_succ_rate(corr)
corr_max = sum_df_clean["FactorCorrelationEvaluator"] * format_issue

corr_max = corr_max.unstack().T.max(axis=0).to_frame("corr(only success)")
corr_max_res = self.reformat_succ_rate(corr_max)

value_max = sum_df_clean["FactorMissingValuesEvaluator"] * format_issue
value_max = value_max.unstack().T.max(axis=0).to_frame("max_value")
value_max_res = self.reformat_succ_rate(value_max)

value_avg = (
(sum_df_clean["FactorMissingValuesEvaluator"] * format_issue)
.unstack()
.T.mean(axis=0)
.to_frame("avg_value")
)
value_avg_res = self.reformat_succ_rate(value_avg)

result_all = pd.concat(
{
"avg. Correlation (value only)": corr_res.iloc[:, 0],
"avg. Format successful rate": format_succ_rate_f.iloc[:, 0],
"avg. Run successful rate": succ_rate_f.iloc[:, 0],
"max. Correlation": corr_max_res.iloc[:, 0],
"max. accuracy": value_max_res.iloc[:, 0],
"avg. accuracy": value_avg_res.iloc[:, 0],
},
axis=1,
)

df = result_all.sort_index(axis=1, key=self.result_all_key_order)
print(df)

# Calculate the mean of each column
mean_values = df.fillna(0.0).mean()
mean_df = pd.DataFrame(mean_values).T

# Assign the MultiIndex to the DataFrame
mean_df.index = pd.MultiIndex.from_tuples([("-", "-", "Average")], names=["Factor", "Category", "Difficulty"])

# Append the mean values to the end of the dataframe
df_w_mean = pd.concat([df, mean_df]).astype("float")

return df_w_mean



class Plotter:
@staticmethod
def change_fs(font_size):
plt.rc("font", size=font_size)
plt.rc("axes", titlesize=font_size)
plt.rc("axes", labelsize=font_size)
plt.rc("xtick", labelsize=font_size)
plt.rc("ytick", labelsize=font_size)
plt.rc("legend", fontsize=font_size)
plt.rc("figure", titlesize=font_size)

@staticmethod
def plot_data(data, file_name):
plt.figure(figsize=(10, 6))
sns.barplot(x="index", y="b", hue="a", data=data)
plt.xlabel("Method")
plt.ylabel("Value")
plt.title("Comparison of Different Methods")
plt.savefig(file_name)

if __name__ == "__main__":
settings = BenchmarkSettings()
benchmark = BenchmarkAnalyzer(settings)
results = {
"1 round experiment": "git_ignore_folder/eval_results/res_promptV220240724-060037.pkl",
}
final_results = benchmark.process_results(results)
final_results_df = pd.DataFrame(final_results)

Plotter.change_fs(20)
plot_data = final_results_df.drop(["max. accuracy", "avg. accuracy"], axis=0).T
plot_data = plot_data.reset_index().melt("index", var_name="a", value_name="b")
Plotter.plot_data(plot_data, "rdagent/app/quant_factor_benchmark/comparison_plot.png")
24 changes: 20 additions & 4 deletions rdagent/app/quant_factor_benchmark/eval.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,23 @@
from rdagent.scenarios.qlib.factor_task_loader.json_loader import (
import os
from pathlib import Path
import pickle
import time
from rdagent.app.qlib_rd_loop.conf import PROP_SETTING
from rdagent.log import rdagent_logger as logger
from rdagent.scenarios.qlib.factor_experiment_loader.json_loader import (
FactorTestCaseLoaderFromJsonFile,
)

from rdagent.components.benchmark.conf import BenchmarkSettings
from rdagent.components.benchmark.eval_method import FactorImplementEval
from rdagent.core.utils import import_class

from rdagent.core.utils import import_class
from rdagent.core.scenario import Scenario
from rdagent.scenarios.qlib.experiment.factor_experiment import QlibFactorScenario

from pprint import pprint

# 1.read the settings
bs = BenchmarkSettings()

Expand All @@ -14,16 +26,20 @@

# 3.declare the method to be tested and pass the arguments.

method_cls = import_class(bs.bench_method_cls)
generate_method = method_cls()

scen: Scenario = import_class(PROP_SETTING.factor_scen)()
generate_method = import_class(bs.bench_method_cls)(scen=scen)
# 4.declare the eval method and pass the arguments.
eval_method = FactorImplementEval(
method=generate_method,
test_cases=test_cases,
scen=scen,
catch_eval_except=True,
test_round=bs.bench_test_round,
)

# 5.run the eval
res = eval_method.eval()

# 6.save the result
logger.log_object(res)
5 changes: 4 additions & 1 deletion rdagent/components/benchmark/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,17 @@


class BenchmarkSettings(BaseSettings):
class Config:
env_prefix = "BENCHMARK_" # Use BENCHMARK_ as prefix for environment variables

ground_truth_dir: Path = DIRNAME / "ground_truth"

bench_data_path: Path = DIRNAME / "example.json"

bench_test_round: int = 10
bench_test_case_n: Optional[int] = None # how many test cases to run; If not given, all test cases will be run

bench_method_cls: str = "rdagent.factor_implementation.CoSTEER.CoSTEERFG"
bench_method_cls: str = "rdagent.components.coder.factor_coder.CoSTEER.FactorCoSTEER"
bench_method_extra_kwargs: dict = field(
default_factory=dict,
) # extra kwargs for the method to be tested except the task list
Expand Down
53 changes: 44 additions & 9 deletions rdagent/components/benchmark/eval_method.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from collections import defaultdict
from pathlib import Path
from typing import List, Tuple, Union
from typing import Dict, List, Tuple, Union

import pandas as pd
from tqdm import tqdm

from rdagent.components.coder.factor_coder.config import FACTOR_IMPLEMENT_SETTINGS
Expand All @@ -18,11 +19,18 @@
from rdagent.components.coder.factor_coder.factor import FactorFBWorkspace
from rdagent.core.conf import RD_AGENT_SETTINGS
from rdagent.core.developer import Developer
from rdagent.core.exception import CoderError

from rdagent.core.exception import CoderException, RunnerException
from rdagent.core.experiment import Task, Workspace
from rdagent.core.scenario import Scenario
from rdagent.core.utils import multiprocessing_wrapper


EVAL_RES = Dict[
str,
List[Tuple[FactorEvaluator, Union[object, RunnerException]]],
]

class TestCase:
def __init__(
self,
Expand Down Expand Up @@ -114,17 +122,18 @@ def __init__(
test_cases: TestCase,
method: Developer,
*args,
scen: Scenario,
test_round: int = 10,
**kwargs,
):
online_evaluator_l = [
FactorSingleColumnEvaluator(),
FactorOutputFormatEvaluator(),
FactorRowCountEvaluator(),
FactorIndexEvaluator(),
FactorMissingValuesEvaluator(),
FactorEqualValueCountEvaluator(),
FactorCorrelationEvaluator(hard_check=False),
FactorSingleColumnEvaluator(scen),
FactorOutputFormatEvaluator(scen),
FactorRowCountEvaluator(scen),
FactorIndexEvaluator(scen),
FactorMissingValuesEvaluator(scen),
FactorEqualValueCountEvaluator(scen),
FactorCorrelationEvaluator(hard_check=False, scen=scen),
]
super().__init__(online_evaluator_l, test_cases, method, *args, **kwargs)
self.test_round = test_round
Expand Down Expand Up @@ -163,3 +172,29 @@ def eval(self):
res[gt_case.target_task.factor_name].append((gen_factor, eval_res))

return res

@staticmethod
def summarize_res(res: EVAL_RES) -> pd.DataFrame:
# None: indicate that it raises exception and get no results
sum_res = {}
for factor_name, runs in res.items():
for fi, err_or_res_l in runs:
# NOTE: str(fi) may not be unique!! Because the workspace can be skipped when hitting the cache.
uniq_key = f"{str(fi)},{id(fi)}"

key = (factor_name, uniq_key)
val = {}
if isinstance(err_or_res_l, Exception):
val["run factor error"] = str(err_or_res_l.__class__)
else:
val["run factor error"] = None
for ev_obj, err_or_res in err_or_res_l:
if isinstance(err_or_res, Exception):
val[str(ev_obj)] = None
else:
feedback, metric = err_or_res
val[str(ev_obj)] = metric
sum_res[key] = val

return pd.DataFrame(sum_res)

12 changes: 9 additions & 3 deletions rdagent/components/benchmark/example.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@
"20-day turnover rate": "Average turnover rate over the past 20 days.",
"Market Capitalization": "Total market value of a company's outstanding shares."
},
"gt_code": "import pandas as pd\n\ndata_f = pd.read_hdf('daily_f.h5')\n\ndata = data_f.reset_index()\nwindow_size = 20\n\nnominator=data.groupby('instrument')[['30\u65e5\u6362\u624b\u7387']].rolling(window=window_size).mean().reset_index(0, drop=True)\n# transfer to series\nnew=nominator['30\u65e5\u6362\u624b\u7387']\ndata['Turnover_Rate_Factor']=new/data['\u6d41\u901aA\u80a1']\n\n# # set the datetime and instrument as index and drop the original index\nresult=pd.DataFrame(data['Turnover_Rate_Factor']).set_index(data_f.index)\n\n# transfer the result to series\nresult=result['Turnover_Rate_Factor']\nresult.to_hdf(\"result.h5\", key=\"data\")\n"
"Category": "Fundamentals",
"Difficulty": "Easy",
"gt_code": "import pandas as pd\n\ndata_f = pd.read_hdf('daily_f.h5')\n\ndata = data_f.reset_index()\nwindow_size = 20\n\nnominator=data.groupby('instrument')[['TurnoverRate_30D']].rolling(window=window_size).mean().reset_index(0, drop=True)\n# transfer to series\nnew=nominator['TurnoverRate_30D']\ndata['Turnover_Rate_Factor']=new/data['TradableACapital']\n\n# set the datetime and instrument as index and drop the original index\nresult=pd.DataFrame(data['Turnover_Rate_Factor']).set_index(data_f.index)\n\n# transfer the result to series\nresult=result['Turnover_Rate_Factor']\nresult.to_hdf(\"result.h5\", key=\"data\")"
},
"PctTurn20": {
"description": "A factor representing the percentage change in turnover rate over the past 20 trading days, market-value neutralized.",
Expand All @@ -16,7 +18,9 @@
"Turnover_{i, t}": "Turnover of stock i at day t.",
"Turnover_{i, t-20}": "Turnover of stock i at day t-20."
},
"gt_code": "import pandas as pd\nfrom statsmodels import api as sm\n\n\ndef fill_mean(s: pd.Series) -> pd.Series:\n return s.fillna(s.mean()).fillna(0.0)\n\n\ndef market_value_neutralize(s: pd.Series, mv: pd.Series) -> pd.Series:\n s = s.groupby(\"datetime\", group_keys=False).apply(fill_mean)\n mv = mv.groupby(\"datetime\", group_keys=False).apply(fill_mean)\n\n df_f = mv.to_frame(\"\u5e02\u503c\")\n df_f[\"const\"] = 1\n X = df_f[[\"\u5e02\u503c\", \"const\"]]\n\n # Perform the Ordinary Least Squares (OLS) regression\n model = sm.OLS(s, X)\n results = model.fit()\n\n # Calculate the residuals\n df_f[\"residual\"] = results.resid\n df_f[\"norm_resi\"] = df_f.groupby(level=\"datetime\", group_keys=False)[\"residual\"].apply(\n lambda x: (x - x.mean()) / x.std(),\n )\n return df_f[\"norm_resi\"]\n\n\n# get_turnover\ndf_pv = pd.read_hdf(\"daily_pv.h5\", key=\"data\")\ndf_f = pd.read_hdf(\"daily_f.h5\", key=\"data\")\nturnover = df_pv[\"$money\"] / df_f[\"\u6d41\u901a\u5e02\u503c\"]\n\nf = turnover.groupby(\"instrument\").pct_change(periods=20)\n\nf_neutralized = market_value_neutralize(f, df_f[\"\u6d41\u901a\u5e02\u503c\"])\n\nf_neutralized.to_hdf(\"result.h5\", key=\"data\")\n"
"Category": "Volume&Price",
"Difficulty": "Medium",
"gt_code": "import pandas as pd\nfrom statsmodels import api as sm\n\ndef fill_mean(s: pd.Series) -> pd.Series:\n return s.fillna(s.mean()).fillna(0.0)\n\ndef market_value_neutralize(s: pd.Series, mv: pd.Series) -> pd.Series:\n s = s.groupby(\"datetime\", group_keys=False).apply(fill_mean)\n mv = mv.groupby(\"datetime\", group_keys=False).apply(fill_mean)\n\n df_f = mv.to_frame(\"MarketValue\")\n df_f[\"const\"] = 1\n X = df_f[[\"MarketValue\", \"const\"]]\n\n # Perform the Ordinary Least Squares (OLS) regression\n model = sm.OLS(s, X)\n results = model.fit()\n\n # Calculate the residuals\n df_f[\"residual\"] = results.resid\n df_f[\"norm_resi\"] = df_f.groupby(level=\"datetime\", group_keys=False)[\"residual\"].apply(\n lambda x: (x - x.mean()) / x.std(),\n )\n return df_f[\"norm_resi\"]\n\n\n# get_turnover\ndf_pv = pd.read_hdf(\"daily_pv.h5\", key=\"data\")\ndf_f = pd.read_hdf(\"daily_f.h5\", key=\"data\")\nturnover = df_pv[\"$money\"] / df_f[\"TradableMarketValue\"]\n\nf = turnover.groupby(\"instrument\").pct_change(periods=20)\n\nf_neutralized = market_value_neutralize(f, df_f[\"TradableMarketValue\"])\n\nf_neutralized.to_hdf(\"result.h5\", key=\"data\")"
},
"PB_ROE": {
"description": "Constructed using the ranking difference between PB and ROE, with PB and ROE replacing original PB and ROE to obtain reconstructed factor values.",
Expand All @@ -25,6 +29,8 @@
"\\text{rank}(PB_t)": "Ranking PB on cross-section at time t.",
"\\text{rank}(ROE_t)": "Ranking single-quarter ROE on cross-section at time t."
},
"gt_code": "#!/usr/bin/env python\n\nimport pandas as pd\n\ndata_f = pd.read_hdf('daily_f.h5')\n\ndata = data_f.reset_index()\n\n# Calculate the rank of PB and ROE\ndata['PB_rank'] = data.groupby('datetime')['B/P'].rank()\ndata['ROE_rank'] = data.groupby('datetime')['ROE'].rank()\n\n# Calculate the difference between the ranks\ndata['PB_ROE'] = data['PB_rank'] - data['ROE_rank']\n\n# set the datetime and instrument as index and drop the original index\nresult=pd.DataFrame(data['PB_ROE']).set_index(data_f.index)\n\n# transfer the result to series\nresult=result['PB_ROE']\nresult.to_hdf(\"result.h5\", key=\"data\")\n"
"Category": "High-Frequency",
"Difficulty": "Hard",
"gt_code": "#!/usr/bin/env python\n\nimport pandas as pd\n\ndata_f = pd.read_hdf('daily_f.h5')\n\ndata = data_f.reset_index()\n\n# Calculate the rank of PB and ROE\ndata['PB_rank'] = data.groupby('datetime')['B/P'].rank()\ndata['ROE_rank'] = data.groupby('datetime')['ROE'].rank()\n\n# Calculate the difference between the ranks\ndata['PB_ROE'] = data['PB_rank'] - data['ROE_rank']\n\n# set the datetime and instrument as index and drop the original index\nresult=pd.DataFrame(data['PB_ROE']).set_index(data_f.index)\n\n# transfer the result to series\nresult=result['PB_ROE']\nresult.to_hdf(\"result.h5\", key=\"data\")"
}
}
Loading

0 comments on commit 3cba84d

Please sign in to comment.