Benchmark (#114)

* Init todo * Evaluation & dataset * Generate new data * dataset generation * add the result * Analysis * Factor update * Updates * Reformat analysis.py * CI fix --------- Co-authored-by: Young <[email protected]> Co-authored-by: you-n-g <[email protected]>
microsoft · Jul 25, 2024 · 3cba84d · 3cba84d
1 parent df68078
commit 3cba84d
Show file tree

Hide file tree

Showing 6 changed files with 269 additions and 21 deletions.
diff --git a/rdagent/app/quant_factor_benchmark/analysis.py b/rdagent/app/quant_factor_benchmark/analysis.py
@@ -0,0 +1,184 @@
+import json
+import pickle
+import pandas as pd
+from pathlib import Path
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+from rdagent.components.benchmark.eval_method import FactorImplementEval
+from rdagent.components.benchmark.conf import BenchmarkSettings
+
+class BenchmarkAnalyzer:
+    def __init__(self, settings):
+        self.settings = settings
+        self.index_map = self.load_index_map()
+
+    def load_index_map(self):
+        index_map = {}
+        with open(self.settings.bench_data_path, "r") as file:
+            factor_dict = json.load(file)
+        for factor_name, data in factor_dict.items():
+            index_map[factor_name] = (factor_name, data["Category"], data["Difficulty"])
+        return index_map
+
+    def load_data(self, file_path):
+        file_path = Path(file_path)
+        if not (file_path.is_file() and file_path.suffix == ".pkl"):
+            raise ValueError("Invalid file path")
+
+        with file_path.open("rb") as f:
+            res = pickle.load(f)
+
+        return res
+
+    def process_results(self, results):
+        final_res = {}
+        for experiment, path in results.items():
+            data = self.load_data(path)
+            summarized_data = FactorImplementEval.summarize_res(data)
+            processed_data = self.analyze_data(summarized_data)
+            final_res[experiment] = processed_data.iloc[-1, :]
+        return final_res
+
+    def reformat_succ_rate(self, display_df):
+        new_idx = []
+        display_df = display_df[display_df.index.isin(self.index_map.keys())]
+        for idx in display_df.index:
+            new_idx.append(self.index_map[idx])
+
+        display_df.index = pd.MultiIndex.from_tuples(
+            new_idx,
+            names=["Factor", "Category", "Difficulty"],
+        )
+        display_df = display_df.swaplevel(0, 2).swaplevel(0, 1).sort_index(axis=0)
+
+        return display_df.sort_index(key=lambda x: [{"Easy": 0, "Medium": 1, "Hard": 2, "New Discovery": 3}.get(i, i) for i in x])
+
+    def result_all_key_order(self, x):
+        order_v = []
+        for i in x:
+            order_v.append(
+                {
+                    "avg. Run successful rate": 0,
+                    "avg. Format successful rate": 1,
+                    "avg. Correlation (value only)": 2,
+                    "max. Correlation": 3,
+                    "max. accuracy": 4,
+                    "avg. accuracy": 5,
+                }.get(i, i),
+            )
+        return order_v
+
+    def analyze_data(self, sum_df):
+        index = [
+            "FactorSingleColumnEvaluator",
+            "FactorOutputFormatEvaluator",
+            "FactorRowCountEvaluator",
+            "FactorIndexEvaluator",
+            "FactorMissingValuesEvaluator",
+            "FactorEqualValueCountEvaluator",
+            "FactorCorrelationEvaluator",
+            "run factor error",
+        ]
+        sum_df = sum_df.reindex(index, axis=0)
+        sum_df_clean = sum_df.T.groupby(level=0).apply(lambda x: x.reset_index(drop=True))
+
+        run_error = sum_df_clean["run factor error"].unstack().T.fillna(False).astype(bool)
+        succ_rate = ~run_error
+        succ_rate = succ_rate.mean(axis=0).to_frame("success rate")
+
+        succ_rate_f = self.reformat_succ_rate(succ_rate)
+        succ_rate_f
+
+        sum_df_clean["FactorRowCountEvaluator"]
+
+        format_issue = (
+            sum_df_clean["FactorRowCountEvaluator"] & sum_df_clean["FactorIndexEvaluator"]
+        )
+        eval_series = format_issue.unstack()
+        succ_rate = eval_series.T.fillna(False).astype(bool)  # false indicate failure
+        format_succ_rate = succ_rate.mean(axis=0).to_frame("success rate")
+        format_succ_rate_f = self.reformat_succ_rate(format_succ_rate)
+
+        corr = sum_df_clean["FactorCorrelationEvaluator"] * format_issue
+        corr = corr.unstack().T.mean(axis=0).to_frame("corr(only success)")
+        corr_res = self.reformat_succ_rate(corr)
+        corr_max = sum_df_clean["FactorCorrelationEvaluator"] * format_issue
+
+        corr_max = corr_max.unstack().T.max(axis=0).to_frame("corr(only success)")
+        corr_max_res = self.reformat_succ_rate(corr_max)
+
+        value_max = sum_df_clean["FactorMissingValuesEvaluator"] * format_issue
+        value_max = value_max.unstack().T.max(axis=0).to_frame("max_value")
+        value_max_res = self.reformat_succ_rate(value_max)
+
+        value_avg = (
+            (sum_df_clean["FactorMissingValuesEvaluator"] * format_issue)
+            .unstack()
+            .T.mean(axis=0)
+            .to_frame("avg_value")
+        )
+        value_avg_res = self.reformat_succ_rate(value_avg)
+
+        result_all = pd.concat(
+            {
+                "avg. Correlation (value only)": corr_res.iloc[:, 0],
+                "avg. Format successful rate": format_succ_rate_f.iloc[:, 0],
+                "avg. Run successful rate": succ_rate_f.iloc[:, 0],
+                "max. Correlation": corr_max_res.iloc[:, 0],
+                "max. accuracy": value_max_res.iloc[:, 0],
+                "avg. accuracy": value_avg_res.iloc[:, 0],
+            },
+            axis=1,
+        )
+
+        df = result_all.sort_index(axis=1, key=self.result_all_key_order)
+        print(df)
+
+        # Calculate the mean of each column
+        mean_values = df.fillna(0.0).mean()
+        mean_df = pd.DataFrame(mean_values).T
+
+        # Assign the MultiIndex to the DataFrame
+        mean_df.index = pd.MultiIndex.from_tuples([("-", "-", "Average")], names=["Factor", "Category", "Difficulty"])
+
+        # Append the mean values to the end of the dataframe
+        df_w_mean = pd.concat([df, mean_df]).astype("float")
+
+        return df_w_mean
+
+
+
+class Plotter:
+    @staticmethod
+    def change_fs(font_size):
+        plt.rc("font", size=font_size)
+        plt.rc("axes", titlesize=font_size)
+        plt.rc("axes", labelsize=font_size)
+        plt.rc("xtick", labelsize=font_size)
+        plt.rc("ytick", labelsize=font_size)
+        plt.rc("legend", fontsize=font_size)
+        plt.rc("figure", titlesize=font_size)
+
+    @staticmethod
+    def plot_data(data, file_name):
+        plt.figure(figsize=(10, 6))
+        sns.barplot(x="index", y="b", hue="a", data=data)
+        plt.xlabel("Method")
+        plt.ylabel("Value")
+        plt.title("Comparison of Different Methods")
+        plt.savefig(file_name)
+
+if __name__ == "__main__":
+    settings = BenchmarkSettings()
+    benchmark = BenchmarkAnalyzer(settings)
+    results = {
+        "1 round experiment": "git_ignore_folder/eval_results/res_promptV220240724-060037.pkl",
+    }
+    final_results = benchmark.process_results(results)
+    final_results_df = pd.DataFrame(final_results)
+
+    Plotter.change_fs(20)
+    plot_data = final_results_df.drop(["max. accuracy", "avg. accuracy"], axis=0).T
+    plot_data = plot_data.reset_index().melt("index", var_name="a", value_name="b")
+    Plotter.plot_data(plot_data, "rdagent/app/quant_factor_benchmark/comparison_plot.png")
diff --git a/rdagent/app/quant_factor_benchmark/eval.py b/rdagent/app/quant_factor_benchmark/eval.py
@@ -1,11 +1,23 @@
-from rdagent.scenarios.qlib.factor_task_loader.json_loader import (
+import os
+from pathlib import Path
+import pickle
+import time
+from rdagent.app.qlib_rd_loop.conf import PROP_SETTING
+from rdagent.log import rdagent_logger as logger
+from rdagent.scenarios.qlib.factor_experiment_loader.json_loader import (
     FactorTestCaseLoaderFromJsonFile,
 )
 
 from rdagent.components.benchmark.conf import BenchmarkSettings
 from rdagent.components.benchmark.eval_method import FactorImplementEval
 from rdagent.core.utils import import_class
 
+from rdagent.core.utils import import_class
+from rdagent.core.scenario import Scenario
+from rdagent.scenarios.qlib.experiment.factor_experiment import QlibFactorScenario
+
+from pprint import pprint
+
 # 1.read the settings
 bs = BenchmarkSettings()
 
@@ -14,16 +26,20 @@
 
 # 3.declare the method to be tested and pass the arguments.
 
-method_cls = import_class(bs.bench_method_cls)
-generate_method = method_cls()
-
+scen: Scenario = import_class(PROP_SETTING.factor_scen)()
+generate_method = import_class(bs.bench_method_cls)(scen=scen)
+ 
 # 4.declare the eval method and pass the arguments.
 eval_method = FactorImplementEval(
     method=generate_method,
     test_cases=test_cases,
+    scen=scen,
     catch_eval_except=True,
     test_round=bs.bench_test_round,
 )
 
 # 5.run the eval
 res = eval_method.eval()
+
+# 6.save the result
+logger.log_object(res)
diff --git a/rdagent/components/benchmark/conf.py b/rdagent/components/benchmark/conf.py
@@ -11,14 +11,17 @@
 
 
 class BenchmarkSettings(BaseSettings):
+    class Config:
+        env_prefix = "BENCHMARK_"  # Use BENCHMARK_ as prefix for environment variables
+
     ground_truth_dir: Path = DIRNAME / "ground_truth"
 
     bench_data_path: Path = DIRNAME / "example.json"
 
     bench_test_round: int = 10
     bench_test_case_n: Optional[int] = None  # how many test cases to run; If not given, all test cases will be run
 
-    bench_method_cls: str = "rdagent.factor_implementation.CoSTEER.CoSTEERFG"
+    bench_method_cls: str = "rdagent.components.coder.factor_coder.CoSTEER.FactorCoSTEER"
     bench_method_extra_kwargs: dict = field(
         default_factory=dict,
     )  # extra kwargs for the method to be tested except the task list

diff --git a/rdagent/components/benchmark/eval_method.py b/rdagent/components/benchmark/eval_method.py
@@ -1,7 +1,8 @@
 from collections import defaultdict
 from pathlib import Path
-from typing import List, Tuple, Union
+from typing import Dict, List, Tuple, Union
 
+import pandas as pd
 from tqdm import tqdm
 
 from rdagent.components.coder.factor_coder.config import FACTOR_IMPLEMENT_SETTINGS
@@ -18,11 +19,18 @@
 from rdagent.components.coder.factor_coder.factor import FactorFBWorkspace
 from rdagent.core.conf import RD_AGENT_SETTINGS
 from rdagent.core.developer import Developer
-from rdagent.core.exception import CoderError
+
+from rdagent.core.exception import CoderException, RunnerException
 from rdagent.core.experiment import Task, Workspace
+from rdagent.core.scenario import Scenario
 from rdagent.core.utils import multiprocessing_wrapper
 
 
+EVAL_RES = Dict[
+    str,
+    List[Tuple[FactorEvaluator, Union[object, RunnerException]]],
+]
+
 class TestCase:
     def __init__(
         self,
@@ -114,17 +122,18 @@ def __init__(
         test_cases: TestCase,
         method: Developer,
         *args,
+        scen: Scenario,
         test_round: int = 10,
         **kwargs,
     ):
         online_evaluator_l = [
-            FactorSingleColumnEvaluator(),
-            FactorOutputFormatEvaluator(),
-            FactorRowCountEvaluator(),
-            FactorIndexEvaluator(),
-            FactorMissingValuesEvaluator(),
-            FactorEqualValueCountEvaluator(),
-            FactorCorrelationEvaluator(hard_check=False),
+            FactorSingleColumnEvaluator(scen),
+            FactorOutputFormatEvaluator(scen),
+            FactorRowCountEvaluator(scen),
+            FactorIndexEvaluator(scen),
+            FactorMissingValuesEvaluator(scen),
+            FactorEqualValueCountEvaluator(scen),
+            FactorCorrelationEvaluator(hard_check=False, scen=scen),
         ]
         super().__init__(online_evaluator_l, test_cases, method, *args, **kwargs)
         self.test_round = test_round
@@ -163,3 +172,29 @@ def eval(self):
             res[gt_case.target_task.factor_name].append((gen_factor, eval_res))
 
         return res
+
+    @staticmethod
+    def summarize_res(res: EVAL_RES) -> pd.DataFrame:
+        # None: indicate that it raises exception and get no results
+        sum_res = {}
+        for factor_name, runs in res.items():
+            for fi, err_or_res_l in runs:
+                # NOTE:  str(fi) may not be unique!!  Because the workspace can be skipped when hitting the cache.
+                uniq_key = f"{str(fi)},{id(fi)}"
+
+                key = (factor_name, uniq_key)
+                val = {}
+                if isinstance(err_or_res_l, Exception):
+                    val["run factor error"] = str(err_or_res_l.__class__)
+                else:
+                    val["run factor error"] = None
+                    for ev_obj, err_or_res in err_or_res_l:
+                        if isinstance(err_or_res, Exception):
+                            val[str(ev_obj)] = None
+                        else:
+                            feedback, metric = err_or_res
+                            val[str(ev_obj)] = metric
+                sum_res[key] = val
+
+        return pd.DataFrame(sum_res)
+
diff --git a/rdagent/components/benchmark/example.json b/rdagent/components/benchmark/example.json
@@ -6,7 +6,9 @@
             "20-day turnover rate": "Average turnover rate over the past 20 days.",
             "Market Capitalization": "Total market value of a company's outstanding shares."
         },
-        "gt_code": "import pandas as pd\n\ndata_f = pd.read_hdf('daily_f.h5')\n\ndata = data_f.reset_index()\nwindow_size = 20\n\nnominator=data.groupby('instrument')[['30\u65e5\u6362\u624b\u7387']].rolling(window=window_size).mean().reset_index(0, drop=True)\n# transfer to series\nnew=nominator['30\u65e5\u6362\u624b\u7387']\ndata['Turnover_Rate_Factor']=new/data['\u6d41\u901aA\u80a1']\n\n# # set the datetime and instrument as index and drop the original index\nresult=pd.DataFrame(data['Turnover_Rate_Factor']).set_index(data_f.index)\n\n# transfer the result to series\nresult=result['Turnover_Rate_Factor']\nresult.to_hdf(\"result.h5\", key=\"data\")\n"
+        "Category": "Fundamentals",
+        "Difficulty": "Easy",
+        "gt_code": "import pandas as pd\n\ndata_f = pd.read_hdf('daily_f.h5')\n\ndata = data_f.reset_index()\nwindow_size = 20\n\nnominator=data.groupby('instrument')[['TurnoverRate_30D']].rolling(window=window_size).mean().reset_index(0, drop=True)\n# transfer to series\nnew=nominator['TurnoverRate_30D']\ndata['Turnover_Rate_Factor']=new/data['TradableACapital']\n\n# set the datetime and instrument as index and drop the original index\nresult=pd.DataFrame(data['Turnover_Rate_Factor']).set_index(data_f.index)\n\n# transfer the result to series\nresult=result['Turnover_Rate_Factor']\nresult.to_hdf(\"result.h5\", key=\"data\")" 
     },
     "PctTurn20": {
         "description": "A factor representing the percentage change in turnover rate over the past 20 trading days, market-value neutralized.",
@@ -16,7 +18,9 @@
             "Turnover_{i, t}": "Turnover of stock i at day t.",
             "Turnover_{i, t-20}": "Turnover of stock i at day t-20."
         },
-        "gt_code": "import pandas as pd\nfrom statsmodels import api as sm\n\n\ndef fill_mean(s: pd.Series) -> pd.Series:\n    return s.fillna(s.mean()).fillna(0.0)\n\n\ndef market_value_neutralize(s: pd.Series, mv: pd.Series) -> pd.Series:\n    s = s.groupby(\"datetime\", group_keys=False).apply(fill_mean)\n    mv = mv.groupby(\"datetime\", group_keys=False).apply(fill_mean)\n\n    df_f = mv.to_frame(\"\u5e02\u503c\")\n    df_f[\"const\"] = 1\n    X = df_f[[\"\u5e02\u503c\", \"const\"]]\n\n    # Perform the Ordinary Least Squares (OLS) regression\n    model = sm.OLS(s, X)\n    results = model.fit()\n\n    # Calculate the residuals\n    df_f[\"residual\"] = results.resid\n    df_f[\"norm_resi\"] = df_f.groupby(level=\"datetime\", group_keys=False)[\"residual\"].apply(\n        lambda x: (x - x.mean()) / x.std(),\n    )\n    return df_f[\"norm_resi\"]\n\n\n# get_turnover\ndf_pv = pd.read_hdf(\"daily_pv.h5\", key=\"data\")\ndf_f = pd.read_hdf(\"daily_f.h5\", key=\"data\")\nturnover = df_pv[\"$money\"] / df_f[\"\u6d41\u901a\u5e02\u503c\"]\n\nf = turnover.groupby(\"instrument\").pct_change(periods=20)\n\nf_neutralized = market_value_neutralize(f, df_f[\"\u6d41\u901a\u5e02\u503c\"])\n\nf_neutralized.to_hdf(\"result.h5\", key=\"data\")\n"
+        "Category": "Volume&Price",
+        "Difficulty": "Medium",
+        "gt_code": "import pandas as pd\nfrom statsmodels import api as sm\n\ndef fill_mean(s: pd.Series) -> pd.Series:\n    return s.fillna(s.mean()).fillna(0.0)\n\ndef market_value_neutralize(s: pd.Series, mv: pd.Series) -> pd.Series:\n    s = s.groupby(\"datetime\", group_keys=False).apply(fill_mean)\n    mv = mv.groupby(\"datetime\", group_keys=False).apply(fill_mean)\n\n    df_f = mv.to_frame(\"MarketValue\")\n    df_f[\"const\"] = 1\n    X = df_f[[\"MarketValue\", \"const\"]]\n\n    # Perform the Ordinary Least Squares (OLS) regression\n    model = sm.OLS(s, X)\n    results = model.fit()\n\n    # Calculate the residuals\n    df_f[\"residual\"] = results.resid\n    df_f[\"norm_resi\"] = df_f.groupby(level=\"datetime\", group_keys=False)[\"residual\"].apply(\n        lambda x: (x - x.mean()) / x.std(),\n    )\n    return df_f[\"norm_resi\"]\n\n\n# get_turnover\ndf_pv = pd.read_hdf(\"daily_pv.h5\", key=\"data\")\ndf_f = pd.read_hdf(\"daily_f.h5\", key=\"data\")\nturnover = df_pv[\"$money\"] / df_f[\"TradableMarketValue\"]\n\nf = turnover.groupby(\"instrument\").pct_change(periods=20)\n\nf_neutralized = market_value_neutralize(f, df_f[\"TradableMarketValue\"])\n\nf_neutralized.to_hdf(\"result.h5\", key=\"data\")"
     },
     "PB_ROE": {
         "description": "Constructed using the ranking difference between PB and ROE, with PB and ROE replacing original PB and ROE to obtain reconstructed factor values.",
@@ -25,6 +29,8 @@
             "\\text{rank}(PB_t)": "Ranking PB on cross-section at time t.",
             "\\text{rank}(ROE_t)": "Ranking single-quarter ROE on cross-section at time t."
         },
-        "gt_code": "#!/usr/bin/env python\n\nimport pandas as pd\n\ndata_f = pd.read_hdf('daily_f.h5')\n\ndata = data_f.reset_index()\n\n# Calculate the rank of PB and ROE\ndata['PB_rank'] = data.groupby('datetime')['B/P'].rank()\ndata['ROE_rank'] = data.groupby('datetime')['ROE'].rank()\n\n# Calculate the difference between the ranks\ndata['PB_ROE'] = data['PB_rank'] - data['ROE_rank']\n\n# set the datetime and instrument as index and drop the original index\nresult=pd.DataFrame(data['PB_ROE']).set_index(data_f.index)\n\n# transfer the result to series\nresult=result['PB_ROE']\nresult.to_hdf(\"result.h5\", key=\"data\")\n"
+        "Category": "High-Frequency",
+        "Difficulty": "Hard",
+        "gt_code": "#!/usr/bin/env python\n\nimport pandas as pd\n\ndata_f = pd.read_hdf('daily_f.h5')\n\ndata = data_f.reset_index()\n\n# Calculate the rank of PB and ROE\ndata['PB_rank'] = data.groupby('datetime')['B/P'].rank()\ndata['ROE_rank'] = data.groupby('datetime')['ROE'].rank()\n\n# Calculate the difference between the ranks\ndata['PB_ROE'] = data['PB_rank'] - data['ROE_rank']\n\n# set the datetime and instrument as index and drop the original index\nresult=pd.DataFrame(data['PB_ROE']).set_index(data_f.index)\n\n# transfer the result to series\nresult=result['PB_ROE']\nresult.to_hdf(\"result.h5\", key=\"data\")"
     }
 }