Skip to content

Commit

Permalink
cancel some comments and fix some bugs
Browse files Browse the repository at this point in the history
  • Loading branch information
WinstonLiyt committed Aug 2, 2024
1 parent 6fd15a7 commit a883a78
Show file tree
Hide file tree
Showing 6 changed files with 30 additions and 72 deletions.
22 changes: 16 additions & 6 deletions rdagent/app/qlib_rd_loop/factor_from_report_sh.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# TODO: we should have more advanced mechanism to handle such requirements for saving sessions.
import json
import csv
import json
import pickle
from pathlib import Path
from typing import Any
Expand Down Expand Up @@ -42,13 +42,14 @@
)
from rdagent.utils.workflow import LoopBase, LoopMeta

with open(FACTOR_PROP_SETTING.report_result_json_file_path, 'r') as input_file:
with open(FACTOR_PROP_SETTING.report_result_json_file_path, "r") as input_file:
csv_reader = csv.reader(input_file)
judge_pdf_data = [row[0] for row in csv_reader]

prompts_path = Path(__file__).parent / "prompts.yaml"
prompts = Prompts(file_path=prompts_path)


def generate_hypothesis(factor_result: dict, report_content: str) -> str:
system_prompt = (
Environment(undefined=StrictUndefined).from_string(prompts["hypothesis_generation"]["system"]).render()
Expand All @@ -72,6 +73,7 @@ def generate_hypothesis(factor_result: dict, report_content: str) -> str:

return Hypothesis(hypothesis=hypothesis_text, reason=reason_text, concise_reason=concise_reason_text)


def extract_factors_and_implement(report_file_path: str) -> tuple:
scenario = QlibFactorScenario()

Expand Down Expand Up @@ -105,6 +107,7 @@ def extract_factors_and_implement(report_file_path: str) -> tuple:

class FactorReportLoop(LoopBase, metaclass=LoopMeta):
skip_loop_error = (FactorEmptyError,)

def __init__(self, PROP_SETTING: BasePropSetting):
scen: Scenario = import_class(PROP_SETTING.scen)()

Expand All @@ -116,7 +119,11 @@ def __init__(self, PROP_SETTING: BasePropSetting):

self.judge_pdf_data_items = judge_pdf_data
self.index = 0
self.hypo_exp_cache = pickle.load(open(FACTOR_PROP_SETTING.report_extract_result, "rb")) if Path(FACTOR_PROP_SETTING.report_extract_result).exists() else {}
self.hypo_exp_cache = (
pickle.load(open(FACTOR_PROP_SETTING.report_extract_result, "rb"))
if Path(FACTOR_PROP_SETTING.report_extract_result).exists()
else {}
)
super().__init__()

def propose_hypo_exp(self, prev_out: dict[str, Any]):
Expand All @@ -128,7 +135,9 @@ def propose_hypo_exp(self, prev_out: dict[str, Any]):
self.index += 1
if report_file_path in self.hypo_exp_cache:
hypothesis, exp = self.hypo_exp_cache[report_file_path]
exp.based_experiments = [QlibFactorExperiment(sub_tasks=[])] + [t[1] for t in self.trace.hist if t[2]]
exp.based_experiments = [QlibFactorExperiment(sub_tasks=[])] + [
t[1] for t in self.trace.hist if t[2]
]
else:
continue
# else:
Expand All @@ -142,8 +151,8 @@ def propose_hypo_exp(self, prev_out: dict[str, Any]):
with logger.tag("load_pdf_screenshot"):
pdf_screenshot = extract_first_page_screenshot_from_pdf(report_file_path)
logger.log_object(pdf_screenshot)
exp.sub_workspace_list = exp.sub_workspace_list[:FACTOR_PROP_SETTING.max_factor_per_report]
exp.sub_tasks = exp.sub_tasks[:FACTOR_PROP_SETTING.max_factor_per_report]
exp.sub_workspace_list = exp.sub_workspace_list[: FACTOR_PROP_SETTING.max_factor_per_report]
exp.sub_tasks = exp.sub_tasks[: FACTOR_PROP_SETTING.max_factor_per_report]
logger.log_object(hypothesis, tag="hypothesis generation")
logger.log_object(exp.sub_tasks, tag="experiment generation")
return hypothesis, exp
Expand All @@ -169,6 +178,7 @@ def feedback(self, prev_out: dict[str, Any]):
logger.log_object(feedback, tag="feedback")
self.trace.hist.append((prev_out["propose_hypo_exp"][0], prev_out["running"], feedback))


def main(path=None, step_n=None):
"""
You can continue running session by
Expand Down
20 changes: 12 additions & 8 deletions rdagent/components/coder/factor_coder/CoSTEER/evaluators.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,33 +170,37 @@ def evaluate(
max_attempts = 3
attempts = 0
final_evaluation_dict = None

while attempts < max_attempts:
try:
resp = APIBackend().build_messages_and_create_chat_completion(
user_prompt=gen_df_info_str, system_prompt=system_prompt, json_mode=True
)
resp_dict = json.loads(resp)

if isinstance(resp_dict["output_format_decision"], str) and resp_dict["output_format_decision"].lower() in (

if isinstance(resp_dict["output_format_decision"], str) and resp_dict[
"output_format_decision"
].lower() in (
"true",
"false",
):
resp_dict["output_format_decision"] = bool(resp_dict["output_format_decision"])

return (
resp_dict["output_format_feedback"],
resp_dict["output_format_decision"],
)

except json.JSONDecodeError as e:
raise ValueError("Failed to decode JSON response from API.") from e

except KeyError as e:
attempts += 1
if attempts >= max_attempts:
raise KeyError("Response from API is missing 'output_format_decision' or 'output_format_feedback' key after multiple attempts.") from e

raise KeyError(
"Response from API is missing 'output_format_decision' or 'output_format_feedback' key after multiple attempts."
) from e

return "Failed to evaluate output format after multiple attempts.", False


Expand Down
24 changes: 0 additions & 24 deletions rdagent/components/coder/factor_coder/CoSTEER/evolving_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,30 +88,6 @@ def evolve(
self.scen,
)

# if FACTOR_IMPLEMENT_SETTINGS.select_ratio < 1:
# # if the number of loops is equal to the select_loop, we need to select some of them
# implementation_factors_per_round = round(
# FACTOR_IMPLEMENT_SETTINGS.select_ratio * len(to_be_finished_task_index) + 0.5
# ) # ceilling
# implementation_factors_per_round = min(
# implementation_factors_per_round, len(to_be_finished_task_index)
# ) # but not exceed the total number of tasks

# if FACTOR_IMPLEMENT_SETTINGS.select_method == "random":
# to_be_finished_task_index = RandomSelect(
# to_be_finished_task_index,
# implementation_factors_per_round,
# )

# if FACTOR_IMPLEMENT_SETTINGS.select_method == "scheduler":
# to_be_finished_task_index = LLMSelect(
# to_be_finished_task_index,
# implementation_factors_per_round,
# evo,
# queried_knowledge.former_traces,
# self.scen,
# )

result = multiprocessing_wrapper(
[
(self.implement_one_factor, (evo.sub_tasks[target_index], queried_knowledge))
Expand Down
1 change: 0 additions & 1 deletion rdagent/components/coder/factor_coder/factor.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,6 @@ def __init__(
) -> None:
super().__init__(*args, **kwargs)
self.executed_factor_value_dataframe = executed_factor_value_dataframe
# self.logger = logger
self.raise_exception = raise_exception

@staticmethod
Expand Down
3 changes: 1 addition & 2 deletions rdagent/core/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from __future__ import annotations

import pickle
import importlib
import json
import multiprocessing as mp
Expand Down Expand Up @@ -103,4 +102,4 @@ def multiprocessing_wrapper(func_calls: list[tuple[Callable, tuple]], n: int) ->
return [f(*args) for f, args in func_calls]
with mp.Pool(processes=n) as pool:
results = [pool.apply_async(f, args) for f, args in func_calls]
return [result.get() for result in results]
return [result.get() for result in results]
32 changes: 1 addition & 31 deletions rdagent/scenarios/qlib/developer/factor_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,32 +37,6 @@ class QlibFactorRunner(CachedRunner[QlibFactorExperiment]):
- results in `mlflow`
"""

def calculate_information_coefficient(
self, concat_feature: pd.DataFrame, SOTA_feature_column_size: int, new_feature_columns_size: int
) -> pd.DataFrame:
res = pd.Series(index=range(SOTA_feature_column_size * new_feature_columns_size))
for col1 in range(SOTA_feature_column_size):
for col2 in range(SOTA_feature_column_size, SOTA_feature_column_size + new_feature_columns_size):
res.loc[col1 * new_feature_columns_size + col2 - SOTA_feature_column_size] = concat_feature.iloc[
:, col1
].corr(concat_feature.iloc[:, col2])
return res

def deduplicate_new_factors(self, SOTA_feature: pd.DataFrame, new_feature: pd.DataFrame) -> pd.DataFrame:
# calculate the IC between each column of SOTA_feature and new_feature
# if the IC is larger than a threshold, remove the new_feature column
# return the new_feature

concat_feature = pd.concat([SOTA_feature, new_feature], axis=1)
IC_max = (
concat_feature.groupby("datetime")
.apply(lambda x: self.calculate_information_coefficient(x, SOTA_feature.shape[1], new_feature.shape[1]))
.mean()
)
IC_max.index = pd.MultiIndex.from_product([range(SOTA_feature.shape[1]), range(new_feature.shape[1])])
IC_max = IC_max.unstack().max(axis=0)
return new_feature.iloc[:, IC_max[IC_max < 0.99].index]

def develop(self, exp: QlibFactorExperiment) -> QlibFactorExperiment:
"""
Generate the experiment by processing and combining factor data,
Expand Down Expand Up @@ -92,17 +66,13 @@ def develop(self, exp: QlibFactorExperiment) -> QlibFactorExperiment:

# Combine the SOTA factor and new factors if SOTA factor exists
if SOTA_factor is not None and not SOTA_factor.empty:
new_factors = self.deduplicate_new_factors(SOTA_factor, new_factors)
if new_factors.empty:
raise FactorEmptyError("No valid factor data found to merge.")
combined_factors = pd.concat([SOTA_factor, new_factors], axis=1).dropna()
else:
combined_factors = new_factors

# Sort and nest the combined factors under 'feature'
# print(combined_factors)
combined_factors = combined_factors.sort_index()
combined_factors = combined_factors.loc[:, ~combined_factors.columns.duplicated(keep='last')]
combined_factors = combined_factors.loc[:, ~combined_factors.columns.duplicated(keep="last")]
new_columns = pd.MultiIndex.from_product([["feature"], combined_factors.columns])
combined_factors.columns = new_columns

Expand Down

0 comments on commit a883a78

Please sign in to comment.