From e6265702f981a232d130ece5502028eaa412bdce Mon Sep 17 00:00:00 2001 From: xuyang1 Date: Thu, 4 Jul 2024 12:40:54 +0000 Subject: [PATCH] align benchmark and evolving evaluators --- rdagent/app/qlib_rd_loop/conf.py | 2 +- rdagent/components/benchmark/eval_method.py | 6 +- .../coder/factor_coder/CoSTEER/__init__.py | 4 +- .../coder/factor_coder/CoSTEER/evaluators.py | 446 ++++++------------ .../components/coder/factor_coder/factor.py | 3 +- .../factor_task_implementation/__init__.py | 2 +- .../model_task_implementation/__init__.py | 3 +- 7 files changed, 149 insertions(+), 317 deletions(-) diff --git a/rdagent/app/qlib_rd_loop/conf.py b/rdagent/app/qlib_rd_loop/conf.py index 3b8a04fd3..6db7c1c32 100644 --- a/rdagent/app/qlib_rd_loop/conf.py +++ b/rdagent/app/qlib_rd_loop/conf.py @@ -7,7 +7,7 @@ class PropSetting(BaseSettings): scen: str = "rdagent.scenarios.qlib.experiment.factor_experiment.QlibFactorScenario" hypothesis_gen: str = "rdagent.scenarios.qlib.factor_proposal.QlibFactorHypothesisGen" hypothesis2experiment: str = "rdagent.scenarios.qlib.factor_proposal.QlibFactorHypothesis2Experiment" - qlib_factor_coder: str = "rdagent.components.coder.factor_coder.CoSTEER.CoSTEERFG" + qlib_factor_coder: str = "rdagent.scenarios.qlib.factor_task_implementation.QlibFactorCoSTEER" qlib_factor_runner: str = "rdagent.scenarios.qlib.task_generator.data.QlibFactorRunner" qlib_factor_summarizer: str = "rdagent.scenarios.qlib.task_generator.feedback.QlibFactorExperiment2Feedback" diff --git a/rdagent/components/benchmark/eval_method.py b/rdagent/components/benchmark/eval_method.py index e0ffa6de2..68c2861f4 100644 --- a/rdagent/components/benchmark/eval_method.py +++ b/rdagent/components/benchmark/eval_method.py @@ -7,13 +7,13 @@ from rdagent.components.coder.factor_coder.config import FACTOR_IMPLEMENT_SETTINGS from rdagent.components.coder.factor_coder.CoSTEER.evaluators import ( FactorCorrelationEvaluator, + FactorEqualValueCountEvaluator, FactorEvaluator, FactorIndexEvaluator, FactorIndexFormatEvaluator, FactorMissingValuesEvaluator, FactorRowCountEvaluator, FactorSingleColumnEvaluator, - FactorValuesEvaluator, ) from rdagent.components.coder.factor_coder.factor import FileBasedFactorImplementation from rdagent.core.exception import ImplementRunException @@ -94,7 +94,7 @@ def eval_case( eval_res = [] for ev in self.evaluator_l: try: - eval_res.append((ev, ev.evaluate(case_gt, case_gen))) + eval_res.append((ev, ev.evaluate(implementation=case_gen, gt_implementation=case_gt))) # if the corr ev is successfully evaluated and achieve the best performance, then break except ImplementRunException as e: return e @@ -122,7 +122,7 @@ def __init__( FactorRowCountEvaluator(), FactorIndexEvaluator(), FactorMissingValuesEvaluator(), - FactorValuesEvaluator(), + FactorEqualValueCountEvaluator(), FactorCorrelationEvaluator(hard_check=False), ] super().__init__(online_evaluator_l, test_cases, method, *args, **kwargs) diff --git a/rdagent/components/coder/factor_coder/CoSTEER/__init__.py b/rdagent/components/coder/factor_coder/CoSTEER/__init__.py index 63c749e5e..cfcb519fe 100644 --- a/rdagent/components/coder/factor_coder/CoSTEER/__init__.py +++ b/rdagent/components/coder/factor_coder/CoSTEER/__init__.py @@ -3,7 +3,7 @@ from rdagent.components.coder.factor_coder.config import FACTOR_IMPLEMENT_SETTINGS from rdagent.components.coder.factor_coder.CoSTEER.evaluators import ( - FactorEvaluatorV1, + FactorEvaluatorForCoder, FactorMultiEvaluator, ) from rdagent.components.coder.factor_coder.CoSTEER.evolvable_subjects import ( @@ -45,7 +45,7 @@ def __init__( self.knowledge_self_gen = knowledge_self_gen self.evolving_strategy = FactorEvolvingStrategyWithGraph() # declare the factor evaluator - self.factor_evaluator = FactorMultiEvaluator(FactorEvaluatorV1()) + self.factor_evaluator = FactorMultiEvaluator(FactorEvaluatorForCoder()) self.evolving_version = 2 def load_or_init_knowledge_base(self, former_knowledge_base_path: Path = None, component_init_list: list = []): diff --git a/rdagent/components/coder/factor_coder/CoSTEER/evaluators.py b/rdagent/components/coder/factor_coder/CoSTEER/evaluators.py index 43699f167..7eb26705a 100644 --- a/rdagent/components/coder/factor_coder/CoSTEER/evaluators.py +++ b/rdagent/components/coder/factor_coder/CoSTEER/evaluators.py @@ -15,7 +15,7 @@ from rdagent.core.conf import RD_AGENT_SETTINGS from rdagent.core.evaluation import Evaluator from rdagent.core.evolving_framework import Feedback, QueriedKnowledge -from rdagent.core.experiment import Implementation +from rdagent.core.experiment import Implementation, Task from rdagent.core.log import RDAgentLog from rdagent.core.prompts import Prompts from rdagent.core.utils import multiprocessing_wrapper @@ -31,36 +31,49 @@ class FactorEvaluator(Evaluator): @abstractmethod def evaluate( self, - gt: Implementation, - gen: Implementation, + target_task: Task, + implementation: Implementation, + gt_implementation: Implementation, + **kwargs, ) -> Tuple[str, object]: """You can get the dataframe by .. code-block:: python - _, gt_df = gt.execute() - _, gen_df = gen.execute() + _, gen_df = implementation.execute() + _, gt_df = gt_implementation.execute() Returns ------- Tuple[str, object] - str: the text-based description of the evaluation result - - object: a comparable metric (bool, integer, float ...) + - object: a comparable metric (bool, integer, float ...) None for evaluator with only text-based result """ raise NotImplementedError("Please implement the `evaluator` method") - def _get_df(self, gt: Implementation, gen: Implementation): - _, gt_df = gt.execute() - _, gen_df = gen.execute() + def _get_df(self, gt_implementation: Implementation, implementation: Implementation): + if gt_implementation is not None: + _, gt_df = gt_implementation.execute() + if isinstance(gt_df, pd.Series): + gt_df = gt_df.to_frame("gt_factor") + if isinstance(gt_df, pd.DataFrame): + gt_df = gt_df.sort_index() + else: + gt_df = None + + _, gen_df = implementation.execute() if isinstance(gen_df, pd.Series): gen_df = gen_df.to_frame("source_factor") - if isinstance(gt_df, pd.Series): - gt_df = gt_df.to_frame("gt_factor") + if isinstance(gen_df, pd.DataFrame): + gen_df = gen_df.sort_index() return gt_df, gen_df + def __str__(self) -> str: + return self.__class__.__name__ + -class FactorCodeEvaluator(Evaluator): +class FactorCodeEvaluator(FactorEvaluator): def evaluate( self, target_task: FactorTask, @@ -76,28 +89,7 @@ def evaluate( system_prompt = evaluate_prompts["evaluator_code_feedback_v1_system"] execution_feedback_to_render = execution_feedback - user_prompt = ( - Environment(undefined=StrictUndefined) - .from_string( - evaluate_prompts["evaluator_code_feedback_v1_user"], - ) - .render( - factor_information=factor_information, - code=code, - execution_feedback=execution_feedback_to_render, - factor_value_feedback=factor_value_feedback, - gt_code=gt_implementation.code if gt_implementation else None, - ) - ) - while ( - APIBackend().build_messages_and_calculate_token( - user_prompt=user_prompt, - system_prompt=system_prompt, - former_messages=[], - ) - > RD_AGENT_SETTINGS.chat_token_limit - ): - execution_feedback_to_render = execution_feedback_to_render[len(execution_feedback_to_render) // 2 :] + for _ in range(10): # 10 times to split the content is enough user_prompt = ( Environment(undefined=StrictUndefined) .from_string( @@ -111,48 +103,49 @@ def evaluate( gt_code=gt_implementation.code if gt_implementation else None, ) ) + if ( + APIBackend().build_messages_and_calculate_token( + user_prompt=user_prompt, + system_prompt=system_prompt, + ) + > RD_AGENT_SETTINGS.chat_token_limit + ): + execution_feedback_to_render = execution_feedback_to_render[len(execution_feedback_to_render) // 2 :] + else: + break critic_response = APIBackend().build_messages_and_create_chat_completion( user_prompt=user_prompt, system_prompt=system_prompt, json_mode=False, ) - return critic_response + return critic_response, None class FactorSingleColumnEvaluator(FactorEvaluator): def evaluate( self, - gt: Implementation, - gen: Implementation, + implementation: Implementation, + gt_implementation: Implementation, ) -> Tuple[str, object]: - gt_df, gen_df = self._get_df(gt, gen) - - if len(gen_df.columns) == 1 and len(gt_df.columns) == 1: - return "Both dataframes have only one column.", True - elif len(gen_df.columns) != 1: - gen_df = gen_df.iloc(axis=1)[ - [ - 0, - ] - ] + _, gen_df = self._get_df(gt_implementation, implementation) + + if len(gen_df.columns) == 1: + return "The source dataframe has only one column which is correct.", True + else: return ( "The source dataframe has more than one column. Please check the implementation. We only evaluate the first column.", False, ) - return "", False - - def __str__(self) -> str: - return self.__class__.__name__ class FactorIndexFormatEvaluator(FactorEvaluator): def evaluate( self, - gt: Implementation, - gen: Implementation, + implementation: Implementation, + gt_implementation: Implementation, ) -> Tuple[str, object]: - gt_df, gen_df = self._get_df(gt, gen) + gt_df, gen_df = self._get_df(gt_implementation, implementation) idx_name_right = gen_df.index.names == ("datetime", "instrument") if idx_name_right: return ( @@ -165,17 +158,14 @@ def evaluate( False, ) - def __str__(self) -> str: - return self.__class__.__name__ - class FactorRowCountEvaluator(FactorEvaluator): def evaluate( self, - gt: Implementation, - gen: Implementation, + implementation: Implementation, + gt_implementation: Implementation, ) -> Tuple[str, object]: - gt_df, gen_df = self._get_df(gt, gen) + gt_df, gen_df = self._get_df(gt_implementation, implementation) if gen_df.shape[0] == gt_df.shape[0]: return "Both dataframes have the same rows count.", True @@ -185,17 +175,14 @@ def evaluate( False, ) - def __str__(self) -> str: - return self.__class__.__name__ - class FactorIndexEvaluator(FactorEvaluator): def evaluate( self, - gt: Implementation, - gen: Implementation, + implementation: Implementation, + gt_implementation: Implementation, ) -> Tuple[str, object]: - gt_df, gen_df = self._get_df(gt, gen) + gt_df, gen_df = self._get_df(gt_implementation, implementation) if gen_df.index.equals(gt_df.index): return "Both dataframes have the same index.", True @@ -205,17 +192,14 @@ def evaluate( False, ) - def __str__(self) -> str: - return self.__class__.__name__ - class FactorMissingValuesEvaluator(FactorEvaluator): def evaluate( self, - gt: Implementation, - gen: Implementation, + implementation: Implementation, + gt_implementation: Implementation, ) -> Tuple[str, object]: - gt_df, gen_df = self._get_df(gt, gen) + gt_df, gen_df = self._get_df(gt_implementation, implementation) if gen_df.isna().sum().sum() == gt_df.isna().sum().sum(): return "Both dataframes have the same missing values.", True @@ -225,17 +209,14 @@ def evaluate( False, ) - def __str__(self) -> str: - return self.__class__.__name__ - -class FactorValuesEvaluator(FactorEvaluator): +class FactorEqualValueCountEvaluator(FactorEvaluator): def evaluate( self, - gt: Implementation, - gen: Implementation, + implementation: Implementation, + gt_implementation: Implementation, ) -> Tuple[str, object]: - gt_df, gen_df = self._get_df(gt, gen) + gt_df, gen_df = self._get_df(gt_implementation, implementation) try: close_values = gen_df.sub(gt_df).abs().lt(1e-6) @@ -255,9 +236,6 @@ def evaluate( acc_rate, ) - def __str__(self) -> str: - return self.__class__.__name__ - class FactorCorrelationEvaluator(FactorEvaluator): def __init__(self, hard_check: bool) -> None: @@ -265,10 +243,10 @@ def __init__(self, hard_check: bool) -> None: def evaluate( self, - gt: Implementation, - gen: Implementation, + implementation: Implementation, + gt_implementation: Implementation, ) -> Tuple[str, object]: - gt_df, gen_df = self._get_df(gt, gen) + gt_df, gen_df = self._get_df(gt_implementation, implementation) concat_df = pd.concat([gen_df, gt_df], axis=1) concat_df.columns = ["source", "gt"] @@ -294,186 +272,57 @@ def evaluate( else: return f"The ic is ({ic:.6f}) and the rankic is ({ric:.6f}).", ic - def __str__(self) -> str: - return self.__class__.__name__ - -class FactorValEvaluator(FactorEvaluator): - def evaluate(self, gt: Implementation, gen: Implementation): - _, gt_df = gt.execute() - _, gen_df = gen.execute() - # FIXME: refactor the two classes - fiv = FactorValueEvaluator() - return fiv.evaluate(source_df=gen_df, gt_df=gt_df) - - def __str__(self) -> str: - return self.__class__.__name__ +class FactorValueEvaluator(FactorEvaluator): - -class FactorValueEvaluator(Evaluator): - # TODO: let's discuss the about the interface of the evaluator def evaluate( self, - source_df: pd.DataFrame, - gt_df: pd.DataFrame, + implementation: Implementation, + gt_implementation: Implementation, **kwargs, ) -> Tuple: conclusions = [] - if isinstance(source_df, pd.Series): - source_df = source_df.to_frame("source_factor") - conclusions.append( - "The source dataframe is a series, better convert it to a dataframe.", - ) - if gt_df is not None and isinstance(gt_df, pd.Series): - gt_df = gt_df.to_frame("gt_factor") - conclusions.append( - "The ground truth dataframe is a series, convert it to a dataframe.", - ) - # Check if both dataframe has only one columns - if len(source_df.columns) == 1: - conclusions.append("The source dataframe has only one column which is correct.") - else: - conclusions.append( - "The source dataframe has more than one column. Please check the implementation. We only evaluate the first column.", - ) - source_df = source_df.iloc(axis=1)[ - [ - 0, - ] - ] - - if list(source_df.index.names) != ["datetime", "instrument"]: - conclusions.append( - rf"The index of the dataframe is not (\"datetime\", \"instrument\"), instead is {source_df.index.names}. Please check the implementation.", - ) - else: - conclusions.append( - 'The index of the dataframe is ("datetime", "instrument") and align with the predefined format.', - ) + feedback_str, _ = FactorSingleColumnEvaluator().evaluate(implementation, gt_implementation) + conclusions.append(feedback_str) + + # Check if the index of the dataframe is ("datetime", "instrument") + feedback_str, _ = FactorIndexFormatEvaluator().evaluate(implementation, gt_implementation) + conclusions.append(feedback_str) # Check if both dataframe have the same rows count - if gt_df is not None: - if source_df.shape[0] == gt_df.shape[0]: - conclusions.append("Both dataframes have the same rows count.") - same_row_count_result = True - else: - conclusions.append( - f"The source dataframe and the ground truth dataframe have different rows count. The source dataframe has {source_df.shape[0]} rows, while the ground truth dataframe has {gt_df.shape[0]} rows. Please check the implementation.", - ) - same_row_count_result = False + if gt_implementation is not None: + feedback_str, _ = FactorRowCountEvaluator().evaluate(implementation, gt_implementation) + conclusions.append(feedback_str) - # Check whether both dataframe has the same index - if source_df.index.equals(gt_df.index): - conclusions.append("Both dataframes have the same index.") - same_index_result = True - else: - conclusions.append( - "The source dataframe and the ground truth dataframe have different index. Please check the implementation.", - ) - same_index_result = False + feedback_str, same_index_result = FactorIndexEvaluator().evaluate(implementation, gt_implementation) + conclusions.append(feedback_str) - # Check for the same missing values (NaN) - if source_df.isna().sum().sum() == gt_df.isna().sum().sum(): - conclusions.append("Both dataframes have the same missing values.") - same_missing_values_result = True - else: - conclusions.append( - f"The dataframes do not have the same missing values. The source dataframe has {source_df.isna().sum().sum()} missing values, while the ground truth dataframe has {gt_df.isna().sum().sum()} missing values. Please check the implementation.", - ) - same_missing_values_result = False + feedback_str, _ = FactorMissingValuesEvaluator().evaluate(implementation, gt_implementation) + conclusions.append(feedback_str) + + feedback_str, equal_value_ratio_result = FactorEqualValueCountEvaluator().evaluate( + implementation, gt_implementation + ) + conclusions.append(feedback_str) - # Check if the values are the same within a small tolerance - if not same_index_result: - conclusions.append( - "The source dataframe and the ground truth dataframe have different index. Give up comparing the values and correlation because it's useless", + if same_index_result: + feedback_str, high_correlation_result = FactorCorrelationEvaluator(hard_check=True).evaluate( + implementation, gt_implementation ) - same_values_result = False - high_correlation_result = False else: - close_values = source_df.sub(gt_df).abs().lt(1e-6) - if close_values.all().iloc[0]: - conclusions.append( - "All values in the dataframes are equal within the tolerance of 1e-6.", - ) - same_values_result = True - else: - conclusions.append( - "Some values differ by more than the tolerance of 1e-6. Check for rounding errors or differences in the calculation methods.", - ) - same_values_result = False - - # Check the ic and rankic between the two dataframes - concat_df = pd.concat([source_df, gt_df], axis=1) - concat_df.columns = ["source", "gt"] - try: - ic = concat_df.groupby("datetime").apply(lambda df: df["source"].corr(df["gt"])).dropna().mean() - ric = ( - concat_df.groupby("datetime") - .apply(lambda df: df["source"].corr(df["gt"], method="spearman")) - .dropna() - .mean() - ) - - if ic > 0.99 and ric > 0.99: - conclusions.append( - f"The dataframes are highly correlated. The ic is {ic:.6f} and the rankic is {ric:.6f}.", - ) - high_correlation_result = True - else: - conclusions.append( - f"The dataframes are not sufficiently high correlated. The ic is {ic:.6f} and the rankic is {ric:.6f}. Investigate the factors that might be causing the discrepancies and ensure that the logic of the factor calculation is consistent.", - ) - high_correlation_result = False - - # Check for shifted alignments only in the "datetime" index - max_shift_days = 2 - for shift in range(-max_shift_days, max_shift_days + 1): - if shift == 0: - continue # Skip the case where there is no shift - - shifted_source_df = source_df.groupby(level="instrument").shift(shift) - concat_df = pd.concat([shifted_source_df, gt_df], axis=1) - concat_df.columns = ["source", "gt"] - shifted_ric = ( - concat_df.groupby("datetime") - .apply(lambda df: df["source"].corr(df["gt"], method="spearman")) - .dropna() - .mean() - ) - if shifted_ric > 0.99: - conclusions.append( - f"The dataframes are highly correlated with a shift of {max_shift_days} days in the 'date' index. Shifted rankic: {shifted_ric:.6f}.", - ) - break - else: - conclusions.append( - f"No sufficient correlation found when shifting up to {max_shift_days} days in the 'date' index. Investigate the factors that might be causing discrepancies.", - ) - - except Exception as e: - RDAgentLog().warning(f"Error occurred when calculating the correlation: {str(e)}") - conclusions.append( - f"Some error occurred when calculating the correlation. Investigate the factors that might be causing the discrepancies and ensure that the logic of the factor calculation is consistent. Error: {e}", - ) - high_correlation_result = False + high_correlation_result = False + feedback_str = "The source dataframe and the ground truth dataframe have different index. Give up comparing the values and correlation because it's useless" + conclusions.append(feedback_str) # Combine all conclusions into a single string conclusion_str = "\n".join(conclusions) - final_result = (same_values_result or high_correlation_result) if gt_df is not None else False - return conclusion_str, final_result - - -# TODO: -def shorten_prompt(tpl: str, render_kwargs: dict, shorten_key: str, max_trail: int = 10) -> str: - """When the prompt is too long. We have to shorten it. - But we should not truncate the prompt directly, so we should find the key we want to shorten and then shorten it. - """ - # TODO: this should replace most of code in - # - FactorFinalDecisionEvaluator.evaluate - # - FactorCodeEvaluator.evaluate + same_value_or_high_correlation = ( + ((equal_value_ratio_result > 0.99) or high_correlation_result) if gt_implementation is not None else False + ) + return conclusion_str, same_value_or_high_correlation class FactorFinalDecisionEvaluator(Evaluator): @@ -489,31 +338,8 @@ def evaluate( "evaluator_final_decision_v1_system" ] execution_feedback_to_render = execution_feedback - user_prompt = ( - Environment(undefined=StrictUndefined) - .from_string( - evaluate_prompts["evaluator_final_decision_v1_user"], - ) - .render( - factor_information=target_task.get_factor_information(), - execution_feedback=execution_feedback_to_render, - code_feedback=code_feedback, - factor_value_feedback=( - value_feedback - if value_feedback is not None - else "No Ground Truth Value provided, so no evaluation on value is performed." - ), - ) - ) - while ( - APIBackend().build_messages_and_calculate_token( - user_prompt=user_prompt, - system_prompt=system_prompt, - former_messages=[], - ) - > RD_AGENT_SETTINGS.chat_token_limit - ): - execution_feedback_to_render = execution_feedback_to_render[len(execution_feedback_to_render) // 2 :] + + for _ in range(10): # 10 times to split the content is enough user_prompt = ( Environment(undefined=StrictUndefined) .from_string( @@ -530,6 +356,16 @@ def evaluate( ), ) ) + if ( + APIBackend().build_messages_and_calculate_token( + user_prompt=user_prompt, + system_prompt=system_prompt, + ) + > RD_AGENT_SETTINGS.chat_token_limit + ): + execution_feedback_to_render = execution_feedback_to_render[len(execution_feedback_to_render) // 2 :] + else: + break final_evaluation_dict = json.loads( APIBackend().build_messages_and_create_chat_completion( @@ -586,14 +422,14 @@ class FactorMultiFeedback( """Feedback contains a list, each element is the corresponding feedback for each factor implementation.""" -class FactorEvaluatorV1(FactorEvaluator): +class FactorEvaluatorForCoder(FactorEvaluator): """This class is the v1 version of evaluator for a single factor implementation. It calls several evaluators in share modules to evaluate the factor implementation. """ def __init__(self) -> None: - self.code_evaluator = FactorCodeEvaluator() self.value_evaluator = FactorValueEvaluator() + self.code_evaluator = FactorCodeEvaluator() self.final_decision_evaluator = FactorFinalDecisionEvaluator() def evaluate( @@ -625,51 +461,39 @@ def evaluate( ) else: factor_feedback = FactorSingleFeedback() + + # 1. Get factor execution feedback to generated implementation and remove the long list of numbers in execution feedback ( - factor_feedback.execution_feedback, - source_df, + execution_feedback, + gen_df, ) = implementation.execute() - # Remove the long list of numbers in the feedback - pattern = r"(?<=\D)(,\s+-?\d+\.\d+){50,}(?=\D)" - factor_feedback.execution_feedback = re.sub(pattern, ", ", factor_feedback.execution_feedback) - execution_feedback_lines = [ - line for line in factor_feedback.execution_feedback.split("\n") if "warning" not in line.lower() - ] - factor_feedback.execution_feedback = "\n".join(execution_feedback_lines) + execution_feedback = re.sub(r"(?<=\D)(,\s+-?\d+\.\d+){50,}(?=\D)", ", ", execution_feedback) + factor_feedback.execution_feedback = "\n".join( + [line for line in execution_feedback.split("\n") if "warning" not in line.lower()] + ) - if source_df is None: + # 2. Get factor value feedback + if gen_df is None: factor_feedback.factor_value_feedback = "No factor value generated, skip value evaluation." factor_feedback.value_generated_flag = False - value_decision = None + same_value_or_high_correlation = None else: factor_feedback.value_generated_flag = True - if gt_implementation is not None: - _, gt_df = gt_implementation.execute(store_result=True) - else: - gt_df = None - try: - source_df = source_df.sort_index() - if gt_df is not None: - gt_df = gt_df.sort_index() - ( - factor_feedback.factor_value_feedback, - value_decision, - ) = self.value_evaluator.evaluate(source_df=source_df, gt_df=gt_df) - except Exception as e: - RDAgentLog().warning("Value evaluation failed with exception: %s", e) - factor_feedback.factor_value_feedback = "Value evaluation failed." - value_decision = False + ( + factor_feedback.factor_value_feedback, + same_value_or_high_correlation, + ) = self.value_evaluator.evaluate(implementation=implementation, gt_implementation=gt_implementation) factor_feedback.final_decision_based_on_gt = gt_implementation is not None - if value_decision is not None and value_decision is True: - # To avoid confusion, when value_decision is True, we do not need code feedback + if same_value_or_high_correlation is not None and same_value_or_high_correlation is True: + # To avoid confusion, when same_value_or_high_correlation is True, we do not need code feedback factor_feedback.code_feedback = "Final decision is True and there are no code critics." - factor_feedback.final_decision = value_decision + factor_feedback.final_decision = same_value_or_high_correlation factor_feedback.final_feedback = "Value evaluation passed, skip final decision evaluation." else: - factor_feedback.code_feedback = self.code_evaluator.evaluate( + factor_feedback.code_feedback, _ = self.code_evaluator.evaluate( target_task=target_task, implementation=implementation, execution_feedback=factor_feedback.execution_feedback, @@ -689,7 +513,7 @@ def evaluate( class FactorMultiEvaluator(Evaluator): - def __init__(self, single_evaluator=FactorEvaluatorV1()) -> None: + def __init__(self, single_evaluator=FactorEvaluatorForCoder()) -> None: super().__init__() self.single_factor_implementation_evaluator = single_evaluator @@ -704,9 +528,7 @@ def evaluate( # for index in range(len(evo.sub_tasks)): # corresponding_implementation = evo.sub_implementations[index] # corresponding_gt_implementation = ( - # evo.sub_gt_implementations[index] - # if evo.sub_gt_implementations is not None - # else None + # evo.sub_gt_implementations[index] if evo.sub_gt_implementations is not None else None # ) # multi_implementation_feedback.append( @@ -744,3 +566,13 @@ def evaluate( RDAgentLog().info(f"Final decisions: {final_decision} True count: {final_decision.count(True)}") return multi_implementation_feedback + + +# TODO: +def shorten_prompt(tpl: str, render_kwargs: dict, shorten_key: str, max_trail: int = 10) -> str: + """When the prompt is too long. We have to shorten it. + But we should not truncate the prompt directly, so we should find the key we want to shorten and then shorten it. + """ + # TODO: this should replace most of code in + # - FactorFinalDecisionEvaluator.evaluate + # - FactorCodeEvaluator.evaluate diff --git a/rdagent/components/coder/factor_coder/factor.py b/rdagent/components/coder/factor_coder/factor.py index bdaaf11a4..72d019951 100644 --- a/rdagent/components/coder/factor_coder/factor.py +++ b/rdagent/components/coder/factor_coder/factor.py @@ -218,5 +218,4 @@ def from_folder(task: FactorTask, path: Union[str, Path], **kwargs): return FileBasedFactorImplementation(task, code=code, **kwargs) -class FactorExperiment(Experiment[FactorTask, FileBasedFactorImplementation]): - ... +class FactorExperiment(Experiment[FactorTask, FileBasedFactorImplementation]): ... diff --git a/rdagent/scenarios/qlib/factor_task_implementation/__init__.py b/rdagent/scenarios/qlib/factor_task_implementation/__init__.py index 48ff2234a..0facb840d 100644 --- a/rdagent/scenarios/qlib/factor_task_implementation/__init__.py +++ b/rdagent/scenarios/qlib/factor_task_implementation/__init__.py @@ -1,4 +1,4 @@ from rdagent.components.coder.factor_coder.CoSTEER import FactorCoSTEER -COSTEERFG_QUANT_FACTOR_IMPLEMENTATION = FactorCoSTEER +QlibFactorCoSTEER = FactorCoSTEER # TODO: This is a placeholder. We need to split the scenario part of the task implementation into this folder diff --git a/rdagent/scenarios/qlib/model_task_implementation/__init__.py b/rdagent/scenarios/qlib/model_task_implementation/__init__.py index ee2e24dd7..fad4c2842 100644 --- a/rdagent/scenarios/qlib/model_task_implementation/__init__.py +++ b/rdagent/scenarios/qlib/model_task_implementation/__init__.py @@ -1,4 +1,5 @@ from rdagent.components.coder.model_coder.one_shot import ModelCodeWriter -class QlibModelCodeWriter(ModelCodeWriter): ... +class QlibModelCodeWriter(ModelCodeWriter): + ...