From e6265702f981a232d130ece5502028eaa412bdce Mon Sep 17 00:00:00 2001
From: xuyang1 <xuyang1@microsoft.com>
Date: Thu, 4 Jul 2024 12:40:54 +0000
Subject: [PATCH] align benchmark and evolving evaluators

---
 rdagent/app/qlib_rd_loop/conf.py              |   2 +-
 rdagent/components/benchmark/eval_method.py   |   6 +-
 .../coder/factor_coder/CoSTEER/__init__.py    |   4 +-
 .../coder/factor_coder/CoSTEER/evaluators.py  | 446 ++++++------------
 .../components/coder/factor_coder/factor.py   |   3 +-
 .../factor_task_implementation/__init__.py    |   2 +-
 .../model_task_implementation/__init__.py     |   3 +-
 7 files changed, 149 insertions(+), 317 deletions(-)

diff --git a/rdagent/app/qlib_rd_loop/conf.py b/rdagent/app/qlib_rd_loop/conf.py
index 3b8a04fd3..6db7c1c32 100644
--- a/rdagent/app/qlib_rd_loop/conf.py
+++ b/rdagent/app/qlib_rd_loop/conf.py
@@ -7,7 +7,7 @@ class PropSetting(BaseSettings):
     scen: str = "rdagent.scenarios.qlib.experiment.factor_experiment.QlibFactorScenario"
     hypothesis_gen: str = "rdagent.scenarios.qlib.factor_proposal.QlibFactorHypothesisGen"
     hypothesis2experiment: str = "rdagent.scenarios.qlib.factor_proposal.QlibFactorHypothesis2Experiment"
-    qlib_factor_coder: str = "rdagent.components.coder.factor_coder.CoSTEER.CoSTEERFG"
+    qlib_factor_coder: str = "rdagent.scenarios.qlib.factor_task_implementation.QlibFactorCoSTEER"
     qlib_factor_runner: str = "rdagent.scenarios.qlib.task_generator.data.QlibFactorRunner"
     qlib_factor_summarizer: str = "rdagent.scenarios.qlib.task_generator.feedback.QlibFactorExperiment2Feedback"
 
diff --git a/rdagent/components/benchmark/eval_method.py b/rdagent/components/benchmark/eval_method.py
index e0ffa6de2..68c2861f4 100644
--- a/rdagent/components/benchmark/eval_method.py
+++ b/rdagent/components/benchmark/eval_method.py
@@ -7,13 +7,13 @@
 from rdagent.components.coder.factor_coder.config import FACTOR_IMPLEMENT_SETTINGS
 from rdagent.components.coder.factor_coder.CoSTEER.evaluators import (
     FactorCorrelationEvaluator,
+    FactorEqualValueCountEvaluator,
     FactorEvaluator,
     FactorIndexEvaluator,
     FactorIndexFormatEvaluator,
     FactorMissingValuesEvaluator,
     FactorRowCountEvaluator,
     FactorSingleColumnEvaluator,
-    FactorValuesEvaluator,
 )
 from rdagent.components.coder.factor_coder.factor import FileBasedFactorImplementation
 from rdagent.core.exception import ImplementRunException
@@ -94,7 +94,7 @@ def eval_case(
         eval_res = []
         for ev in self.evaluator_l:
             try:
-                eval_res.append((ev, ev.evaluate(case_gt, case_gen)))
+                eval_res.append((ev, ev.evaluate(implementation=case_gen, gt_implementation=case_gt)))
                 # if the corr ev is successfully evaluated and achieve the best performance, then break
             except ImplementRunException as e:
                 return e
@@ -122,7 +122,7 @@ def __init__(
             FactorRowCountEvaluator(),
             FactorIndexEvaluator(),
             FactorMissingValuesEvaluator(),
-            FactorValuesEvaluator(),
+            FactorEqualValueCountEvaluator(),
             FactorCorrelationEvaluator(hard_check=False),
         ]
         super().__init__(online_evaluator_l, test_cases, method, *args, **kwargs)
diff --git a/rdagent/components/coder/factor_coder/CoSTEER/__init__.py b/rdagent/components/coder/factor_coder/CoSTEER/__init__.py
index 63c749e5e..cfcb519fe 100644
--- a/rdagent/components/coder/factor_coder/CoSTEER/__init__.py
+++ b/rdagent/components/coder/factor_coder/CoSTEER/__init__.py
@@ -3,7 +3,7 @@
 
 from rdagent.components.coder.factor_coder.config import FACTOR_IMPLEMENT_SETTINGS
 from rdagent.components.coder.factor_coder.CoSTEER.evaluators import (
-    FactorEvaluatorV1,
+    FactorEvaluatorForCoder,
     FactorMultiEvaluator,
 )
 from rdagent.components.coder.factor_coder.CoSTEER.evolvable_subjects import (
@@ -45,7 +45,7 @@ def __init__(
         self.knowledge_self_gen = knowledge_self_gen
         self.evolving_strategy = FactorEvolvingStrategyWithGraph()
         # declare the factor evaluator
-        self.factor_evaluator = FactorMultiEvaluator(FactorEvaluatorV1())
+        self.factor_evaluator = FactorMultiEvaluator(FactorEvaluatorForCoder())
         self.evolving_version = 2
 
     def load_or_init_knowledge_base(self, former_knowledge_base_path: Path = None, component_init_list: list = []):
diff --git a/rdagent/components/coder/factor_coder/CoSTEER/evaluators.py b/rdagent/components/coder/factor_coder/CoSTEER/evaluators.py
index 43699f167..7eb26705a 100644
--- a/rdagent/components/coder/factor_coder/CoSTEER/evaluators.py
+++ b/rdagent/components/coder/factor_coder/CoSTEER/evaluators.py
@@ -15,7 +15,7 @@
 from rdagent.core.conf import RD_AGENT_SETTINGS
 from rdagent.core.evaluation import Evaluator
 from rdagent.core.evolving_framework import Feedback, QueriedKnowledge
-from rdagent.core.experiment import Implementation
+from rdagent.core.experiment import Implementation, Task
 from rdagent.core.log import RDAgentLog
 from rdagent.core.prompts import Prompts
 from rdagent.core.utils import multiprocessing_wrapper
@@ -31,36 +31,49 @@ class FactorEvaluator(Evaluator):
     @abstractmethod
     def evaluate(
         self,
-        gt: Implementation,
-        gen: Implementation,
+        target_task: Task,
+        implementation: Implementation,
+        gt_implementation: Implementation,
+        **kwargs,
     ) -> Tuple[str, object]:
         """You can get the dataframe by
 
         .. code-block:: python
 
-            _, gt_df = gt.execute()
-            _, gen_df = gen.execute()
+            _, gen_df = implementation.execute()
+            _, gt_df = gt_implementation.execute()
 
         Returns
         -------
         Tuple[str, object]
             - str: the text-based description of the evaluation result
-            - object: a comparable metric (bool, integer, float ...)
+            - object: a comparable metric (bool, integer, float ...) None for evaluator with only text-based result
 
         """
         raise NotImplementedError("Please implement the `evaluator` method")
 
-    def _get_df(self, gt: Implementation, gen: Implementation):
-        _, gt_df = gt.execute()
-        _, gen_df = gen.execute()
+    def _get_df(self, gt_implementation: Implementation, implementation: Implementation):
+        if gt_implementation is not None:
+            _, gt_df = gt_implementation.execute()
+            if isinstance(gt_df, pd.Series):
+                gt_df = gt_df.to_frame("gt_factor")
+            if isinstance(gt_df, pd.DataFrame):
+                gt_df = gt_df.sort_index()
+        else:
+            gt_df = None
+
+        _, gen_df = implementation.execute()
         if isinstance(gen_df, pd.Series):
             gen_df = gen_df.to_frame("source_factor")
-        if isinstance(gt_df, pd.Series):
-            gt_df = gt_df.to_frame("gt_factor")
+        if isinstance(gen_df, pd.DataFrame):
+            gen_df = gen_df.sort_index()
         return gt_df, gen_df
 
+    def __str__(self) -> str:
+        return self.__class__.__name__
+
 
-class FactorCodeEvaluator(Evaluator):
+class FactorCodeEvaluator(FactorEvaluator):
     def evaluate(
         self,
         target_task: FactorTask,
@@ -76,28 +89,7 @@ def evaluate(
         system_prompt = evaluate_prompts["evaluator_code_feedback_v1_system"]
 
         execution_feedback_to_render = execution_feedback
-        user_prompt = (
-            Environment(undefined=StrictUndefined)
-            .from_string(
-                evaluate_prompts["evaluator_code_feedback_v1_user"],
-            )
-            .render(
-                factor_information=factor_information,
-                code=code,
-                execution_feedback=execution_feedback_to_render,
-                factor_value_feedback=factor_value_feedback,
-                gt_code=gt_implementation.code if gt_implementation else None,
-            )
-        )
-        while (
-            APIBackend().build_messages_and_calculate_token(
-                user_prompt=user_prompt,
-                system_prompt=system_prompt,
-                former_messages=[],
-            )
-            > RD_AGENT_SETTINGS.chat_token_limit
-        ):
-            execution_feedback_to_render = execution_feedback_to_render[len(execution_feedback_to_render) // 2 :]
+        for _ in range(10):  # 10 times to split the content is enough
             user_prompt = (
                 Environment(undefined=StrictUndefined)
                 .from_string(
@@ -111,48 +103,49 @@ def evaluate(
                     gt_code=gt_implementation.code if gt_implementation else None,
                 )
             )
+            if (
+                APIBackend().build_messages_and_calculate_token(
+                    user_prompt=user_prompt,
+                    system_prompt=system_prompt,
+                )
+                > RD_AGENT_SETTINGS.chat_token_limit
+            ):
+                execution_feedback_to_render = execution_feedback_to_render[len(execution_feedback_to_render) // 2 :]
+            else:
+                break
         critic_response = APIBackend().build_messages_and_create_chat_completion(
             user_prompt=user_prompt,
             system_prompt=system_prompt,
             json_mode=False,
         )
 
-        return critic_response
+        return critic_response, None
 
 
 class FactorSingleColumnEvaluator(FactorEvaluator):
     def evaluate(
         self,
-        gt: Implementation,
-        gen: Implementation,
+        implementation: Implementation,
+        gt_implementation: Implementation,
     ) -> Tuple[str, object]:
-        gt_df, gen_df = self._get_df(gt, gen)
-
-        if len(gen_df.columns) == 1 and len(gt_df.columns) == 1:
-            return "Both dataframes have only one column.", True
-        elif len(gen_df.columns) != 1:
-            gen_df = gen_df.iloc(axis=1)[
-                [
-                    0,
-                ]
-            ]
+        _, gen_df = self._get_df(gt_implementation, implementation)
+
+        if len(gen_df.columns) == 1:
+            return "The source dataframe has only one column which is correct.", True
+        else:
             return (
                 "The source dataframe has more than one column. Please check the implementation. We only evaluate the first column.",
                 False,
             )
-        return "", False
-
-    def __str__(self) -> str:
-        return self.__class__.__name__
 
 
 class FactorIndexFormatEvaluator(FactorEvaluator):
     def evaluate(
         self,
-        gt: Implementation,
-        gen: Implementation,
+        implementation: Implementation,
+        gt_implementation: Implementation,
     ) -> Tuple[str, object]:
-        gt_df, gen_df = self._get_df(gt, gen)
+        gt_df, gen_df = self._get_df(gt_implementation, implementation)
         idx_name_right = gen_df.index.names == ("datetime", "instrument")
         if idx_name_right:
             return (
@@ -165,17 +158,14 @@ def evaluate(
                 False,
             )
 
-    def __str__(self) -> str:
-        return self.__class__.__name__
-
 
 class FactorRowCountEvaluator(FactorEvaluator):
     def evaluate(
         self,
-        gt: Implementation,
-        gen: Implementation,
+        implementation: Implementation,
+        gt_implementation: Implementation,
     ) -> Tuple[str, object]:
-        gt_df, gen_df = self._get_df(gt, gen)
+        gt_df, gen_df = self._get_df(gt_implementation, implementation)
 
         if gen_df.shape[0] == gt_df.shape[0]:
             return "Both dataframes have the same rows count.", True
@@ -185,17 +175,14 @@ def evaluate(
                 False,
             )
 
-    def __str__(self) -> str:
-        return self.__class__.__name__
-
 
 class FactorIndexEvaluator(FactorEvaluator):
     def evaluate(
         self,
-        gt: Implementation,
-        gen: Implementation,
+        implementation: Implementation,
+        gt_implementation: Implementation,
     ) -> Tuple[str, object]:
-        gt_df, gen_df = self._get_df(gt, gen)
+        gt_df, gen_df = self._get_df(gt_implementation, implementation)
 
         if gen_df.index.equals(gt_df.index):
             return "Both dataframes have the same index.", True
@@ -205,17 +192,14 @@ def evaluate(
                 False,
             )
 
-    def __str__(self) -> str:
-        return self.__class__.__name__
-
 
 class FactorMissingValuesEvaluator(FactorEvaluator):
     def evaluate(
         self,
-        gt: Implementation,
-        gen: Implementation,
+        implementation: Implementation,
+        gt_implementation: Implementation,
     ) -> Tuple[str, object]:
-        gt_df, gen_df = self._get_df(gt, gen)
+        gt_df, gen_df = self._get_df(gt_implementation, implementation)
 
         if gen_df.isna().sum().sum() == gt_df.isna().sum().sum():
             return "Both dataframes have the same missing values.", True
@@ -225,17 +209,14 @@ def evaluate(
                 False,
             )
 
-    def __str__(self) -> str:
-        return self.__class__.__name__
 
-
-class FactorValuesEvaluator(FactorEvaluator):
+class FactorEqualValueCountEvaluator(FactorEvaluator):
     def evaluate(
         self,
-        gt: Implementation,
-        gen: Implementation,
+        implementation: Implementation,
+        gt_implementation: Implementation,
     ) -> Tuple[str, object]:
-        gt_df, gen_df = self._get_df(gt, gen)
+        gt_df, gen_df = self._get_df(gt_implementation, implementation)
 
         try:
             close_values = gen_df.sub(gt_df).abs().lt(1e-6)
@@ -255,9 +236,6 @@ def evaluate(
                 acc_rate,
             )
 
-    def __str__(self) -> str:
-        return self.__class__.__name__
-
 
 class FactorCorrelationEvaluator(FactorEvaluator):
     def __init__(self, hard_check: bool) -> None:
@@ -265,10 +243,10 @@ def __init__(self, hard_check: bool) -> None:
 
     def evaluate(
         self,
-        gt: Implementation,
-        gen: Implementation,
+        implementation: Implementation,
+        gt_implementation: Implementation,
     ) -> Tuple[str, object]:
-        gt_df, gen_df = self._get_df(gt, gen)
+        gt_df, gen_df = self._get_df(gt_implementation, implementation)
 
         concat_df = pd.concat([gen_df, gt_df], axis=1)
         concat_df.columns = ["source", "gt"]
@@ -294,186 +272,57 @@ def evaluate(
         else:
             return f"The ic is ({ic:.6f}) and the rankic is ({ric:.6f}).", ic
 
-    def __str__(self) -> str:
-        return self.__class__.__name__
-
 
-class FactorValEvaluator(FactorEvaluator):
-    def evaluate(self, gt: Implementation, gen: Implementation):
-        _, gt_df = gt.execute()
-        _, gen_df = gen.execute()
-        # FIXME: refactor the two classes
-        fiv = FactorValueEvaluator()
-        return fiv.evaluate(source_df=gen_df, gt_df=gt_df)
-
-    def __str__(self) -> str:
-        return self.__class__.__name__
+class FactorValueEvaluator(FactorEvaluator):
 
-
-class FactorValueEvaluator(Evaluator):
-    # TODO: let's discuss the about the interface of the evaluator
     def evaluate(
         self,
-        source_df: pd.DataFrame,
-        gt_df: pd.DataFrame,
+        implementation: Implementation,
+        gt_implementation: Implementation,
         **kwargs,
     ) -> Tuple:
         conclusions = []
 
-        if isinstance(source_df, pd.Series):
-            source_df = source_df.to_frame("source_factor")
-            conclusions.append(
-                "The source dataframe is a series, better convert it to a dataframe.",
-            )
-        if gt_df is not None and isinstance(gt_df, pd.Series):
-            gt_df = gt_df.to_frame("gt_factor")
-            conclusions.append(
-                "The ground truth dataframe is a series, convert it to a dataframe.",
-            )
-
         # Check if both dataframe has only one columns
-        if len(source_df.columns) == 1:
-            conclusions.append("The source dataframe has only one column which is correct.")
-        else:
-            conclusions.append(
-                "The source dataframe has more than one column. Please check the implementation. We only evaluate the first column.",
-            )
-            source_df = source_df.iloc(axis=1)[
-                [
-                    0,
-                ]
-            ]
-
-        if list(source_df.index.names) != ["datetime", "instrument"]:
-            conclusions.append(
-                rf"The index of the dataframe is not (\"datetime\", \"instrument\"), instead is {source_df.index.names}. Please check the implementation.",
-            )
-        else:
-            conclusions.append(
-                'The index of the dataframe is ("datetime", "instrument") and align with the predefined format.',
-            )
+        feedback_str, _ = FactorSingleColumnEvaluator().evaluate(implementation, gt_implementation)
+        conclusions.append(feedback_str)
+
+        # Check if the index of the dataframe is ("datetime", "instrument")
+        feedback_str, _ = FactorIndexFormatEvaluator().evaluate(implementation, gt_implementation)
+        conclusions.append(feedback_str)
 
         # Check if both dataframe have the same rows count
-        if gt_df is not None:
-            if source_df.shape[0] == gt_df.shape[0]:
-                conclusions.append("Both dataframes have the same rows count.")
-                same_row_count_result = True
-            else:
-                conclusions.append(
-                    f"The source dataframe and the ground truth dataframe have different rows count. The source dataframe has {source_df.shape[0]} rows, while the ground truth dataframe has {gt_df.shape[0]} rows. Please check the implementation.",
-                )
-                same_row_count_result = False
+        if gt_implementation is not None:
+            feedback_str, _ = FactorRowCountEvaluator().evaluate(implementation, gt_implementation)
+            conclusions.append(feedback_str)
 
-            # Check whether both dataframe has the same index
-            if source_df.index.equals(gt_df.index):
-                conclusions.append("Both dataframes have the same index.")
-                same_index_result = True
-            else:
-                conclusions.append(
-                    "The source dataframe and the ground truth dataframe have different index. Please check the implementation.",
-                )
-                same_index_result = False
+            feedback_str, same_index_result = FactorIndexEvaluator().evaluate(implementation, gt_implementation)
+            conclusions.append(feedback_str)
 
-            # Check for the same missing values (NaN)
-            if source_df.isna().sum().sum() == gt_df.isna().sum().sum():
-                conclusions.append("Both dataframes have the same missing values.")
-                same_missing_values_result = True
-            else:
-                conclusions.append(
-                    f"The dataframes do not have the same missing values. The source dataframe has {source_df.isna().sum().sum()} missing values, while the ground truth dataframe has {gt_df.isna().sum().sum()} missing values. Please check the implementation.",
-                )
-                same_missing_values_result = False
+            feedback_str, _ = FactorMissingValuesEvaluator().evaluate(implementation, gt_implementation)
+            conclusions.append(feedback_str)
+
+            feedback_str, equal_value_ratio_result = FactorEqualValueCountEvaluator().evaluate(
+                implementation, gt_implementation
+            )
+            conclusions.append(feedback_str)
 
-            # Check if the values are the same within a small tolerance
-            if not same_index_result:
-                conclusions.append(
-                    "The source dataframe and the ground truth dataframe have different index. Give up comparing the values and correlation because it's useless",
+            if same_index_result:
+                feedback_str, high_correlation_result = FactorCorrelationEvaluator(hard_check=True).evaluate(
+                    implementation, gt_implementation
                 )
-                same_values_result = False
-                high_correlation_result = False
             else:
-                close_values = source_df.sub(gt_df).abs().lt(1e-6)
-                if close_values.all().iloc[0]:
-                    conclusions.append(
-                        "All values in the dataframes are equal within the tolerance of 1e-6.",
-                    )
-                    same_values_result = True
-                else:
-                    conclusions.append(
-                        "Some values differ by more than the tolerance of 1e-6. Check for rounding errors or differences in the calculation methods.",
-                    )
-                    same_values_result = False
-
-                # Check the ic and rankic between the two dataframes
-                concat_df = pd.concat([source_df, gt_df], axis=1)
-                concat_df.columns = ["source", "gt"]
-                try:
-                    ic = concat_df.groupby("datetime").apply(lambda df: df["source"].corr(df["gt"])).dropna().mean()
-                    ric = (
-                        concat_df.groupby("datetime")
-                        .apply(lambda df: df["source"].corr(df["gt"], method="spearman"))
-                        .dropna()
-                        .mean()
-                    )
-
-                    if ic > 0.99 and ric > 0.99:
-                        conclusions.append(
-                            f"The dataframes are highly correlated. The ic is {ic:.6f} and the rankic is {ric:.6f}.",
-                        )
-                        high_correlation_result = True
-                    else:
-                        conclusions.append(
-                            f"The dataframes are not sufficiently high correlated. The ic is {ic:.6f} and the rankic is {ric:.6f}. Investigate the factors that might be causing the discrepancies and ensure that the logic of the factor calculation is consistent.",
-                        )
-                        high_correlation_result = False
-
-                    # Check for shifted alignments only in the "datetime" index
-                    max_shift_days = 2
-                    for shift in range(-max_shift_days, max_shift_days + 1):
-                        if shift == 0:
-                            continue  # Skip the case where there is no shift
-
-                        shifted_source_df = source_df.groupby(level="instrument").shift(shift)
-                        concat_df = pd.concat([shifted_source_df, gt_df], axis=1)
-                        concat_df.columns = ["source", "gt"]
-                        shifted_ric = (
-                            concat_df.groupby("datetime")
-                            .apply(lambda df: df["source"].corr(df["gt"], method="spearman"))
-                            .dropna()
-                            .mean()
-                        )
-                        if shifted_ric > 0.99:
-                            conclusions.append(
-                                f"The dataframes are highly correlated with a shift of {max_shift_days} days in the 'date' index. Shifted rankic: {shifted_ric:.6f}.",
-                            )
-                            break
-                    else:
-                        conclusions.append(
-                            f"No sufficient correlation found when shifting up to {max_shift_days} days in the 'date' index. Investigate the factors that might be causing discrepancies.",
-                        )
-
-                except Exception as e:
-                    RDAgentLog().warning(f"Error occurred when calculating the correlation: {str(e)}")
-                    conclusions.append(
-                        f"Some error occurred when calculating the correlation. Investigate the factors that might be causing the discrepancies and ensure that the logic of the factor calculation is consistent. Error: {e}",
-                    )
-                    high_correlation_result = False
+                high_correlation_result = False
+                feedback_str = "The source dataframe and the ground truth dataframe have different index. Give up comparing the values and correlation because it's useless"
+            conclusions.append(feedback_str)
 
         # Combine all conclusions into a single string
         conclusion_str = "\n".join(conclusions)
 
-        final_result = (same_values_result or high_correlation_result) if gt_df is not None else False
-        return conclusion_str, final_result
-
-
-# TODO:
-def shorten_prompt(tpl: str, render_kwargs: dict, shorten_key: str, max_trail: int = 10) -> str:
-    """When the prompt is too long. We have to shorten it.
-    But we should not truncate the prompt directly, so we should find the key we want to shorten and then shorten it.
-    """
-    # TODO: this should replace most of code in
-    # - FactorFinalDecisionEvaluator.evaluate
-    # - FactorCodeEvaluator.evaluate
+        same_value_or_high_correlation = (
+            ((equal_value_ratio_result > 0.99) or high_correlation_result) if gt_implementation is not None else False
+        )
+        return conclusion_str, same_value_or_high_correlation
 
 
 class FactorFinalDecisionEvaluator(Evaluator):
@@ -489,31 +338,8 @@ def evaluate(
             "evaluator_final_decision_v1_system"
         ]
         execution_feedback_to_render = execution_feedback
-        user_prompt = (
-            Environment(undefined=StrictUndefined)
-            .from_string(
-                evaluate_prompts["evaluator_final_decision_v1_user"],
-            )
-            .render(
-                factor_information=target_task.get_factor_information(),
-                execution_feedback=execution_feedback_to_render,
-                code_feedback=code_feedback,
-                factor_value_feedback=(
-                    value_feedback
-                    if value_feedback is not None
-                    else "No Ground Truth Value provided, so no evaluation on value is performed."
-                ),
-            )
-        )
-        while (
-            APIBackend().build_messages_and_calculate_token(
-                user_prompt=user_prompt,
-                system_prompt=system_prompt,
-                former_messages=[],
-            )
-            > RD_AGENT_SETTINGS.chat_token_limit
-        ):
-            execution_feedback_to_render = execution_feedback_to_render[len(execution_feedback_to_render) // 2 :]
+
+        for _ in range(10):  # 10 times to split the content is enough
             user_prompt = (
                 Environment(undefined=StrictUndefined)
                 .from_string(
@@ -530,6 +356,16 @@ def evaluate(
                     ),
                 )
             )
+            if (
+                APIBackend().build_messages_and_calculate_token(
+                    user_prompt=user_prompt,
+                    system_prompt=system_prompt,
+                )
+                > RD_AGENT_SETTINGS.chat_token_limit
+            ):
+                execution_feedback_to_render = execution_feedback_to_render[len(execution_feedback_to_render) // 2 :]
+            else:
+                break
 
         final_evaluation_dict = json.loads(
             APIBackend().build_messages_and_create_chat_completion(
@@ -586,14 +422,14 @@ class FactorMultiFeedback(
     """Feedback contains a list, each element is the corresponding feedback for each factor implementation."""
 
 
-class FactorEvaluatorV1(FactorEvaluator):
+class FactorEvaluatorForCoder(FactorEvaluator):
     """This class is the v1 version of evaluator for a single factor implementation.
     It calls several evaluators in share modules to evaluate the factor implementation.
     """
 
     def __init__(self) -> None:
-        self.code_evaluator = FactorCodeEvaluator()
         self.value_evaluator = FactorValueEvaluator()
+        self.code_evaluator = FactorCodeEvaluator()
         self.final_decision_evaluator = FactorFinalDecisionEvaluator()
 
     def evaluate(
@@ -625,51 +461,39 @@ def evaluate(
             )
         else:
             factor_feedback = FactorSingleFeedback()
+
+            # 1. Get factor execution feedback to generated implementation and remove the long list of numbers in execution feedback
             (
-                factor_feedback.execution_feedback,
-                source_df,
+                execution_feedback,
+                gen_df,
             ) = implementation.execute()
 
-            # Remove the long list of numbers in the feedback
-            pattern = r"(?<=\D)(,\s+-?\d+\.\d+){50,}(?=\D)"
-            factor_feedback.execution_feedback = re.sub(pattern, ", ", factor_feedback.execution_feedback)
-            execution_feedback_lines = [
-                line for line in factor_feedback.execution_feedback.split("\n") if "warning" not in line.lower()
-            ]
-            factor_feedback.execution_feedback = "\n".join(execution_feedback_lines)
+            execution_feedback = re.sub(r"(?<=\D)(,\s+-?\d+\.\d+){50,}(?=\D)", ", ", execution_feedback)
+            factor_feedback.execution_feedback = "\n".join(
+                [line for line in execution_feedback.split("\n") if "warning" not in line.lower()]
+            )
 
-            if source_df is None:
+            # 2. Get factor value feedback
+            if gen_df is None:
                 factor_feedback.factor_value_feedback = "No factor value generated, skip value evaluation."
                 factor_feedback.value_generated_flag = False
-                value_decision = None
+                same_value_or_high_correlation = None
             else:
                 factor_feedback.value_generated_flag = True
-                if gt_implementation is not None:
-                    _, gt_df = gt_implementation.execute(store_result=True)
-                else:
-                    gt_df = None
-                try:
-                    source_df = source_df.sort_index()
-                    if gt_df is not None:
-                        gt_df = gt_df.sort_index()
-                    (
-                        factor_feedback.factor_value_feedback,
-                        value_decision,
-                    ) = self.value_evaluator.evaluate(source_df=source_df, gt_df=gt_df)
-                except Exception as e:
-                    RDAgentLog().warning("Value evaluation failed with exception: %s", e)
-                    factor_feedback.factor_value_feedback = "Value evaluation failed."
-                    value_decision = False
+                (
+                    factor_feedback.factor_value_feedback,
+                    same_value_or_high_correlation,
+                ) = self.value_evaluator.evaluate(implementation=implementation, gt_implementation=gt_implementation)
 
             factor_feedback.final_decision_based_on_gt = gt_implementation is not None
 
-            if value_decision is not None and value_decision is True:
-                # To avoid confusion, when value_decision is True, we do not need code feedback
+            if same_value_or_high_correlation is not None and same_value_or_high_correlation is True:
+                # To avoid confusion, when same_value_or_high_correlation is True, we do not need code feedback
                 factor_feedback.code_feedback = "Final decision is True and there are no code critics."
-                factor_feedback.final_decision = value_decision
+                factor_feedback.final_decision = same_value_or_high_correlation
                 factor_feedback.final_feedback = "Value evaluation passed, skip final decision evaluation."
             else:
-                factor_feedback.code_feedback = self.code_evaluator.evaluate(
+                factor_feedback.code_feedback, _ = self.code_evaluator.evaluate(
                     target_task=target_task,
                     implementation=implementation,
                     execution_feedback=factor_feedback.execution_feedback,
@@ -689,7 +513,7 @@ def evaluate(
 
 
 class FactorMultiEvaluator(Evaluator):
-    def __init__(self, single_evaluator=FactorEvaluatorV1()) -> None:
+    def __init__(self, single_evaluator=FactorEvaluatorForCoder()) -> None:
         super().__init__()
         self.single_factor_implementation_evaluator = single_evaluator
 
@@ -704,9 +528,7 @@ def evaluate(
         # for index in range(len(evo.sub_tasks)):
         #     corresponding_implementation = evo.sub_implementations[index]
         #     corresponding_gt_implementation = (
-        #         evo.sub_gt_implementations[index]
-        #         if evo.sub_gt_implementations is not None
-        #         else None
+        #         evo.sub_gt_implementations[index] if evo.sub_gt_implementations is not None else None
         #     )
 
         #     multi_implementation_feedback.append(
@@ -744,3 +566,13 @@ def evaluate(
         RDAgentLog().info(f"Final decisions: {final_decision} True count: {final_decision.count(True)}")
 
         return multi_implementation_feedback
+
+
+# TODO:
+def shorten_prompt(tpl: str, render_kwargs: dict, shorten_key: str, max_trail: int = 10) -> str:
+    """When the prompt is too long. We have to shorten it.
+    But we should not truncate the prompt directly, so we should find the key we want to shorten and then shorten it.
+    """
+    # TODO: this should replace most of code in
+    # - FactorFinalDecisionEvaluator.evaluate
+    # - FactorCodeEvaluator.evaluate
diff --git a/rdagent/components/coder/factor_coder/factor.py b/rdagent/components/coder/factor_coder/factor.py
index bdaaf11a4..72d019951 100644
--- a/rdagent/components/coder/factor_coder/factor.py
+++ b/rdagent/components/coder/factor_coder/factor.py
@@ -218,5 +218,4 @@ def from_folder(task: FactorTask, path: Union[str, Path], **kwargs):
         return FileBasedFactorImplementation(task, code=code, **kwargs)
 
 
-class FactorExperiment(Experiment[FactorTask, FileBasedFactorImplementation]):
-    ...
+class FactorExperiment(Experiment[FactorTask, FileBasedFactorImplementation]): ...
diff --git a/rdagent/scenarios/qlib/factor_task_implementation/__init__.py b/rdagent/scenarios/qlib/factor_task_implementation/__init__.py
index 48ff2234a..0facb840d 100644
--- a/rdagent/scenarios/qlib/factor_task_implementation/__init__.py
+++ b/rdagent/scenarios/qlib/factor_task_implementation/__init__.py
@@ -1,4 +1,4 @@
 from rdagent.components.coder.factor_coder.CoSTEER import FactorCoSTEER
 
-COSTEERFG_QUANT_FACTOR_IMPLEMENTATION = FactorCoSTEER
+QlibFactorCoSTEER = FactorCoSTEER
 # TODO: This is a placeholder. We need to split the scenario part of the task implementation into this folder
diff --git a/rdagent/scenarios/qlib/model_task_implementation/__init__.py b/rdagent/scenarios/qlib/model_task_implementation/__init__.py
index ee2e24dd7..fad4c2842 100644
--- a/rdagent/scenarios/qlib/model_task_implementation/__init__.py
+++ b/rdagent/scenarios/qlib/model_task_implementation/__init__.py
@@ -1,4 +1,5 @@
 from rdagent.components.coder.model_coder.one_shot import ModelCodeWriter
 
 
-class QlibModelCodeWriter(ModelCodeWriter): ...
+class QlibModelCodeWriter(ModelCodeWriter):
+    ...