First version of factor idea proposal (#46)

* update all code * save code * update first version of factor proposal * change a comment * remove a useless comment --------- Co-authored-by: xuyang1 <[email protected]>
microsoft · Jul 4, 2024 · 637ad10 · 637ad10
1 parent fc91f36
commit 637ad10
Show file tree

Hide file tree

Showing 17 changed files with 354 additions and 98 deletions.
diff --git a/rdagent/app/model_proposal/conf.py b/rdagent/app/model_proposal/conf.py
diff --git a/rdagent/app/qlib_rd_loop/conf.py b/rdagent/app/qlib_rd_loop/conf.py
@@ -0,0 +1,17 @@
+from pydantic_settings import BaseSettings
+
+
+class PropSetting(BaseSettings):
+    """"""
+
+    scen: str = "rdagent.scenarios.qlib.experiment.factor_experiment.QlibFactorScenario"
+    hypothesis_gen: str = "rdagent.scenarios.qlib.factor_proposal.QlibFactorHypothesisGen"
+    hypothesis2experiment: str = "rdagent.scenarios.qlib.factor_proposal.QlibFactorHypothesis2Experiment"
+    qlib_factor_coder: str = "rdagent.components.task_implementation.factor_implementation.CoSTEER.CoSTEERFG"
+    qlib_factor_runner: str = "rdagent.scenarios.qlib.task_generator.data.QlibFactorRunner"
+    qlib_factor_summarizer: str = "rdagent.scenarios.qlib.task_generator.feedback.QlibFactorExperiment2Feedback"
+
+    evolving_n: int = 10
+
+
+PROP_SETTING = PropSetting()
diff --git a/rdagent/app/qlib_rd_loop/factor.py b/rdagent/app/qlib_rd_loop/factor.py
@@ -0,0 +1,38 @@
+"""
+TODO: Factor Structure RD-Loop
+"""
+
+# import_from
+from rdagent.app.qlib_rd_loop.conf import PROP_SETTING
+from rdagent.core.proposal import (
+    Experiment2Feedback,
+    Hypothesis2Experiment,
+    HypothesisGen,
+    HypothesisSet,
+    Trace,
+)
+from rdagent.core.task_generator import TaskGenerator
+from rdagent.core.utils import import_class
+
+scen = import_class(PROP_SETTING.scen)()
+
+hypothesis_gen: HypothesisGen = import_class(PROP_SETTING.hypothesis_gen)(scen)
+
+hypothesis2experiment: Hypothesis2Experiment = import_class(PROP_SETTING.hypothesis2experiment)()
+
+qlib_factor_coder: TaskGenerator = import_class(PROP_SETTING.qlib_factor_coder)()
+qlib_factor_runner: TaskGenerator = import_class(PROP_SETTING.qlib_factor_runner)()
+
+qlib_factor_summarizer: Experiment2Feedback = import_class(PROP_SETTING.qlib_factor_summarizer)()
+
+
+trace = Trace(scen=scen)
+hs = HypothesisSet(trace=trace)
+for _ in range(PROP_SETTING.evolving_n):
+    hypothesis = hypothesis_gen.gen(trace)
+    exp = hypothesis2experiment.convert(hs)
+    exp = qlib_factor_coder.generate(exp)
+    exp = qlib_factor_runner.generate(exp)
+    feedback = qlib_factor_summarizer.summarize(exp)
+
+    trace.hist.append((hypothesis, exp, feedback))
diff --git a/rdagent/app/model_proposal/run.py → rdagent/app/qlib_rd_loop/model.py b/rdagent/app/model_proposal/run.py → rdagent/app/qlib_rd_loop/model.py
diff --git a/rdagent/components/idea_proposal/factor_proposal.py b/rdagent/components/idea_proposal/factor_proposal.py
@@ -1,13 +1,18 @@
 from abc import abstractmethod
 from pathlib import Path
+from typing import Tuple
 
 from jinja2 import Environment, StrictUndefined
 
+from rdagent.components.task_implementation.factor_implementation.factor import (
+    FactorExperiment,
+)
 from rdagent.core.prompts import Prompts
 from rdagent.core.proposal import (
     Hypothesis,
-    Hypothesis2Task,
+    Hypothesis2Experiment,
     HypothesisGen,
+    HypothesisSet,
     Scenario,
     Trace,
 )
@@ -22,40 +27,71 @@
 class FactorHypothesisGen(HypothesisGen):
     def __init__(self, scen: Scenario):
         super().__init__(scen)
-        self.gen_context_flag = False
-        self.gen_context_dict = None
-        self.gen_json_flag = False
 
     # The following methods are scenario related so they should be implemented in the subclass
     @abstractmethod
-    def prepare_gen_context(self, trace: Trace) -> None: ...
+    def prepare_context(self, trace: Trace) -> Tuple[dict, bool]: ...
 
     @abstractmethod
-    def gen_response_to_hypothesis_list(self, response: str) -> FactorHypothesis: ...
+    def convert_response(self, response: str) -> FactorHypothesis: ...
 
     def gen(self, trace: Trace) -> FactorHypothesis:
-        assert self.gen_context_flag, "Please call prepare_gen_context before calling gen."
-        self.gen_context_flag = False  # reset the flag
+        context_dict, json_flag = self.prepare_context(trace)
 
         system_prompt = (
             Environment(undefined=StrictUndefined)
             .from_string(prompt_dict["factor_hypothesis_gen"]["system_prompt"])
-            .render(scenario=self.scen.get_scenario_all_desc())
+            .render(
+                scenario=self.scen.get_scenario_all_desc(),
+                hypothesis_output_format=context_dict["hypothesis_output_format"],
+            )
         )
         user_prompt = (
             Environment(undefined=StrictUndefined)
             .from_string(prompt_dict["factor_hypothesis_gen"]["user_prompt"])
-            .render(self.gen_context_dict)
+            .render(
+                hypothesis_and_feedback=context_dict["hypothesis_and_feedback"],
+                RAG=context_dict["RAG"],
+            )
         )
 
-        resp = APIBackend().build_messages_and_create_chat_completion(
-            user_prompt, system_prompt, json_mode=self.gen_json_flag
-        )
+        resp = APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt, json_mode=json_flag)
 
-        hypothesis = self.gen_response_to_hypothesis_list(resp)
+        hypothesis = self.convert_response(resp)
 
         return hypothesis
 
 
-class FactorHypothesis2Task(Hypothesis2Task):
-    def convert(self, bs: FactorHypothesis) -> None: ...
+class FactorHypothesis2Experiment(Hypothesis2Experiment[FactorExperiment]):
+    def __init__(self) -> None:
+        super().__init__()
+
+    @abstractmethod
+    def prepare_context(self, hs: HypothesisSet) -> Tuple[dict, bool]: ...
+
+    @abstractmethod
+    def convert_response(self, response: str) -> FactorExperiment: ...
+
+    def convert(self, hs: HypothesisSet) -> FactorExperiment:
+        context, json_flag = self.prepare_context(hs)
+        system_prompt = (
+            Environment(undefined=StrictUndefined)
+            .from_string(prompt_dict["factor_hypothesis2experiment"]["system_prompt"])
+            .render(
+                scenario=hs.trace.scen.get_scenario_all_desc(),
+                experiment_output_format=context["experiment_output_format"],
+            )
+        )
+        user_prompt = (
+            Environment(undefined=StrictUndefined)
+            .from_string(prompt_dict["factor_hypothesis2experiment"]["user_prompt"])
+            .render(
+                hypothesis_and_feedback=context["hypothesis_and_feedback"],
+                factor_list=context["factor_list"],
+                RAG=context["RAG"],
+            )
+        )
+
+        resp = APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt, json_mode=json_flag)
+
+        return self.convert_response(resp)
diff --git a/rdagent/components/idea_proposal/prompts.yaml b/rdagent/components/idea_proposal/prompts.yaml
@@ -3,20 +3,36 @@ factor_hypothesis_gen:
     The user is trying to generate new hypothesis on the factors in data-driven research and development.
     The factors are used in a certain scenario, the scenario is as follows:
     {{ scenario }}
-    The user has made several hypothesis on this sencario and did several evaluation on them. The user will provide this information to you.
+    The user has made several hypothesis on this scenario and did several evaluation on them. The user will provide this information to you.
     To help you generate new hypothesis, the user has prepared some additional information for you. You should use this information to help generate new factors.
+    Please generate the output following the format below:
+    {{ hypothesis_output_format }}
   user_prompt: |-
-    The user has made several hypothesis on this sencario and did several evaluation on them.
+    The user has made several hypothesis on this scenario and did several evaluation on them.
     The former hypothesis and the corresponding feedbacks are as follows:
     {{ hypothesis_and_feedback }}
     To help you generate new factors, we have prepared the following information for you:
     {{ RAG }}
-    Please generate the new hypothesis based on the information above and generate the output following the format below:
-    {{ factor_output_format }}
+    Please generate the new hypothesis based on the information above.
 
-factor_hypothesis_to_tasks:
+factor_hypothesis2experiment:
   system_prompt: |-
     The user is trying to generate new factors based on the hypothesis generated in the previous step. 
     The factors are used in certain scenario, the scenario is as follows:
     {{ scenario }}
-
+    The user will use the factors generated to do some experiments. The user will provide this information to you:
+    1. The hypothesis generated in the previous steps and their corresponding feedbacks.
+    2. Former proposed factors on similar hypothesis.
+    3. Some additional information to help you generate new factors.
+    Please generate the output following the format below:
+    {{ experiment_output_format }}
+    
+  user_prompt: |-
+    The user has made several hypothesis on this scenario and did several evaluation on them.
+    The former hypothesis and the corresponding feedbacks are as follows:
+    {{ hypothesis_and_feedback }}
+    The former proposed factors on similar hypothesis are as follows:
+    {{ factor_list }}
+    To help you generate new factors, we have prepared the following information for you:
+    {{ RAG }}
+    Please generate the new factors based on the information above.
diff --git a/rdagent/components/task_implementation/factor_implementation/evolving/evolvable_subjects.py b/rdagent/components/task_implementation/factor_implementation/evolving/evolvable_subjects.py
@@ -7,7 +7,7 @@
 from rdagent.core.log import RDAgentLog
 
 
-class FactorEvolvingItem(FactorExperiment[FactorTask, FileBasedFactorImplementation], EvolvableSubjects):
+class FactorEvolvingItem(FactorExperiment, EvolvableSubjects):
     """
     Intermediate item of factor implementation.
     """

diff --git a/rdagent/components/task_implementation/factor_implementation/factor.py b/rdagent/components/task_implementation/factor_implementation/factor.py
@@ -220,4 +220,4 @@ def from_folder(task: FactorTask, path: Union[str, Path], **kwargs):
         return FileBasedFactorImplementation(task, code=code, **kwargs)
 
 
-FactorExperiment = Experiment
+class FactorExperiment(Experiment[FactorTask, FileBasedFactorImplementation]): ...
diff --git a/rdagent/core/experiment.py b/rdagent/core/experiment.py
@@ -119,7 +119,7 @@ class Experiment(ABC, Generic[ASpecificTask, ASpecificImp]):
     The experiment is a sequence of tasks and the implementations of the tasks after generated by the TaskGenerator.
     """
 
-    def __init__(self, sub_tasks: Sequence[Task]) -> None:
+    def __init__(self, sub_tasks: Sequence[ASpecificTask]) -> None:
         self.sub_tasks = sub_tasks
         self.sub_implementations: Sequence[ASpecificImp] = [None for _ in self.sub_tasks]
 

diff --git a/rdagent/core/proposal.py b/rdagent/core/proposal.py
@@ -2,7 +2,8 @@
 
 """
 
-from typing import Dict, List, Tuple
+from abc import ABC, abstractmethod
+from typing import Dict, Generic, List, Tuple, TypeVar
 
 from rdagent.core.evolving_framework import Feedback
 from rdagent.core.experiment import Experiment, Implementation, Loader, Task
@@ -18,35 +19,53 @@ class Hypothesis:
     - Belief
     """
 
-    hypothesis: str = None
-    reason: str = None
+    def __init__(self, hypothesis: str, reason: str) -> None:
+        self.hypothesis: str = hypothesis
+        self.reason: str = reason
 
     # source: data_ana | model_nan = None
 
 
 # Origin(path of repo/data/feedback) => view/summarization => generated Hypothesis
 
 
-class Scenario:
-    def get_repo_path(self):
-        """codebase"""
+class Scenario(ABC):
 
-    def get_data(self):
-        """ "data info"""
+    @property
+    @abstractmethod
+    def background(self):
+        """Background information"""
 
-    def get_env(self):
-        """env description"""
+    @property
+    @abstractmethod
+    def source_data(self):
+        """Source data description"""
 
+    @property
+    @abstractmethod
+    def interface(self):
+        """Interface description about how to run the code"""
+
+    @property
+    @abstractmethod
+    def simulator(self):
+        """Simulator description"""
+
+    @abstractmethod
     def get_scenario_all_desc(self) -> str:
         """Combine all the description together"""
 
 
 class HypothesisFeedback(Feedback): ...
 
 
-class Trace:
-    scen: Scenario
-    hist: list[Tuple[Hypothesis, Experiment, HypothesisFeedback]]
+ASpecificScen = TypeVar("ASpecificScen", bound=Scenario)
+
+
+class Trace(Generic[ASpecificScen]):
+    def __init__(self, scen: ASpecificScen) -> None:
+        self.scen: ASpecificScen = scen
+        self.hist: list[Tuple[Hypothesis, Experiment, HypothesisFeedback]] = []
 
 
 class HypothesisGen:
@@ -74,16 +93,21 @@ class HypothesisSet:
     true_hypothesis or false_hypothesis
     """
 
-    hypothesis_list: list[Hypothesis]
-    trace: Trace
+    def __init__(self, trace: Trace, hypothesis_list: list[Hypothesis] = []) -> None:
+        self.hypothesis_list: list[Hypothesis] = hypothesis_list
+        self.trace: Trace = trace
+
+
+ASpecificExp = TypeVar("ASpecificExp", bound=Experiment)
 
 
-class Hypothesis2Experiment(Loader[Experiment]):
+class Hypothesis2Experiment(ABC, Generic[ASpecificExp]):
     """
     [Abstract description => concrete description] => Code implement
     """
 
-    def convert(self, bs: HypothesisSet) -> Experiment:
+    @abstractmethod
+    def convert(self, hs: HypothesisSet) -> ASpecificExp:
         """Connect the idea proposal to implementation"""
         ...
 
@@ -99,3 +123,4 @@ def summarize(self, ti: Experiment) -> HypothesisFeedback:
         The `ti` should be executed and the results should be included.
         For example: `mlflow` of Qlib will be included.
         """
+        return HypothesisFeedback()
diff --git a/rdagent/scenarios/qlib/experiment/factor_experiment.py b/rdagent/scenarios/qlib/experiment/factor_experiment.py
@@ -1,5 +1,43 @@
+from pathlib import Path
+
 from rdagent.components.task_implementation.factor_implementation.factor import (
     FactorExperiment,
 )
+from rdagent.components.task_implementation.factor_implementation.utils import (
+    get_data_folder_intro,
+)
+from rdagent.core.prompts import Prompts
+from rdagent.core.proposal import Scenario
+
+prompt_dict = Prompts(file_path=Path(__file__).parent / "prompts.yaml")
 
 QlibFactorExperiment = FactorExperiment
+
+
+class QlibFactorScenario(Scenario):
+    @property
+    def background(self) -> str:
+        return prompt_dict["qlib_factor_background"]
+
+    @property
+    def source_data(self) -> str:
+        return get_data_folder_intro()
+
+    @property
+    def interface(self) -> str:
+        return prompt_dict["qlib_factor_interface"]
+
+    @property
+    def simulator(self) -> str:
+        return prompt_dict["qlib_factor_simulator"]
+
+    def get_scenario_all_desc(self) -> str:
+        return f"""Background of the scenario:
+{self.background}
+The source data you can use:
+{self.source_data}
+The interface you should follow to write the runnable code:
+{self.interface}
+The simulator user can use to test your factor:
+{self.simulator}
+"""
Original file line number	Diff line number	Diff line change
Expand Up		@@ -220,4 +220,4 @@ def from_folder(task: FactorTask, path: Union[str, Path], **kwargs):
		return FileBasedFactorImplementation(task, code=code, **kwargs)


		FactorExperiment = Experiment
		class FactorExperiment(Experiment[FactorTask, FileBasedFactorImplementation]): ...