From 34f61d98bfcbd6bc975ccc4a90791d26a3db4300 Mon Sep 17 00:00:00 2001 From: Xu Yang Date: Fri, 2 Aug 2024 05:58:26 +0000 Subject: [PATCH] use multiprocessing to calculate IC and some minor fix --- constraints/3.10.txt | 113 ++++++++++++++++++ constraints/3.11.txt | 113 ++++++++++++++++++ .../coder/factor_coder/CoSTEER/evaluators.py | 1 - .../components/coder/factor_coder/config.py | 2 +- rdagent/components/coder/model_coder/conf.py | 2 +- .../knowledge_management/vector_base.py | 3 - .../scenarios/qlib/developer/factor_runner.py | 7 +- rdagent/scenarios/qlib/developer/feedback.py | 10 +- requirements.txt | 2 + 9 files changed, 244 insertions(+), 9 deletions(-) diff --git a/constraints/3.10.txt b/constraints/3.10.txt index 0eb3965a..9de7bbc9 100644 --- a/constraints/3.10.txt +++ b/constraints/3.10.txt @@ -4,30 +4,51 @@ alabaster==0.7.13 annotated-types==0.6.0 anyio==4.2.0 appdirs==1.4.4 +argon2-cffi==23.1.0 +argon2-cffi-bindings==21.2.0 +arrow==1.3.0 +asttokens==2.4.1 +async-lru==2.0.4 async-timeout==4.0.3 attrs==23.2.0 autodoc-pydantic==2.0.1 azure-ai-formrecognizer==3.3.2 azure-common==1.1.28 azure-core==1.29.6 +azure-identity==1.17.1 Babel==2.14.0 beautifulsoup4==4.12.2 black==23.12.1 +bleach==6.1.0 +blosc2==2.7.1 build==1.0.3 certifi==2023.11.17 cffi==1.16.0 charset-normalizer==3.3.2 click==8.1.7 colorama==0.4.6 +comm==0.2.2 +contourpy==1.2.1 coverage==7.4.0 cryptography==41.0.7 +cycler==0.12.1 Cython==3.0.7 dataclasses-json==0.6.3 +debugpy==1.8.2 +decorator==5.1.1 +defusedxml==0.7.1 +dill==0.3.8 distro==1.9.0 +docker==7.1.0 docutils==0.20.1 exceptiongroup==1.2.0 +executing==2.0.1 +fastjsonschema==2.20.0 +feedparser==6.0.11 filelock==3.13.1 fire==0.5.0 +fonttools==4.53.1 +fqdn==1.5.1 frozenlist==1.4.1 fsspec==2023.12.2 furo==2023.9.10 @@ -41,35 +62,73 @@ idna==3.6 imagesize==1.4.1 importlib-metadata==7.0.1 iniconfig==2.0.0 +ipykernel==6.29.5 +ipython==8.26.0 +ipywidgets==8.1.3 isodate==0.6.1 +isoduration==20.11.0 isort==5.13.2 jaraco.classes==3.3.0 +jedi==0.19.1 jeepney==0.8.0 Jinja2==3.1.2 +joblib==1.4.2 +json5==0.9.25 jsonpatch==1.33 jsonpointer==2.4 +jsonschema==4.23.0 +jsonschema-specifications==2023.12.1 +jupyter==1.0.0 +jupyter-console==6.6.3 +jupyter-events==0.10.0 +jupyter-lsp==2.2.5 +jupyter_client==8.6.2 +jupyter_core==5.7.2 +jupyter_server==2.14.2 +jupyter_server_terminals==0.5.3 +jupyterlab==4.2.4 +jupyterlab_pygments==0.3.0 +jupyterlab_server==2.27.3 +jupyterlab_widgets==3.0.11 keyring==24.3.0 +kiwisolver==1.4.5 langchain==0.0.353 langchain-community==0.0.7 langchain-core==0.1.4 langsmith==0.0.75 +Levenshtein==0.25.1 livereload==2.6.3 loguru==0.7.2 +loguru-mypy==0.0.4 lxml==5.0.0 markdown-it-py==3.0.0 MarkupSafe==2.1.3 marshmallow==3.20.1 +matplotlib==3.9.1 +matplotlib-inline==0.1.7 mdit-py-plugins==0.4.0 mdurl==0.1.2 +mistune==3.0.2 more-itertools==10.1.0 mpmath==1.3.0 +msal==1.30.0 +msal-extensions==1.2.0 +msgpack==1.0.8 msrest==0.7.1 multidict==6.0.4 mypy==1.10.0 mypy-extensions==1.0.0 myst-parser==2.0.0 +nbclient==0.10.0 +nbconvert==7.16.4 +nbformat==5.10.4 +ndindex==1.8 +nest-asyncio==1.6.0 networkx==3.2.1 nh3==0.2.15 +notebook==7.2.1 +notebook_shim==0.2.4 +numexpr==2.10.1 numpy==1.26.2 nvidia-cublas-cu12==12.1.3.1 nvidia-cuda-cupti-cu12==12.1.105 @@ -85,37 +144,69 @@ nvidia-nvjitlink-cu12==12.3.101 nvidia-nvtx-cu12==12.1.105 oauthlib==3.2.2 openai==1.6.1 +overrides==7.7.0 packaging==23.2 +pandarallel==1.6.5 pandas==2.1.4 +pandocfilters==1.5.1 +parso==0.8.4 pathspec==0.12.1 +patsy==0.5.6 +pexpect==4.9.0 +pillow==10.4.0 pkginfo==1.9.6 platformdirs==4.1.0 pluggy==1.3.0 +portalocker==2.10.1 +prometheus_client==0.20.0 +prompt_toolkit==3.0.47 +psutil==6.0.0 +ptyprocess==0.7.0 +pure_eval==0.2.3 +py-cpuinfo==9.0.0 pycparser==2.21 pydantic==2.5.3 pydantic-settings==2.1.0 pydantic_core==2.14.6 Pygments==2.17.2 +PyJWT==2.8.0 +PyMuPDF==1.24.9 +PyMuPDFb==1.24.9 +pyparsing==3.1.2 pypdf==3.17.4 pyproject_hooks==1.0.0 pytest==7.4.4 python-dateutil==2.8.2 python-dotenv==1.0.0 +python-json-logger==2.0.7 +python-Levenshtein==0.25.1 pytz==2023.3.post1 PyYAML==6.0.1 +pyzmq==26.0.3 +qtconsole==5.5.2 +QtPy==2.4.1 +rapidfuzz==3.9.5 readme-renderer==42.0 +referencing==0.35.1 +regex==2024.7.24 requests==2.31.0 requests-oauthlib==1.3.1 requests-toolbelt==1.0.0 +rfc3339-validator==0.1.4 rfc3986==2.0.0 +rfc3986-validator==0.1.1 rich==13.7.0 +rpds-py==0.19.1 ruamel.yaml==0.18.5 ruamel.yaml.clib==0.2.8 ruff==0.4.5 +scikit-learn==1.5.1 scipy==1.11.4 SecretStorage==3.3.3 semver==3.0.2 +Send2Trash==1.8.3 setuptools-scm==8.0.4 +sgmllib3k==1.0.0 shellingham==1.5.4 six==1.16.0 sniffio==1.3.0 @@ -133,21 +224,43 @@ sphinxcontrib-jsmath==1.0.1 sphinxcontrib-qthelp==1.0.6 sphinxcontrib-serializinghtml==1.1.9 SQLAlchemy==2.0.24 +stack-data==0.6.3 +statsmodels==0.14.2 sympy==1.12 +tables==3.9.2 +tabulate==0.9.0 tenacity==8.2.3 termcolor==2.4.0 +terminado==0.18.1 +threadpoolctl==3.5.0 +tiktoken==0.7.0 +tinycss2==1.3.0 toml-sort==0.23.1 tomli==2.0.1 tomlkit==0.12.3 torch==2.1.2 +torch_geometric==2.5.3 tornado==6.4 tqdm==4.66.1 +traitlets==5.14.3 +tree-sitter==0.22.3 +tree-sitter-python==0.21.0 triton==2.1.0 twine==4.0.2 typer==0.9.0 +types-psutil==6.0.0.20240621 +types-python-dateutil==2.9.0.20240316 +types-PyYAML==6.0.12.20240724 +types-tqdm==4.66.0.20240417 typing-inspect==0.9.0 typing_extensions==4.9.0 tzdata==2023.4 +uri-template==1.3.0 urllib3==2.1.0 +wcwidth==0.2.13 +webcolors==24.6.0 +webencodings==0.5.1 +websocket-client==1.8.0 +widgetsnbextension==4.0.11 yarl==1.9.4 zipp==3.17.0 diff --git a/constraints/3.11.txt b/constraints/3.11.txt index 4ac9c55c..0e800115 100644 --- a/constraints/3.11.txt +++ b/constraints/3.11.txt @@ -4,28 +4,49 @@ alabaster==0.7.13 annotated-types==0.6.0 anyio==4.2.0 appdirs==1.4.4 +argon2-cffi==23.1.0 +argon2-cffi-bindings==21.2.0 +arrow==1.3.0 +asttokens==2.4.1 +async-lru==2.0.4 attrs==23.2.0 autodoc-pydantic==2.0.1 azure-ai-formrecognizer==3.3.2 azure-common==1.1.28 azure-core==1.29.6 +azure-identity==1.17.1 Babel==2.14.0 beautifulsoup4==4.12.2 black==23.12.1 +bleach==6.1.0 +blosc2==2.7.1 build==1.0.3 certifi==2023.11.17 cffi==1.16.0 charset-normalizer==3.3.2 click==8.1.7 colorama==0.4.6 +comm==0.2.2 +contourpy==1.2.1 coverage==7.4.0 cryptography==41.0.7 +cycler==0.12.1 Cython==3.0.7 dataclasses-json==0.6.3 +debugpy==1.8.2 +decorator==5.1.1 +defusedxml==0.7.1 +dill==0.3.8 distro==1.9.0 +docker==7.1.0 docutils==0.20.1 +executing==2.0.1 +fastjsonschema==2.20.0 +feedparser==6.0.11 filelock==3.13.1 fire==0.5.0 +fonttools==4.53.1 +fqdn==1.5.1 frozenlist==1.4.1 fsspec==2023.12.2 furo==2023.9.10 @@ -39,35 +60,73 @@ idna==3.6 imagesize==1.4.1 importlib-metadata==7.0.1 iniconfig==2.0.0 +ipykernel==6.29.5 +ipython==8.26.0 +ipywidgets==8.1.3 isodate==0.6.1 +isoduration==20.11.0 isort==5.13.2 jaraco.classes==3.3.0 +jedi==0.19.1 jeepney==0.8.0 Jinja2==3.1.2 +joblib==1.4.2 +json5==0.9.25 jsonpatch==1.33 jsonpointer==2.4 +jsonschema==4.23.0 +jsonschema-specifications==2023.12.1 +jupyter==1.0.0 +jupyter-console==6.6.3 +jupyter-events==0.10.0 +jupyter-lsp==2.2.5 +jupyter_client==8.6.2 +jupyter_core==5.7.2 +jupyter_server==2.14.2 +jupyter_server_terminals==0.5.3 +jupyterlab==4.2.4 +jupyterlab_pygments==0.3.0 +jupyterlab_server==2.27.3 +jupyterlab_widgets==3.0.11 keyring==24.3.0 +kiwisolver==1.4.5 langchain==0.0.353 langchain-community==0.0.7 langchain-core==0.1.4 langsmith==0.0.75 +Levenshtein==0.25.1 livereload==2.6.3 loguru==0.7.2 +loguru-mypy==0.0.4 lxml==5.0.0 markdown-it-py==3.0.0 MarkupSafe==2.1.3 marshmallow==3.20.1 +matplotlib==3.9.1 +matplotlib-inline==0.1.7 mdit-py-plugins==0.4.0 mdurl==0.1.2 +mistune==3.0.2 more-itertools==10.1.0 mpmath==1.3.0 +msal==1.30.0 +msal-extensions==1.2.0 +msgpack==1.0.8 msrest==0.7.1 multidict==6.0.4 mypy==1.10.0 mypy-extensions==1.0.0 myst-parser==2.0.0 +nbclient==0.10.0 +nbconvert==7.16.4 +nbformat==5.10.4 +ndindex==1.8 +nest-asyncio==1.6.0 networkx==3.2.1 nh3==0.2.15 +notebook==7.2.1 +notebook_shim==0.2.4 +numexpr==2.10.1 numpy==1.26.2 nvidia-cublas-cu12==12.1.3.1 nvidia-cuda-cupti-cu12==12.1.105 @@ -83,37 +142,69 @@ nvidia-nvjitlink-cu12==12.3.101 nvidia-nvtx-cu12==12.1.105 oauthlib==3.2.2 openai==1.6.1 +overrides==7.7.0 packaging==23.2 +pandarallel==1.6.5 pandas==2.1.4 +pandocfilters==1.5.1 +parso==0.8.4 pathspec==0.12.1 +patsy==0.5.6 +pexpect==4.9.0 +pillow==10.4.0 pkginfo==1.9.6 platformdirs==4.1.0 pluggy==1.3.0 +portalocker==2.10.1 +prometheus_client==0.20.0 +prompt_toolkit==3.0.47 +psutil==6.0.0 +ptyprocess==0.7.0 +pure_eval==0.2.3 +py-cpuinfo==9.0.0 pycparser==2.21 pydantic==2.5.3 pydantic-settings==2.1.0 pydantic_core==2.14.6 Pygments==2.17.2 +PyJWT==2.9.0 +PyMuPDF==1.24.9 +PyMuPDFb==1.24.9 +pyparsing==3.1.2 pypdf==3.17.4 pyproject_hooks==1.0.0 pytest==7.4.4 python-dateutil==2.8.2 python-dotenv==1.0.0 +python-json-logger==2.0.7 +python-Levenshtein==0.25.1 pytz==2023.3.post1 PyYAML==6.0.1 +pyzmq==26.0.3 +qtconsole==5.5.2 +QtPy==2.4.1 +rapidfuzz==3.9.5 readme-renderer==42.0 +referencing==0.35.1 +regex==2024.7.24 requests==2.31.0 requests-oauthlib==1.3.1 requests-toolbelt==1.0.0 +rfc3339-validator==0.1.4 rfc3986==2.0.0 +rfc3986-validator==0.1.1 rich==13.7.0 +rpds-py==0.19.1 ruamel.yaml==0.18.5 ruamel.yaml.clib==0.2.8 ruff==0.4.5 +scikit-learn==1.5.1 scipy==1.11.4 SecretStorage==3.3.3 semver==3.0.2 +Send2Trash==1.8.3 setuptools-scm==8.0.4 +sgmllib3k==1.0.0 shellingham==1.5.4 six==1.16.0 sniffio==1.3.0 @@ -131,20 +222,42 @@ sphinxcontrib-jsmath==1.0.1 sphinxcontrib-qthelp==1.0.6 sphinxcontrib-serializinghtml==1.1.9 SQLAlchemy==2.0.24 +stack-data==0.6.3 +statsmodels==0.14.2 sympy==1.12 +tables==3.9.2 +tabulate==0.9.0 tenacity==8.2.3 termcolor==2.4.0 +terminado==0.18.1 +threadpoolctl==3.5.0 +tiktoken==0.7.0 +tinycss2==1.3.0 toml-sort==0.23.1 tomlkit==0.12.3 torch==2.1.2 +torch_geometric==2.5.3 tornado==6.4 tqdm==4.66.1 +traitlets==5.14.3 +tree-sitter==0.22.3 +tree-sitter-python==0.21.0 triton==2.1.0 twine==4.0.2 typer==0.9.0 +types-psutil==6.0.0.20240621 +types-python-dateutil==2.9.0.20240316 +types-PyYAML==6.0.12.20240724 +types-tqdm==4.66.0.20240417 typing-inspect==0.9.0 typing_extensions==4.9.0 tzdata==2023.4 +uri-template==1.3.0 urllib3==2.1.0 +wcwidth==0.2.13 +webcolors==24.6.0 +webencodings==0.5.1 +websocket-client==1.8.0 +widgetsnbextension==4.0.11 yarl==1.9.4 zipp==3.17.0 diff --git a/rdagent/components/coder/factor_coder/CoSTEER/evaluators.py b/rdagent/components/coder/factor_coder/CoSTEER/evaluators.py index bd732bcb..3bb6b0e2 100644 --- a/rdagent/components/coder/factor_coder/CoSTEER/evaluators.py +++ b/rdagent/components/coder/factor_coder/CoSTEER/evaluators.py @@ -611,7 +611,6 @@ def evaluate( value_feedback=factor_feedback.factor_value_feedback, code_feedback=factor_feedback.code_feedback, ) - logger.info(factor_feedback.final_decision) return factor_feedback diff --git a/rdagent/components/coder/factor_coder/config.py b/rdagent/components/coder/factor_coder/config.py index cc3c301c..df0e6161 100644 --- a/rdagent/components/coder/factor_coder/config.py +++ b/rdagent/components/coder/factor_coder/config.py @@ -44,7 +44,7 @@ class Config: max_loop: int = 10 knowledge_base_path: Union[str, None] = None - new_knowledge_base_path: Union[str, None] = None + new_knowledge_base_path: Union[str, None] = knowledge_base_path python_bin: str = "python" diff --git a/rdagent/components/coder/model_coder/conf.py b/rdagent/components/coder/model_coder/conf.py index ea45041b..af4c07b9 100644 --- a/rdagent/components/coder/model_coder/conf.py +++ b/rdagent/components/coder/model_coder/conf.py @@ -15,7 +15,7 @@ class Config: ) knowledge_base_path: Union[str, None] = None - new_knowledge_base_path: Union[str, None] = None + new_knowledge_base_path: Union[str, None] = knowledge_base_path max_loop: int = 10 diff --git a/rdagent/components/knowledge_management/vector_base.py b/rdagent/components/knowledge_management/vector_base.py index c0d72e5e..f5d22c5e 100644 --- a/rdagent/components/knowledge_management/vector_base.py +++ b/rdagent/components/knowledge_management/vector_base.py @@ -127,8 +127,6 @@ def __init__(self, vector_df_path: Union[str, Path] = None): else: self.vector_df = pd.DataFrame(columns=["id", "label", "content", "embedding"]) - logger.info(f"VectorBase loaded, shape={self.vector_df.shape}") - def shape(self): return self.vector_df.shape @@ -205,4 +203,3 @@ def load(self, vector_df_path, **kwargs): def save(self, vector_df_path, **kwargs): self.vector_df.to_pickle(vector_df_path) - logger.info(f"Save vectorBase vector_df to: {vector_df_path}") diff --git a/rdagent/scenarios/qlib/developer/factor_runner.py b/rdagent/scenarios/qlib/developer/factor_runner.py index 33f4d1c4..30455c43 100644 --- a/rdagent/scenarios/qlib/developer/factor_runner.py +++ b/rdagent/scenarios/qlib/developer/factor_runner.py @@ -3,6 +3,9 @@ from typing import List import pandas as pd +from pandarallel import pandarallel + +pandarallel.initialize(verbose=1) from rdagent.components.runner import CachedRunner from rdagent.components.runner.conf import RUNNER_SETTINGS @@ -56,7 +59,9 @@ def deduplicate_new_factors(self, SOTA_feature: pd.DataFrame, new_feature: pd.Da concat_feature = pd.concat([SOTA_feature, new_feature], axis=1) IC_max = ( concat_feature.groupby("datetime") - .apply(lambda x: self.calculate_information_coefficient(x, SOTA_feature.shape[1], new_feature.shape[1])) + .parallel_apply( + lambda x: self.calculate_information_coefficient(x, SOTA_feature.shape[1], new_feature.shape[1]) + ) .mean() ) IC_max.index = pd.MultiIndex.from_product([range(SOTA_feature.shape[1]), range(new_feature.shape[1])]) diff --git a/rdagent/scenarios/qlib/developer/feedback.py b/rdagent/scenarios/qlib/developer/feedback.py index 6711a87f..c0edfa24 100644 --- a/rdagent/scenarios/qlib/developer/feedback.py +++ b/rdagent/scenarios/qlib/developer/feedback.py @@ -51,7 +51,13 @@ def process_results(current_result, sota_result): # Filter the combined DataFrame to retain only the important metrics filtered_combined_df = combined_df.loc[important_metrics] - return filtered_combined_df.to_dict() + filtered_combined_df[ + "Bigger columns name (Didn't consider the direction of the metric)" + ] = filtered_combined_df.apply( + lambda row: "Current Result" if row["Current Result"] > row["SOTA Result"] else "SOTA Result", axis=1 + ) + + return filtered_combined_df.to_string() class QlibFactorHypothesisExperiment2Feedback(HypothesisExperiment2Feedback): @@ -75,7 +81,7 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac # Process the results to filter important metrics combined_result = process_results(current_result, sota_result) - logger.info(f"combined_result: {combined_result}") + # logger.info(f"combined_result: {combined_result}") # Generate the system prompt sys_prompt = ( diff --git a/requirements.txt b/requirements.txt index f9ba05af..b6d8f7e4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,6 +19,7 @@ torch_geometric tabulate # Convert pandas dataframe to markdown table to make it more readable to LLM numpy # we use numpy as default data format. So we have to install numpy pandas # we use pandas as default data format. So we have to install pandas +pandarallel # parallelize pandas feedparser matplotlib langchain @@ -60,3 +61,4 @@ python-dotenv # infrastructure related. docker +