Skip to content

Commit

Permalink
add a task loader interface && move pdf analysis to pdf task loader
Browse files Browse the repository at this point in the history
  • Loading branch information
peteryangms committed Jun 17, 2024
1 parent b3ae882 commit df3ac3b
Show file tree
Hide file tree
Showing 8 changed files with 34 additions and 60 deletions.
Original file line number Diff line number Diff line change
@@ -1,44 +1,15 @@
# %%
from pathlib import Path

from dotenv import load_dotenv
from rdagent.document_process.document_analysis_to_factors import (
check_factor_viability,
classify_report_from_dict,
deduplicate_factors_by_llm,
extract_factors_from_report_dict,
merge_file_to_factor_dict_to_factor_dict,
)
from rdagent.document_process.document_reader import load_and_process_pdfs_by_langchain
from rdagent.factor_implementation.share_modules.factor_implementation_utils import load_data_from_dict
from rdagent.factor_implementation.CoSTEER import CoSTEERFG
from dotenv import load_dotenv
from rdagent.factor_implementation.task_loader.pdf_loader import FactorImplementationTaskLoaderFromPDFfiles

assert load_dotenv()


def extract_factors(report_file_path: str) -> None:
docs_dict = load_and_process_pdfs_by_langchain(Path(report_file_path))

selected_report_dict = classify_report_from_dict(report_dict=docs_dict, vote_time=1)
file_to_factor_result = extract_factors_from_report_dict(docs_dict, selected_report_dict)
factor_dict = merge_file_to_factor_dict_to_factor_dict(file_to_factor_result)

factor_viability = check_factor_viability(factor_dict)
factor_dict, duplication_names_list = deduplicate_factors_by_llm(factor_dict, factor_viability)
return factor_dict


def implement_factors(factor_dict: dict) -> None:
factor_tasks = load_data_from_dict(factor_dict)
factor_generate_method = CoSTEERFG()
implementation_result = factor_generate_method.generate(factor_tasks)
return implementation_result


def extract_factors_and_implement(report_file_path: str) -> None:
factor_tasks = extract_factors(report_file_path)
implementation_result = implement_factors(factor_tasks)
factor_tasks = FactorImplementationTaskLoaderFromPDFfiles().load(report_file_path)
implementation_result = CoSTEERFG().generate(factor_tasks)
return implementation_result


if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion rdagent/core/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,5 +33,5 @@ def __init__(

class TaskLoader:
@abstractmethod
def load(*args, **kwargs) -> BaseTask | list[BaseTask]:
def load(self, *args, **kwargs) -> BaseTask | list[BaseTask]:
raise NotImplementedError("load method is not implemented.")
File renamed without changes.
10 changes: 10 additions & 0 deletions rdagent/factor_implementation/prompts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -217,3 +217,13 @@ select_implementable_factor_user: |-
{% endfor %}
{% endif %}
{% endfor %}
analyze_component_prompt_v1_system: |-
User is getting a new task that might consist of the components below (given in component_index: component_description):
{{all_component_content}}
You should find out what components does the new task have, and put their indices in a list.
Please response the critic in the json format. Here is an example structure for the JSON output, please strictly follow the format:
{
"component_no_list": the list containing indices of components.
}
Original file line number Diff line number Diff line change
Expand Up @@ -50,17 +50,3 @@ def get_data_folder_intro():
f"file type {p.name} is not supported. Please implement its description function.",
)
return "\n ----------------- file spliter -------------\n".join(content_l)


def load_data_from_dict(factor_dict: dict) -> list[FactorImplementTask]:
"""Load data from a dict."""
task_l = []
for factor_name, factor_data in factor_dict.items():
task = FactorImplementTask(
factor_name=factor_name,
factor_description=factor_data["description"],
factor_formulation=factor_data["formulation"],
variables=factor_data["variables"],
)
task_l.append(task)
return task_l
6 changes: 3 additions & 3 deletions rdagent/factor_implementation/task_loader/json_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@


class FactorImplementationTaskLoaderFromDict(TaskLoader):
def load(factor_dict: dict) -> list:
def load(self, factor_dict: dict) -> list:
"""Load data from a dict."""
task_l = []
for factor_name, factor_data in factor_dict.items():
Expand All @@ -20,12 +20,12 @@ def load(factor_dict: dict) -> list:


class FactorImplementationTaskLoaderFromJsonFile(TaskLoader):
def load(json_file_path: Path) -> list:
def load(self, json_file_path: Path) -> list:
factor_dict = json.load(json_file_path)
return FactorImplementationTaskLoaderFromDict().load(factor_dict)


class FactorImplementationTaskLoaderFromJsonString(TaskLoader):
def load(json_string: str) -> list:
def load(self, json_string: str) -> list:
factor_dict = json.loads(json_string)
return FactorImplementationTaskLoaderFromDict().load(factor_dict)
16 changes: 16 additions & 0 deletions rdagent/factor_implementation/task_loader/pdf_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@
from rdagent.core.conf import RD_Agent_Settings
from rdagent.core.log import RDAgentLog
from rdagent.core.prompts import Prompts
from rdagent.core.task import TaskLoader
from rdagent.document_reader.document_reader import load_and_process_pdfs_by_langchain
from rdagent.factor_implementation.task_loader.json_loader import FactorImplementationTaskLoaderFromDict
from rdagent.oai.llm_utils import APIBackend, create_embedding_with_multiprocessing
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
Expand Down Expand Up @@ -566,3 +569,16 @@ def deduplicate_factors_by_llm( # noqa: C901, PLR0912
llm_deduplicated_factor_dict[factor_name] = factor_dict[factor_name]

return llm_deduplicated_factor_dict, final_duplication_names_list


class FactorImplementationTaskLoaderFromPDFfiles(TaskLoader):
def load(self, file_or_folder_path: Path) -> dict:
docs_dict = load_and_process_pdfs_by_langchain(Path(file_or_folder_path))

selected_report_dict = classify_report_from_dict(report_dict=docs_dict, vote_time=1)
file_to_factor_result = extract_factors_from_report_dict(docs_dict, selected_report_dict)
factor_dict = merge_file_to_factor_dict_to_factor_dict(file_to_factor_result)

factor_viability = check_factor_viability(factor_dict)
factor_dict, duplication_names_list = deduplicate_factors_by_llm(factor_dict, factor_viability)
return FactorImplementationTaskLoaderFromDict().load(factor_dict)
9 changes: 0 additions & 9 deletions rdagent/knowledge_management/prompts.yaml

This file was deleted.

0 comments on commit df3ac3b

Please sign in to comment.