diff --git a/py/core/base/providers/ingestion.py b/py/core/base/providers/ingestion.py index a4d27b2de..f199a6ab5 100644 --- a/py/core/base/providers/ingestion.py +++ b/py/core/base/providers/ingestion.py @@ -7,33 +7,88 @@ from .base import Provider, ProviderConfig from .database import DatabaseProvider from .llm import CompletionProvider +from .base import AppConfig +from typing import ClassVar +from pydantic import BaseModel, Field logger = logging.getLogger() - class IngestionConfig(ProviderConfig): - provider: str = "r2r" - excluded_parsers: list[str] = ["mp4"] - chunk_enrichment_settings: ChunkEnrichmentSettings = ( - ChunkEnrichmentSettings() + _defaults: ClassVar[dict] = { + "app": AppConfig(), + "provider": "r2r", + "excluded_parsers": ["mp4"], + "chunk_enrichment_settings": ChunkEnrichmentSettings(), + "extra_parsers": {}, + "audio_transcription_model": "openai/whisper-1", + "vision_img_prompt_name": "vision_img", + "vision_img_model": "openai/gpt-4o", + "vision_pdf_prompt_name": "vision_pdf", + "vision_pdf_model": "openai/gpt-4o", + "skip_document_summary": False, + "document_summary_system_prompt": "default_system", + "document_summary_task_prompt": "default_summary", + "chunks_for_document_summary": 128, + "document_summary_model": "openai/gpt-4o-mini", + "parser_overrides": {}, + "extra_fields": {} + } + + provider: str = Field( + default_factory=lambda: IngestionConfig._defaults["provider"] + ) + excluded_parsers: list[str] = Field( + default_factory=lambda: IngestionConfig._defaults["excluded_parsers"] + ) + chunk_enrichment_settings: ChunkEnrichmentSettings = Field( + default_factory=lambda: IngestionConfig._defaults["chunk_enrichment_settings"] + ) + extra_parsers: dict[str, str] = Field( + default_factory=lambda: IngestionConfig._defaults["extra_parsers"] + ) + audio_transcription_model: str = Field( + default_factory=lambda: IngestionConfig._defaults["audio_transcription_model"] + ) + vision_img_prompt_name: str = Field( + default_factory=lambda: IngestionConfig._defaults["vision_img_prompt_name"] + ) + vision_img_model: str = Field( + default_factory=lambda: IngestionConfig._defaults["vision_img_model"] + ) + vision_pdf_prompt_name: str = Field( + default_factory=lambda: IngestionConfig._defaults["vision_pdf_prompt_name"] + ) + vision_pdf_model: str = Field( + default_factory=lambda: IngestionConfig._defaults["vision_pdf_model"] + ) + skip_document_summary: bool = Field( + default_factory=lambda: IngestionConfig._defaults["skip_document_summary"] + ) + document_summary_system_prompt: str = Field( + default_factory=lambda: IngestionConfig._defaults["document_summary_system_prompt"] + ) + document_summary_task_prompt: str = Field( + default_factory=lambda: IngestionConfig._defaults["document_summary_task_prompt"] + ) + chunks_for_document_summary: int = Field( + default_factory=lambda: IngestionConfig._defaults["chunks_for_document_summary"] + ) + document_summary_model: str = Field( + default_factory=lambda: IngestionConfig._defaults["document_summary_model"] + ) + parser_overrides: dict[str, str] = Field( + default_factory=lambda: IngestionConfig._defaults["parser_overrides"] ) - extra_parsers: dict[str, str] = {} - - audio_transcription_model: str = "openai/whisper-1" - - vision_img_prompt_name: str = "vision_img" - vision_img_model: str = "openai/gpt-4o" - - vision_pdf_prompt_name: str = "vision_pdf" - vision_pdf_model: str = "openai/gpt-4o" - - skip_document_summary: bool = False - document_summary_system_prompt: str = "default_system" - document_summary_task_prompt: str = "default_summary" - chunks_for_document_summary: int = 128 - document_summary_model: str = "openai/gpt-4o-mini" - parser_overrides: dict[str, str] = {} + @classmethod + def set_default(cls, **kwargs): + for key, value in kwargs.items(): + if key in cls._defaults: + cls._defaults[key] = value + else: + raise AttributeError( + f"No default attribute '{key}' in IngestionConfig" + ) @property def supported_providers(self) -> list[str]: @@ -43,6 +98,14 @@ def validate_config(self) -> None: if self.provider not in self.supported_providers: raise ValueError(f"Provider {self.provider} is not supported.") + @classmethod + def get_default(cls, mode: str, app) -> "IngestionConfig": + """Return default ingestion configuration for a given mode.""" + if mode == "hi-res": + return cls(app=app, parser_overrides={"pdf": "zerox"}) + else: + return cls(app=app) + @classmethod def get_default(cls, mode: str, app) -> "IngestionConfig": """Return default ingestion configuration for a given mode.""" @@ -59,6 +122,36 @@ def get_default(cls, mode: str, app) -> "IngestionConfig": return cls(app=app) + @classmethod + def set_default(cls, **kwargs): + for key, value in kwargs.items(): + if key in cls._defaults: + cls._defaults[key] = value + else: + raise AttributeError( + f"No default attribute '{key}' in GenerationConfig" + ) + + class Config: + populate_by_name = True + json_schema_extra = { + "provider": "r2r", + "excluded_parsers": ["mp4"], + "chunk_enrichment_settings": ChunkEnrichmentSettings().dict(), + "extra_parsers": {}, + "audio_transcription_model": "openai/whisper-1", + "vision_img_prompt_name": "vision_img", + "vision_img_model": "openai/gpt-4o", + "vision_pdf_prompt_name": "vision_pdf", + "vision_pdf_model": "openai/gpt-4o", + "skip_document_summary": False, + "document_summary_system_prompt": "default_system", + "document_summary_task_prompt": "default_summary", + "chunks_for_document_summary": 128, + "document_summary_model": "openai/gpt-4o-mini", + "parser_overrides": {}, + } + class IngestionProvider(Provider, ABC): config: IngestionConfig diff --git a/py/core/main/api/v3/documents_router.py b/py/core/main/api/v3/documents_router.py index c6ba01cd0..16e10dbb2 100644 --- a/py/core/main/api/v3/documents_router.py +++ b/py/core/main/api/v3/documents_router.py @@ -275,6 +275,7 @@ async def create_document( ingestion_mode=ingestion_mode, ingestion_config=ingestion_config, ) + print('effective_ingestion_config = ', effective_ingestion_config) if not file and not raw_text and not chunks: raise R2RException( status_code=422, diff --git a/py/core/main/config.py b/py/core/main/config.py index 75e1477f4..f2343bb26 100644 --- a/py/core/main/config.py +++ b/py/core/main/config.py @@ -123,6 +123,11 @@ def __init__(self, config_data: dict[str, Any]): self.agent = AgentConfig.create(**self.agent, app=self.app) # type: ignore self.orchestration = OrchestrationConfig.create(**self.orchestration, app=self.app) # type: ignore + + IngestionConfig.set_default( + **self.ingestion.dict() + ) + # override GenerationConfig defaults GenerationConfig.set_default( **self.completion.generation_config.dict() diff --git a/py/pyproject.toml b/py/pyproject.toml index c9006ce52..c352dd43c 100644 --- a/py/pyproject.toml +++ b/py/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "r2r" readme = "README.md" -version = "3.3.3" +version = "3.3.4" description = "SciPhi R2R" authors = ["Owen Colegrove "]