Skip to content

Commit

Permalink
fix provider defaults issue (#1667)
Browse files Browse the repository at this point in the history
  • Loading branch information
emrgnt-cmplxty authored Dec 6, 2024
1 parent 1d5f1eb commit 9735323
Show file tree
Hide file tree
Showing 4 changed files with 121 additions and 22 deletions.
135 changes: 114 additions & 21 deletions py/core/base/providers/ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,33 +7,88 @@
from .base import Provider, ProviderConfig
from .database import DatabaseProvider
from .llm import CompletionProvider
from .base import AppConfig
from typing import ClassVar
from pydantic import BaseModel, Field

logger = logging.getLogger()


class IngestionConfig(ProviderConfig):
provider: str = "r2r"
excluded_parsers: list[str] = ["mp4"]
chunk_enrichment_settings: ChunkEnrichmentSettings = (
ChunkEnrichmentSettings()
_defaults: ClassVar[dict] = {
"app": AppConfig(),
"provider": "r2r",
"excluded_parsers": ["mp4"],
"chunk_enrichment_settings": ChunkEnrichmentSettings(),
"extra_parsers": {},
"audio_transcription_model": "openai/whisper-1",
"vision_img_prompt_name": "vision_img",
"vision_img_model": "openai/gpt-4o",
"vision_pdf_prompt_name": "vision_pdf",
"vision_pdf_model": "openai/gpt-4o",
"skip_document_summary": False,
"document_summary_system_prompt": "default_system",
"document_summary_task_prompt": "default_summary",
"chunks_for_document_summary": 128,
"document_summary_model": "openai/gpt-4o-mini",
"parser_overrides": {},
"extra_fields": {}
}

provider: str = Field(
default_factory=lambda: IngestionConfig._defaults["provider"]
)
excluded_parsers: list[str] = Field(
default_factory=lambda: IngestionConfig._defaults["excluded_parsers"]
)
chunk_enrichment_settings: ChunkEnrichmentSettings = Field(
default_factory=lambda: IngestionConfig._defaults["chunk_enrichment_settings"]
)
extra_parsers: dict[str, str] = Field(
default_factory=lambda: IngestionConfig._defaults["extra_parsers"]
)
audio_transcription_model: str = Field(
default_factory=lambda: IngestionConfig._defaults["audio_transcription_model"]
)
vision_img_prompt_name: str = Field(
default_factory=lambda: IngestionConfig._defaults["vision_img_prompt_name"]
)
vision_img_model: str = Field(
default_factory=lambda: IngestionConfig._defaults["vision_img_model"]
)
vision_pdf_prompt_name: str = Field(
default_factory=lambda: IngestionConfig._defaults["vision_pdf_prompt_name"]
)
vision_pdf_model: str = Field(
default_factory=lambda: IngestionConfig._defaults["vision_pdf_model"]
)
skip_document_summary: bool = Field(
default_factory=lambda: IngestionConfig._defaults["skip_document_summary"]
)
document_summary_system_prompt: str = Field(
default_factory=lambda: IngestionConfig._defaults["document_summary_system_prompt"]
)
document_summary_task_prompt: str = Field(
default_factory=lambda: IngestionConfig._defaults["document_summary_task_prompt"]
)
chunks_for_document_summary: int = Field(
default_factory=lambda: IngestionConfig._defaults["chunks_for_document_summary"]
)
document_summary_model: str = Field(
default_factory=lambda: IngestionConfig._defaults["document_summary_model"]
)
parser_overrides: dict[str, str] = Field(
default_factory=lambda: IngestionConfig._defaults["parser_overrides"]
)
extra_parsers: dict[str, str] = {}

audio_transcription_model: str = "openai/whisper-1"

vision_img_prompt_name: str = "vision_img"
vision_img_model: str = "openai/gpt-4o"

vision_pdf_prompt_name: str = "vision_pdf"
vision_pdf_model: str = "openai/gpt-4o"

skip_document_summary: bool = False
document_summary_system_prompt: str = "default_system"
document_summary_task_prompt: str = "default_summary"
chunks_for_document_summary: int = 128
document_summary_model: str = "openai/gpt-4o-mini"

parser_overrides: dict[str, str] = {}
@classmethod
def set_default(cls, **kwargs):
for key, value in kwargs.items():
if key in cls._defaults:
cls._defaults[key] = value
else:
raise AttributeError(
f"No default attribute '{key}' in IngestionConfig"
)

@property
def supported_providers(self) -> list[str]:
Expand All @@ -43,6 +98,14 @@ def validate_config(self) -> None:
if self.provider not in self.supported_providers:
raise ValueError(f"Provider {self.provider} is not supported.")

@classmethod
def get_default(cls, mode: str, app) -> "IngestionConfig":
"""Return default ingestion configuration for a given mode."""
if mode == "hi-res":
return cls(app=app, parser_overrides={"pdf": "zerox"})
else:
return cls(app=app)

@classmethod
def get_default(cls, mode: str, app) -> "IngestionConfig":
"""Return default ingestion configuration for a given mode."""
Expand All @@ -59,6 +122,36 @@ def get_default(cls, mode: str, app) -> "IngestionConfig":
return cls(app=app)


@classmethod
def set_default(cls, **kwargs):
for key, value in kwargs.items():
if key in cls._defaults:
cls._defaults[key] = value
else:
raise AttributeError(
f"No default attribute '{key}' in GenerationConfig"
)

class Config:
populate_by_name = True
json_schema_extra = {
"provider": "r2r",
"excluded_parsers": ["mp4"],
"chunk_enrichment_settings": ChunkEnrichmentSettings().dict(),
"extra_parsers": {},
"audio_transcription_model": "openai/whisper-1",
"vision_img_prompt_name": "vision_img",
"vision_img_model": "openai/gpt-4o",
"vision_pdf_prompt_name": "vision_pdf",
"vision_pdf_model": "openai/gpt-4o",
"skip_document_summary": False,
"document_summary_system_prompt": "default_system",
"document_summary_task_prompt": "default_summary",
"chunks_for_document_summary": 128,
"document_summary_model": "openai/gpt-4o-mini",
"parser_overrides": {},
}

class IngestionProvider(Provider, ABC):

config: IngestionConfig
Expand Down
1 change: 1 addition & 0 deletions py/core/main/api/v3/documents_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,7 @@ async def create_document(
ingestion_mode=ingestion_mode,
ingestion_config=ingestion_config,
)
print('effective_ingestion_config = ', effective_ingestion_config)
if not file and not raw_text and not chunks:
raise R2RException(
status_code=422,
Expand Down
5 changes: 5 additions & 0 deletions py/core/main/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,11 @@ def __init__(self, config_data: dict[str, Any]):
self.agent = AgentConfig.create(**self.agent, app=self.app) # type: ignore
self.orchestration = OrchestrationConfig.create(**self.orchestration, app=self.app) # type: ignore


IngestionConfig.set_default(
**self.ingestion.dict()
)

# override GenerationConfig defaults
GenerationConfig.set_default(
**self.completion.generation_config.dict()
Expand Down
2 changes: 1 addition & 1 deletion py/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ build-backend = "poetry.core.masonry.api"
[tool.poetry]
name = "r2r"
readme = "README.md"
version = "3.3.3"
version = "3.3.4"

description = "SciPhi R2R"
authors = ["Owen Colegrove <[email protected]>"]
Expand Down

0 comments on commit 9735323

Please sign in to comment.