From 3ac57b77107e3217a4879f69cb4fe478c8f9c2e8 Mon Sep 17 00:00:00 2001 From: Javier <160494147+javier-cohere@users.noreply.github.com> Date: Fri, 8 Nov 2024 11:32:20 +0100 Subject: [PATCH] Add parameters for the new pptx parser (#35) This PR introduces a new class, `PresentationParsingStrategy`, which extends the `StrEnum` class. This class provides two parsing strategies for presentations: `Unstructured` and `ImageToMarkdown`. The `Unstructured` strategy is set as the default. The `ParserConfig` class is also updated to include a new parameter, `presentation_parsing_strategy`, which is set to the `Unstructured` strategy by default. ## Changes: - Added the `PresentationParsingStrategy` class. - Added the `presentation_parsing_strategy` parameter to the `ParserConfig` class. --- compass_sdk/__init__.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/compass_sdk/__init__.py b/compass_sdk/__init__.py index d0e0ed6..abcd4bf 100644 --- a/compass_sdk/__init__.py +++ b/compass_sdk/__init__.py @@ -266,6 +266,15 @@ def _missing_(cls, value): return cls.QuickText +class PresentationParsingStrategy(StrEnum): + Unstructured = "Unstructured" + ImageToMarkdown = "ImageToMarkdown" + + @classmethod + def _missing_(cls, value): + return cls.Unstructured + + class ParserConfig(BaseModel): """ CompassParser configuration. Important parameters: @@ -319,6 +328,7 @@ class ParserConfig(BaseModel): horizontal_table_crop_margin: int = 100 pdf_parsing_strategy: PDFParsingStrategy = PDFParsingStrategy.QuickText + presentation_parsing_strategy: PresentationParsingStrategy = PresentationParsingStrategy.Unstructured ### Document indexing