Skip to content

Commit

Permalink
Refactor config
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Nov 14, 2024
1 parent 92b9a2c commit d27b874
Show file tree
Hide file tree
Showing 16 changed files with 50 additions and 57 deletions.
3 changes: 3 additions & 0 deletions marker/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,6 @@
def flush_cuda_memory():
if settings.TORCH_DEVICE_MODEL == "cuda":
torch.cuda.empty_cache()



6 changes: 3 additions & 3 deletions marker/v2/builders/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@

from pydantic import BaseModel

from marker.v2.util import assign_config


class BaseBuilder:
def __init__(self, config: Optional[BaseModel] = None):
if config:
for k in config.model_fields:
setattr(self, k, config[k])
assign_config(self, config)

def __call__(self, data, *args, **kwargs):
raise NotImplementedError
4 changes: 2 additions & 2 deletions marker/v2/builders/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@ def __call__(self, provider: PdfProvider, layout_builder: LayoutBuilder):
return document

def build_document(self, provider: PdfProvider):
if provider.config.page_range is None:
if provider.page_range is None:
page_range = range(len(provider))
else:
page_range = provider.config.page_range
page_range = provider.page_range
assert max(page_range) < len(provider) and min(page_range) >= 0, "Invalid page range"

initial_pages = [
Expand Down
6 changes: 3 additions & 3 deletions marker/v2/converters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@

from pydantic import BaseModel

from marker.v2.util import assign_config


class BaseConverter:
def __init__(self, config: Optional[BaseModel] = None):
if config:
for k in config.model_fields:
setattr(self, k, config[k])
assign_config(self, config)

def __call__(self):
raise NotImplementedError
3 changes: 1 addition & 2 deletions marker/v2/converters/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,14 @@
from marker.v2.builders.structure import StructureBuilder
from marker.v2.converters import BaseConverter
from marker.v2.providers.pdf import PdfProvider
from marker.v2.schema.config.pdf import PdfProviderConfig


class PdfConverter(BaseConverter):
filepath: str
page_range: List[int] | None = None

def __call__(self):
pdf_provider = PdfProvider(self.config.filepath, PdfProviderConfig())
pdf_provider = PdfProvider(self.filepath)

layout_model = load_model()
layout_model.processor = load_processor()
Expand Down
6 changes: 3 additions & 3 deletions marker/v2/processors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@

from pydantic import BaseModel

from marker.v2.util import assign_config


class BaseProcessor:
def __init__(self, config: Optional[BaseModel] = None):
if config:
for k in config.model_fields:
setattr(self, k, config[k])
assign_config(self, config)
15 changes: 6 additions & 9 deletions marker/v2/providers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,19 @@
from typing import List
from typing import List, Optional

from pydantic import BaseModel

from marker.v2.schema.config.provider import ProviderConfig
from marker.v2.schema.text.line import Line
from marker.v2.util import assign_config


class BaseProvider:
def __init__(self, filepath: str, config: ProviderConfig):
def __init__(self, filepath: str, config: Optional[BaseModel] = None):
assign_config(self, config)
self.filepath = filepath
self.config = config

self.setup()

def __len__(self):
pass

def setup(self):
pass

def get_image(self, idx: int, dpi: int):
pass

Expand Down
21 changes: 12 additions & 9 deletions marker/v2/providers/pdf.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,25 @@
import functools
from typing import Dict, List
from typing import Dict, List, Optional

import pypdfium2 as pdfium
from pdftext.extraction import dictionary_output
from PIL import Image
from pydantic import BaseModel

from marker.v2.providers import BaseProvider
from marker.v2.schema.config.pdf import PdfProviderConfig
from marker.v2.schema.polygon import PolygonBox
from marker.v2.schema.text.line import Line, Span


class PdfProvider(BaseProvider):
def __init__(self, filepath: str, config: PdfProviderConfig):
self.filepath = filepath
self.config = config
self.page_lines: Dict[int, List[Line]] = {}
page_range: List[int] | None = None
pdftext_workers: int = 4
flatten_pdf: bool = True

def __init__(self, filepath: str, config: Optional[BaseModel] = None):
super().__init__(filepath, config)

self.page_lines: Dict[int, List[Line]] = {}
self.doc: pdfium.PdfDocument

self.setup()
Expand Down Expand Up @@ -68,10 +71,10 @@ def setup(self):
self.doc = pdfium.PdfDocument(self.filepath)
page_char_blocks = dictionary_output(
self.filepath,
page_range=self.config.page_range,
page_range=self.page_range,
keep_chars=False,
workers=self.config.pdftext_workers,
flatten_pdf=self.config.flatten_pdf
workers=self.pdftext_workers,
flatten_pdf=self.flatten_pdf
)
for page in page_char_blocks:
page_id = page["page"]
Expand Down
10 changes: 9 additions & 1 deletion marker/v2/renderers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,10 @@
from typing import Optional

from pydantic import BaseModel


class BaseRenderer:
pass
def __init__(self, config: Optional[BaseModel] = None):
if config:
for k in config.model_fields:
setattr(self, k, config[k])
8 changes: 0 additions & 8 deletions marker/v2/schema/config/pdf.py

This file was deleted.

9 changes: 0 additions & 9 deletions marker/v2/schema/config/provider.py

This file was deleted.

4 changes: 4 additions & 0 deletions marker/v2/util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
def assign_config(cls, config):
if config:
for k in config.model_fields:
setattr(cls, k, config[k])
3 changes: 1 addition & 2 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from marker.v2.builders.document import DocumentBuilder
from marker.v2.builders.layout import LayoutBuilder
from marker.v2.providers.pdf import PdfProvider
from marker.v2.schema.config.pdf import PdfProviderConfig
from marker.v2.schema.document import Document


Expand All @@ -34,7 +33,7 @@ def pdf_document(request, layout_model) -> Document:
temp_pdf.write(dataset['pdf'][idx])
temp_pdf.flush()

provider = PdfProvider(temp_pdf.name, PdfProviderConfig())
provider = PdfProvider(temp_pdf.name)
layout_builder = LayoutBuilder(layout_model)
builder = DocumentBuilder()
document = builder(provider, layout_builder)
Expand Down
3 changes: 1 addition & 2 deletions tests/test_document_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@

from marker.v2.builders.document import DocumentBuilder
from marker.v2.builders.layout import LayoutBuilder
from marker.v2.schema.config.pdf import PdfProviderConfig


def test_document_builder(layout_model):
Expand All @@ -17,7 +16,7 @@ def test_document_builder(layout_model):
temp_pdf.write(dataset['pdf'][idx])
temp_pdf.flush()

provider = PdfProvider(temp_pdf.name, PdfProviderConfig())
provider = PdfProvider(temp_pdf.name)
layout_builer = LayoutBuilder(layout_model)
builder = DocumentBuilder()
document = builder(provider, layout_builer)
Expand Down
3 changes: 1 addition & 2 deletions tests/test_pdf_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import datasets

from marker.v2.providers.pdf import PdfProvider
from marker.v2.schema.config.pdf import PdfProviderConfig


def test_pdf_provider():
Expand All @@ -14,7 +13,7 @@ def test_pdf_provider():
temp_pdf.write(dataset['pdf'][idx])
temp_pdf.flush()

provider = PdfProvider(temp_pdf.name, PdfProviderConfig())
provider = PdfProvider(temp_pdf.name)
assert len(provider) == 12
assert provider.get_image(0, 72).size == (612, 792)
assert provider.get_image(0, 96).size == (816, 1056)
Expand Down
3 changes: 1 addition & 2 deletions tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from marker.v2.builders.document import DocumentBuilder
from marker.v2.builders.layout import LayoutBuilder
from marker.v2.providers.pdf import PdfProvider
from marker.v2.schema.config.pdf import PdfProviderConfig
from marker.v2.schema.document import Document


Expand All @@ -21,7 +20,7 @@ def setup_pdf_document(filename: str) -> Document:
layout_model = load_model()
layout_model.processor = load_processor()

provider = PdfProvider(temp_pdf.name, PdfProviderConfig())
provider = PdfProvider(temp_pdf.name)
layout_builder = LayoutBuilder(layout_model)
builder = DocumentBuilder()
document = builder(provider, layout_builder)
Expand Down

0 comments on commit d27b874

Please sign in to comment.