Skip to content

Commit

Permalink
Merge pull request #15 from explosion/feature/doc-markdown
Browse files Browse the repository at this point in the history
Add Doc._.markdown
  • Loading branch information
ines authored Dec 9, 2024
2 parents 9647ce6 + 0d06079 commit 64c6f4a
Show file tree
Hide file tree
Showing 5 changed files with 18 additions and 2 deletions.
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ print(doc.text)
print(doc._.layout)
# Tables in the document and their extracted data
print(doc._.tables)
# Markdown representation of the document
print(doc._.markdown)

# Layout spans for different sections
for span in doc.spans["layout"]:
Expand Down Expand Up @@ -114,6 +116,7 @@ for span in doc.spans["layout"]:
| `Doc._.layout` | `DocLayout` | Layout features of the document. |
| `Doc._.pages` | `list[tuple[PageLayout, list[Span]]]` | Pages in the document and the spans they contain. |
| `Doc._.tables` | `list[Span]` | All tables in the document. |
| `Doc._.markdown` | `str` | Markdown representation of the document. |
| `Doc.spans["layout"]` | `spacy.tokens.SpanGroup` | The layout spans in the document. |
| `Span.label_` | `str` | The type of the extracted layout span, e.g. `"text"` or `"section_header"`. [See here](https://github.com/DS4SD/docling-core/blob/14cad33ae7f8dc011a79dd364361d2647c635466/docling_core/types/doc/labels.py) for options. |
| `Span.label` | `int` | The integer ID of the span label. |
Expand Down Expand Up @@ -161,7 +164,7 @@ layout = spaCyLayout(nlp)
| --- | --- | --- |
| `nlp` | `spacy.language.Language` | The initialized `nlp` object to use for tokenization. |
| `separator` | `str` | Token used to separate sections in the created `Doc` object. The separator won't be part of the layout span. If `None`, no separator will be added. Defaults to `"\n\n"`. |
| `attrs` | `dict[str, str]` | Override the custom spaCy attributes. Can include `"doc_layout"`, `"doc_pages"`, `"doc_tables"`, `"span_layout"`, `"span_data"`, `"span_heading"` and `"span_group"`. |
| `attrs` | `dict[str, str]` | Override the custom spaCy attributes. Can include `"doc_layout"`, `"doc_pages"`, `"doc_tables"`, `"doc_markdown"`, `"span_layout"`, `"span_data"`, `"span_heading"` and `"span_group"`. |
| `headings` | `list[str]` | Labels of headings to consider for `Span._.heading` detection. Defaults to `["section_header", "page_header", "title"]`. |
| `display_table` | `Callable[[pandas.DataFrame], str] \| str` | Function to generate the text-based representation of the table in the `Doc.text` or placeholder text. Defaults to `"TABLE"`. |
| `docling_options` | `dict[InputFormat, FormatOption]` | [Format options](https://ds4sd.github.io/docling/usage/#advanced-options) passed to Docling's `DocumentConverter`. |
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[metadata]
version = 0.0.8
version = 0.0.9
description = Use spaCy with PDFs, Word docs and other documents
url = https://github.com/explosion/spacy-layout
author = Explosion
Expand Down
3 changes: 3 additions & 0 deletions spacy_layout/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def __init__(
doc_layout=attrs.get("doc_layout", "layout"),
doc_pages=attrs.get("doc_pages", "pages"),
doc_tables=attrs.get("doc_tables", "tables"),
doc_markdown=attrs.get("doc_markdown", "markdown"),
span_layout=attrs.get("span_layout", "layout"),
span_heading=attrs.get("span_heading", "heading"),
span_data=attrs.get("span_data", "data"),
Expand All @@ -60,6 +61,7 @@ def __init__(
Doc.set_extension(self.attrs.doc_layout, default=None, force=True)
Doc.set_extension(self.attrs.doc_pages, getter=self.get_pages, force=True)
Doc.set_extension(self.attrs.doc_tables, getter=self.get_tables, force=True)
Doc.set_extension(self.attrs.doc_markdown, default=None, force=True)
Span.set_extension(self.attrs.span_layout, default=None, force=True)
Span.set_extension(self.attrs.span_data, default=None, force=True)
Span.set_extension(self.attrs.span_heading, getter=self.get_heading, force=True)
Expand Down Expand Up @@ -109,6 +111,7 @@ def _result_to_doc(self, result: "ConversionResult") -> Doc:
inputs.append((table_text, item))
doc = self._texts_to_doc(inputs, pages)
doc._.set(self.attrs.doc_layout, DocLayout(pages=[p for p in pages.values()]))
doc._.set(self.attrs.doc_markdown, result.document.export_to_markdown())
return doc

def _texts_to_doc(
Expand Down
1 change: 1 addition & 0 deletions spacy_layout/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ class Attrs:
doc_layout: str
doc_pages: str
doc_tables: str
doc_markdown: str
span_layout: str
span_data: str
span_heading: str
Expand Down
9 changes: 9 additions & 0 deletions tests/test_general.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,15 @@ def test_table(nlp):
"Chernihiv, Ukraine",
],
}
markdown = (
"| Name | Type | Place of birth |\n"
"|------------------|--------|--------------------|\n"
"| Ines | human | Cologne, Germany |\n"
"| Matt | human | Sydney, Australia |\n"
"| Baikal | cat | Berlin, Germany |\n"
"| Stanislav Petrov | cat | Chernihiv, Ukraine |\n"
)
assert markdown in doc._.get(layout.attrs.doc_markdown)


def test_table_placeholder(nlp):
Expand Down

0 comments on commit 64c6f4a

Please sign in to comment.