-
Notifications
You must be signed in to change notification settings - Fork 0
/
docx_dto.py
76 lines (61 loc) · 2.01 KB
/
docx_dto.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
from typing import List
class Metadata:
def __init__(
self,
doc_id: str,
title: str,
date: str,
location: str,
type: str = '',
category: str = '',
img: str = ''
):
self.id = doc_id
self.title = title
self.date = date
self.location = location
self.type = type
self.category = category
self.img = img
class TextSpan:
def __init__(self, text: str, text_style: str = ''):
self.text = text
self.text_style = text_style
class Paragraph:
def __init__(self, text_spans: List[TextSpan] = None):
if text_spans is None:
text_spans = []
self.text_spans = text_spans
def append_span(self, span: TextSpan):
self.text_spans.append(span)
def to_text(self):
return ''.join(
span.text
for span in self.text_spans
)
class DocxDto:
def __init__(self, metadata: Metadata = None, content: List[Paragraph] = None):
if content is None:
content = []
self.metadata = metadata
self.content = content
def append_paragraph(self, paragraph):
self.content.append(paragraph)
def extract_metadata_from_content(self):
# first item is irrelevant (`Originalskript des Vortrags`)
self.content.pop(0)
title = self.content.pop(0).to_text()
loc_dat = self.content.pop(0).to_text().split(', ')
id = self.content.pop(0).to_text()
type = self.content.pop(0).to_text()
category = self.content.pop(0).to_text()
img = self.content.pop(0).to_text().split('src="')[1].split('" width')[0]
self.metadata = Metadata(
doc_id=id.replace('Code:', '').strip(),
title=title,
location=loc_dat[0],
date=loc_dat[1],
type=type.replace('Typ:', '').strip(),
category=category.replace('Kategorie:', '').strip(),
img=img
)