-
Notifications
You must be signed in to change notification settings - Fork 0
/
doc2json.py
181 lines (163 loc) · 5.82 KB
/
doc2json.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
from collections import defaultdict
import json
import zipfile
from lxml import etree
# Define common fonts to ignore
common_fonts = {
'Times New Roman',
'Arial',
'Calibri',
# Add any other common fonts here
}
# Define elements to ignore
ignored_elements = {
'proofErr',
'bookmarkStart',
'bookmarkEnd',
'lastRenderedPageBreak',
'webHidden',
'numPr',
'pBdr',
'ind',
'spacing',
'jc',
'tabs',
'sectPr',
'pgMar'
# Add any other elements to ignore here
}
# Define attributes to ignore
ignored_attributes = {
'rsidR',
'rsidRPr',
'rsidRDefault',
'rsidP',
'paraId',
'textId',
'rsidR',
'rsidRPr',
'rsidDel',
'rsidP',
'rsidTr',
# Add any other attributes to ignore here
}
# Define metadata elements to ignore
ignored_metadata_elements = {
'application',
'docSecurity',
'scaleCrop',
'linksUpToDate',
'charactersWithSpaces',
'hiddenSlides',
'mmClips',
'notes',
'words',
'characters',
'pages',
'lines',
'paragraphs',
'company',
'template',
# Add any other metadata elements to ignore here
}
def remove_ignored_elements(tree):
"""Remove all ignored elements from the XML tree, except highlights."""
for elem in tree.xpath(".//*"):
tag_without_ns = elem.tag.split('}')[-1]
if tag_without_ns in ignored_elements:
elem.getparent().remove(elem)
elif elem.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}rPr': # Check for highlights in rPr
if not any(child.tag.endswith('highlight') for child in elem.getchildren()):
elem.getparent().remove(elem)
else:
# Remove ignored attributes
for attr in list(elem.attrib):
attr_without_ns = attr.split('}')[-1]
if attr_without_ns in ignored_attributes or attr_without_ns.startswith('rsid'):
del elem.attrib[attr]
return tree
def etree_to_dict(t):
"""Convert an lxml etree to a nested dictionary, excluding ignored namespaces and attributes."""
tag = t.tag.split('}')[-1] # Remove namespace URI
if tag in ignored_elements:
return None
d = {tag: {} if t.attrib else None}
children = list(t)
if children:
dd = defaultdict(list)
for dc in filter(None, map(etree_to_dict, children)):
for k, v in dc.items():
dd[k].append(v)
d = {tag: {k: v[0] if len(v) == 1 else v for k, v in dd.items()}}
if t.attrib:
# Filter out common fonts and ignored attributes
filtered_attribs = {}
for k, v in t.attrib.items():
k = k.split('}')[-1] # Remove namespace URI
if k in ('ascii', 'hAnsi', 'cs', 'eastAsia'):
if v not in common_fonts:
filtered_attribs[k] = v
elif k not in ignored_attributes and not k.startswith('rsid'):
filtered_attribs[k] = v
d[tag].update(filtered_attribs)
if t.text:
text = t.text.strip()
# Here we ensure that the text encoding is correctly handled
text = bytes(text, 'utf-8').decode('utf-8', 'ignore')
if children or t.attrib:
if text:
d[tag]['#text'] = text
else:
d[tag] = text
if not t.attrib and not children and not t.text:
return None
return d
# Additionally, update the 'remove_ignored_elements' function to fix encoding
def remove_ignored_elements(tree):
"""Remove all ignored elements from the XML tree, except highlights."""
for elem in tree.xpath(".//*"):
tag_without_ns = elem.tag.split('}')[-1]
if tag_without_ns in ignored_elements:
elem.getparent().remove(elem)
elif elem.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}rPr': # Check for highlights in rPr
if not any(child.tag.endswith('highlight') for child in elem.getchildren()):
elem.getparent().remove(elem)
else:
# Remove ignored attributes
for attr in list(elem.attrib):
attr_without_ns = attr.split('}')[-1]
if attr_without_ns in ignored_attributes or attr_without_ns.startswith('rsid'):
del elem.attrib[attr]
# Decode the text correctly for each XML element
for elem in tree.xpath(".//text()"):
elem_text = elem.strip()
encoded_text = bytes(elem_text, 'utf-8').decode('utf-8', 'ignore')
parent = elem.getparent()
if parent is not None:
parent.text = encoded_text
return tree
def extract_metadata(docx):
"""Extract metadata from the document properties, ignoring specified elements."""
metadata = {}
with docx.open('docProps/core.xml') as core_xml:
xml_content = core_xml.read()
core_tree = etree.XML(xml_content)
for child in core_tree.getchildren():
tag = child.tag.split('}')[-1] # Get tag without namespace
if tag not in ignored_metadata_elements:
metadata[tag] = child.text
return metadata
def process_docx(file_path):
# Load the document with zipfile and lxml
with zipfile.ZipFile(file_path) as docx:
metadata = extract_metadata(docx)
with docx.open('word/document.xml') as document_xml:
xml_content = document_xml.read()
document_tree = etree.XML(xml_content)
# Remove the ignored elements
document_tree = remove_ignored_elements(document_tree)
# Convert the rest of the XML tree to a dictionary
document_dict = etree_to_dict(document_tree)
document_dict['metadata'] = metadata # Add metadata to the document dictionary
docx_json = json.dumps(document_dict, ensure_ascii=False, indent=2)
return docx_json