-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocess.py
110 lines (103 loc) · 3.12 KB
/
process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
def parse():
'''
Parsing Data from Word document
'''
import sys
#from time import sleep
import re
import docx
c=0
afd=None
doc=docx.Document('data/VAN_DER_LINDE_20161031.docx')
pars=doc.paragraphs
footnotes=[]
tl=[]
pnum=1
for par in pars:
txt=par.text
if 'HOOF STAMBOOM' in txt:
#print(txt)
afd=txt
elif 'ANDER IMMIGRANTE' in txt:
#print(txt)
afd=txt
elif 'ONGEKOPPELDES' in txt:
#print(txt)
afd=txt
c+=1
if afd=='HOOF STAMBOOM':
for s,r in zip(['\t',"‘","“",'’'],[' ',"'",'"',"'"]):
txt=txt.replace(s,r)
if len(txt)>0: # if not an empty line
if re.search('^[0-9]+[ ]*[A-Za-z]*',txt): # if the line starts with a number
#print(f'txt: {len(txt):03} "{txt[0:100]}"')
if len(txt)<=3: # probably the page number
pnum=int(txt)
#print(f'{c:03}: pagenum: {pnum}')
elif re.search(r'^[1-9]{1,3}[ ]?[A-Za-z]',txt): # otherwise a footnote
footnoteVal,footnoteText=re.search(r'^([1-9]{1,3})[ ]?([A-Za-z].*)',txt).groups()
print(f'{pnum} {footnoteVal}, {footnoteText}')
dat={}
dat['page']=pnum
dat['ref']=footnoteVal
dat['text']=footnoteText
footnotes.append(dat)
else:
#print(f'{txt[:100]} ')
tl.append(txt)
k,v='',''
if re.search(r'^[\[\(]?[A-Za-z][0-9]+[\]\)]? ',txt):
k,v=re.search(r'^([\[\(]?[A-Za-z][0-9]+[\]\)]?)(.*)',txt).groups()
v=v.strip()
#print(f'{k:^4}: {v}')
else:
k=None
v+=txt
#print(f'"{txt}"')
if k is not None:
if re.search(' [A-Za-z][0-9]{1,2} ',v):
res=re.split('([\[\(]?[A-Za-z][0-9]{1,2}[\]\)]?)', txt.strip())
res.pop(0)
for i in range(0,len(res),2):
#print(f"{pnum} {res[i]}: {res[i+1].strip()}")
split_parts(pnum, res[i],res[i+1].strip())
#if c>1320: return footnotes, tl
else:
#print(f'{pnum} {k}: {v.strip()}')
split_parts(pnum, k,v.strip())
#pass
#if c>=1800: return footnotes, tl
return footnotes, tl
def split_parts(pnum, index, datstr):
import re
global c
# extract index classifier
if '[' in index:
idx_type='other'
elif '(' in index:
idx_type='adopted'
else:
idx_type='vdl'
#then trip the index of it's classifier tokens
for v in ['[',']','(',')']:
index=index.strip(v)
# then extract exception gender specifiers
if '(v)' in datstr[:40].lower():
gender='female'
#c+=1; print(f'{c} {index} {datstr}')
elif '(m)' in datstr[:35].lower():
gender='male'
#c+=1; print(f'{c} {index} {datstr}')
else:
gender=None
if '(x)' in datstr[:100].lower(): # (x) - not married
#c+=1; print(f'{c} {index} {datstr[:100]}')
married=False
if '*' in datstr.lower(): # born
#res=re.split(r'(\*|\+|~| x |#)',datstr)
#c+=1; print(f'{c} {index} {datstr}')
pass
if __name__ == '__main__':
c=0
print('='*200)
d=parse()