-
Notifications
You must be signed in to change notification settings - Fork 0
/
fixBrokenFiles.py
56 lines (53 loc) · 1.88 KB
/
fixBrokenFiles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import os
import argparse
import xml.etree.ElementTree as ET
from copy import deepcopy
def run(fileDir):
for file in os.listdir(fileDir):
if not file[-3:]=='xml':
continue
tree = ET.parse(os.path.join(fileDir,file))
root = tree.getroot()
if (len(root.findall('SENTENCE'))<=1):
continue
theSentence=deepcopy(root.findall('SENTENCE')[0])
textElem = theSentence.find('TEXT')
for element in list(theSentence):
#if element.tag!='TEXT':
theSentence.remove(element)
theSentence.append(textElem)
end = -1
text = ""
elements=[]
for sentence in list(root.findall('SENTENCE')):
for element in list(sentence):
if element.tag=='TEXT':
if (len(text)>0):
if text[-1]==element.text[0]:
text+=element.text[1:]
else:
text+=element.text
else:
text=element.text
end = sentence.get('end')
else:
elements+=[element]
root.remove(sentence)
theSentence.set('end',end)
theSentence.find('TEXT').text=text
elements[:]=sorted(elements,key=lambda child: (child.tag,child.get('id')))
for elem in elements:
theSentence.append(elem)
root.clear()
root.append(theSentence)
tree.write(os.path.join(os.path.join(fileDir,"out"),file))
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='This is a script to find SpRL file data that did not find its way into a core file.')
parser.add_argument(
"--path",
dest="path",
required=True,
help='Path to the input files')
args=parser.parse_args()
run(args.path)