forked from vnmssnhv/NeuTralRewriter
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpostprocess.py
36 lines (31 loc) · 1.16 KB
/
postprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import codecs
import argparse
import stanza
def postprocess(inFile,outFile):
nlp = stanza.Pipeline(lang='en', processors='truecase')
doc = nlp('this is a test sentence for stanza. this is another sentence.')
tokenizedsents=[]
for sent in doc.sentences:
tok_s=[]
for word in sent.words:
print("word " + str(word))
tok_s.append(word.text)
print(tok_s)
tok_sent = " ".join(tok_s)
print(tok_s)
tokenizedsents.append(tok_s)
print(tokenizedsents)
def read_file(input):
with codecs.open(input,'r') as inF:
input_sents=inF.readlines()
return input_sents
def write_output(outputfile,output):
with codecs.open(outputfile, 'w') as outF:
outF.write('\n'.join(output))
if __name__ == '__main__':
# USAGE: python postprocess-lc-tk.py -i inputF -l en -o outputF
parser = argparse.ArgumentParser(description='tokenize sentences using Stanza')
parser.add_argument("-i", "--input_file", required=True)
parser.add_argument("-o", "--output_file", required=True)
args = parser.parse_args()
postprocess(args.input_file, args.output_file)