-
Notifications
You must be signed in to change notification settings - Fork 0
/
tokenize_duc.py
58 lines (48 loc) · 1.88 KB
/
tokenize_duc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
from os.path import join
import json
from nltk.parse import CoreNLPParser
import os
import random
import argparse
from collections import Counter, defaultdict
import pickle as pkl
import re
from bs4 import BeautifulSoup
import nltk
corenlp_parser = CoreNLPParser(url='http://localhost:9000')
def _count_data(path):
""" count number of data in the given path"""
matcher = re.compile(r'[0-9]+\.json')
match = lambda name: bool(matcher.match(name))
names = os.listdir(path)
n_data = len(list(filter(match, names)))
return n_data
def main(data_dir, out_dir):
split_dir = join(data_dir, 'test')
os.makedirs(out_dir)
out_test_dir = join(out_dir, 'test')
os.makedirs(out_test_dir)
n_data = _count_data(split_dir)
for i in range(n_data):
js = json.load(open(join(split_dir, '{}.json'.format(i))))
document = js['article']
tokenized_document = [' '.join(corenlp_parser.tokenize(doc_sent.strip())) for doc_sent in document]
summary_list = js['abstract']
tokenized_summary_list = []
for summary in summary_list:
tokenized_summary = [' '.join(corenlp_parser.tokenize(summary_sent.strip())) for summary_sent in summary]
tokenized_summary_list.append(tokenized_summary)
js['article'] = tokenized_document
js['abstract'] = tokenized_summary_list
with open(join(out_test_dir, '{}.json'.format(i)), 'w') as f:
json.dump(js, f, indent=4)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description=('Preprocess duc data')
)
parser.add_argument('--data_dir', type=str, action='store',
help='The directory of the data.')
parser.add_argument('--out_dir', type=str, action='store',
help='The output directory of the data.')
args = parser.parse_args()
main(args.data_dir, args.out_dir)