-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathformat_instances.py
40 lines (37 loc) · 1.41 KB
/
format_instances.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
#!/usr/bin/env
import argparse
import datahandler
"""
Script to format classification files by filtering instances and adding labels
"""
parser = argparse.ArgumentParser(description = "Script to format " +
"classification files by filtering instances and adding labels")
parser.add_argument('-i', action = 'store', required = True, nargs = '+',
help = "The input files")
parser.add_argument('-l', action = 'store', required = False, nargs = '+',
help = "The label for each file")
parser.add_argument('-o', action = 'store', required = True, nargs = '+',
help = "The output files")
parser.add_argument('-b', action = 'store', required = False, nargs = '+',
help = "Remove instances if they contain one of the given words")
parser.add_argument('--punctuation', action = 'store_true',
help = "Choose to filter punctuation")
parser.add_argument('--us', action = 'store_true',
help = "Choose to normalize usernames")
parser.add_argument('--ur', action = 'store_true',
help = "Choose to normalize urls")
args = parser.parse_args()
fl = list(zip(args.i, args.l, args.o))
for infile, label, outfile in fl:
dh = datahandler.Datahandler()
dh.set(infile)
dh.set_label(label)
if args.b:
dh.filter_instances(args.b)
if args.punctuation:
dh.filter_punctuation()
if args.us:
dh.normalize_usernames()
if args.ur:
dh.normalize_urls()
dh.write_csv(outfile)