forked from alisonmsmith/topicflow
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathrun.py
644 lines (524 loc) · 28.2 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
import numpy as np
import pandas as pd
import json
import os
import sys
import time
import argparse
import http.server
import socketserver
from collections import OrderedDict
def read_data(df_list=False, df_topic_doc=False, df_topic_word=False, df_topic_tf=False):
"""
Choose only one set of data to return as one or a number of pandas.DataFrame
object(s).
It is important to set only one argument to True and call this function
when needed.
Args:
df_list -- if set to True, returns a list containing the metadata of
Full Disclosure emails of every month in a year
df_topic_doc -- if set to True, returns a list containing the Topic-Document
matrixes of every month in a year
df_topic_word -- if set to True, returns a list containing the Topic-Word
(or Topic-Term) matrixes of every month in a year
df_topic_tf -- if set to True, returns a pandas.DataFrame object showing
the similarity scores of some topics between every two
months
Returns:
Depending on which one argument is set to True, the function returns either
a list of 12 pandas.DataFrame objects representing the relevent information
of 12 months in a year, or one pandas.DataFrame object representing the
similarity scores of a year.
"""
month_list = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
month_index_list = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
if df_list == True:
df_list = []
for month_index in month_index_list:
for folder in os.listdir(path_meta):
if folder.endswith('_' + month_index):
path_folder = os.path.join(path_meta, folder)
file_csv = [x for x in os.listdir(path_folder) if x.endswith('.csv')][0]
path_csv = os.path.join(path_folder, file_csv)
df_list.append(pd.read_csv(path_csv))
return df_list
# Load a list of data frames, where each element is a table of dtm folder
if df_topic_doc == True:
df_topic_doc = []
for month in month_list:
filename = month + ".csv"
path_file = os.path.join(path_dtm, filename)
df_topic_doc.append(
pd.read_csv(path_file,
index_col= 0)
)
# Adds .txt to the row.names of each dtm
df_topic_doc_modified = []
for month_ix in range(len(month_list)):
df_tmp = df_topic_doc[month_ix].copy()
if not df_tmp.index.tolist()[0].endswith('.txt'):
df_tmp.index = [x + '.txt' for x in df_tmp.index.tolist()]
df_topic_doc_modified.append(df_tmp)
del df_tmp
return df_topic_doc_modified
if df_topic_word == True:
df_topic_word = []
for month in month_list:
filename = month + ".csv"
path_file = os.path.join(path_ttm, filename)
df_topic_word.append(
pd.read_csv(path_file,
index_col= 0)
)
return df_topic_word
if df_topic_tf == True:
df_topic_tf = pd.read_csv(path_topic_tf)
return df_topic_tf
def transform_doc(project_name, path_doc, path_meta, doc_extension):
"""
Transform Full Disclosure email documents from .txt formats into
JavaScript format that TopicFlow can read.
Args:
project_name -- name of the new project
path_doc -- path of documents directory
Returns:
a dictionary that maps document id with .txt file name that will be
used in transform_bins
Outcome:
"Doc.js"
"""
### READ METADATA and INITIATE MONTH INDEX LIST
df_list = read_data(df_list=True)
month_index_list = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
### DATA TRANSFORMATION
# initiate one main dictionary of Doc.js and one dictionary that maps
# document id with .txt file name
tweet_data = {} # which contains all elements of documents
tweet_id_txt = {} # use this for transform_bins
# find documents
id_pointer = 1 # tweet_id starts with 1
# store the list of sub-folders in order for each month, remove any folder that isn't what we need
# for example, a file like .DS_store can do huge damage to our pipeline
path_docs = []
for subfolder_ix in month_index_list:
for subfolder in os.listdir(path_doc):
if subfolder.endswith('_' + subfolder_ix):
path_docs.append(subfolder)
for month_ix, folder in enumerate(path_docs):
tweet_id_txt[str(month_ix)] = {}
tweet_id_txt[str(month_ix)]['id'] = []
tweet_id_txt[str(month_ix)]['txt'] = []
path_folder = os.path.join(path_doc, folder)
# read .txt files with the user-specified extension
txt_list = [x for x in os.listdir(path_folder) if x.endswith(doc_extension)]
# find .txt files that match their metadata entries
for txt in txt_list:
txt_entry_elements = txt.split('.')[0].split('_') # looks like ['2005', 'Jan', '0']
#txt_entry_elements[1] = folder[-2:] # looks like ['2005', '01', '0']
txt_entry = '_'.join(txt_entry_elements) # looks like '2005_Jan_0', use this to find document metadata in .csv file
# only record an entry if there's a match between .txt file and metadata,
# and the file is readable.
#print(txt)
try:
row = df_list[month_ix][df_list[month_ix]['id'] == txt_entry] # the row of one text file in metadata
author = row['author'].values[0]
date = pd.to_datetime(row['date']).apply(lambda x: str(x.month) + '/' + str(x.day) + '/' + str(x.year) + ' ' + str(x.hour) + ':' + str(x.minute)).values[0]
with open(os.path.join(path_folder, txt), 'r',
encoding='latin1') as textfile: # notice the encoding
text = textfile.read().replace('"','').replace('http://','').replace('\\','').replace('\n','') # remove irrgular expressions
# populate content
tweet_data[str(id_pointer)] = {}
tweet_id_txt[str(month_ix)]['id'].append(id_pointer)
tweet_id_txt[str(month_ix)]['txt'].append(txt.split('.')[0] + '.txt')
tweet_data[str(id_pointer)]['tweet_id'] = id_pointer
tweet_data[str(id_pointer)]['author'] = author
tweet_data[str(id_pointer)]['tweet_date'] = date
tweet_data[str(id_pointer)]['text'] = text
id_pointer += 1
# if for any reason the above "try" fails, we don't record
except:
# here, you can do things like listing files that can't be parsed
# e.g. print(txt)
#print('Unable to parse '+txt)
pass
# transform body into .json format
json_tmp = json.dumps(tweet_data)
# transform into .js format that TopicFlow can read
prefix = 'function populate_tweets_' + project_name + '(){\nvar tweet_data ='
posfix = ';\nreadTweetJSON(tweet_data);\n}'
doc_js = prefix + json_tmp + posfix
### WRITE
# make a directory named after the project name
if os.path.isdir(os.path.join(path_tf, 'data', project_name)) == False:
os.mkdir(os.path.join(path_tf, 'data', project_name))
# write
with open(os.path.join(path_tf, 'data', project_name, 'Doc.js'), 'w') as file:
file.write(doc_js)
print('\nDoc.js created, 20% complete.')
return tweet_id_txt
def transform_bins(project_name, path_doc, path_meta, path_dtm, path_ttm, path_topic_tf, tweet_id_txt):
"""
Transform LDA-genereted Topic-document matrixes and Topic-word matrixes
into JavaScript format that TopicFlow can read.
Args:
project_name -- name of the new project
path_doc -- path of documents directory
path_meta -- path of documents metadata directory
path_dtm -- path of Document_Topic_Matrix directory
path_ttm -- path of Topic_Term_Matrix directory
path_topic_tf -- path of topicflow similarity file
tweet_id_txt -- a dictionary that maps document id with .txt file name
generated by transform_doc
Outcome:
"Bins.js"
"""
### DEFINE month_list, READ DATA
month_list = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
# read df_list
df_list = read_data(df_list=True)
# read topic-doc & topic-word data sets
df_topic_doc = read_data(df_topic_doc=True)
# read topic-word data sets
df_topic_word = read_data(df_topic_word=True)
### DATA TRANSFORMATION - 1
# initiate bins, each month is one bin, each bin is also a dictionary
bin_dict = {}
for month_ix in range(len(month_list)):
bin_dict[str(month_ix)] = {}
# populate bin_id
for month_ix in range(len(month_list)):
bin_dict[str(month_ix)]['bin_id'] = month_ix
# populate tweet_ids
for month_ix in range(len(month_list)):
bin_dict[str(month_ix)]['tweet_Ids'] = tweet_id_txt[str(month_ix)]['id']
# populate start_time & end_time
# here we need input from df_list, specifically the length of each month
# this part sorts out the earliest and latest time of a tweet in each month, and
# transform them into "mm/dd/yy hh:mm" format
for month_ix in range(len(month_list)):
bin_dict[str(month_ix)]['start_time'] = pd.to_datetime(df_list[month_ix].date).sort_values().apply(lambda x: str(x.month) + '/' + str(x.day) + '/' + str(x.year) + ' ' + str(x.hour) + ':' + str(x.minute)).tolist()[0]
for month_ix in range(len(month_list)):
bin_dict[str(month_ix)]['end_time'] = pd.to_datetime(df_list[month_ix].date).sort_values().apply(lambda x: str(x.month) + '/' + str(x.day) + '/' + str(x.year) + ' ' + str(x.hour) + ':' + str(x.minute)).tolist()[-1]
# initiate topic_model
for month_ix in range(len(month_list)):
bin_dict[str(month_ix)]['topic_model'] = {}
# add 4 sub dictionaries
bin_dict[str(month_ix)]['topic_model']['topic_doc'] = {}
bin_dict[str(month_ix)]['topic_model']['doc_topic'] = {}
bin_dict[str(month_ix)]['topic_model']['topic_word'] = {}
bin_dict[str(month_ix)]['topic_model']['topic_prob'] = []
### DATA TRANSFORMATION - 2: POPULATE topic_model
# topic_model is the hardest part. We need to populate them month by month,
# and one by one.
for month_ix in range(len(month_list)):
overlap = set(df_topic_doc[month_ix].index.tolist()) & set(tweet_id_txt[str(month_ix)]['txt'])
overlap = list(overlap)
df_topic_doc_overlap = df_topic_doc[month_ix].copy().loc[overlap, :]
# topic_prob & topic_doc
for prob in range(10):
bin_dict[str(month_ix)]['topic_model']['topic_prob'].append(str(month_ix) + '_' + str(prob))
# initiate topic_doc
bin_dict[str(month_ix)]['topic_model']['topic_doc'][str(month_ix) + '_' + str(prob)] = {}
overlap_id = [tweet_id_txt[str(month_ix)]['txt'].index(index_txtfile) for index_txtfile in df_topic_doc_overlap.index.tolist()]
overlap_id = [tweet_id_txt[str(month_ix)]['id'][index_tweetid] for index_tweetid in overlap_id]
for overlap_ix in range(len(overlap_id)):
bin_dict[str(month_ix)]['topic_model']['topic_doc'][str(month_ix) + '_' + str(prob)][str(overlap_id[overlap_ix])] = df_topic_doc_overlap[str(int(prob + 1))].tolist()[overlap_ix]
# doc_topic
overlap_id = [tweet_id_txt[str(month_ix)]['txt'].index(index_txtfile) for index_txtfile in df_topic_doc_overlap.index.tolist()]
overlap_id = [tweet_id_txt[str(month_ix)]['id'][index_tweetid] for index_tweetid in overlap_id]
for overlap_ix2 in range(len(overlap_id)):
row = df_topic_doc_overlap.iloc[overlap_ix2, :].tolist()
bin_dict[str(month_ix)]['topic_model']['doc_topic'][str(overlap_id[overlap_ix2])] = {}
for row_ix in range(len(row)):
bin_dict[str(month_ix)]['topic_model']['doc_topic'][str(overlap_id[overlap_ix2])][str(month_ix) + '_' + str(row_ix)] = row[row_ix]
# topic_word
for topic_word_ix in range(10):
name = str(month_ix) + '_' + str(topic_word_ix)
bin_dict[str(month_ix)]['topic_model']['topic_word'][name] = {}
topwords = df_topic_word[month_ix].iloc[topic_word_ix].sort_values(ascending=False)[:10]
topwords = np.around(topwords, 17)
# we choose top 10 most frequent words, so here the range is 10
for topword_ix in range(10):
bin_dict[str(month_ix)]['topic_model']['topic_word'][name][topwords.index[topword_ix]] = topwords.values[topword_ix]
# delete df_topic_doc_overlap to aviod overwritting error and save memory
del df_topic_doc_overlap
### TRANSFORM INTO JS FORMAT
# transform bin_dict into an ordered dictionary
bin_dict_ordered = {}
key_order = ('tweet_Ids','start_time','bin_id','topic_model','end_time')
for month_ix in range(len(month_list)):
tmp = OrderedDict()
for k in key_order:
tmp[k] = bin_dict[str(month_ix)][k]
bin_dict_ordered[str(month_ix)] = tmp
# transform body into .json format
json_tmp = json.dumps(bin_dict_ordered)
# transform into .js format that TopicFlow can read
prefix = 'function populate_bins_' + project_name + '(){\nvar bin_data = '
posfix = ';\nreadBinJSON(bin_data);\n}'
bins_js = prefix + json_tmp + posfix
### WRITE
with open(os.path.join(path_tf, 'data', project_name, 'Bins.js'), 'w') as file:
file.write(bins_js)
print('Bins.js created, 40% complete.')
def transform_topicSimilarity(project_name, path_topic_tf):
"""
Transform topic similarity matrix into JavaScript format
that TopicFlow can read.
Args:
project_name -- name of the new project
path_topic_tf -- path of topicflow similarity file
Outcome:
"TopicSimilarity.js"
"""
### DEFINE month_list, READ DATA
month_list = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
df_topic_tf = read_data(df_topic_tf=True)
### DATA TRANSFORMATION
# initiate a dictionary
sim_dict = {}
# populate nodes
# put topics into nodes, record their orders
nodes = []
for i in range(len(month_list)):
for j in range(10):
tmp = {}
name = str(i) + '_' + str(j)
# how to calculate the value of a topic? the paper didn't define clearly
# so here I use a random number
value = np.random.randint(1,100)
tmp['name'], tmp['value'] = name, value
nodes.append(tmp)
# populate links
# put source, target, value into links
links = []
for month_ix in range(len(month_list) - 1):
# get unique pais between every two months, in total we have 11 pairs
mm1, mm2 = month_list[month_ix], month_list[month_ix + 1]
sim = mm1 + '_' + mm2 + '_similarity'
df_tmp = df_topic_tf[[mm1, mm2, sim]].dropna(axis=0).drop_duplicates()
for row_ix in range(len(df_tmp)):
source = month_ix*10 + int(df_tmp[mm1].values[row_ix]) - 1
target = (month_ix+1)*10 + int(df_tmp[mm2].values[row_ix]) - 1
score = df_tmp[sim].values[row_ix] * 100 # 100 makes it neither too thin nor too thick
link_tmp = {}
link_tmp['source'], link_tmp['target'], link_tmp['value'] = source, target, score
links.append(link_tmp)
# put two lists into sim_dict
sim_dict['nodes'], sim_dict['links'] = nodes, links
### TRANSFORM INTO JS FORMAT
json_tmp = json.dumps(sim_dict)
# finally, transform into .js format that TopicFlow can read
prefix = 'function populate_similarity_' + project_name + '(){\nvar sim_data = '
posfix = ';\nreadSimilarityJSON(sim_data);\n}'
topicSimilarity_js = prefix + json_tmp + posfix
### WRITE
with open(os.path.join(path_tf, 'data', project_name, 'TopicSimilarity.js'), 'w') as file:
file.write(topicSimilarity_js)
print('TopicSimilarity.js created, 60% complete.')
def modify_html(project_name, path_tf):
"""
Modify the content of \topicflow\index.html.
Two hand-added comments are used to locate the lines where new content can be
added. Executing the function would replace the existing index.html.
Args:
project_name -- name of the new project
path_tf -- path of topicflow directory
Outcome:
a modified "index.html" that includes a new project
"""
# read exisitng index.html and parse by lines
with open(os.path.join(path_tf, 'index.html'), 'r') as file:
html = file.read()
html_parse = html.split('\n')
# add new section after '<!-- add new section after this line -->'
ix = html_parse.index('<!-- add new section after this line -->')
new_section = '<script src="data/SHA/Doc.js"></script>\n<script src="data/SHA/Bins.js"></script>\n<script src="data/SHA/TopicSimilarity.js"></script>\n'.replace('SHA',project_name)
html_parse.insert(ix+1, new_section)
# add new selector after '<!-- add new dataset selector after this line -->'
ix = html_parse.index('\t\t\t<!-- add new dataset selector after this line -->')
new_selector = '\t\t\t<li id="{}"><a href="#">{}</a></li>'.format(project_name, project_name.replace('_', ' '))
html_parse.insert(ix+1, new_selector)
# replace existing index.html
html_combine = '\n'.join(html_parse)
os.remove(os.path.join(path_tf, 'index.html'))
with open(os.path.join(path_tf, 'index.html'), 'w') as file:
file.write(html_combine)
print('index.html modified, 80% complete.')
def modify_controller(project_name, path_tf):
"""
Modify the content of \topicflow\scripts\controller.js.
Two hand-added comments are used to locate the lines where new content can be
added. Executing the function would replace the existing controller.js.
Args:
project_name -- name of the new project
path_tf -- path of topicflow directory
Outcome:
a modified "controller.js" that includes a new project
"""
# read exisitng controller.js and parse by lines
with open(os.path.join(path_tf, 'scripts', 'controller.js'), 'r') as file:
controller = file.read()
controller_parse = controller.split('\n')
# add idToName after '// add new idToName'
ix = controller_parse.index('\t\t\t\t\t// add new idToName')
new_idToName = '\t\t\t\t\t"{}":"{}",'.format(project_name, project_name.replace('_', ' '))
controller_parse.insert(ix+1, new_idToName)
# add selected dataset after '// add new selected dataset here'
ix = controller_parse.index('\t// add new selected dataset here')
new_selectedDataset = '\tif (selected_data==="SHA") {\n\t\tpopulate_tweets_SHA();\n\t\tpopulate_bins_SHA();\n\t\tpopulate_similarity_SHA();\n\t}'.replace('SHA', project_name)
controller_parse.insert(ix+1, new_selectedDataset)
# replace existing controller.js
controller_combine = '\n'.join(controller_parse)
os.remove(os.path.join(path_tf, 'scripts', 'controller.js'))
with open(os.path.join(path_tf, 'scripts', 'controller.js'), 'w') as file:
file.write(controller_combine)
print('controller.js modified, 100% complete.')
def del_project(project_name_delete):
"""
Delete an existing project. Content of the project in index.html,
controller.js, and data/<project> folder will be deleted. The base project
"Full_Disclosure_2012" should not be deleted.
Args:
project_name_delete -- name of the project that should be deleted
Outcome:
Removal of an existing project or multiple existing projects.
"""
### DELETE CONTENT IN index.html
# read exisitng index.html and parse by lines
with open(os.path.join(path_tf, 'index.html'), 'r') as file:
html = file.read()
html_parse = html.split('\n')
# delete section after '<!-- add new section after this line -->'
ix_1 = html_parse.index('<!-- add new section after this line -->')
ix_2 = html_parse.index('<!-- end of adding new datasets. -->')
delete_ix = 0
for i_1 in range(ix_1, ix_2):
# make sure only the specified project is deleted, we don't want to delete other projects that have the this name in it
if project_name_delete in html_parse[i_1] and 'Doc.js' in html_parse[i_1] and len(html_parse[i_1]) == 36+len(project_name_delete):
delete_ix = i_1
for i_2 in range(4): # there are 4 lines for each project section, and we don't want to delete the end line
if not delete_ix == 0:
html_parse.pop(delete_ix)
# delete dataset selector after '<!-- add new dataset selector after this line -->'
ix_1 = html_parse.index('\t\t\t<!-- add new dataset selector after this line -->')
ix_2 = html_parse.index('\t\t\t<!-- end of adding new dataset selector -->')
for i_3 in range(ix_1, ix_2):
if 'id="' + project_name_delete + '"' in html_parse[i_3]:
html_parse.pop(i_3)
# replace existing index.html
html_combine = '\n'.join(html_parse)
os.remove(os.path.join(path_tf, 'index.html'))
with open(os.path.join(path_tf, 'index.html'), 'w') as file:
file.write(html_combine)
### DELETE CONTENT IN controller.js
# read exisitng controller.js and parse by lines
with open(os.path.join(path_tf, 'scripts', 'controller.js'), 'r') as file:
controller = file.read()
controller_parse = controller.split('\n')
# delete idToName after '// add new idToName'
ix_1 = controller_parse.index('\t\t\t\t\t// add new idToName')
ix_2 = controller_parse.index('\t\t\t\t\t"Full_Disclosure_2012":"Full Disclosure 2012"')
for i_4 in range(ix_1, ix_2):
if '"' + project_name_delete + '"' in controller_parse[i_4]:
controller_parse.pop(i_4)
# delete selected dataset after '// add new selected dataset here'
ix_1 = controller_parse.index('\t// add new selected dataset here')
ix_2 = controller_parse.index('\t// end of adding new selected datasets')
delete_ix = 0
for i_5 in range(ix_1, ix_2):
if '"' + project_name_delete + '"' in controller_parse[i_5]:
delete_ix = i_5
for i_6 in range(5): # there are 5 lines for each selected dataset, and we don't want to delete the end line
if not delete_ix == 0:
controller_parse.pop(delete_ix)
# replace existing controller.js
controller_combine = '\n'.join(controller_parse)
os.remove(os.path.join(path_tf, 'scripts', 'controller.js'))
with open(os.path.join(path_tf, 'scripts', 'controller.js'), 'w') as file:
file.write(controller_combine)
### DELETE data.<project_name_delete> FOLDER
# delete three .js files
for js_file in os.listdir(os.path.join(path_tf, 'data', project_name_delete)):
os.remove(os.path.join(path_tf, 'data', project_name_delete, js_file))
# delete project folder
os.rmdir(os.path.join(path_tf, 'data', project_name_delete))
if __name__ == "__main__":
# record the path of topicflow
path_tf = sys.argv[0][:-6]
if len(path_tf) == 0:
path_tf = '.'
### ARGPARSE
parser = argparse.ArgumentParser(prog = 'run.py',
description = 'This script allows you to add PERCEIVE\'s topicflow R package output data into topicflowviz format and visualize it in localhost. Added projects can be later visualized using run.py, unless explicitly deleted.',
epilog = 'Example of adding a new project: python run.py -a "FD2014" "/**/2014.parsed" "/**/2014.metadata" ".reply.body.txt" "/**/dtm" "/**/ttm" "/**/topic_flow.csv"')
parser.add_argument('-a', '--add', type = str, nargs = '+',
help = 'If adding a new project. Please specify all the following items, an example is provided for each item: [project name - "FD2014", path of document folder - "/**/2014.parsed", path of document metadata folder - "/**/2014.metadata", document extension - ".reply.body.txt", path of Document Topic Matrix folder - "/**/dtm", path of Topic Term Matrix folder - "/**/ttm", path of Topic Flow Similarity file - "/**/topic_flow.csv"], 7 items in total.')
parser.add_argument('-d', '--delete', type = str, nargs = '+',
help = 'Delete one or multiple existing projects. Specify the name(s) of the project(s) that should be deleted in double quotes. The base project "Full_Disclosure_2012" should not be deleted. Single deletion example: python run.py -d "FD2014". Multiple deletion example: python run.py -d "FD2014" "FD2015".')
parser.add_argument('-s', '--show', help='Show existing projects',
action="store_true")
args = parser.parse_args()
# show existing projects
if args.show:
existing_projects = []
data_dir = os.path.join(path_tf, 'data')
for existing_project in os.listdir(data_dir):
if os.path.isdir(os.path.join(path_tf, 'data', existing_project)):
existing_projects.append(existing_project.replace('_', ' '))
print('Existing projects:\n', existing_projects)
# delete an existing project, if true, end the outer if.
elif args.delete:
for arg_del in args.delete:
if os.path.isdir(os.path.join(path_tf, 'data', arg_del.replace(' ', '_'))):
project_name_delete = arg_del.replace(' ', '_')
del_project(project_name_delete)
if len(args.delete) == 1:
print('Project successfully deleted.')
elif len(args.delete) >= 1:
print('Projects successfully deleted.')
# add a new project
elif args.add:
project_name = args.add[0]
path_doc = args.add[1]
path_meta = args.add[2]
doc_extension = args.add[3]
path_dtm = args.add[4]
path_ttm = args.add[5]
path_topic_tf = args.add[6]
# replace spaces in the project name with underlines
project_name = project_name.replace(' ', '_')
# change '~' to path
path_doc = os.path.expanduser(path_doc)
path_meta = os.path.expanduser(path_meta)
path_dtm = os.path.expanduser(path_dtm)
path_ttm = os.path.expanduser(path_ttm)
path_topic_tf = os.path.expanduser(path_topic_tf)
time_start = time.time()
if os.path.isdir(path_doc) and os.path.isdir(path_meta) and os.path.isdir(path_dtm) and os.path.isdir(path_ttm) and os.path.isfile(path_topic_tf):
print('\nData transformation started...')
tweet_id_txt = transform_doc(project_name, path_doc, path_meta, doc_extension)
transform_bins(project_name, path_doc, path_meta, path_dtm, path_ttm, path_topic_tf, tweet_id_txt)
transform_topicSimilarity(project_name, path_topic_tf)
modify_html(project_name, path_tf)
modify_controller(project_name, path_tf)
print('\nTotal time taken:', str(round(time.time() - time_start, 2)), 'seconds.\n')
else:
print('\nData transformation failed because of wrong path(s) in the arguments, showing the existing projects...')
### INVOKE SERVER
PORT = np.random.randint(9000, 10000)
# change the working directory to topicflow
os.chdir(path_tf)
Handler = http.server.SimpleHTTPRequestHandler
with socketserver.TCPServer(("", PORT), Handler) as httpd:
print("serving at port", PORT)
httpd.serve_forever()
else:
### INVOKE SERVER
PORT = np.random.randint(9000, 10000)
# change the working directory to topicflow
os.chdir(path_tf)
Handler = http.server.SimpleHTTPRequestHandler
with socketserver.TCPServer(("", PORT), Handler) as httpd:
print("serving at port", PORT)
httpd.serve_forever()