Skip to content

Commit f2b8417

Browse files
committed
added scripts for data preprocessing
1 parent 05065a6 commit f2b8417

File tree

4 files changed

+1198
-0
lines changed

4 files changed

+1198
-0
lines changed

README.md

+2
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ The pre-fitted embeddings can be found below:
2020

2121
+ https://bitbucket.org/diengadji/embeddings/src
2222

23+
All the scripts to pre-process a dataset can be found in the folder 'scripts'.
24+
2325
## Example
2426

2527
To run the DETM on the ACL dataset you can run the command below. You can specify different values for other arguments, peek at the arguments list in main.py.

scripts/data_acl.py

+335
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,335 @@
1+
import csv
2+
from sklearn.feature_extraction.text import CountVectorizer
3+
import numpy as np
4+
import pickle
5+
import random
6+
from scipy import sparse
7+
import itertools
8+
from scipy.io import savemat, loadmat
9+
import string
10+
import os
11+
12+
# Maximum / minimum document frequency
13+
max_df = 0.7
14+
min_df = 10 # choose desired value for min_df
15+
16+
# Read meta-data
17+
print('reading meta-data...')
18+
all_pids = []
19+
all_timestamps = []
20+
21+
with open('raw/acl_abstracts/acl_data-combined/paper_metadata.csv', 'r') as csv_file:
22+
csv_reader = csv.reader(csv_file, delimiter=',', quotechar='"')
23+
line_count = 0
24+
for row in csv_reader:
25+
if line_count > 0:
26+
all_pids.append(row[0])
27+
all_timestamps.append(row[2][0:4])
28+
line_count += 1
29+
30+
def remove_not_printable(in_str):
31+
return "".join([c for c in in_str if c in string.printable])
32+
33+
34+
# Read raw data
35+
print('reading raw data...')
36+
docs = []
37+
not_found = []
38+
timestamps = []
39+
for (pid, tt) in zip(all_pids, all_timestamps):
40+
path_read = 'raw/acl_abstracts/acl_data-combined/all_papers'
41+
path_read = os.path.join(path_read, pid + '.txt')
42+
if not os.path.isfile(path_read):
43+
not_found.append(pid)
44+
else:
45+
with open(path_read, 'rb') as f:
46+
doc = f.read().decode('utf-8', 'ignore')
47+
doc = doc.lower().replace('\n', ' ').replace("’", " ").replace("'", " ").translate(str.maketrans(string.punctuation + "0123456789", ' '*len(string.punctuation + "0123456789"))).split()
48+
doc = [remove_not_printable(w) for w in doc if len(w)>1]
49+
if len(doc) > 1:
50+
doc = " ".join(doc)
51+
docs.append(doc)
52+
timestamps.append(tt)
53+
54+
# Write as raw text
55+
print('writing to text file...')
56+
out_filename = './docs_processed.txt'
57+
print('writing to text file...')
58+
with open(out_filename, 'w') as f:
59+
for line in docs:
60+
f.write(line + '\n')
61+
62+
# Read stopwords
63+
with open('stops.txt', 'r') as f:
64+
stops = f.read().split('\n')
65+
66+
# Create count vectorizer
67+
print('counting document frequency of words...')
68+
cvectorizer = CountVectorizer(min_df=min_df, max_df=max_df, stop_words=None)
69+
cvz = cvectorizer.fit_transform(docs).sign()
70+
71+
# Get vocabulary
72+
print('building the vocabulary...')
73+
sum_counts = cvz.sum(axis=0)
74+
v_size = sum_counts.shape[1]
75+
sum_counts_np = np.zeros(v_size, dtype=int)
76+
for v in range(v_size):
77+
sum_counts_np[v] = sum_counts[0,v]
78+
word2id = dict([(w, cvectorizer.vocabulary_.get(w)) for w in cvectorizer.vocabulary_])
79+
id2word = dict([(cvectorizer.vocabulary_.get(w), w) for w in cvectorizer.vocabulary_])
80+
del cvectorizer
81+
print(' initial vocabulary size: {}'.format(v_size))
82+
83+
# Sort elements in vocabulary
84+
idx_sort = np.argsort(sum_counts_np)
85+
vocab_aux = [id2word[idx_sort[cc]] for cc in range(v_size)]
86+
87+
# Filter out stopwords (if any)
88+
vocab_aux = [w for w in vocab_aux if w not in stops]
89+
print(' vocabulary size after removing stopwords from list: {}'.format(len(vocab_aux)))
90+
91+
# Create dictionary and inverse dictionary
92+
vocab = vocab_aux
93+
del vocab_aux
94+
word2id = dict([(w, j) for j, w in enumerate(vocab)])
95+
id2word = dict([(j, w) for j, w in enumerate(vocab)])
96+
97+
# Create mapping of timestamps
98+
all_times = sorted(set(timestamps))
99+
time2id = dict([(t, i) for i, t in enumerate(all_times)])
100+
id2time = dict([(i, t) for i, t in enumerate(all_times)])
101+
time_list = [id2time[i] for i in range(len(all_times))]
102+
103+
# Split in train/test/valid
104+
print('tokenizing documents and splitting into train/test/valid...')
105+
num_docs = cvz.shape[0]
106+
trSize = int(np.floor(0.85*num_docs))
107+
tsSize = int(np.floor(0.10*num_docs))
108+
vaSize = int(num_docs - trSize - tsSize)
109+
del cvz
110+
idx_permute = np.random.permutation(num_docs).astype(int)
111+
112+
# Remove words not in train_data
113+
vocab = list(set([w for idx_d in range(trSize) for w in docs[idx_permute[idx_d]].split() if w in word2id]))
114+
word2id = dict([(w, j) for j, w in enumerate(vocab)])
115+
id2word = dict([(j, w) for j, w in enumerate(vocab)])
116+
print(' vocabulary after removing words not in train: {}'.format(len(vocab)))
117+
118+
docs_tr = [[word2id[w] for w in docs[idx_permute[idx_d]].split() if w in word2id] for idx_d in range(trSize)]
119+
timestamps_tr = [time2id[timestamps[idx_permute[idx_d]]] for idx_d in range(trSize)]
120+
docs_ts = [[word2id[w] for w in docs[idx_permute[idx_d+trSize]].split() if w in word2id] for idx_d in range(tsSize)]
121+
timestamps_ts = [time2id[timestamps[idx_permute[idx_d+trSize]]] for idx_d in range(tsSize)]
122+
docs_va = [[word2id[w] for w in docs[idx_permute[idx_d+trSize+tsSize]].split() if w in word2id] for idx_d in range(vaSize)]
123+
timestamps_va = [time2id[timestamps[idx_permute[idx_d+trSize+tsSize]]] for idx_d in range(vaSize)]
124+
125+
print(' number of documents (train): {} [this should be equal to {} and {}]'.format(len(docs_tr), trSize, len(timestamps_tr)))
126+
print(' number of documents (test): {} [this should be equal to {} and {}]'.format(len(docs_ts), tsSize, len(timestamps_ts)))
127+
print(' number of documents (valid): {} [this should be equal to {} and {}]'.format(len(docs_va), vaSize, len(timestamps_va)))
128+
129+
# Remove empty documents
130+
print('removing empty documents...')
131+
132+
def remove_empty(in_docs, in_timestamps):
133+
out_docs = []
134+
out_timestamps = []
135+
for ii, doc in enumerate(in_docs):
136+
if(doc!=[]):
137+
out_docs.append(doc)
138+
out_timestamps.append(in_timestamps[ii])
139+
return out_docs, out_timestamps
140+
141+
def remove_by_threshold(in_docs, in_timestamps, thr):
142+
out_docs = []
143+
out_timestamps = []
144+
for ii, doc in enumerate(in_docs):
145+
if(len(doc)>thr):
146+
out_docs.append(doc)
147+
out_timestamps.append(in_timestamps[ii])
148+
return out_docs, out_timestamps
149+
150+
docs_tr, timestamps_tr = remove_empty(docs_tr, timestamps_tr)
151+
docs_ts, timestamps_ts = remove_empty(docs_ts, timestamps_ts)
152+
docs_va, timestamps_va = remove_empty(docs_va, timestamps_va)
153+
154+
# Remove test documents with length=1
155+
docs_ts, timestamps_ts = remove_by_threshold(docs_ts, timestamps_ts, 1)
156+
157+
# Split test set in 2 halves
158+
print('splitting test documents in 2 halves...')
159+
docs_ts_h1 = [[w for i,w in enumerate(doc) if i<=len(doc)/2.0-1] for doc in docs_ts]
160+
docs_ts_h2 = [[w for i,w in enumerate(doc) if i>len(doc)/2.0-1] for doc in docs_ts]
161+
162+
# Getting lists of words and doc_indices
163+
print('creating lists of words...')
164+
165+
def create_list_words(in_docs):
166+
return [x for y in in_docs for x in y]
167+
168+
words_tr = create_list_words(docs_tr)
169+
words_ts = create_list_words(docs_ts)
170+
words_ts_h1 = create_list_words(docs_ts_h1)
171+
words_ts_h2 = create_list_words(docs_ts_h2)
172+
words_va = create_list_words(docs_va)
173+
174+
print(' len(words_tr): ', len(words_tr))
175+
print(' len(words_ts): ', len(words_ts))
176+
print(' len(words_ts_h1): ', len(words_ts_h1))
177+
print(' len(words_ts_h2): ', len(words_ts_h2))
178+
print(' len(words_va): ', len(words_va))
179+
180+
# Get doc indices
181+
print('getting doc indices...')
182+
183+
def create_doc_indices(in_docs):
184+
aux = [[j for i in range(len(doc))] for j, doc in enumerate(in_docs)]
185+
return [int(x) for y in aux for x in y]
186+
187+
doc_indices_tr = create_doc_indices(docs_tr)
188+
doc_indices_ts = create_doc_indices(docs_ts)
189+
doc_indices_ts_h1 = create_doc_indices(docs_ts_h1)
190+
doc_indices_ts_h2 = create_doc_indices(docs_ts_h2)
191+
doc_indices_va = create_doc_indices(docs_va)
192+
193+
print(' len(np.unique(doc_indices_tr)): {} [this should be {}]'.format(len(np.unique(doc_indices_tr)), len(docs_tr)))
194+
print(' len(np.unique(doc_indices_ts)): {} [this should be {}]'.format(len(np.unique(doc_indices_ts)), len(docs_ts)))
195+
print(' len(np.unique(doc_indices_ts_h1)): {} [this should be {}]'.format(len(np.unique(doc_indices_ts_h1)), len(docs_ts_h1)))
196+
print(' len(np.unique(doc_indices_ts_h2)): {} [this should be {}]'.format(len(np.unique(doc_indices_ts_h2)), len(docs_ts_h2)))
197+
print(' len(np.unique(doc_indices_va)): {} [this should be {}]'.format(len(np.unique(doc_indices_va)), len(docs_va)))
198+
199+
# Number of documents in each set
200+
n_docs_tr = len(docs_tr)
201+
n_docs_ts = len(docs_ts)
202+
n_docs_ts_h1 = len(docs_ts_h1)
203+
n_docs_ts_h2 = len(docs_ts_h2)
204+
n_docs_va = len(docs_va)
205+
206+
# Remove unused variables
207+
del docs_tr
208+
del docs_ts
209+
del docs_ts_h1
210+
del docs_ts_h2
211+
del docs_va
212+
213+
# Create bow representation
214+
print('creating bow representation...')
215+
216+
def create_bow(doc_indices, words, n_docs, vocab_size):
217+
return sparse.coo_matrix(([1]*len(doc_indices),(doc_indices, words)), shape=(n_docs, vocab_size)).tocsr()
218+
219+
bow_tr = create_bow(doc_indices_tr, words_tr, n_docs_tr, len(vocab))
220+
bow_ts = create_bow(doc_indices_ts, words_ts, n_docs_ts, len(vocab))
221+
bow_ts_h1 = create_bow(doc_indices_ts_h1, words_ts_h1, n_docs_ts_h1, len(vocab))
222+
bow_ts_h2 = create_bow(doc_indices_ts_h2, words_ts_h2, n_docs_ts_h2, len(vocab))
223+
bow_va = create_bow(doc_indices_va, words_va, n_docs_va, len(vocab))
224+
225+
del words_tr
226+
del words_ts
227+
del words_ts_h1
228+
del words_ts_h2
229+
del words_va
230+
del doc_indices_tr
231+
del doc_indices_ts
232+
del doc_indices_ts_h1
233+
del doc_indices_ts_h2
234+
del doc_indices_va
235+
236+
# Write files for LDA C++ code
237+
def write_lda_file(filename, timestamps_in, time_list_in, bow_in):
238+
idxSort = np.argsort(timestamps_in)
239+
240+
with open(filename, "w") as f:
241+
for row in idxSort:
242+
x = bow_in.getrow(row)
243+
n_elems = x.count_nonzero()
244+
f.write(str(n_elems))
245+
if(n_elems != len(x.indices) or n_elems != len(x.data)):
246+
raise ValueError("[ERR] THIS SHOULD NOT HAPPEN")
247+
for ii, dd in zip(x.indices, x.data):
248+
f.write(' ' + str(ii) + ':' + str(dd))
249+
f.write('\n')
250+
251+
with open(filename.replace("-mult", "-seq"), "w") as f:
252+
f.write(str(len(time_list_in)) + '\n')
253+
for idx_t, _ in enumerate(time_list_in):
254+
n_elem = len([t for t in timestamps_in if t==idx_t])
255+
f.write(str(n_elem) + '\n')
256+
257+
258+
path_save = './min_df_' + str(min_df) + '/'
259+
if not os.path.isdir(path_save):
260+
os.system('mkdir -p ' + path_save)
261+
262+
# Write files for LDA C++ code
263+
print('saving LDA files for C++ code...')
264+
write_lda_file(path_save + 'dtm_tr-mult.dat', timestamps_tr, time_list, bow_tr)
265+
write_lda_file(path_save + 'dtm_ts-mult.dat', timestamps_ts, time_list, bow_ts)
266+
write_lda_file(path_save + 'dtm_ts_h1-mult.dat', timestamps_ts, time_list, bow_ts_h1)
267+
write_lda_file(path_save + 'dtm_ts_h2-mult.dat', timestamps_ts, time_list, bow_ts_h2)
268+
write_lda_file(path_save + 'dtm_va-mult.dat', timestamps_va, time_list, bow_va)
269+
270+
# Also write the vocabulary and timestamps
271+
with open(path_save + 'vocab.txt', "w") as f:
272+
for v in vocab:
273+
f.write(v + '\n')
274+
275+
with open(path_save + 'timestamps.txt', "w") as f:
276+
for t in time_list:
277+
f.write(t + '\n')
278+
279+
with open(path_save + 'vocab.pkl', 'wb') as f:
280+
pickle.dump(vocab, f)
281+
del vocab
282+
283+
with open(path_save + 'timestamps.pkl', 'wb') as f:
284+
pickle.dump(time_list, f)
285+
286+
# Save timestamps alone
287+
savemat(path_save + 'bow_tr_timestamps', {'timestamps': timestamps_tr}, do_compression=True)
288+
savemat(path_save + 'bow_ts_timestamps', {'timestamps': timestamps_ts}, do_compression=True)
289+
savemat(path_save + 'bow_va_timestamps', {'timestamps': timestamps_va}, do_compression=True)
290+
291+
# Split bow intro token/value pairs
292+
print('splitting bow intro token/value pairs and saving to disk...')
293+
294+
def split_bow(bow_in, n_docs):
295+
indices = [[w for w in bow_in[doc,:].indices] for doc in range(n_docs)]
296+
counts = [[c for c in bow_in[doc,:].data] for doc in range(n_docs)]
297+
return indices, counts
298+
299+
bow_tr_tokens, bow_tr_counts = split_bow(bow_tr, n_docs_tr)
300+
savemat(path_save + 'bow_tr_tokens', {'tokens': bow_tr_tokens}, do_compression=True)
301+
savemat(path_save + 'bow_tr_counts', {'counts': bow_tr_counts}, do_compression=True)
302+
del bow_tr
303+
del bow_tr_tokens
304+
del bow_tr_counts
305+
306+
bow_ts_tokens, bow_ts_counts = split_bow(bow_ts, n_docs_ts)
307+
savemat(path_save + 'bow_ts_tokens', {'tokens': bow_ts_tokens}, do_compression=True)
308+
savemat(path_save + 'bow_ts_counts', {'counts': bow_ts_counts}, do_compression=True)
309+
del bow_ts
310+
del bow_ts_tokens
311+
del bow_ts_counts
312+
313+
bow_ts_h1_tokens, bow_ts_h1_counts = split_bow(bow_ts_h1, n_docs_ts_h1)
314+
savemat(path_save + 'bow_ts_h1_tokens', {'tokens': bow_ts_h1_tokens}, do_compression=True)
315+
savemat(path_save + 'bow_ts_h1_counts', {'counts': bow_ts_h1_counts}, do_compression=True)
316+
del bow_ts_h1
317+
del bow_ts_h1_tokens
318+
del bow_ts_h1_counts
319+
320+
bow_ts_h2_tokens, bow_ts_h2_counts = split_bow(bow_ts_h2, n_docs_ts_h2)
321+
savemat(path_save + 'bow_ts_h2_tokens', {'tokens': bow_ts_h2_tokens}, do_compression=True)
322+
savemat(path_save + 'bow_ts_h2_counts', {'counts': bow_ts_h2_counts}, do_compression=True)
323+
del bow_ts_h2
324+
del bow_ts_h2_tokens
325+
del bow_ts_h2_counts
326+
327+
bow_va_tokens, bow_va_counts = split_bow(bow_va, n_docs_va)
328+
savemat(path_save + 'bow_va_tokens', {'tokens': bow_va_tokens}, do_compression=True)
329+
savemat(path_save + 'bow_va_counts', {'counts': bow_va_counts}, do_compression=True)
330+
del bow_va
331+
del bow_va_tokens
332+
del bow_va_counts
333+
334+
print('Data ready !!')
335+
print('*************')

0 commit comments

Comments
 (0)