-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathIR.py
758 lines (637 loc) · 40.2 KB
/
IR.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
"""
Code for Information Retrieval course project @ University of Trieste, MSc in Data Science & Scientific Computing A.Y. 2023/2024.
Author: Michele Alessi
This file contains the code implementation of IR class, a class that allows embedding the corpus in different latent spaces and compute the similarity between the queries in .QRY file and the corpus in the latent space.
"""
import torch
import numpy as np
from datetime import date
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from src.utils import compute_pre_rec, custom_preprocessor
from src.autoencoder import AutoEncoder, train as train_ae, build_train_loader
from src.variational_autoencoder import VariationalAutoEncoder, train as train_vae
class IR():
def __init__(self, name, corpus, splitted_corpus, queries, splitted_queries, relevances):
"""
Args:
name: (str) Name of the IR object.
corpus: (list) List of strings. Each string is a document in the corpus.
splitted_corpus: (list) List of lists. Each list is a document in the corpus splitted into words.
queries: (list) List of strings. Each string is a query in the .QRY file.
splitted_queries: (list) List of lists. Each list is a query in the .QRY file splitted into words.
relevances: (list) List of lists. Each list is a list of relevant documents for the query in queries[i].
Attributes:
cname: (str) Name of the IR object.
file: (file) File where to write down the results.
len_corpus: (int) Number of documents in the corpus.
len_queries: (int) Number of queries in the .QRY file.
Embedding attributes:
tfidf_vectorizer: (skleran.feature_extraction.text.TfidfVectorizer object) TfidfVectorizer object used to build the tfidf matrix.
tfidf_matrix: (scipy.sparse.csr.csr_matrix) TfidfVectorizer.fit_transform(corpus) output matrix.
wc_vectorizer: (skleran.feature_extraction.text.CountVectorizer object) CountVectorizer object used to build the wordcount matrix.
wc_matrix: (scipy.sparse.csr.csr_matrix) CountVectorizer.fit_transform(corpus) output matrix.
matrix: (scipy.sparse.csr.csr_matrix) Updated by select_matrix_type(), it is the actual embedding matrix in use to build the latent space.
matrix_type: (str) Updated by select_matrix_type(), it is the name of the matrix ('tfidf' or 'wc')
__security: (bool) Security attribute, it is used to avoid the user to change the matrix (avoid errors like building the latent space with a matrix and computing the similarity with another one).
SVD latent space attributes:
svd: (sklearn.decomposition._truncated_svd.TruncatedSVD object) Functional.
svd_latent_space: (numpy.ndarray) Output matrix representing the latent space of the corpus built with SVD.
svd_latent_space_computed: (bool) True if the latent space has been computed.
n_components: (int) Latent space dimension for SVD.
Autoencoder latent space attributes:
autoencoder: (src.autoencoder.AutoEncoder object) Model.
trained: (bool) True if the autoencoder has been trained.
autoencoder_latent_space: (numpy.ndarray) Output matrix representing the latent space of the corpus built with the trained autoencoder.
autoencoder_latent_space_computed: (bool) True if the latent space has been computed.
losses: (list) List of losses for each iteration during the autoencoder training.
z_dim: (int) Latent space dimension for Autoencoder.
Variational Autoencoder latent space attributes:
vautoencoder: (src.variational_autoencoder.VariationalAutoEncoder object) Model.
vtrained: (bool) True if the VAE has been trained.
vautoencoder_latent_space: (numpy.ndarray) Output matrix representing the latent space of the corpus built with the trained variational autoencoder.
vautoencoder_latent_space_computed: (bool) True if the latent space has been computed.
vlosses: (list) List of losses for each iteration during the variational autoencoder training.
vz_dim: (int) Latent space dimension for VAE.
Queries attributes:
similarity_matrix_svd: (numpy.ndarray) Output matrix representing the similarity between all the queries in .QRY file and the corpus in the SVD latent space.
similarity_matrix_autoencoder: (numpy.ndarray) Output matrix representing the similarity between all the queries in .QRY file and the corpus in the Autoencoder latent space.
similarity_matrix_vautoencoder: (numpy.ndarray) Output matrix representing the similarity between all the queries in .QRY file and the corpus in the Variational Autoencoder latent space.
Methods:
__str__: (str) Print the general information about the IR object.
__getattr__: (str) Print an error message if the attribute is not found.
__getattribute__: (str) Print an error message if the attribute is not found.
Embedding methods:
build_tfidf_matrix: (None) Build the tfidf matrix.
build_wc_matrix: (None) Build the wordcount matrix.
select_matrix_type: (None) Select the matrix type to use.
SVD methods:
build_svd_latent_space: (None) Build the latent space using svd decomposition.
process_query_svd: (numpy.ndarray) Compute the similarity between the queries in .QRY file and the corpus in the SVD latent space.
Autoencoder methods:
build_autoencoder: (None) Declare the autoencoder architecture.
train_autoencoder: (None) Train the autoencoder.
autoencoder_purge: (None) Delete the autoencoder.
build_autoencoder_latent_space: (None) Build the latent space using the trained autoencoder.
process_query_autoencoder: (numpy.ndarray) Compute the similarity between the queries in .QRY file and the corpus in the Autoencoder latent space.
save_model: (None) Save the trained model.
Variational Autoencoder methods:
build_variational_autoencoder: (None) Declare the variational autoencoder architecture.
train_variational_autoencoder: (None) Train the variational autoencoder.
variational_autoencoder_purge: (None) Delete the variational autoencoder.
build_variational_autoencoder_latent_space: (None) Build the latent space using the trained variational autoencoder.
process_query_vautoencoder: (numpy.ndarray) Compute the similarity between the queries in .QRY file and the corpus in the Variational Autoencoder latent space.
complete_queries_info: (None) Print all the information about the queries in .QRY file (not working for a single query).
"""
self.cname = name
self.file = open(f"./out/{self.cname}.txt", "a")
self.file.write(f"Author: Michele Alessi\n")
today = date.today()
self.file.write(f"Date: {today}\n\n")
# corpus attributes
self.corpus = corpus
self.len_corpus = len(corpus)
self.splitted_corpus = splitted_corpus
self.queries = queries
self.len_queries = len(queries)
self.splitted_queries = splitted_queries
self.relevances = relevances
############## corpus vectorization ##############
# matrix attributes
self.tfidf_vectorizer = None # skleran.feature_extraction.text.TfidfVectorizer object
self.tfidf_matrix = None # TfidfVectorizer.fit_transform(corpus) output matrix
self.wc_vectorizer = None # skleran.feature_extraction.text.CountVectorizer object
self.wc_matrix = None # CountVectorizer.fit_transform(corpus) output matrix
self.matrix = None # updated by select_matrix_type(), it is the actual matrix
self.matrix_type = None # updated by select_matrix_type(), it is the name of the matrix
self.__security = False # security attribute, it is used to avoid the user to change the matrix
############# Latent space ##############
# latent space attributes: SVD
self.svd = None # functional
self.svd_latent_space = None # output matrix representing the latent space of the corpus
self.svd_latent_space_computed = False # True if the latent space has been computed
self.n_components = None # latent space dimension for SVD
# latent space attributes: Autoencoder
self.autoencoder= None # model
self.trained = False # True if the autoencoder has been trained
self.autoencoder_latent_space = None # output matrix representing the latent space of the corpus
self.autoencoder_latent_space_computed = False # True if the latent space has been computed
self.lossess = None # list of losses for each epoch
self.z_dim = None # latent space dimension for Autoencoder
# latent space attributes: Variational Autoencoder
self.vautoencoder = None # model
self.vtrained = False # True if the VAE has been trained
self.vautoencoder_latent_space = None # output matrix representing the latent space of the corpus
self.vautoencoder_latent_space_computed = False # True if the latent space has been computed
self.vlossess = None # list of losses for each epoch
self.vz_dim = None # latent space dimension for VAE
############## Queries attributes ##############
self.similarity_matrix_svd = None # output matrix representing the similarity between all the queries in .QRY file and the corpus in the SVD latent space
self.similarity_matrix_autoencoder = None # output matrix representing the similarity between all the queries in .QRY file and the corpus in the Autoencoder latent space
self.similarity_matrix_vautoencoder = None # output matrix representing the similarity between all the queries in .QRY file and the corpus in the VAE latent space
def __str__(self):
"""
Print the general information about the IR object.
"""
status = (
f"General information:\n"
f"IR object with {self.len_corpus} documents and {self.len_queries} queries.\n")
mat_info = (f"Current matrix-type (i.e. text-embedding type: tfidf, wc) in use is: {self.matrix_type}.\n"
f"Matrix shape: {self.matrix.shape} ---> (# docs: {self.matrix.shape[0]}, # words: {self.matrix.shape[1]})\n\n"
if self.matrix is not None else ""
)
svd_info = (
f"SVD information:\n"
f"SVD latent space: {self.svd_latent_space_computed}\n"
f"SVD latent space dimension: {self.n_components}\n\n"
if self.svd_latent_space_computed else ""
)
ae_info = ("Autoencoder information:\n"
f"Autoencoder trained: {self.trained}\n"
f"Autoencoder latent space: {self.autoencoder_latent_space_computed}\n"
f"Autoencoder latent space dimension: {self.z_dim}\n"
f"Autoencoder architecture: {self.autoencoder}\n"
f"Losses: {self.lossess[-1]}\n"
if self.trained else ""
)
vae_info = ("Variational Autoencoder information:\n"
f"Variational Autoencoder trained: {self.vtrained}\n"
f"Variational Autoencoder latent space: {self.vautoencoder_latent_space_computed}\n"
f"Variational Autoencoder latent space dimension: {self.vz_dim}\n"
f"Variational Autoencoder architecture: {self.vautoencoder}\n"
f"Losses: {self.vlossess[-1]}\n"
if self.vtrained else ""
)
qinfo = (
f"\nQueries:\n"
f"Similiraty with .QRY queries computed in SVD latent space under {self.matrix_type} text embedding: {self.similarity_matrix_svd is not None}\n"
f"Similarity with .QRY queries computed in Autoencoder latent space under {self.matrix_type} text embedding: {self.similarity_matrix_autoencoder is not None}\n "
f"Similarity with .QRY queries computed in Variational Autoencoder latent space under {self.matrix_type} text embedding: {self.similarity_matrix_vautoencoder is not None}\n"
"To have complete information about .QRY queries, call complete_queries_info()\n"
)
possible_actions = (
f"\nPossible actions:\n"
f"1. Build tfidf matrix: build_tfidf_matrix()\n"
f"2. Build wc matrix: build_wc_matrix()\n"
f"3. Select matrix type: select_matrix_type('tfidf' or 'wc')\n"
f"4. Build SVD latent space: build_svd_latent_space(n_components)\n"
f"5. Build autoencoder: build_autoencoder(input_dim, h_dim_1, h_dim_2, h_dim_3, z_dim)\n"
f"6. Train autoencoder: train_autoencoder(BATCH_SIZE, NUM_EPOCHS, loss_fn, lr)\n"
f"7. Build autoencoder latent space: build_autoencoder_latent_space()\n"
f"8. Process query in SVD latent space: process_query_svd(query)\n"
f"9. Process query in Autoencoder latent space: process_query_autoencoder(query)\n"
f"10. Retrieve documents: retrieve_documents(similarity_matrix, n_docs)\n"
)
return status + mat_info + svd_info + ae_info + vae_info + qinfo + possible_actions
def complete_queries_info(self, write_down = False):
"""
Print all the information about the queries in .QRY file (not working for a single query)
"""
q_info = ""
if self.similarity_matrix_svd is not None:
result_svd, precision_svd, recall_svd = self.retrieve_documents(self.similarity_matrix_svd, verbose=False)
q_info += "\nSVD info:\n"
q_info += "\n".join([f"Query {i}: P: {precision_svd[i]} R: {recall_svd[i]}" for i in range(len(self.similarity_matrix_svd))])
if write_down:
self.file.write(f"SVD queries info:\n")
self.file.write("for each query, the top 15 documents retrieved are listed in the form: doc_id, score\n\n")
self.file.write("query_id, doc_id, score\n")
for i, dict in enumerate(result_svd):
for j, score in dict.items():
self.file.write(f"{i+1}, {j}, {score}\n")
self.file.write("\n\n")
if self.similarity_matrix_autoencoder is not None:
result_autoencoder, precision_autoencoder, recall_autoencoder = self.retrieve_documents(self.similarity_matrix_autoencoder, verbose=False)
q_info += "\nAutoEncoder info:\n"
q_info += "\n".join([f"Query {i}: P: {precision_autoencoder[i]} R: {recall_autoencoder[i]}" for i in range(len(self.similarity_matrix_autoencoder))])
if write_down:
self.file.write(f"AutoEncoder queries info:\n")
self.file.write("for each query, the top 15 documents retrieved are listed in the form: doc_id, score\n\n")
self.file.write("query_id, doc_id, score\n")
for i, dict in enumerate(result_autoencoder):
for j, score in dict.items():
self.file.write(f"{i+1}, {j}, {score}\n")
self.file.write("\n\n")
if self.similarity_matrix_vautoencoder is not None:
result_vautoencoder, precision_vautoencoder, recall_vautoencoder = self.retrieve_documents(self.similarity_matrix_vautoencoder, verbose=False)
q_info += "\nVariationalAutoEncoder info:\n"
q_info += "\n".join([f"Query {i}: P: {precision_vautoencoder[i]} R: {recall_vautoencoder[i]}" for i in range(len(self.similarity_matrix_vautoencoder))])
if write_down:
self.file.write(f"VariationalAutoEncoder queries info:\n")
self.file.write("for each query, the top 15 documents retrieved are listed in the form: doc_id, score\n\n")
self.file.write("query_id, doc_id, score\n")
for i, dict in enumerate(result_vautoencoder):
for j, score in dict.items():
self.file.write(f"{i+1}, {j}, {score}\n")
self.file.write("\n\n")
if self.similarity_matrix_svd is None and self.similarity_matrix_autoencoder is None and self.similarity_matrix_vautoencoder is None:
raise ValueError("No similarity matrix computed yet. Call process_query_svd() or process_query_autoencoder() or process_query_vautoencoder() first.")
print("Queries information:\n", q_info)
print("\n\nAverage precision and recall:\n")
if self.similarity_matrix_svd is not None:
print(f"SVD: P: {np.mean(precision_svd)} R: {np.mean(recall_svd)}")
if self.similarity_matrix_autoencoder is not None:
print(f"AutoEncoder: P: {np.mean(precision_autoencoder)} R: {np.mean(recall_autoencoder)}")
if self.similarity_matrix_vautoencoder is not None:
print(f"VariationalAutoEncoder: P: {np.mean(precision_vautoencoder)} R: {np.mean(recall_vautoencoder)}")
def select_matrix_type(self, matrix_type):
"""
Select the matrix type to use (tfidf or wc).
Args:
matrix_type: (str) 'tfidf' or 'wc'
"""
if self.__security is False:
if matrix_type == "tfidf":
if self.tfidf_matrix is None:
raise ValueError("Tfidf matrix not built yet. Call build_tfidf_matrix() first.")
self.matrix = self.tfidf_matrix
self.matrix_type = "tfidf"
self.__security = True
print("Tfidf matrix selected.\n")
elif matrix_type == "wc":
if self.wc_matrix is None:
raise ValueError("Wordcount matrix not built yet. Call build_wc_matrix() first.")
self.matrix = self.wc_matrix
self.matrix_type = "wc"
self.__security = True
print("Wordcount matrix selected.\n")
else:
raise ValueError("Matrix type not supported.")
self.file.write(f"Current matrix-type (i.e. text-embedding type: tfidf, wc) in use is: {self.matrix_type}.\n")
self.file.write(f"Matrix shape: {self.matrix.shape} ---> (# docs: {self.matrix.shape[0]}, # words: {self.matrix.shape[1]})\n\n")
else:
raise ValueError(f"Matrix type already selected: {self.matrix_type}.\n"
f"For code-reliability reason it is not possible to change it. If you want to, create a new IR object.")
def __getattr__(self, name):
return f"Attribute '{name}' not found."
def __getattribute__(self, name):
return super().__getattribute__(name)
############## tfidf and wordcount matrices ##############
def build_tfidf_matrix(self, vectorizer = TfidfVectorizer(preprocessor=custom_preprocessor, stop_words='english')):
"""
Fit the tfidf vectorizer on the corpus and build the tfidf matrix.
Args:
vectorizer: (sklearn.feature_extraction.text.TfidfVectorizer object) TfidfVectorizer object used to build the tfidf matrix.
"""
print("Building tfidf matrix...\n")
self.tfidf_matrix = vectorizer.fit_transform(self.corpus)
self.tfidf_vectorizer = vectorizer
print("Done.\n")
def build_wc_matrix(self, vectorizer = CountVectorizer(preprocessor=custom_preprocessor, stop_words='english')):
"""
Fit the wordcount vectorizer on the corpus and build the wordcount matrix.
Args:
vectorizer: (sklearn.feature_extraction.text.CountVectorizer object) CountVectorizer object used to build the wordcount matrix.
"""
print("Building wc matrix...\n")
self.wc_matrix = vectorizer.fit_transform(self.corpus)
self.wc_vectorizer = vectorizer
print("Done.\n")
############## SVD latent space ##############
def build_svd_latent_space(self, n_components=100):
"""
Fit the TruncatedSVD on the matrix and build the latent space.
Args:
n_components: (int) Latent space dimension for SVD.
"""
if self.matrix is None:
raise ValueError("Matrix not set yet. Call select_matrix_type() first.")
print("Building latent space using svd decomposition...\n")
print(f"Current matrix in use is: {self.matrix_type}.\n")
self.svd = TruncatedSVD(n_components=n_components)
self.svd_latent_space = self.svd.fit_transform(self.matrix)
self.n_components = n_components # latent space dimension for SVD
self.svd_latent_space_computed = True
print("Done.\n")
# Queries similarity in SVD space
def process_query_svd(self, query=None):
"""
Process the query in the SVD latent space.
Args:
query: single string, or None, if None the similarity between all the queries and the corpus will be computed
Returns:
result: output matrix representing the similarity between the query (resp. all the queries) and the corpus in the SVD latent space
"""
if self.svd_latent_space is None:
raise ValueError("SVD latent space not built yet. Call build_svd_latent_space() first to find the projection matrix.\n"
"Then process_query_svd() will use that matrix to project the queries in the latent space.")
print(f"Computing query-corpus similarity using SVD latent space...\n"
f"Embedding type: {self.matrix_type}\n"
f"Latent SVD dimension: {self.n_components}\n")
if self.matrix_type == "tfidf":
if query is not None and type(query) == str:
queries_transformed = self.tfidf_vectorizer.transform([query])
elif query is None:
queries_transformed = self.tfidf_vectorizer.transform(self.queries)
else:
raise ValueError("Query must be either a string or None (in this case the similarity with all the available queries will be computed).")
elif self.matrix_type == "wc":
if query is not None and type(query) == str:
queries_transformed = self.wc_vectorizer.transform([query])
elif query is None:
queries_transformed = self.wc_vectorizer.transform(self.queries)
else:
raise ValueError("Query must be either a string or None (in this case the similarity with all the available queries will be computed).")
else:
raise ValueError("Matrix type not declared. Call select_matrix_type() first.")
queries_reduced = self.svd.transform(queries_transformed)
similarity_matrix_svd = cosine_similarity(queries_reduced, self.svd_latent_space)
if query is None:
self.similarity_matrix_svd = similarity_matrix_svd
print("Done.\n")
return similarity_matrix_svd
############## Autoencoder latent space ##############
def build_autoencoder(self, input_dim, h_dim_1, h_dim_2, z_dim):
"""
Call to the AutoEncoder class to build the autoencoder model.
Args:
input_dim: (int) Number of neurons in the input layer.
h_dim_1: (int) Number of neurons in the first hidden layer.
h_dim_2: (int) Number of neurons in the second hidden layer.
z_dim: (int) Number of neurons in the latent layer.
"""
print("Declaring autoencoder architecture...\n")
self.autoencoder= AutoEncoder(input_dim=input_dim, h_dim_1=h_dim_1, h_dim_2=h_dim_2, z_dim=z_dim)
self.z_dim = z_dim # latent space dimension for Autoencoder
print("Done.\n")
def train_autoencoder(self, BATCH_SIZE, NUM_EPOCHS, loss_fn, lr ):
"""
Call to the build_train_loader() function to build the train_loader.
Call to the train() function to train the autoencoder model.
Args:
BATCH_SIZE: (int) Batch size for the train_loader.
NUM_EPOCHS: (int) Number of epochs for the train_loader.
loss_fn: (torch.nn.modules.loss) Loss function to use.
lr: (float) Learning rate.
"""
if self.matrix is None:
raise ValueError("Matrix not set yet. Call select_matrix_type() first.")
if self.autoencoder is None:
raise ValueError("Autoencoder not built yet. Call build_autoencoder() first.")
print(f"Current matrix in use is: {self.matrix_type}.\n")
print("Training autoencoder ...\n")
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_loader = build_train_loader(self.matrix.toarray(), BATCH_SIZE)
self.lossess = train_ae(NUM_EPOCHS=NUM_EPOCHS, train_loader=train_loader, model=self.autoencoder, loss_fn=loss_fn, lr=lr, DEVICE=DEVICE, INPUT_DIM=self.matrix.shape[1], scheduler=None)
self.trained = True
print("Done.\n")
def autoencoder_purge(self):
"""
Delete the autoencoder model.
"""
if self.autoencoder is not None:
del self.autoencoder
self.autoencoder = None
self.trained = False
self.autoencoder_latent_space = None
self.autoencoder_latent_space_computed = False
self.lossess = None
self.z_dim = None
print("Model purged successfully.\n")
else:
print("Model not present.\n")
def save_model(self, path):
"""
Save the trained autoencoder model.
Args:
path: (str) Path where to save the model.
"""
if self.trained is False:
raise ValueError("Autoencoder not trained yet, cannot save a model not trained. Call train_autoencoder() first.")
torch.save(self.autoencoder.state_dict(), path)
print("Model saved successfully.")
def build_autoencoder_latent_space(self):
"""
Build the latent space using the trained autoencoder on the matrix.
This function iterates over the matrix and for each document it applies the autoencoder to the document vector, and picks the output of the latent layer as the latent representation of the document.
"""
if self.autoencoder is None:
raise ValueError("Autoencoder not built yet. Call build_autoencoder() first.")
if not self.trained:
raise ValueError("Autoencoder not trained yet. Call train_autoencoder() first.")
print("Building latent space using autoencoder...\n")
print(f"Current matrix in use is: {self.matrix_type}.\n")
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.autoencoder.eval()
# per ogni doc del corpus, ne faccio la proiezione nel latent space usando il trained autoencoder
torchmat = torch.tensor(self.matrix.toarray()).float()
latent_space = np.zeros((self.len_corpus, self.z_dim))
for i, vec in enumerate(torchmat):
print("Progress: {:.2f}%".format((i+1)/self.len_corpus*100), end="\r")
vec = vec.to(DEVICE)
_, z = self.autoencoder(vec)
latent_space[i] = z.detach().numpy()
self.autoencoder_latent_space = latent_space
self.autoencoder_latent_space_computed = True
print("Done.\n")
# Queries similarity in Autoencoder space
def process_query_autoencoder(self, query=None):
"""
Iterate over the queries and for each query it applies the autoencoder to the query vector, and picks the output of the latent layer as the latent representation of the query.
Then it computes the similarity between the queries and the corpus in the Autoencoder latent space.
Args:
query: single string, or None, if None the similarity between all the queries and the corpus will be computed
result: output matrix representing the similarity between all the queries and the corpus in the Autoencoder latent space
Returns:
result: output matrix representing the similarity between the query (resp. all the queries) and the corpus in the Autoencoder latent space
"""
if self.trained is False:
raise ValueError("Autoencoder not trained yet. Call train_autoencoder() first, then use the model to project the queries in the latent space.")
print(f"Computing single query-corpus similarity using AutoEncoder latent space...\n"
f"Embedding type: {self.matrix_type}\n"
f"Latent AutoEncoder dimension: {self.z_dim}\n")
if self.matrix_type == "tfidf":
if query is not None and type(query) == str:
queries_transformed = self.tfidf_vectorizer.transform([query])
elif query is None:
queries_transformed = self.tfidf_vectorizer.transform(self.queries)
else:
raise ValueError("Query must be either a string or None (in this case the similarity with all the available queries will be computed).")
elif self.matrix_type == "wc":
if query is not None and type(query) == str:
queries_transformed = self.wc_vectorizer.transform([query])
elif query is None:
queries_transformed = self.wc_vectorizer.transform(self.queries)
else:
raise ValueError("Query must be either a string or None (in this case the similarity with all the available queries will be computed).")
else:
raise ValueError("Matrix type not declared. Call select_matrix_type() first.")
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
queries_transformed = queries_transformed.toarray()
#print(queries_transformed.shape)
queries_reduced = np.zeros((queries_transformed.shape[0], self.z_dim))
self.autoencoder.eval()
# per ogni q, faccio la proiezione nel latent
for i, vec in enumerate(queries_transformed):
vec = torch.tensor(vec).float().to(DEVICE)
_, z = self.autoencoder(vec)
queries_reduced[i] = z.detach().numpy()
similarity_matrix_autoencoder = cosine_similarity(queries_reduced, self.autoencoder_latent_space)
if query is None:
self.similarity_matrix_autoencoder = similarity_matrix_autoencoder
print("Done.\n")
return similarity_matrix_autoencoder
############## Variational Autoencoder latent space ##############
def build_variational_autoencoder(self, input_dim, h_dim_1, h_dim_2, z_dim):
"""
Call to the VariationalAutoEncoder class to build the variational autoencoder model.
Args:
input_dim: (int) Number of neurons in the input layer.
h_dim_1: (int) Number of neurons in the first hidden layer.
h_dim_2: (int) Number of neurons in the second hidden layer.
z_dim: (int) Number of neurons in the latent layer.
"""
print("Declaring variational autoencoder architecture...\n")
self.vautoencoder = VariationalAutoEncoder(input_dim=input_dim, h_dim_1=h_dim_1, h_dim_2=h_dim_2, z_dim=z_dim)
self.vz_dim = z_dim # latent space dimension for VAE
print("Done.\n")
def train_variational_autoencoder(self, BATCH_SIZE, NUM_EPOCHS, loss_fn, lr, scheduler = None):
"""
Call to the build_train_loader() function to build the train_loader.
Call to the train() function to train the variational autoencoder model.
Args:
BATCH_SIZE: (int) Batch size for the train_loader.
NUM_EPOCHS: (int) Number of epochs for the train_loader.
loss_fn: (torch.nn.modules.loss) Loss function to use.
lr: (float) Learning rate.
scheduler: (torch.optim.lr_scheduler) Learning rate scheduler.
"""
if self.matrix is None:
raise ValueError("Matrix not set yet. Call select_matrix_type() first.")
if self.vautoencoder is None:
raise ValueError("Variational Autoencoder not built yet. Call build_variational_autoencoder() first.")
print(f"Current matrix in use is: {self.matrix_type}.\n")
print("Training variational autoencoder ...\n")
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_loader = build_train_loader(self.matrix.toarray(), BATCH_SIZE)
self.vlossess = train_vae(NUM_EPOCHS=NUM_EPOCHS, train_loader=train_loader, model=self.vautoencoder, loss_fn=loss_fn, lr=lr, DEVICE=DEVICE, INPUT_DIM=self.matrix.shape[1], scheduler=scheduler)
self.vtrained = True
print("Done.\n")
def variational_autoencoder_purge(self):
"""
Delete the variational autoencoder model.
"""
if self.vautoencoder is not None:
del self.vautoencoder
self.vautoencoder = None
self.vtrained = False
self.vautoencoder_latent_space = None
self.vautoencoder_latent_space_computed = False
self.vlossess = None
self.vz_dim = None
print("Model purged successfully.\n")
else:
print("Model not present.\n")
def build_variational_autoencoder_latent_space(self):
"""
Build the latent space using the trained variational autoencoder on the matrix.
This function iterates over the matrix and for each document it applies the variational autoencoder to the document vector, and picks the output of the latent layer as the latent representation of the document.
Note: the latent space is built by taking the mean of the latent representation of the document over 10 iterations.
"""
if self.vautoencoder is None:
raise ValueError("Variational Autoencoder not built yet. Call build_variational_autoencoder() first.")
if not self.vtrained:
raise ValueError("Variational Autoencoder not trained yet. Call train_variational_autoencoder() first.")
print("Building latent space using variational autoencoder...\n")
print(f"Current matrix in use is: {self.matrix_type}.\n")
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.vautoencoder.eval()
torchmat = torch.tensor(self.matrix.toarray()).float()
latent_space = np.zeros((self.len_corpus, self.vz_dim))
for i, vec in enumerate(torchmat):
print("Progress: {:.2f}%".format((i+1)/self.len_corpus*100), end="\r")
vec = vec.to(DEVICE)
for _ in range(10):
_,_,_, z = self.vautoencoder(vec)
# take the mean
latent_space[i] += z.detach().numpy()
latent_space[i] /= 10
#latent_space[i] = z.detach().numpy()
self.vautoencoder_latent_space = latent_space
self.vautoencoder_latent_space_computed = True
print("Done.\n")
# Queries similarity in Variational Autoencoder space
def process_query_vautoencoder(self, query=None):
"""
Iterate over the queries and for each query it applies the variational autoencoder to the query vector, and picks the output of the latent layer as the latent representation of the query.
Then it computes the similarity between the queries and the corpus in the Variational Autoencoder latent space.
Args:
query: single string, or None, if None the similarity between all the queries and the corpus will be computed
Returns:
result: output matrix representing the similarity between the query (resp. all the queries) and the corpus in the Variational Autoencoder latent space
"""
if self.vtrained is False:
raise ValueError("Variational Autoencoder not trained yet. Call train_variational_autoencoder() first, then use the model to project the queries in the latent space.")
print(f"Computing single query-corpus similarity using Variational Autoencoder latent space...\n"
f"Embedding type: {self.matrix_type}\n"
f"Latent Variational AutoEncoder dimension: {self.vz_dim}\n")
if self.matrix_type == "tfidf":
if query is not None and type(query) == str:
queries_transformed = self.tfidf_vectorizer.transform([query])
elif query is None:
queries_transformed = self.tfidf_vectorizer.transform(self.queries)
else:
raise ValueError("Query must be either a string or None (in this case the similarity with all the available queries will be computed).")
elif self.matrix_type == "wc":
if query is not None and type(query) == str:
queries_transformed = self.wc_vectorizer.transform([query])
elif query is None:
queries_transformed = self.wc_vectorizer.transform(self.queries)
else:
raise ValueError("Query must be either a string or None (in this case the similarity with all the available queries will be computed).")
else:
raise ValueError("Matrix type not declared. Call select_matrix_type() first.")
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
queries_transformed = queries_transformed.toarray()
queries_reduced = np.zeros((queries_transformed.shape[0], self.vz_dim))
self.vautoencoder.eval()
for i, vec in enumerate(queries_transformed):
vec = torch.tensor(vec).float().to(DEVICE)
_,_,_, z = self.vautoencoder(vec)
queries_reduced[i] = z.detach().numpy()
similarity_matrix_vautoencoder = cosine_similarity(queries_reduced, self.vautoencoder_latent_space)
if query is None:
self.similarity_matrix_vautoencoder = similarity_matrix_vautoencoder
print("Done.\n")
return similarity_matrix_vautoencoder
############## Retrieve docs given a similarity matrix ##############
def retrieve_documents(self, similarity_matrix, n_docs=15, verbose = True):
"""
Retrieve the top n_docs documents for each query.
Args:
similarity_matrix: output matrix representing the similarity between all the queries (or the query) and the corpus in the latent space
n_docs: (int) Number of documents to retrieve for each query.
verbose: (bool) If True, print the precision and recall for each query.
Returns:
result: list of dictionaries, each dictionary contains the top n_docs documents for a query
precision: list of precision for each query (available only if len(similarity_matrix)>1, meaning that the similarity between all the queries and the corpus has been computed)
recall: list of recall for each query (available only if len(similarity_matrix)>1, meaning that the similarity between all the queries and the corpus has been computed)
Note: if the query is a string, only the top n_docs documents for that query will be retrieved. Since the query does not belong to the .QRY file, relevences are not available and the precision and recall will not be computed.
"""
result =[]
for i in range(len(similarity_matrix)):
tmp = {}
# tmp = {doc_id: similarity_score of doc_id with query i}
for j in similarity_matrix[i].argsort()[-n_docs:][::-1]:
tmp[j+1] = similarity_matrix[i, j] # j+1 because doc_id starts from 1 in relevances
result.append(tmp)
#print('len_result', len(result))
if len(similarity_matrix)>1:
precision, recall = compute_pre_rec(result, self.relevances)
count = 0
for i in range(len(precision)):
if recall[i] != 0:
if verbose:
print(f"Query {i}: P: {precision[i]} R: {recall[i]}\n")
count += 1
if verbose:
print(f"Query at least with positive precision: {count} out of {len(precision)}")
return result, precision, recall
else:
print(f"Retrived docs for the given query:\n",
"!WARNING! doc_id starts from 1\n")
for doc_id, score in result[0].items(): # doc_id starts from 1
print(f"Doc id: {doc_id} Score: {score}\n")
return result, None, None