-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathdata_dive.py
87 lines (78 loc) · 3.24 KB
/
data_dive.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import os
import sys
# import torch
import timeit
import argparse
import numpy as np
import time
# import torch.nn as nn
# import torch.optim as optim
import matplotlib.pyplot as plt
# import torch.autograd as autograd
from sklearn import preprocessing
# from torch.autograd import Variable
from sklearn.decomposition import PCA
import matplotlib.gridspec as gridspec
# this file is to explore the generated data and the data that already exist to see how much similarity do they share.
# It prits some stats and qualitative results
new_data_x_file = "../datasets/Gen_data_Z_dim-200_mb_size-100_h_dim-600_preproc-1_beta-1.01_final_ly-Sigmoid_loss-BCELoss/new_x.npy"
new_data_y_file = "../datasets/Gen_data_Z_dim-200_mb_size-100_h_dim-600_preproc-1_beta-1.01_final_ly-Sigmoid_loss-BCELoss/new_y.npy"
actual_data_x_file = "../datasets/Eurlex/eurlex_docs/x_tr.npy"
actual_data_y_file = "../datasets/Eurlex/eurlex_docs/y_tr.npy"
indx2word_file = "../datasets/Eurlex/eurlex_docs/feature_names.txt"
indx2label = "../datasets/Eurlex/eurlex_docs/label_set.txt"
K = 10
# ----------------------------------------------------------------------------
new_data_x = np.load(new_data_x_file)
new_data_y = np.load(new_data_y_file)
actual_data_x = np.load(actual_data_x_file)
actual_data_y = np.load(actual_data_y_file)
f = open(indx2label, 'r')
temp = f.read().splitlines()
labels = []
for i in temp:
labels.append(i.split(":")[1])
f = open(indx2word_file, 'r')
temp = f.read().splitlines()
words = []
for i in temp:
words.append(i.split(":")[1])
print("Shapes: new_x: {}; new_y: {}; original_x: {}; original_y: {};".format(new_data_x.shape, \
new_data_y.shape, actual_data_x.shape, actual_data_y.shape))
print("Num Words: {}; Num Labels: {};".format(len(labels), len(words)))
for data_pt_num in range(K):
data_pt_labels = np.argwhere(new_data_y[data_pt_num]==1)
label_names = []
for label in data_pt_labels.tolist():
# print(label)
label_names.append(labels[label[0]])
print("Labels in the data point : {}".format(label_names))
data_pt_words = np.argsort(new_data_x[data_pt_num])[-10:]
word_names = []
for word in data_pt_words.tolist():
word_names.append(words[word])
print("Top 10 words in the data point : {}".format(word_names))
# Nearest Data point in actual data
indx = -1
closest = 1e10
# print(actual_data_y)
for i in range(len(actual_data_y)):
dist = -len(np.intersect1d(np.argwhere(actual_data_y[i]==1), np.argwhere(new_data_y[data_pt_num]==1)))
# print(np.argwhere(actual_data_y[i]==1))
# print(np.argwhere(new_data_y[data_pt_num]==1))
if(dist<closest):
closest = dist
indx = i
print(-closest)
print(indx)
data_pt_labels = np.argwhere(actual_data_y[indx]==1)
label_names = []
for label in data_pt_labels.tolist():
label_names.append(labels[label[0]])
print("Closest Label Set in the original data set has labels: {}".format(label_names))
data_pt_words = np.argsort(actual_data_x[indx])[-10:]
word_names = []
for word in data_pt_words.tolist():
word_names.append(words[word])
print("Top 10 words in the data point with the above label: {}".format(word_names))
print("="*50)