Skip to content

Commit ea008f8

Browse files
committed
fixes
1 parent 5d38d07 commit ea008f8

File tree

2 files changed

+22
-26
lines changed

2 files changed

+22
-26
lines changed

research/classification_research.ipynb

+1-18
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
}
3939
],
4040
"source": [
41+
"# Source: https://github.com/amandacurry/convabuse\n",
4142
"full_df = pd.read_csv(config.input_file('ConvAbuseEMNLPfull.csv'))\n",
4243
"full_df.describe()"
4344
],
@@ -68,24 +69,6 @@
6869
"collapsed": false
6970
}
7071
},
71-
{
72-
"cell_type": "code",
73-
"execution_count": 54,
74-
"outputs": [],
75-
"source": [],
76-
"metadata": {
77-
"collapsed": false
78-
}
79-
},
80-
{
81-
"cell_type": "code",
82-
"execution_count": 54,
83-
"outputs": [],
84-
"source": [],
85-
"metadata": {
86-
"collapsed": false
87-
}
88-
},
8972
{
9073
"cell_type": "code",
9174
"execution_count": 55,

research/data_analysis_runner.py

+21-8
Original file line numberDiff line numberDiff line change
@@ -2,18 +2,15 @@
22

33
from data_analysis import word_cloud_generator
44
from data_analysis.ngram_plotter import NgramPlotter
5-
from data_model.south_park_data import SouthParkData
5+
from data_model.toxic_comment_data import ToxicCommentData
66
from model.tokenizer import NltkTokenizer
77

88
if __name__ == '__main__':
9-
dataset = SouthParkData()
10-
data = dataset.get_data()
11-
labels = dataset.get_label()
9+
dataset = ToxicCommentData()
10+
X = dataset.get_data()
11+
Y = dataset.get_label()
1212
label_names = dataset.get_label_names()
1313

14-
X = data.to_numpy()
15-
Y = labels.to_numpy()
16-
1714
classes = len(np.unique(Y))
1815
print(X.shape, Y.shape, classes)
1916

@@ -22,8 +19,24 @@
2219

2320
ngram_plotter = NgramPlotter(ngram_size=2)
2421

22+
data_set_length = {}
23+
2524
for i in range(classes):
26-
print(f'Label {i}: {len(X[Y == i])}')
25+
print(f'Label {label_names[i]}: {len(X[Y == i])}')
26+
data_set_length[label_names[i]] = len(X[Y == i])
2727
all_words = [text for subtext in X[Y == i] for text in subtext]
2828
word_cloud_generator.generate(' '.join(all_words), f'word_cloud_{i}_{label_names[i]}.pdf')
2929
ngram_plotter.plot_histogram(all_words, f'ngram_{i}_{label_names[i]}.pdf')
30+
31+
data_set_length = {k: v for k, v in sorted(data_set_length.items(), key=lambda item: item[1])}
32+
import matplotlib.pyplot as plt
33+
plt.figure(figsize=(5, 4))
34+
plt.grid(axis='x', linestyle='--')
35+
plt.barh(range(len(data_set_length)), list(data_set_length.values()), align='center')
36+
plt.yticks(range(len(data_set_length)), list(data_set_length.keys()))
37+
for i, v in enumerate(data_set_length.values()):
38+
plt.text(v + 300, i, str(v), color='blue', fontweight='bold')
39+
plt.savefig('data_set_length.pdf', bbox_inches='tight')
40+
plt.show()
41+
42+

0 commit comments

Comments
 (0)