|
2 | 2 |
|
3 | 3 | from data_analysis import word_cloud_generator
|
4 | 4 | from data_analysis.ngram_plotter import NgramPlotter
|
5 |
| -from data_model.south_park_data import SouthParkData |
| 5 | +from data_model.toxic_comment_data import ToxicCommentData |
6 | 6 | from model.tokenizer import NltkTokenizer
|
7 | 7 |
|
8 | 8 | if __name__ == '__main__':
|
9 |
| - dataset = SouthParkData() |
10 |
| - data = dataset.get_data() |
11 |
| - labels = dataset.get_label() |
| 9 | + dataset = ToxicCommentData() |
| 10 | + X = dataset.get_data() |
| 11 | + Y = dataset.get_label() |
12 | 12 | label_names = dataset.get_label_names()
|
13 | 13 |
|
14 |
| - X = data.to_numpy() |
15 |
| - Y = labels.to_numpy() |
16 |
| - |
17 | 14 | classes = len(np.unique(Y))
|
18 | 15 | print(X.shape, Y.shape, classes)
|
19 | 16 |
|
|
22 | 19 |
|
23 | 20 | ngram_plotter = NgramPlotter(ngram_size=2)
|
24 | 21 |
|
| 22 | + data_set_length = {} |
| 23 | + |
25 | 24 | for i in range(classes):
|
26 |
| - print(f'Label {i}: {len(X[Y == i])}') |
| 25 | + print(f'Label {label_names[i]}: {len(X[Y == i])}') |
| 26 | + data_set_length[label_names[i]] = len(X[Y == i]) |
27 | 27 | all_words = [text for subtext in X[Y == i] for text in subtext]
|
28 | 28 | word_cloud_generator.generate(' '.join(all_words), f'word_cloud_{i}_{label_names[i]}.pdf')
|
29 | 29 | ngram_plotter.plot_histogram(all_words, f'ngram_{i}_{label_names[i]}.pdf')
|
| 30 | + |
| 31 | + data_set_length = {k: v for k, v in sorted(data_set_length.items(), key=lambda item: item[1])} |
| 32 | + import matplotlib.pyplot as plt |
| 33 | + plt.figure(figsize=(5, 4)) |
| 34 | + plt.grid(axis='x', linestyle='--') |
| 35 | + plt.barh(range(len(data_set_length)), list(data_set_length.values()), align='center') |
| 36 | + plt.yticks(range(len(data_set_length)), list(data_set_length.keys())) |
| 37 | + for i, v in enumerate(data_set_length.values()): |
| 38 | + plt.text(v + 300, i, str(v), color='blue', fontweight='bold') |
| 39 | + plt.savefig('data_set_length.pdf', bbox_inches='tight') |
| 40 | + plt.show() |
| 41 | + |
| 42 | + |
0 commit comments