-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathRacism Detection.py
168 lines (125 loc) · 4.82 KB
/
Racism Detection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
# -*- coding: utf-8 -*-
"""Racism Detection - angela.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1Q9tfCjBCSK45JQ5xi1IX2p2nkAf1XDxB
"""
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter
from itertools import chain
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import logging
from keras.models import Sequential
from keras.layers import SpatialDropout1D, LSTM, Dense, Embedding
from keras.callbacks import EarlyStopping
from keras.preprocessing.sequence import pad_sequences
"""#Read Dataset"""
df = pd.read_csv('dataset_racism.csv', sep=',')
df.drop(df.columns[df.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)
df
"""#Data Preprocessing"""
def preprocessing(str):
#lowercase string
str = str.lower()
#remove rt, mention and link
str = re.sub('rt |@[a-z]*|http([a-z]|[0-9]|/|:|.)*|pic.twitter.com/([a-z]|[0-9])*', '', str)
#remove punctuation and emoticon
str = re.sub('[^a-z0-9]+', ' ', str)
#remove extra white spaces
str = ' '.join(str.split())
#tokenization
# str = str.split()
# if str == []:
# return float('NaN')
return str
df['preprocessed'] = df.tweets.apply(preprocessing)
df.preprocessed = df.preprocessed.apply(str)
df.head()
"""#Data Analysis"""
df_R = df[df.label == 'R']
df_NonR = df[df.label == 'Non_R']
label_count = df['label'].value_counts()
print('Class Non-Racist :', label_count[0])
print('Class Racist :', label_count[1])
label_count.plot(kind='bar', title='Count (label)',rot=0)
df['preprocessed'][df.label == 'R'].iloc[0]
def generateWordCloud(df_tweets):
# split texts by whitespace and turn them to array
tweets = df_tweets.str.split(" ").tolist()
# flatten the 2d array to 1d array
tweets = list(chain.from_iterable(tweets))
# count most common 20 racist words
common_words = dict(Counter(tweets).most_common(20))
# print(common_words)
# set wordcloud values
wordcloud = WordCloud(background_color="white",width=1500,height=1500,relative_scaling=0.5,min_font_size=10).generate_from_frequencies(common_words)
return wordcloud
# generate wordcloud racist
wc_R = generateWordCloud(df[df['label'] == 'R']['preprocessed'])
# plot the WordCloud image
plt.figure(figsize = (6, 6), facecolor = None)
plt.imshow(wc_R)
plt.axis("off")
plt.tight_layout(pad = 0)
# generate wordcloud non-racist
wc_NonR = generateWordCloud(df[df['label'] == 'Non_R']['preprocessed'])
# plot the WordCloud image
plt.figure(figsize = (6, 6), facecolor = None)
plt.imshow(wc_NonR)
plt.axis("off")
plt.tight_layout(pad = 0)
"""# LSTM"""
from tensorflow.keras.preprocessing.text import Tokenizer
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 250
# This is fixed.
EMBEDDING_DIM = 100
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df['preprocessed'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
# get key from word index dictionary
def get_key(val):
for key, value in word_index.items():
if val == value:
return key
return "key doesn't exist"
get_key(922)
X = tokenizer.texts_to_sequences(df['preprocessed'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)
Y = pd.get_dummies(df['label']).values
print('Shape of label tensor:', Y.shape)
# Split dataset
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X_train.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
epochs = 8
batch_size = 64
history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])
accr = model.evaluate(X_test, Y_test)
print('Test set\n Loss: {:0.3f}\n Accuracy: {:0.3f}'.format(accr[0],accr[1]))
plt.title('model train vs validation loss')
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show();
plt.title('Accuracy')
plt.plot(history.history['accuracy'], label='train')
plt.plot(history.history['val_accuracy'], label='test')
plt.legend()
plt.show();