Racism Detection.py

# -*- coding: utf-8 -*-
"""Racism Detection - angela.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1Q9tfCjBCSK45JQ5xi1IX2p2nkAf1XDxB
"""

from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from collections import Counter
from itertools import chain 
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import logging

from keras.models import Sequential
from keras.layers import SpatialDropout1D, LSTM, Dense, Embedding
from keras.callbacks import EarlyStopping
from keras.preprocessing.sequence import pad_sequences

"""#Read Dataset"""

df = pd.read_csv('dataset_racism.csv', sep=',')
df.drop(df.columns[df.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)
df

"""#Data Preprocessing"""

def preprocessing(str):
    #lowercase string
    str = str.lower()
    #remove rt, mention and link
    str = re.sub('rt |@[a-z]*|http([a-z]|[0-9]|/|:|.)*|pic.twitter.com/([a-z]|[0-9])*', '', str)
    #remove punctuation and emoticon
    str = re.sub('[^a-z0-9]+', ' ', str)
    #remove extra white spaces
    str = ' '.join(str.split())
    #tokenization
    # str = str.split()
    # if str == []:
        # return float('NaN')
    return str

df['preprocessed'] = df.tweets.apply(preprocessing)
df.preprocessed = df.preprocessed.apply(str)

df.head()

"""#Data Analysis"""

df_R = df[df.label == 'R']
df_NonR = df[df.label == 'Non_R']

label_count = df['label'].value_counts()
print('Class Non-Racist :', label_count[0])
print('Class Racist     :', label_count[1])

label_count.plot(kind='bar', title='Count (label)',rot=0)

df['preprocessed'][df.label == 'R'].iloc[0]

def generateWordCloud(df_tweets):
  # split texts by whitespace and turn them to array
  tweets = df_tweets.str.split(" ").tolist()

  # flatten the 2d array to 1d array
  tweets = list(chain.from_iterable(tweets))

  # count most common 20 racist words
  common_words = dict(Counter(tweets).most_common(20))
  # print(common_words)

  # set wordcloud values
  wordcloud = WordCloud(background_color="white",width=1500,height=1500,relative_scaling=0.5,min_font_size=10).generate_from_frequencies(common_words)

  return wordcloud

# generate wordcloud racist
wc_R = generateWordCloud(df[df['label'] == 'R']['preprocessed'])

# plot the WordCloud image                        
plt.figure(figsize = (6, 6), facecolor = None) 
plt.imshow(wc_R) 
plt.axis("off") 
plt.tight_layout(pad = 0)

# generate wordcloud non-racist
wc_NonR = generateWordCloud(df[df['label'] == 'Non_R']['preprocessed'])

# plot the WordCloud image                        
plt.figure(figsize = (6, 6), facecolor = None) 
plt.imshow(wc_NonR) 
plt.axis("off") 
plt.tight_layout(pad = 0)

"""# LSTM"""

from tensorflow.keras.preprocessing.text import Tokenizer
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000

# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 250

# This is fixed.
EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df['preprocessed'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

# get key from word index dictionary
def get_key(val): 
    for key, value in word_index.items(): 
         if val == value: 
             return key 

    return "key doesn't exist"
  
get_key(922)

X = tokenizer.texts_to_sequences(df['preprocessed'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Y = pd.get_dummies(df['label']).values
print('Shape of label tensor:', Y.shape)

# Split dataset
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X_train.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

epochs = 8
batch_size = 64

history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

accr = model.evaluate(X_test, Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

plt.title('model train vs validation loss')
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show();

plt.title('Accuracy')
plt.plot(history.history['accuracy'], label='train')
plt.plot(history.history['val_accuracy'], label='test')
plt.legend()
plt.show();