Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Michael-feb18 #9

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
168 changes: 122 additions & 46 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,41 @@
# COMP 472
# WINTER 2021

# Michael Arabian - 40095854
# Thomas Le - 40096120
# Andre Saad - 40076579
# Michael Arabian - 40095854
# Thomas Le - 40096120
# Andre Saad - 40076579


# Sklearn Imports
from sklearn.feature_extraction.text import *
from sklearn.datasets import load_files
from sklearn import *
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import *

from collections import Counter
import matplotlib.pyplot as plt
import re
import pandas as pd
import numpy


# ---------------- Task 0 --------------------- #
# -------------------------------------- Task 0 --------------------------------------- #
#
# You first need to remove the document identifier, and also the topic label, which you don't need.
# Then, split the data into a training and an evaluation part. For instance, we may use 80% for training and the
# remainder for evaluation.
#


def read_documents(docName):
"""
read_documents takes in a file name properly reads and sorts all data.

:param docName: Name of inputed file

"""

docs = []
label = []
with open(docName, encoding="utf8") as f:
Expand All @@ -31,42 +47,61 @@ def read_documents(docName):


all_docs, all_labels = read_documents('all_sentiment_shuffled.txt')
split_point = int(0.80 * len(all_docs))
train_docs = all_docs[:split_point]
train_labels = all_labels[:split_point]
eval_docs = all_docs[split_point:]
eval_labels = all_labels[split_point:]

# ---------------- Task 1 --------------------- #

frequency = Counter()
split_point = int(0.80 * len(all_docs)) # Uses 80% of data set for training.

# Distribution for words
# Takes too long to load and plot

# Distribution for positive/negative
train_docs = all_docs[:split_point] # Training Documents
train_labels = all_labels[:split_point] # Training Labels

eval_docs = all_docs[split_point:] # Testing Documents
eval_labels = all_labels[split_point:] # Testing Labels



# ------------------------------ Task 1 ------------------------------- #
#
# Plot the distribution of the number of the instances in each class.
#
#

frequency = Counter() # Counter Initialization for frequency

# Distribution for Positive/Negative Data
for doc in all_labels:
frequency[doc] += 1

# Plot Creation
plt.bar(frequency.keys(), frequency.values())
plt.title("Distribution Plot")
plt.xlabel("Label")
plt.ylabel("Frequency")
plt.show()

# ---------------- Task 2 --------------------- #

# Naives Bayes
gnb = MultinomialNB()
# ------------------------------ Task 2 -------------------------------- #
# Run 3 different ML models.


# a)
# Naives Bayes

gnb = MultinomialNB() # Use of Multinomial Naive Bayes Classifier

cv = CountVectorizer(strip_accents='ascii', token_pattern=u'(?ui)\\b\\w*[a-z]+\\w*\\b',ngram_range = (1,1), stop_words='english')
train_docs_Vec = cv.fit_transform(train_docs)
gnb.fit(train_docs_Vec, train_labels)

eval_docs_Vec = cv.transform(eval_docs)
predictedGnb = gnb.predict(eval_docs_Vec)

# part 3a)

# Task 3 :
# Part a)
# Writing the row number of the instance, followed by a comma, followed by the index of the predicted class of that instance to a file

row = len(train_docs) + 1

f = open("NaiveBayes-all_sentiment_shuffled.txt", "w")
for result in predictedGnb:
index = 1
Expand All @@ -75,11 +110,16 @@ def read_documents(docName):
f.write(str(row) + ", " + str(index) + "\n")
row += 1

# part 3c) d)
precisionGnb = metrics.precision_score(eval_labels, predictedGnb, average = None)
recallGnb = metrics.recall_score(eval_labels, predictedGnb, average = None)
f1ScoreGnb = metrics.f1_score(eval_labels, predictedGnb, average=None)
accuracyGnb = metrics.accuracy_score(eval_labels, predictedGnb)

# Task 3
# Part c) & Part d)

precisionGnb = metrics.precision_score(eval_labels, predictedGnb, average = None) # Calculation of Precision
recallGnb = metrics.recall_score(eval_labels, predictedGnb, average = None) # Calculation of Recall
f1ScoreGnb = metrics.f1_score(eval_labels, predictedGnb, average=None) # Calculation of F1-Measure
accuracyGnb = metrics.accuracy_score(eval_labels, predictedGnb) # Calculation of Accuracy


print('Naives Bayes Precision: ' + str(precisionGnb))
print('Naives Bayes Recall: ' + str(recallGnb))
print('Naives Bayes f1_score: ' + str(f1ScoreGnb))
Expand All @@ -89,20 +129,28 @@ def read_documents(docName):
'\nNaives Bayes f1_score: ' + str(f1ScoreGnb) +
'\nNaives Bayes Accuracy: ' + str(accuracyGnb*100))

cmGnb = numpy.array2string(metrics.confusion_matrix(eval_labels, predictedGnb))

cmGnb = numpy.array2string(metrics.confusion_matrix(eval_labels, predictedGnb)) # Generating Confusion Matrix
print(cmGnb)
f.write('\n Confusion Matrix: \n' + cmGnb )

f.close()
f.write('\n Confusion Matrix: \n' + cmGnb ) # Writing to generated file.
f.close() # Closing the file.



# Decision tree -------------------------- #
# b )
# Decision tree

decisionTree = tree.DecisionTreeClassifier(criterion= 'entropy')
decisionTree = tree.DecisionTreeClassifier(criterion= 'entropy') # Use of Decision Tree Classifier with Criterion set to 'entropy'
decisionTree.fit(train_docs_Vec, train_labels)
predictedDt = decisionTree.predict(eval_docs_Vec)

# part 3a)

# Task 3
# Part a)

row = len(train_docs) + 1

f = open("DecisionTree-all_sentiment_shuffled.txt", "w")
for result in predictedDt:
index = 1
Expand All @@ -111,11 +159,15 @@ def read_documents(docName):
f.write(str(row) + ", " + str(index) + "\n")
row += 1

# part 3c) d)
precisionDt = metrics.precision_score(eval_labels, predictedDt, average=None)
recallDt = metrics.recall_score(eval_labels, predictedDt, average=None)
f1ScoreDt = metrics.f1_score(eval_labels, predictedDt, average=None)
accuracyDt = metrics.accuracy_score(predictedDt, eval_labels)

# Task 3
# Part c) & Part d)

precisionDt = metrics.precision_score(eval_labels, predictedDt, average=None) # Calculation of Precision
recallDt = metrics.recall_score(eval_labels, predictedDt, average=None) # Calculation of Recall
f1ScoreDt = metrics.f1_score(eval_labels, predictedDt, average=None) # Calculation of F1-Measure
accuracyDt = metrics.accuracy_score(predictedDt, eval_labels) # Calculation of Accuracy

print('Decision Tree Precision: ' + str(precisionDt))
print('Decision Tree Recall: ' + str(recallDt))
print('Decision Tree f1_score: ' + str(f1ScoreDt))
Expand All @@ -126,19 +178,29 @@ def read_documents(docName):
'\nDecision Tree f1_score: ' + str(f1ScoreDt) +
'\nDecision Tree Accuracy: ' + str(accuracyDt*100))

cmDt = numpy.array2string(metrics.confusion_matrix(eval_labels, predictedDt))
cmDt = numpy.array2string(metrics.confusion_matrix(eval_labels, predictedDt)) # Generating Confusion Matrix
print(cmDt)
f.write('\n Confusion Matrix: \n' + cmDt )
f.close()

# Better Decision tree -------------------------- #

betterDecisionTree = tree.DecisionTreeClassifier(splitter= 'random')
f.write('\n Confusion Matrix: \n' + cmDt ) # Writing to generated file.
f.close() # Closing File.





# c )
# Better Decision Tree

betterDecisionTree = tree.DecisionTreeClassifier(splitter= 'random') # Use of Decision Tree Classifier with Splitter set to 'random'
betterDecisionTree.fit(train_docs_Vec, train_labels)
predictedBdt = betterDecisionTree.predict(eval_docs_Vec)

# part 3a)

# Task
# Part 3a)
row2 = len(train_docs) + 1

f = open("BetterDecisionTree-all_sentiment_shuffled.txt", "w")
for result in predictedBdt:
index2 = 1
Expand All @@ -147,11 +209,15 @@ def read_documents(docName):
f.write(str(row) + ", " + str(index2) + "\n")
row2 += 1

# part 3c) d)

# Task 3
# Part c) & Part d)
precisionBdt = metrics.precision_score(eval_labels, predictedBdt, average=None)
recallBdt = metrics.recall_score(eval_labels, predictedBdt, average=None)
f1ScoreBdt = metrics.f1_score(eval_labels, predictedBdt, average=None)
accuracyBdt = metrics.accuracy_score(eval_labels, predictedBdt)


print('Better Decision Tree Precision: ' + str(precisionBdt))
print('Better Decision Tree Recall: ' + str(recallBdt))
print('Better Decision Tree f1_score: ' + str(f1ScoreBdt))
Expand All @@ -162,12 +228,22 @@ def read_documents(docName):
'\nBetter Decision Tree f1_score: ' + str(f1ScoreBdt) +
'\nBetter Decision Tree Accuracy: ' + str(accuracyBdt*100))

cmBdt = numpy.array2string(metrics.confusion_matrix(eval_labels, predictedBdt))
cmBdt = numpy.array2string(metrics.confusion_matrix(eval_labels, predictedBdt)) # Generating Confusion Matrix
print(cmBdt)
f.write('\n Confusion Matrix: \n' + cmBdt)
f.close()

# ---------------- Task 4 --------------------- #
f.write('\n Confusion Matrix: \n' + cmBdt) # Writing to generated file.
f.close() # Closing File.



# ------------------------------ Task 4 -------------------------------- #
#
# Error Analysis
#
# Find the few misclassified documents and comment on why you think they were hard to classify. For
# instance, you may select a few short documents where the probabilities were particularly high in the wrong
# direction.


# index = 0
# listOfString = []
Expand Down