fill_in.py

import sys
import re
import math

from Vector import Vector


# ==== Cleaning Text ===

# Load stopwords
from stopwords import stopwords
new_stop_words = set()
for word in stopwords:
    if "'" in word:
        new_stop_words.add(word.replace("'",""))
stopwords = stopwords.union(new_stop_words)


# Sanitizer
def sanitize(text):
    """
    clean up:
        1. split a string into a list of words
        2. remove all @ handles
        3. remove all hash tags
        4. remove all liks
        5. remove all stopwords
        6. throw away punctuation, except smiley faces
        7. make the final vector of words a set

    @args:
        text --> a string
    """
    words = text.split()
    words = [word for word in words if "@" not in word]
    words = [word for word in words if "#" not in word]
    words = [word.lower() for word in words if not word.startswith("http")]
    words = [word.lower() for word in words if word not in stopwords]
    text = " ".join(words)
    words = re.findall("\w+", text)
    words.extend(re.findall("['\-/()=:;]['\-/()=:;]+", text))
    words = {word for word in words if
                        len(word) > 1
                        and word.lower() != "rt"}
    return words

# ===============


# ==== Feature Selection ====

# Info gain formula
def info_bernuilli(p):
    """
    computes the entropy of a Bernuilli distr

    @args:
        p --> probability p in Bernuilli(p)
    """
    return -p*math.log(p)-(1-p)*math.log(1-p)

# 250 features with highest info gain
def select_features():
    """
    Selects 250 words with the highest information gain. Refer to
    slides for details
    """

    positives = Vector()
    negatives = Vector()
    both = Vector()

    documents = 0.0
    p = 0.0
    n = 0.0

    f = open("data/train_pos.txt")
    for line in f:
        for word in sanitize(line):
            positives[word] += 1
            both[word] += 1
            documents += 1
            p += 1
    f.close()

    f = open("data/train_neg.txt")
    for line in f:
        for word in sanitize(line):
            negatives[word] += 1
            both[word] += 1
            documents += 1
            n += 1
    f.close()

    features = []
    for word in both:
        p_both = both[word] / documents
        p_pos = positives[word] / p or 0.001/p
        p_neg = negatives[word] / n or 0.001/n
        gain = info_bernuilli(p_both) \
                - p/documents * info_bernuilli(p_pos) \
                - n/documents * info_bernuilli(p_neg)
        features.append((word, gain))

    for word in [w[0] for w in sorted(features, key=lambda x: -x[1])[:250]]:
        print word

#==================

#
# HEY LOOK AT ME I AM THE MOST INTERESTING PART
#


class NB(object):

    def __init__(self):
        """
        Should do:
            1. define the possible classes
            2. define features
            3. define CPDs
            4. define priors
        """

        # FILL ME IN!

    def learn_cpd(self, cls, tweets):
        """
        Should do:
            learn the CPD for a given class

        @args:
            cls --> a string, "+" or "-"
            tweets --> an iterable of tweets (a file object, list, etc)
        """

        # FILL ME IN!

    def posterior(self, cls, sanitized_tweet):
        """
        Computes the posterior of a sanitized tweet, P(C|tweet)

        @args:
            cls --> a string, "+" or "-". determines CPD to use
            sanitized_tweet --> a set of words in the tweet
        """

        # FILL ME IN!

    def classify(self, tweet):
        """
        Given a text, classify its sentiment. Picks the class with the largest posterior.

        However, if we are not confident, ie if not P(C1|tweet) < 2*P(C2|tweet),
        then we refuse to classify, and return neutral, "~".

        @args:
            tweet --> a string, text of the tweet
        """

        # FILL ME IN!

# ===================


def eval_performance(n):
    w = 0
    t = 0
    for tweet in open("data/verify_pos.txt"):
        t += 1.0
        if "+" != n.classify(tweet):
            w += 1.0
    for tweet in open("data/verify_neg.txt"):
        t += 1.0
        if "-" != n.classify(tweet):
            w += 1.0
    for tweet in open("data/verify_neutral.txt"):
        t += 1.0
        if "~" != n.classify(tweet):
            w += 1.0

    print "Error: %s" % (w/t)

def classify_text(n, txt):
    print "That text is: %s" % n.classify(txt)

def main():
    n = NB()
    n.learn_cpd("+", open("data/train_pos.txt"))
    n.learn_cpd("-", open("data/train_neg.txt"))
    if "--verify" in sys.argv:
        eval_performance(n)
    elif "--features" in sys.argv:
        select_features()
    else:
        if len(sys.argv) < 2:
            print "Not enough args. Provide text as argument"
            return -1
        classify_text(n, sys.argv[1])

if __name__ == "__main__":
    main()