Skip to content

Commit

Permalink
final version
Browse files Browse the repository at this point in the history
  • Loading branch information
ayushoriginal committed Oct 21, 2016
1 parent 9aa99f0 commit daa684b
Show file tree
Hide file tree
Showing 9 changed files with 1,541 additions and 0 deletions.
153 changes: 153 additions & 0 deletions preprocessing/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
# This Python file uses the following encoding: utf-8
import re

# Hashtags
hash_regex = re.compile(r"#(\w+)")
def hash_repl(match):
return '__HASH_'+match.group(1).upper()

# Handels
hndl_regex = re.compile(r"@(\w+)")
def hndl_repl(match):
return '__HNDL'#_'+match.group(1).upper()

# URLs
url_regex = re.compile(r"(http|https|ftp)://[a-zA-Z0-9\./]+")

# Spliting by word boundaries
word_bound_regex = re.compile(r"\W+")

# Repeating words like hurrrryyyyyy
rpt_regex = re.compile(r"(.)\1{1,}", re.IGNORECASE);
def rpt_repl(match):
return match.group(1)+match.group(1)

# Emoticons
emoticons = \
[ ('__EMOT_SMILEY', [':-)', ':)', '(:', '(-:', ] ) ,\
('__EMOT_LAUGH', [':-D', ':D', 'X-D', 'XD', 'xD', ] ) ,\
('__EMOT_LOVE', ['<3', ':\*', ] ) ,\
('__EMOT_WINK', [';-)', ';)', ';-D', ';D', '(;', '(-;', ] ) ,\
('__EMOT_FROWN', [':-(', ':(', '(:', '(-:', ] ) ,\
('__EMOT_CRY', [':,(', ':\'(', ':"(', ':(('] ) ,\
]

# Punctuations
punctuations = \
[ #('', ['.', ] ) ,\
#('', [',', ] ) ,\
#('', ['\'', '\"', ] ) ,\
('__PUNC_EXCL', ['!', '¡', ] ) ,\
('__PUNC_QUES', ['?', '¿', ] ) ,\
('__PUNC_ELLP', ['...', '…', ] ) ,\
#FIXME : MORE? http://en.wikipedia.org/wiki/Punctuation
]

#Printing functions for info
def print_config(cfg):
for (x, arr) in cfg:
print x, '\t',
for a in arr:
print a, '\t',
print ''

def print_emoticons():
print_config(emoticons)

def print_punctuations():
print_config(punctuations)

#For emoticon regexes
def escape_paren(arr):
return [text.replace(')', '[)}\]]').replace('(', '[({\[]') for text in arr]

def regex_union(arr):
return '(' + '|'.join( arr ) + ')'

emoticons_regex = [ (repl, re.compile(regex_union(escape_paren(regx))) ) \
for (repl, regx) in emoticons ]

#For punctuation replacement
def punctuations_repl(match):
text = match.group(0)
repl = []
for (key, parr) in punctuations :
for punc in parr :
if punc in text:
repl.append(key)
if( len(repl)>0 ) :
return ' '+' '.join(repl)+' '
else :
return ' '

def processHashtags( text, subject='', query=[]):
return re.sub( hash_regex, hash_repl, text )

def processHandles( text, subject='', query=[]):
return re.sub( hndl_regex, hndl_repl, text )

def processUrls( text, subject='', query=[]):
return re.sub( url_regex, ' __URL ', text )

def processEmoticons( text, subject='', query=[]):
for (repl, regx) in emoticons_regex :
text = re.sub(regx, ' '+repl+' ', text)
return text

def processPunctuations( text, subject='', query=[]):
return re.sub( word_bound_regex , punctuations_repl, text )

def processRepeatings( text, subject='', query=[]):
return re.sub( rpt_regex, rpt_repl, text )

def processQueryTerm( text, subject='', query=[]):
query_regex = "|".join([ re.escape(q) for q in query])
return re.sub( query_regex, '__QUER', text, flags=re.IGNORECASE )

def countHandles(text):
return len( re.findall( hndl_regex, text) )
def countHashtags(text):
return len( re.findall( hash_regex, text) )
def countUrls(text):
return len( re.findall( url_regex, text) )
def countEmoticons(text):
count = 0
for (repl, regx) in emoticons_regex :
count += len( re.findall( regx, text) )
return count

#FIXME: preprocessing.preprocess()! wtf! will need to move.
#FIXME: use process functions inside
def processAll( text, subject='', query=[]):

if(len(query)>0):
query_regex = "|".join([ re.escape(q) for q in query])
text = re.sub( query_regex, '__QUER', text, flags=re.IGNORECASE )

text = re.sub( hash_regex, hash_repl, text )
text = re.sub( hndl_regex, hndl_repl, text )
text = re.sub( url_regex, ' __URL ', text )

for (repl, regx) in emoticons_regex :
text = re.sub(regx, ' '+repl+' ', text)


text = text.replace('\'','')
# FIXME: Jugad

text = re.sub( word_bound_regex , punctuations_repl, text )
text = re.sub( rpt_regex, rpt_repl, text )

return text

#from time import time
#import preprocessing, sanderstwitter02
#tweets = sanderstwitter02.getTweetsRawData('sentiment.csv')
#start = time()
#procTweets = [ (preprocessing.preprocess(t),s) for (t,s) in tweets]
#end = time()
#end - start

#uni = [ a if(a[0:2]=='__') else a.lower() for a in re.findall(r"\w+", text) ]
#bi = nltk.bigrams(uni)
#tri = nltk.trigrams(uni)
1 change: 1 addition & 0 deletions sandersfeatures/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
import tweet_features, tweet_pca
146 changes: 146 additions & 0 deletions sandersfeatures/tweet_features.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
"""
@package tweet_features
Convert tweet to feature vector.
These routines help convert arbitrary tweets in to feature vectors.
"""
import numpy


# search patterns for features
testFeatures = \
[('hasAddict', (' addict',)), \
('hasAwesome', ('awesome',)), \
('hasBroken', ('broke',)), \
('hasBad', (' bad',)), \
('hasBug', (' bug',)), \
('hasCant', ('cant','can\'t')), \
('hasCrash', ('crash',)), \
('hasCool', ('cool',)), \
('hasDifficult', ('difficult',)), \
('hasDisaster', ('disaster',)), \
('hasDown', (' down',)), \
('hasDont', ('dont','don\'t','do not','does not','doesn\'t')), \
('hasEasy', (' easy',)), \
('hasExclaim', ('!',)), \
('hasExcite', (' excite',)), \
('hasExpense', ('expense','expensive')), \
('hasFail', (' fail',)), \
('hasFast', (' fast',)), \
('hasFix', (' fix',)), \
('hasFree', (' free',)), \
('hasFrowny', (':(', '):')), \
('hasFuck', ('fuck',)), \
('hasGood', ('good','great')), \
('hasHappy', (' happy',' happi')), \
('hasHate', ('hate',)), \
('hasHeart', ('heart', '<3')), \
('hasIssue', (' issue',)), \
('hasIncredible', ('incredible',)), \
('hasInterest', ('interest',)), \
('hasLike', (' like',)), \
('hasLol', (' lol',)), \
('hasLove', ('love','loving')), \
('hasLose', (' lose',)), \
('hasNeat', ('neat',)), \
('hasNever', (' never',)), \
('hasNice', (' nice',)), \
('hasPoor', ('poor',)), \
('hasPerfect', ('perfect',)), \
('hasPlease', ('please',)), \
('hasSerious', ('serious',)), \
('hasShit', ('shit',)), \
('hasSlow', (' slow',)), \
('hasSmiley', (':)', ':D', '(:')), \
('hasSuck', ('suck',)), \
('hasTerrible', ('terrible',)), \
('hasThanks', ('thank',)), \
('hasTrouble', ('trouble',)), \
('hasUnhappy', ('unhapp',)), \
('hasWin', (' win ','winner','winning')), \
('hasWinky', (';)',)), \
('hasWow', ('wow','omg')) ]


def make_tweet_nparr( txt ):
"""
Extract tweet feature vector as NumPy array.
"""
# result storage
fvec = numpy.empty( len(testFeatures) )

# search for each feature
txtLow = ' ' + txt.lower() + ' '
for i in range( 0, len(testFeatures) ):

key = testFeatures[i][0]

fvec[i] = False
for tstr in testFeatures[i][1]:
fvec[i] = fvec[i] or (txtLow.find(tstr) != -1)

return fvec


def make_tweet_dict( txt ):
"""
Extract tweet feature vector as dictionary.
"""
txtLow = ' ' + txt.lower() + ' '

# result storage
fvec = {}

# search for each feature
for test in testFeatures:

key = test[0]

fvec[key] = False;
for tstr in test[1]:
fvec[key] = fvec[key] or (txtLow.find(tstr) != -1)

return fvec


def tweet_dict_to_nparr( dict ):
"""
Convert dictionary feature vector to numpy array
"""
fvec = numpy.empty( len(testFeatures) )

for i in range( 0, len(testFeatures) ):
fvec[i] = dict[ testFeatures[i][0] ]

return fvec


def tweet_nparr_to_dict( nparr, use_standard_features=False ):
"""
Convert NumPy array to dictionary
"""
fvec = {}

if use_standard_features:
assert len(nparr) == len(testFeatures)
fvec = {}
for i in range( 0, len(nparr) ):
fvec[ testFeatures[i][0] ] = nparr[i]

else:
for i in range( 0, len(nparr) ):
fvec[ str(i) ] = nparr[i]

return fvec


def is_zero_dict( dict ):
"""
Identifies empty feature vectors
"""
has_any_features = False
for key in dict:
has_any_features = has_any_features or dict[key]

return not has_any_features
47 changes: 47 additions & 0 deletions sandersfeatures/tweet_pca.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
"""
@package tweet_pca
PCT for dimensionality reduction.
"""
import mdp, numpy
import tweet_features

import pdb


def tweet_pca_reduce( tweets_train, tweets_test, output_dim ):

# convert dictionary feature vecs to numpy array
print '--> Converting dictionaries to NumPy arrays'
train_arr = numpy.array( [tweet_features.tweet_dict_to_nparr(t) for \
(t,s) in tweets_train])

test_arr = numpy.array( [tweet_features.tweet_dict_to_nparr(t) for \
(t,s) in tweets_test])


# compute principle components over training set
print '--> Computing PCT'
pca_array = mdp.pca( train_arr.transpose(), \
svd=True, output_dim=output_dim )


# both train and test sets to PC space
print '--> Projecting feature vectors to PC space'

train_arr = numpy.dot( train_arr, pca_array )
test_arr = numpy.dot( test_arr, pca_array )


# convert projected vecs back to reduced dictionaries
print '--> Converting NumPy arrays to dictionaries'

reduced_train = \
zip( [tweet_features.tweet_nparr_to_dict(v) for v in train_arr], \
[s for (t,s) in tweets_train] )

reduced_test = \
zip( [tweet_features.tweet_nparr_to_dict(v) for v in test_arr], \
[s for (t,s) in tweets_test])

return (reduced_train, reduced_test)
33 changes: 33 additions & 0 deletions sanderstwitter02/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import csv

queryTerms = {\
'apple' : ['@apple', ], \
'microsoft' : ['#microsoft', ], \
'google' : ['#google', ], \
'twitter' : ['#twitter', ], \
}

def getTweetsRawData( fileName ):
# read all tweets and labels
fp = open( fileName, 'rb' )
reader = csv.reader( fp, delimiter=',', quotechar='"', escapechar='\\' )
tweets = []
for row in reader:
tweets.append( [row[4], row[1], row[0], queryTerms[(row[0]).lower()] ] )
# treat neutral and irrelevant the same
for t in tweets:
if (t[1] == 'positive'):
t[1] = 'pos'
elif (t[1] == 'negative'):
t[1] = 'neg'
elif (t[1] == 'irrelevant')|(t[1] == 'neutral'):
t[1] = 'neu'

return tweets # 0: Text # 1: class # 2: subject # 3: query

SampleTweetsStats = '''
Class Count Example
neg 529 #Skype often crashing: #microsoft, what are you doing?
neu 3770 How #Google Ventures Chooses Which Startups Get Its $200 Million http://t.co/FCWXoUd8 via @mashbusiness @mashable
pos 483 Now all @Apple has to do is get swype on the iphone and it will be crack. Iphone that is
'''
Loading

0 comments on commit daa684b

Please sign in to comment.