-
Notifications
You must be signed in to change notification settings - Fork 278
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
9aa99f0
commit daa684b
Showing
9 changed files
with
1,541 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,153 @@ | ||
# This Python file uses the following encoding: utf-8 | ||
import re | ||
|
||
# Hashtags | ||
hash_regex = re.compile(r"#(\w+)") | ||
def hash_repl(match): | ||
return '__HASH_'+match.group(1).upper() | ||
|
||
# Handels | ||
hndl_regex = re.compile(r"@(\w+)") | ||
def hndl_repl(match): | ||
return '__HNDL'#_'+match.group(1).upper() | ||
|
||
# URLs | ||
url_regex = re.compile(r"(http|https|ftp)://[a-zA-Z0-9\./]+") | ||
|
||
# Spliting by word boundaries | ||
word_bound_regex = re.compile(r"\W+") | ||
|
||
# Repeating words like hurrrryyyyyy | ||
rpt_regex = re.compile(r"(.)\1{1,}", re.IGNORECASE); | ||
def rpt_repl(match): | ||
return match.group(1)+match.group(1) | ||
|
||
# Emoticons | ||
emoticons = \ | ||
[ ('__EMOT_SMILEY', [':-)', ':)', '(:', '(-:', ] ) ,\ | ||
('__EMOT_LAUGH', [':-D', ':D', 'X-D', 'XD', 'xD', ] ) ,\ | ||
('__EMOT_LOVE', ['<3', ':\*', ] ) ,\ | ||
('__EMOT_WINK', [';-)', ';)', ';-D', ';D', '(;', '(-;', ] ) ,\ | ||
('__EMOT_FROWN', [':-(', ':(', '(:', '(-:', ] ) ,\ | ||
('__EMOT_CRY', [':,(', ':\'(', ':"(', ':(('] ) ,\ | ||
] | ||
|
||
# Punctuations | ||
punctuations = \ | ||
[ #('', ['.', ] ) ,\ | ||
#('', [',', ] ) ,\ | ||
#('', ['\'', '\"', ] ) ,\ | ||
('__PUNC_EXCL', ['!', '¡', ] ) ,\ | ||
('__PUNC_QUES', ['?', '¿', ] ) ,\ | ||
('__PUNC_ELLP', ['...', '…', ] ) ,\ | ||
#FIXME : MORE? http://en.wikipedia.org/wiki/Punctuation | ||
] | ||
|
||
#Printing functions for info | ||
def print_config(cfg): | ||
for (x, arr) in cfg: | ||
print x, '\t', | ||
for a in arr: | ||
print a, '\t', | ||
print '' | ||
|
||
def print_emoticons(): | ||
print_config(emoticons) | ||
|
||
def print_punctuations(): | ||
print_config(punctuations) | ||
|
||
#For emoticon regexes | ||
def escape_paren(arr): | ||
return [text.replace(')', '[)}\]]').replace('(', '[({\[]') for text in arr] | ||
|
||
def regex_union(arr): | ||
return '(' + '|'.join( arr ) + ')' | ||
|
||
emoticons_regex = [ (repl, re.compile(regex_union(escape_paren(regx))) ) \ | ||
for (repl, regx) in emoticons ] | ||
|
||
#For punctuation replacement | ||
def punctuations_repl(match): | ||
text = match.group(0) | ||
repl = [] | ||
for (key, parr) in punctuations : | ||
for punc in parr : | ||
if punc in text: | ||
repl.append(key) | ||
if( len(repl)>0 ) : | ||
return ' '+' '.join(repl)+' ' | ||
else : | ||
return ' ' | ||
|
||
def processHashtags( text, subject='', query=[]): | ||
return re.sub( hash_regex, hash_repl, text ) | ||
|
||
def processHandles( text, subject='', query=[]): | ||
return re.sub( hndl_regex, hndl_repl, text ) | ||
|
||
def processUrls( text, subject='', query=[]): | ||
return re.sub( url_regex, ' __URL ', text ) | ||
|
||
def processEmoticons( text, subject='', query=[]): | ||
for (repl, regx) in emoticons_regex : | ||
text = re.sub(regx, ' '+repl+' ', text) | ||
return text | ||
|
||
def processPunctuations( text, subject='', query=[]): | ||
return re.sub( word_bound_regex , punctuations_repl, text ) | ||
|
||
def processRepeatings( text, subject='', query=[]): | ||
return re.sub( rpt_regex, rpt_repl, text ) | ||
|
||
def processQueryTerm( text, subject='', query=[]): | ||
query_regex = "|".join([ re.escape(q) for q in query]) | ||
return re.sub( query_regex, '__QUER', text, flags=re.IGNORECASE ) | ||
|
||
def countHandles(text): | ||
return len( re.findall( hndl_regex, text) ) | ||
def countHashtags(text): | ||
return len( re.findall( hash_regex, text) ) | ||
def countUrls(text): | ||
return len( re.findall( url_regex, text) ) | ||
def countEmoticons(text): | ||
count = 0 | ||
for (repl, regx) in emoticons_regex : | ||
count += len( re.findall( regx, text) ) | ||
return count | ||
|
||
#FIXME: preprocessing.preprocess()! wtf! will need to move. | ||
#FIXME: use process functions inside | ||
def processAll( text, subject='', query=[]): | ||
|
||
if(len(query)>0): | ||
query_regex = "|".join([ re.escape(q) for q in query]) | ||
text = re.sub( query_regex, '__QUER', text, flags=re.IGNORECASE ) | ||
|
||
text = re.sub( hash_regex, hash_repl, text ) | ||
text = re.sub( hndl_regex, hndl_repl, text ) | ||
text = re.sub( url_regex, ' __URL ', text ) | ||
|
||
for (repl, regx) in emoticons_regex : | ||
text = re.sub(regx, ' '+repl+' ', text) | ||
|
||
|
||
text = text.replace('\'','') | ||
# FIXME: Jugad | ||
|
||
text = re.sub( word_bound_regex , punctuations_repl, text ) | ||
text = re.sub( rpt_regex, rpt_repl, text ) | ||
|
||
return text | ||
|
||
#from time import time | ||
#import preprocessing, sanderstwitter02 | ||
#tweets = sanderstwitter02.getTweetsRawData('sentiment.csv') | ||
#start = time() | ||
#procTweets = [ (preprocessing.preprocess(t),s) for (t,s) in tweets] | ||
#end = time() | ||
#end - start | ||
|
||
#uni = [ a if(a[0:2]=='__') else a.lower() for a in re.findall(r"\w+", text) ] | ||
#bi = nltk.bigrams(uni) | ||
#tri = nltk.trigrams(uni) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
import tweet_features, tweet_pca |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,146 @@ | ||
""" | ||
@package tweet_features | ||
Convert tweet to feature vector. | ||
These routines help convert arbitrary tweets in to feature vectors. | ||
""" | ||
import numpy | ||
|
||
|
||
# search patterns for features | ||
testFeatures = \ | ||
[('hasAddict', (' addict',)), \ | ||
('hasAwesome', ('awesome',)), \ | ||
('hasBroken', ('broke',)), \ | ||
('hasBad', (' bad',)), \ | ||
('hasBug', (' bug',)), \ | ||
('hasCant', ('cant','can\'t')), \ | ||
('hasCrash', ('crash',)), \ | ||
('hasCool', ('cool',)), \ | ||
('hasDifficult', ('difficult',)), \ | ||
('hasDisaster', ('disaster',)), \ | ||
('hasDown', (' down',)), \ | ||
('hasDont', ('dont','don\'t','do not','does not','doesn\'t')), \ | ||
('hasEasy', (' easy',)), \ | ||
('hasExclaim', ('!',)), \ | ||
('hasExcite', (' excite',)), \ | ||
('hasExpense', ('expense','expensive')), \ | ||
('hasFail', (' fail',)), \ | ||
('hasFast', (' fast',)), \ | ||
('hasFix', (' fix',)), \ | ||
('hasFree', (' free',)), \ | ||
('hasFrowny', (':(', '):')), \ | ||
('hasFuck', ('fuck',)), \ | ||
('hasGood', ('good','great')), \ | ||
('hasHappy', (' happy',' happi')), \ | ||
('hasHate', ('hate',)), \ | ||
('hasHeart', ('heart', '<3')), \ | ||
('hasIssue', (' issue',)), \ | ||
('hasIncredible', ('incredible',)), \ | ||
('hasInterest', ('interest',)), \ | ||
('hasLike', (' like',)), \ | ||
('hasLol', (' lol',)), \ | ||
('hasLove', ('love','loving')), \ | ||
('hasLose', (' lose',)), \ | ||
('hasNeat', ('neat',)), \ | ||
('hasNever', (' never',)), \ | ||
('hasNice', (' nice',)), \ | ||
('hasPoor', ('poor',)), \ | ||
('hasPerfect', ('perfect',)), \ | ||
('hasPlease', ('please',)), \ | ||
('hasSerious', ('serious',)), \ | ||
('hasShit', ('shit',)), \ | ||
('hasSlow', (' slow',)), \ | ||
('hasSmiley', (':)', ':D', '(:')), \ | ||
('hasSuck', ('suck',)), \ | ||
('hasTerrible', ('terrible',)), \ | ||
('hasThanks', ('thank',)), \ | ||
('hasTrouble', ('trouble',)), \ | ||
('hasUnhappy', ('unhapp',)), \ | ||
('hasWin', (' win ','winner','winning')), \ | ||
('hasWinky', (';)',)), \ | ||
('hasWow', ('wow','omg')) ] | ||
|
||
|
||
def make_tweet_nparr( txt ): | ||
""" | ||
Extract tweet feature vector as NumPy array. | ||
""" | ||
# result storage | ||
fvec = numpy.empty( len(testFeatures) ) | ||
|
||
# search for each feature | ||
txtLow = ' ' + txt.lower() + ' ' | ||
for i in range( 0, len(testFeatures) ): | ||
|
||
key = testFeatures[i][0] | ||
|
||
fvec[i] = False | ||
for tstr in testFeatures[i][1]: | ||
fvec[i] = fvec[i] or (txtLow.find(tstr) != -1) | ||
|
||
return fvec | ||
|
||
|
||
def make_tweet_dict( txt ): | ||
""" | ||
Extract tweet feature vector as dictionary. | ||
""" | ||
txtLow = ' ' + txt.lower() + ' ' | ||
|
||
# result storage | ||
fvec = {} | ||
|
||
# search for each feature | ||
for test in testFeatures: | ||
|
||
key = test[0] | ||
|
||
fvec[key] = False; | ||
for tstr in test[1]: | ||
fvec[key] = fvec[key] or (txtLow.find(tstr) != -1) | ||
|
||
return fvec | ||
|
||
|
||
def tweet_dict_to_nparr( dict ): | ||
""" | ||
Convert dictionary feature vector to numpy array | ||
""" | ||
fvec = numpy.empty( len(testFeatures) ) | ||
|
||
for i in range( 0, len(testFeatures) ): | ||
fvec[i] = dict[ testFeatures[i][0] ] | ||
|
||
return fvec | ||
|
||
|
||
def tweet_nparr_to_dict( nparr, use_standard_features=False ): | ||
""" | ||
Convert NumPy array to dictionary | ||
""" | ||
fvec = {} | ||
|
||
if use_standard_features: | ||
assert len(nparr) == len(testFeatures) | ||
fvec = {} | ||
for i in range( 0, len(nparr) ): | ||
fvec[ testFeatures[i][0] ] = nparr[i] | ||
|
||
else: | ||
for i in range( 0, len(nparr) ): | ||
fvec[ str(i) ] = nparr[i] | ||
|
||
return fvec | ||
|
||
|
||
def is_zero_dict( dict ): | ||
""" | ||
Identifies empty feature vectors | ||
""" | ||
has_any_features = False | ||
for key in dict: | ||
has_any_features = has_any_features or dict[key] | ||
|
||
return not has_any_features |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
""" | ||
@package tweet_pca | ||
PCT for dimensionality reduction. | ||
""" | ||
import mdp, numpy | ||
import tweet_features | ||
|
||
import pdb | ||
|
||
|
||
def tweet_pca_reduce( tweets_train, tweets_test, output_dim ): | ||
|
||
# convert dictionary feature vecs to numpy array | ||
print '--> Converting dictionaries to NumPy arrays' | ||
train_arr = numpy.array( [tweet_features.tweet_dict_to_nparr(t) for \ | ||
(t,s) in tweets_train]) | ||
|
||
test_arr = numpy.array( [tweet_features.tweet_dict_to_nparr(t) for \ | ||
(t,s) in tweets_test]) | ||
|
||
|
||
# compute principle components over training set | ||
print '--> Computing PCT' | ||
pca_array = mdp.pca( train_arr.transpose(), \ | ||
svd=True, output_dim=output_dim ) | ||
|
||
|
||
# both train and test sets to PC space | ||
print '--> Projecting feature vectors to PC space' | ||
|
||
train_arr = numpy.dot( train_arr, pca_array ) | ||
test_arr = numpy.dot( test_arr, pca_array ) | ||
|
||
|
||
# convert projected vecs back to reduced dictionaries | ||
print '--> Converting NumPy arrays to dictionaries' | ||
|
||
reduced_train = \ | ||
zip( [tweet_features.tweet_nparr_to_dict(v) for v in train_arr], \ | ||
[s for (t,s) in tweets_train] ) | ||
|
||
reduced_test = \ | ||
zip( [tweet_features.tweet_nparr_to_dict(v) for v in test_arr], \ | ||
[s for (t,s) in tweets_test]) | ||
|
||
return (reduced_train, reduced_test) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
import csv | ||
|
||
queryTerms = {\ | ||
'apple' : ['@apple', ], \ | ||
'microsoft' : ['#microsoft', ], \ | ||
'google' : ['#google', ], \ | ||
'twitter' : ['#twitter', ], \ | ||
} | ||
|
||
def getTweetsRawData( fileName ): | ||
# read all tweets and labels | ||
fp = open( fileName, 'rb' ) | ||
reader = csv.reader( fp, delimiter=',', quotechar='"', escapechar='\\' ) | ||
tweets = [] | ||
for row in reader: | ||
tweets.append( [row[4], row[1], row[0], queryTerms[(row[0]).lower()] ] ) | ||
# treat neutral and irrelevant the same | ||
for t in tweets: | ||
if (t[1] == 'positive'): | ||
t[1] = 'pos' | ||
elif (t[1] == 'negative'): | ||
t[1] = 'neg' | ||
elif (t[1] == 'irrelevant')|(t[1] == 'neutral'): | ||
t[1] = 'neu' | ||
|
||
return tweets # 0: Text # 1: class # 2: subject # 3: query | ||
|
||
SampleTweetsStats = ''' | ||
Class Count Example | ||
neg 529 #Skype often crashing: #microsoft, what are you doing? | ||
neu 3770 How #Google Ventures Chooses Which Startups Get Its $200 Million http://t.co/FCWXoUd8 via @mashbusiness @mashable | ||
pos 483 Now all @Apple has to do is get swype on the iphone and it will be crack. Iphone that is | ||
''' |
Oops, something went wrong.