final version

ayushoriginal · Oct 21, 2016 · daa684b · daa684b
1 parent 9aa99f0
commit daa684b
Show file tree

Hide file tree

Showing 9 changed files with 1,541 additions and 0 deletions.
diff --git a/preprocessing/__init__.py b/preprocessing/__init__.py
@@ -0,0 +1,153 @@
+# This Python file uses the following encoding: utf-8
+import re
+
+# Hashtags
+hash_regex = re.compile(r"#(\w+)")
+def hash_repl(match):
+	return '__HASH_'+match.group(1).upper()
+
+# Handels
+hndl_regex = re.compile(r"@(\w+)")
+def hndl_repl(match):
+	return '__HNDL'#_'+match.group(1).upper()
+
+# URLs
+url_regex = re.compile(r"(http|https|ftp)://[a-zA-Z0-9\./]+")
+
+# Spliting by word boundaries
+word_bound_regex = re.compile(r"\W+")
+
+# Repeating words like hurrrryyyyyy
+rpt_regex = re.compile(r"(.)\1{1,}", re.IGNORECASE);
+def rpt_repl(match):
+	return match.group(1)+match.group(1)
+
+# Emoticons
+emoticons = \
+	[	('__EMOT_SMILEY',	[':-)', ':)', '(:', '(-:', ] )	,\
+		('__EMOT_LAUGH',		[':-D', ':D', 'X-D', 'XD', 'xD', ] )	,\
+		('__EMOT_LOVE',		['<3', ':\*', ] )	,\
+		('__EMOT_WINK',		[';-)', ';)', ';-D', ';D', '(;', '(-;', ] )	,\
+		('__EMOT_FROWN',		[':-(', ':(', '(:', '(-:', ] )	,\
+		('__EMOT_CRY',		[':,(', ':\'(', ':"(', ':(('] )	,\
+	]
+
+# Punctuations
+punctuations = \
+	[	#('',		['.', ] )	,\
+		#('',		[',', ] )	,\
+		#('',		['\'', '\"', ] )	,\
+		('__PUNC_EXCL',		['!', '¡', ] )	,\
+		('__PUNC_QUES',		['?', '¿', ] )	,\
+		('__PUNC_ELLP',		['...', '…', ] )	,\
+		#FIXME : MORE? http://en.wikipedia.org/wiki/Punctuation
+	]
+
+#Printing functions for info
+def print_config(cfg):
+	for (x, arr) in cfg:
+		print x, '\t',
+		for a in arr:
+			print a, '\t',
+		print ''
+
+def print_emoticons():
+	print_config(emoticons)
+
+def print_punctuations():
+	print_config(punctuations)
+
+#For emoticon regexes
+def escape_paren(arr):
+	return [text.replace(')', '[)}\]]').replace('(', '[({\[]') for text in arr]
+
+def regex_union(arr):
+	return '(' + '|'.join( arr ) + ')'
+
+emoticons_regex = [ (repl, re.compile(regex_union(escape_paren(regx))) ) \
+					for (repl, regx) in emoticons ]
+
+#For punctuation replacement
+def punctuations_repl(match):
+	text = match.group(0)
+	repl = []
+	for (key, parr) in punctuations :
+		for punc in parr :
+			if punc in text:
+				repl.append(key)
+	if( len(repl)>0 ) :
+		return ' '+' '.join(repl)+' '
+	else :
+		return ' '
+
+def processHashtags( 	text, subject='', query=[]):
+	return re.sub( hash_regex, hash_repl, text )
+
+def processHandles( 	text, subject='', query=[]):
+	return re.sub( hndl_regex, hndl_repl, text )
+
+def processUrls( 		text, subject='', query=[]):
+	return re.sub( url_regex, ' __URL ', text )
+
+def processEmoticons( 	text, subject='', query=[]):
+	for (repl, regx) in emoticons_regex :
+		text = re.sub(regx, ' '+repl+' ', text)
+	return text
+
+def processPunctuations( text, subject='', query=[]):
+	return re.sub( word_bound_regex , punctuations_repl, text )
+
+def processRepeatings( 	text, subject='', query=[]):
+	return re.sub( rpt_regex, rpt_repl, text )
+
+def processQueryTerm( 	text, subject='', query=[]):
+	query_regex = "|".join([ re.escape(q) for q in query])
+	return re.sub( query_regex, '__QUER', text, flags=re.IGNORECASE )
+
+def countHandles(text):
+	return len( re.findall( hndl_regex, text) )
+def countHashtags(text):
+	return len( re.findall( hash_regex, text) )
+def countUrls(text):
+	return len( re.findall( url_regex, text) )
+def countEmoticons(text):
+	count = 0
+	for (repl, regx) in emoticons_regex :
+		count += len( re.findall( regx, text) )
+	return count
+
+#FIXME: preprocessing.preprocess()! wtf! will need to move.
+#FIXME: use process functions inside
+def processAll( 		text, subject='', query=[]):
+
+	if(len(query)>0):
+		query_regex = "|".join([ re.escape(q) for q in query])
+		text = re.sub( query_regex, '__QUER', text, flags=re.IGNORECASE )
+
+	text = re.sub( hash_regex, hash_repl, text )
+	text = re.sub( hndl_regex, hndl_repl, text )
+	text = re.sub( url_regex, ' __URL ', text )
+
+	for (repl, regx) in emoticons_regex :
+		text = re.sub(regx, ' '+repl+' ', text)
+
+
+	text = text.replace('\'','')
+	# FIXME: Jugad
+
+	text = re.sub( word_bound_regex , punctuations_repl, text )
+	text = re.sub( rpt_regex, rpt_repl, text )
+
+	return text
+
+#from time import time
+#import preprocessing, sanderstwitter02
+#tweets = sanderstwitter02.getTweetsRawData('sentiment.csv')
+#start = time()
+#procTweets = [ (preprocessing.preprocess(t),s) for (t,s) in tweets]
+#end = time()
+#end - start
+
+#uni = [ a if(a[0:2]=='__') else a.lower() for a in re.findall(r"\w+", text) ]
+#bi  = nltk.bigrams(uni)
+#tri = nltk.trigrams(uni)
diff --git a/sandersfeatures/__init__.py b/sandersfeatures/__init__.py
@@ -0,0 +1 @@
+import tweet_features, tweet_pca
diff --git a/sandersfeatures/tweet_features.py b/sandersfeatures/tweet_features.py
@@ -0,0 +1,146 @@
+"""
+@package tweet_features
+Convert tweet to feature vector.
+
+These routines help convert arbitrary tweets in to feature vectors.
+
+"""
+import numpy
+
+
+# search patterns for features
+testFeatures = \
+    [('hasAddict',     (' addict',)), \
+     ('hasAwesome',    ('awesome',)), \
+     ('hasBroken',     ('broke',)), \
+     ('hasBad',        (' bad',)), \
+     ('hasBug',        (' bug',)), \
+     ('hasCant',       ('cant','can\'t')), \
+     ('hasCrash',      ('crash',)), \
+     ('hasCool',       ('cool',)), \
+     ('hasDifficult',  ('difficult',)), \
+     ('hasDisaster',   ('disaster',)), \
+     ('hasDown',       (' down',)), \
+     ('hasDont',       ('dont','don\'t','do not','does not','doesn\'t')), \
+     ('hasEasy',       (' easy',)), \
+     ('hasExclaim',    ('!',)), \
+     ('hasExcite',     (' excite',)), \
+     ('hasExpense',    ('expense','expensive')), \
+     ('hasFail',       (' fail',)), \
+     ('hasFast',       (' fast',)), \
+     ('hasFix',        (' fix',)), \
+     ('hasFree',       (' free',)), \
+     ('hasFrowny',     (':(', '):')), \
+     ('hasFuck',       ('fuck',)), \
+     ('hasGood',       ('good','great')), \
+     ('hasHappy',      (' happy',' happi')), \
+     ('hasHate',       ('hate',)), \
+     ('hasHeart',      ('heart', '<3')), \
+     ('hasIssue',      (' issue',)), \
+     ('hasIncredible', ('incredible',)), \
+     ('hasInterest',   ('interest',)), \
+     ('hasLike',       (' like',)), \
+     ('hasLol',        (' lol',)), \
+     ('hasLove',       ('love','loving')), \
+     ('hasLose',       (' lose',)), \
+     ('hasNeat',       ('neat',)), \
+     ('hasNever',      (' never',)), \
+     ('hasNice',       (' nice',)), \
+     ('hasPoor',       ('poor',)), \
+     ('hasPerfect',    ('perfect',)), \
+     ('hasPlease',     ('please',)), \
+     ('hasSerious',    ('serious',)), \
+     ('hasShit',       ('shit',)), \
+     ('hasSlow',       (' slow',)), \
+     ('hasSmiley',     (':)', ':D', '(:')), \
+     ('hasSuck',       ('suck',)), \
+     ('hasTerrible',   ('terrible',)), \
+     ('hasThanks',     ('thank',)), \
+     ('hasTrouble',    ('trouble',)), \
+     ('hasUnhappy',    ('unhapp',)), \
+     ('hasWin',        (' win ','winner','winning')), \
+     ('hasWinky',      (';)',)), \
+     ('hasWow',        ('wow','omg')) ]
+
+
+def make_tweet_nparr( txt ):
+    """
+    Extract tweet feature vector as NumPy array.
+    """
+    # result storage
+    fvec = numpy.empty( len(testFeatures) )
+
+    # search for each feature
+    txtLow = ' ' + txt.lower() + ' '
+    for i in range( 0, len(testFeatures) ):
+
+        key = testFeatures[i][0]
+
+        fvec[i] = False
+        for tstr in testFeatures[i][1]:
+            fvec[i] = fvec[i] or (txtLow.find(tstr) != -1)
+
+    return fvec
+
+
+def make_tweet_dict( txt ):
+    """
+    Extract tweet feature vector as dictionary.
+    """
+    txtLow = ' ' + txt.lower() + ' '
+
+    # result storage
+    fvec = {}
+
+    # search for each feature
+    for test in testFeatures:
+
+        key = test[0]
+
+        fvec[key] = False;
+        for tstr in test[1]:
+            fvec[key] = fvec[key] or (txtLow.find(tstr) != -1)
+
+    return fvec
+
+
+def tweet_dict_to_nparr( dict ):
+    """
+    Convert dictionary feature vector to numpy array
+    """
+    fvec = numpy.empty( len(testFeatures) )
+
+    for i in range( 0, len(testFeatures) ):
+        fvec[i] = dict[ testFeatures[i][0] ]
+
+    return fvec
+
+
+def tweet_nparr_to_dict( nparr, use_standard_features=False ):
+    """
+    Convert NumPy array to dictionary
+    """
+    fvec = {}
+
+    if use_standard_features:
+        assert len(nparr) == len(testFeatures)
+        fvec = {}
+        for i in range( 0, len(nparr) ):
+            fvec[ testFeatures[i][0] ] = nparr[i]
+
+    else:
+        for i in range( 0, len(nparr) ):
+            fvec[ str(i) ] = nparr[i]
+
+    return fvec
+
+
+def is_zero_dict( dict ):
+    """
+    Identifies empty feature vectors
+    """
+    has_any_features = False
+    for key in dict:
+        has_any_features = has_any_features or dict[key]
+
+    return not has_any_features
diff --git a/sandersfeatures/tweet_pca.py b/sandersfeatures/tweet_pca.py
@@ -0,0 +1,47 @@
+"""
+@package tweet_pca
+PCT for dimensionality reduction.
+
+"""
+import mdp, numpy
+import tweet_features
+
+import pdb
+
+
+def tweet_pca_reduce( tweets_train, tweets_test, output_dim ):
+
+    # convert dictionary feature vecs to numpy array
+    print '--> Converting dictionaries to NumPy arrays'
+    train_arr = numpy.array( [tweet_features.tweet_dict_to_nparr(t) for \
+                              (t,s) in tweets_train])
+
+    test_arr = numpy.array( [tweet_features.tweet_dict_to_nparr(t) for \
+                             (t,s) in tweets_test])
+
+
+    # compute principle components over training set
+    print '--> Computing PCT'
+    pca_array = mdp.pca( train_arr.transpose(), \
+                         svd=True, output_dim=output_dim )
+
+
+    # both train and test sets to PC space
+    print '--> Projecting feature vectors to PC space'
+
+    train_arr = numpy.dot( train_arr, pca_array )
+    test_arr  = numpy.dot( test_arr,  pca_array )
+
+
+    # convert projected vecs back to reduced dictionaries
+    print '--> Converting NumPy arrays to dictionaries'
+
+    reduced_train = \
+        zip( [tweet_features.tweet_nparr_to_dict(v) for v in train_arr], \
+             [s for (t,s) in tweets_train] )
+
+    reduced_test  = \
+        zip( [tweet_features.tweet_nparr_to_dict(v) for v in test_arr], \
+             [s for (t,s) in tweets_test])
+
+    return (reduced_train, reduced_test)
diff --git a/sanderstwitter02/__init__.py b/sanderstwitter02/__init__.py
@@ -0,0 +1,33 @@
+import csv
+
+queryTerms = {\
+                'apple'     : ['@apple',    ],  \
+                'microsoft' : ['#microsoft', ], \
+                'google'    : ['#google', ],    \
+                'twitter'   : ['#twitter', ],    \
+    }
+
+def getTweetsRawData( fileName ):
+    # read all tweets and labels
+    fp = open( fileName, 'rb' )
+    reader = csv.reader( fp, delimiter=',', quotechar='"', escapechar='\\' )
+    tweets = []
+    for row in reader:
+        tweets.append( [row[4], row[1], row[0], queryTerms[(row[0]).lower()] ] )
+    # treat neutral and irrelevant the same
+    for t in tweets:
+        if (t[1] == 'positive'):
+            t[1] = 'pos'
+        elif (t[1] == 'negative'):
+            t[1] = 'neg'
+        elif (t[1] == 'irrelevant')|(t[1] == 'neutral'):
+            t[1] = 'neu'
+
+    return tweets # 0: Text # 1: class # 2: subject # 3: query
+
+SampleTweetsStats = '''
+   Class    Count Example
+     neg      529 #Skype often crashing: #microsoft, what are you doing?
+     neu     3770 How #Google Ventures Chooses Which Startups Get Its $200 Million http://t.co/FCWXoUd8 via @mashbusiness @mashable
+     pos      483 Now all @Apple has to do is get swype on the iphone and it will be crack. Iphone that is
+'''