diff --git a/preprocessing/__init__.py b/preprocessing/__init__.py new file mode 100644 index 0000000..2658f07 --- /dev/null +++ b/preprocessing/__init__.py @@ -0,0 +1,153 @@ +# This Python file uses the following encoding: utf-8 +import re + +# Hashtags +hash_regex = re.compile(r"#(\w+)") +def hash_repl(match): + return '__HASH_'+match.group(1).upper() + +# Handels +hndl_regex = re.compile(r"@(\w+)") +def hndl_repl(match): + return '__HNDL'#_'+match.group(1).upper() + +# URLs +url_regex = re.compile(r"(http|https|ftp)://[a-zA-Z0-9\./]+") + +# Spliting by word boundaries +word_bound_regex = re.compile(r"\W+") + +# Repeating words like hurrrryyyyyy +rpt_regex = re.compile(r"(.)\1{1,}", re.IGNORECASE); +def rpt_repl(match): + return match.group(1)+match.group(1) + +# Emoticons +emoticons = \ + [ ('__EMOT_SMILEY', [':-)', ':)', '(:', '(-:', ] ) ,\ + ('__EMOT_LAUGH', [':-D', ':D', 'X-D', 'XD', 'xD', ] ) ,\ + ('__EMOT_LOVE', ['<3', ':\*', ] ) ,\ + ('__EMOT_WINK', [';-)', ';)', ';-D', ';D', '(;', '(-;', ] ) ,\ + ('__EMOT_FROWN', [':-(', ':(', '(:', '(-:', ] ) ,\ + ('__EMOT_CRY', [':,(', ':\'(', ':"(', ':(('] ) ,\ + ] + +# Punctuations +punctuations = \ + [ #('', ['.', ] ) ,\ + #('', [',', ] ) ,\ + #('', ['\'', '\"', ] ) ,\ + ('__PUNC_EXCL', ['!', '¡', ] ) ,\ + ('__PUNC_QUES', ['?', '¿', ] ) ,\ + ('__PUNC_ELLP', ['...', '…', ] ) ,\ + #FIXME : MORE? http://en.wikipedia.org/wiki/Punctuation + ] + +#Printing functions for info +def print_config(cfg): + for (x, arr) in cfg: + print x, '\t', + for a in arr: + print a, '\t', + print '' + +def print_emoticons(): + print_config(emoticons) + +def print_punctuations(): + print_config(punctuations) + +#For emoticon regexes +def escape_paren(arr): + return [text.replace(')', '[)}\]]').replace('(', '[({\[]') for text in arr] + +def regex_union(arr): + return '(' + '|'.join( arr ) + ')' + +emoticons_regex = [ (repl, re.compile(regex_union(escape_paren(regx))) ) \ + for (repl, regx) in emoticons ] + +#For punctuation replacement +def punctuations_repl(match): + text = match.group(0) + repl = [] + for (key, parr) in punctuations : + for punc in parr : + if punc in text: + repl.append(key) + if( len(repl)>0 ) : + return ' '+' '.join(repl)+' ' + else : + return ' ' + +def processHashtags( text, subject='', query=[]): + return re.sub( hash_regex, hash_repl, text ) + +def processHandles( text, subject='', query=[]): + return re.sub( hndl_regex, hndl_repl, text ) + +def processUrls( text, subject='', query=[]): + return re.sub( url_regex, ' __URL ', text ) + +def processEmoticons( text, subject='', query=[]): + for (repl, regx) in emoticons_regex : + text = re.sub(regx, ' '+repl+' ', text) + return text + +def processPunctuations( text, subject='', query=[]): + return re.sub( word_bound_regex , punctuations_repl, text ) + +def processRepeatings( text, subject='', query=[]): + return re.sub( rpt_regex, rpt_repl, text ) + +def processQueryTerm( text, subject='', query=[]): + query_regex = "|".join([ re.escape(q) for q in query]) + return re.sub( query_regex, '__QUER', text, flags=re.IGNORECASE ) + +def countHandles(text): + return len( re.findall( hndl_regex, text) ) +def countHashtags(text): + return len( re.findall( hash_regex, text) ) +def countUrls(text): + return len( re.findall( url_regex, text) ) +def countEmoticons(text): + count = 0 + for (repl, regx) in emoticons_regex : + count += len( re.findall( regx, text) ) + return count + +#FIXME: preprocessing.preprocess()! wtf! will need to move. +#FIXME: use process functions inside +def processAll( text, subject='', query=[]): + + if(len(query)>0): + query_regex = "|".join([ re.escape(q) for q in query]) + text = re.sub( query_regex, '__QUER', text, flags=re.IGNORECASE ) + + text = re.sub( hash_regex, hash_repl, text ) + text = re.sub( hndl_regex, hndl_repl, text ) + text = re.sub( url_regex, ' __URL ', text ) + + for (repl, regx) in emoticons_regex : + text = re.sub(regx, ' '+repl+' ', text) + + + text = text.replace('\'','') + # FIXME: Jugad + + text = re.sub( word_bound_regex , punctuations_repl, text ) + text = re.sub( rpt_regex, rpt_repl, text ) + + return text + +#from time import time +#import preprocessing, sanderstwitter02 +#tweets = sanderstwitter02.getTweetsRawData('sentiment.csv') +#start = time() +#procTweets = [ (preprocessing.preprocess(t),s) for (t,s) in tweets] +#end = time() +#end - start + +#uni = [ a if(a[0:2]=='__') else a.lower() for a in re.findall(r"\w+", text) ] +#bi = nltk.bigrams(uni) +#tri = nltk.trigrams(uni) diff --git a/sandersfeatures/__init__.py b/sandersfeatures/__init__.py new file mode 100644 index 0000000..2288412 --- /dev/null +++ b/sandersfeatures/__init__.py @@ -0,0 +1 @@ +import tweet_features, tweet_pca diff --git a/sandersfeatures/tweet_features.py b/sandersfeatures/tweet_features.py new file mode 100644 index 0000000..c5f2986 --- /dev/null +++ b/sandersfeatures/tweet_features.py @@ -0,0 +1,146 @@ +""" +@package tweet_features +Convert tweet to feature vector. + +These routines help convert arbitrary tweets in to feature vectors. + +""" +import numpy + + +# search patterns for features +testFeatures = \ + [('hasAddict', (' addict',)), \ + ('hasAwesome', ('awesome',)), \ + ('hasBroken', ('broke',)), \ + ('hasBad', (' bad',)), \ + ('hasBug', (' bug',)), \ + ('hasCant', ('cant','can\'t')), \ + ('hasCrash', ('crash',)), \ + ('hasCool', ('cool',)), \ + ('hasDifficult', ('difficult',)), \ + ('hasDisaster', ('disaster',)), \ + ('hasDown', (' down',)), \ + ('hasDont', ('dont','don\'t','do not','does not','doesn\'t')), \ + ('hasEasy', (' easy',)), \ + ('hasExclaim', ('!',)), \ + ('hasExcite', (' excite',)), \ + ('hasExpense', ('expense','expensive')), \ + ('hasFail', (' fail',)), \ + ('hasFast', (' fast',)), \ + ('hasFix', (' fix',)), \ + ('hasFree', (' free',)), \ + ('hasFrowny', (':(', '):')), \ + ('hasFuck', ('fuck',)), \ + ('hasGood', ('good','great')), \ + ('hasHappy', (' happy',' happi')), \ + ('hasHate', ('hate',)), \ + ('hasHeart', ('heart', '<3')), \ + ('hasIssue', (' issue',)), \ + ('hasIncredible', ('incredible',)), \ + ('hasInterest', ('interest',)), \ + ('hasLike', (' like',)), \ + ('hasLol', (' lol',)), \ + ('hasLove', ('love','loving')), \ + ('hasLose', (' lose',)), \ + ('hasNeat', ('neat',)), \ + ('hasNever', (' never',)), \ + ('hasNice', (' nice',)), \ + ('hasPoor', ('poor',)), \ + ('hasPerfect', ('perfect',)), \ + ('hasPlease', ('please',)), \ + ('hasSerious', ('serious',)), \ + ('hasShit', ('shit',)), \ + ('hasSlow', (' slow',)), \ + ('hasSmiley', (':)', ':D', '(:')), \ + ('hasSuck', ('suck',)), \ + ('hasTerrible', ('terrible',)), \ + ('hasThanks', ('thank',)), \ + ('hasTrouble', ('trouble',)), \ + ('hasUnhappy', ('unhapp',)), \ + ('hasWin', (' win ','winner','winning')), \ + ('hasWinky', (';)',)), \ + ('hasWow', ('wow','omg')) ] + + +def make_tweet_nparr( txt ): + """ + Extract tweet feature vector as NumPy array. + """ + # result storage + fvec = numpy.empty( len(testFeatures) ) + + # search for each feature + txtLow = ' ' + txt.lower() + ' ' + for i in range( 0, len(testFeatures) ): + + key = testFeatures[i][0] + + fvec[i] = False + for tstr in testFeatures[i][1]: + fvec[i] = fvec[i] or (txtLow.find(tstr) != -1) + + return fvec + + +def make_tweet_dict( txt ): + """ + Extract tweet feature vector as dictionary. + """ + txtLow = ' ' + txt.lower() + ' ' + + # result storage + fvec = {} + + # search for each feature + for test in testFeatures: + + key = test[0] + + fvec[key] = False; + for tstr in test[1]: + fvec[key] = fvec[key] or (txtLow.find(tstr) != -1) + + return fvec + + +def tweet_dict_to_nparr( dict ): + """ + Convert dictionary feature vector to numpy array + """ + fvec = numpy.empty( len(testFeatures) ) + + for i in range( 0, len(testFeatures) ): + fvec[i] = dict[ testFeatures[i][0] ] + + return fvec + + +def tweet_nparr_to_dict( nparr, use_standard_features=False ): + """ + Convert NumPy array to dictionary + """ + fvec = {} + + if use_standard_features: + assert len(nparr) == len(testFeatures) + fvec = {} + for i in range( 0, len(nparr) ): + fvec[ testFeatures[i][0] ] = nparr[i] + + else: + for i in range( 0, len(nparr) ): + fvec[ str(i) ] = nparr[i] + + return fvec + + +def is_zero_dict( dict ): + """ + Identifies empty feature vectors + """ + has_any_features = False + for key in dict: + has_any_features = has_any_features or dict[key] + + return not has_any_features diff --git a/sandersfeatures/tweet_pca.py b/sandersfeatures/tweet_pca.py new file mode 100644 index 0000000..0659fe0 --- /dev/null +++ b/sandersfeatures/tweet_pca.py @@ -0,0 +1,47 @@ +""" +@package tweet_pca +PCT for dimensionality reduction. + +""" +import mdp, numpy +import tweet_features + +import pdb + + +def tweet_pca_reduce( tweets_train, tweets_test, output_dim ): + + # convert dictionary feature vecs to numpy array + print '--> Converting dictionaries to NumPy arrays' + train_arr = numpy.array( [tweet_features.tweet_dict_to_nparr(t) for \ + (t,s) in tweets_train]) + + test_arr = numpy.array( [tweet_features.tweet_dict_to_nparr(t) for \ + (t,s) in tweets_test]) + + + # compute principle components over training set + print '--> Computing PCT' + pca_array = mdp.pca( train_arr.transpose(), \ + svd=True, output_dim=output_dim ) + + + # both train and test sets to PC space + print '--> Projecting feature vectors to PC space' + + train_arr = numpy.dot( train_arr, pca_array ) + test_arr = numpy.dot( test_arr, pca_array ) + + + # convert projected vecs back to reduced dictionaries + print '--> Converting NumPy arrays to dictionaries' + + reduced_train = \ + zip( [tweet_features.tweet_nparr_to_dict(v) for v in train_arr], \ + [s for (t,s) in tweets_train] ) + + reduced_test = \ + zip( [tweet_features.tweet_nparr_to_dict(v) for v in test_arr], \ + [s for (t,s) in tweets_test]) + + return (reduced_train, reduced_test) diff --git a/sanderstwitter02/__init__.py b/sanderstwitter02/__init__.py new file mode 100644 index 0000000..49fbb99 --- /dev/null +++ b/sanderstwitter02/__init__.py @@ -0,0 +1,33 @@ +import csv + +queryTerms = {\ + 'apple' : ['@apple', ], \ + 'microsoft' : ['#microsoft', ], \ + 'google' : ['#google', ], \ + 'twitter' : ['#twitter', ], \ + } + +def getTweetsRawData( fileName ): + # read all tweets and labels + fp = open( fileName, 'rb' ) + reader = csv.reader( fp, delimiter=',', quotechar='"', escapechar='\\' ) + tweets = [] + for row in reader: + tweets.append( [row[4], row[1], row[0], queryTerms[(row[0]).lower()] ] ) + # treat neutral and irrelevant the same + for t in tweets: + if (t[1] == 'positive'): + t[1] = 'pos' + elif (t[1] == 'negative'): + t[1] = 'neg' + elif (t[1] == 'irrelevant')|(t[1] == 'neutral'): + t[1] = 'neu' + + return tweets # 0: Text # 1: class # 2: subject # 3: query + +SampleTweetsStats = ''' + Class Count Example + neg 529 #Skype often crashing: #microsoft, what are you doing? + neu 3770 How #Google Ventures Chooses Which Startups Get Its $200 Million http://t.co/FCWXoUd8 via @mashbusiness @mashable + pos 483 Now all @Apple has to do is get swype on the iphone and it will be crack. Iphone that is +''' diff --git a/sanderstwitter02/install.py b/sanderstwitter02/install.py new file mode 100644 index 0000000..4f7f45f --- /dev/null +++ b/sanderstwitter02/install.py @@ -0,0 +1,268 @@ +# +# Sanders-Twitter Sentiment Corpus Install Script +# Version 0.1 +# +# Adapted from http://www.sananalytics.com/lab/twitter-sentiment/ +# +# Yogesh Garg +# yogeshg91@gmail.com +# +import csv, getpass, json, os, time, random + +# using python-twitter library +import twitter + + +def get_user_params(): + + user_params = {} + + # get user input params + user_params['inList'] = '' #raw_input( '\nInput file [./corpus.csv]: ' ) + user_params['outList'] = '' #raw_input( 'Results file [./full-corpus.csv]: ' ) + user_params['rawDir'] = '' #raw_input( 'Raw data dir [./rawdata/]: ' ) + + # apply defaults + if user_params['inList'] == '': + user_params['inList'] = './corpus.csv' + if user_params['outList'] == '': + user_params['outList'] = './full-corpus.csv' + if user_params['rawDir'] == '': + user_params['rawDir'] = './rawdata/' + + return user_params + + +def dump_user_params( user_params ): + + # dump user params for confirmation + print 'Input: ' + user_params['inList'] + print 'Output: ' + user_params['outList'] + print 'Raw data: ' + user_params['rawDir'] + return + +def filter_list( total_list ) : + # filtering only apple for test purposes + indices = [i for i in range( len( total_list ) ) if (total_list[i])[0] == "apple"] + return [total_list[i] for i in indices] + +def read_total_list( in_filename ): + + # read total fetch list csv + fp = open( in_filename, 'rb' ) + reader = csv.reader( fp, delimiter=',', quotechar='"' ) + + total_list = [] + for row in reader: + total_list.append( row ) + + return total_list + + +def purge_already_fetched( fetch_list, raw_dir ): + + # list of tweet ids that still need downloading + rem_list = [] + count = 0; + + # check each tweet to see if we have it + for item in fetch_list: + + # check if json file exists + tweet_file = raw_dir + item[2] + '.json' + if os.path.exists( tweet_file ): + + # attempt to parse json file + try: + parse_tweet_json( tweet_file ) + count = count + 1 + print '--> already downloaded #' + item[2] + except RuntimeError: + rem_list.append( item ) + else: + rem_list.append( item ) + + print 'already fetched :', count + + return rem_list + + +def get_time_left_str( cur_idx, fetch_list, download_pause ): + + tweets_left = len(fetch_list) - cur_idx + total_seconds = tweets_left * download_pause + + str_hr = int( total_seconds / 3600 ) + str_min = int((total_seconds - str_hr*3600) / 60) + str_sec = total_seconds - str_hr*3600 - str_min*60 + + return '%dh %dm %ds' % (str_hr, str_min, str_sec) + + +def download_tweets( fetch_list, raw_dir ): + + # proxy settings for downloading behind a proxy + #os.environ['http_proxy'] = 'http://10.10.78.21:3128/' + #os.environ['https_proxy'] = 'http://10.10.78.21:3128/' + + # using python-twitter library + api = twitter.Api(consumer_key='yDkaORxEcwX6SheX6pa1fw', + consumer_secret='VYIGd2KITohR4ygmHrcyZgV0B74CXi5wsT1eryVtw', + access_token_key='227846642-8IjK2K32CDFt3682SNOOpnzegAja3TyVpzFOGrQj', + access_token_secret='L6of20EZdBv48EA2GE8Js6roIfZFnCKBpoPwvBDxF8', + input_encoding=None, cache=None) + + # ensure raw data directory exists + if not os.path.exists( raw_dir ): + os.mkdir( raw_dir ) + + # stay within rate limits + max_tweets_per_hr = 180*4 + download_pause_sec = 3600 / max_tweets_per_hr + + # download tweets + for idx in range(0,len(fetch_list)): + + # current item + item = fetch_list[idx] + + # print status + trem = get_time_left_str( idx, fetch_list, download_pause_sec ) + print '--> downloading tweet #%s (%d of %d) (%s left)' % \ + (item[2], idx+1, len(fetch_list), trem) + + # pull data + start = time.time() + try: + tweetStatus = api.GetStatus(item[2]) + tweetFile = open(raw_dir + item[2] + '.json', 'w') + tweetFile.write( tweetStatus.AsJsonString() ) + tweetFile.close() + except Exception, e: + print 'Cannot download tweet #'+item[2] + print e + end = time.time() + + # stay in Twitter API rate limits + print ' pausing %.2f sec to obey Twitter API rate limits' % \ + (download_pause_sec-(end-start)) + time.sleep( download_pause_sec-(end-start) ) + + return + + +def parse_tweet_json( filename ): + + # read tweet + print 'opening: ' + filename + fp = open( filename, 'rb' ) + + # parse json + try: + tweet_json = json.load( fp ) + except ValueError: + raise RuntimeError('error parsing json') + + # look for twitter api error msgs + if 'error' in tweet_json: + raise RuntimeError('error in downloaded tweet') + + # extract creation date and tweet text + return [ tweet_json['created_at'], tweet_json['text'] ] + + +def build_output_corpus( out_filename, raw_dir, total_list ): + + # open csv output file + fp = open( out_filename, 'wb' ) + writer = csv.writer( fp, delimiter=',', quotechar='"', escapechar='\\', + quoting=csv.QUOTE_ALL ) + + # write header row + #writer.writerow( ['Topic','Sentiment','TweetId','TweetDate','TweetText'] ) + + # parse all downloaded tweets + missing_count = 0 + for item in total_list: + + # ensure tweet exists + if os.path.exists( raw_dir + item[2] + '.json' ): + + try: + # parse tweet + parsed_tweet = parse_tweet_json( raw_dir + item[2] + '.json' ) + full_row = item + parsed_tweet + + # character encoding for output + for i in range(0,len(full_row)): + full_row[i] = full_row[i].encode("utf-8").replace('\n',' ') + + # write csv row + writer.writerow( full_row ) + + except RuntimeError: + print '--> bad data in tweet #' + item[2] + missing_count += 1 + + else: + print '--> missing tweet #' + item[2] + missing_count += 1 + + # indicate success + if missing_count == 0: + print '\nSuccessfully downloaded corpus!' + print 'Output in: ' + out_filename + '\n' + else: + print '\nMissing %d of %d tweets!' % (missing_count, len(total_list)) + print 'Partial output in: ' + out_filename + '\n' + + return + + +def rebuild_output_corpus(): + user_params = {} + user_params['inList'] = './sanderstwitter02/corpus.csv' + user_params['outList'] = './sanderstwitter02/full-corpus.csv' + user_params['rawDir'] = './sanderstwitter02/rawdata/' + + total_list = read_total_list( user_params['inList'] ) + build_output_corpus( user_params['outList'], user_params['rawDir'], + total_list ) + + +def main(): + + # get user parameters + user_params = get_user_params() + dump_user_params( user_params ) + + # get fetch list + total_list = read_total_list( user_params['inList'] ) + + # filter out only apple tweets + #total_list = filter_list( total_list ) + + # pull only 100 tweets + #total_list = random.sample( total_list, 100 ) + + print 'total tweets : ', len( total_list ) + fetch_list = purge_already_fetched( total_list, user_params['rawDir'] ) + print 'fetch tweets : ', len( fetch_list ) + + # start fetching data from twitter + download_tweets( fetch_list, user_params['rawDir'] ) + + # second pass for any failed downloads + print '\nStarting second pass to retry any failed downloads'; + fetch_list = purge_already_fetched( total_list, user_params['rawDir'] ) + download_tweets( fetch_list, user_params['rawDir'] ) + + # build output corpus + build_output_corpus( user_params['outList'], user_params['rawDir'], + total_list ) + + return + + +if __name__ == '__main__': + main() diff --git a/sentiment.py b/sentiment.py new file mode 100644 index 0000000..2187bf6 --- /dev/null +++ b/sentiment.py @@ -0,0 +1,457 @@ +""" +Sentiment Analysis of Twitter Feeds +@Ayush Pareek +""" + +import sys, os, random +import nltk, re +import collections + +import time + +def get_time_stamp(): + return time.strftime("%y%m%d-%H%M%S-%Z") +def grid(alist, blist): + for a in alist: + for b in blist: + yield(a, b) + +TIME_STAMP = get_time_stamp() + +NUM_SHOW_FEATURES = 100 +SPLIT_RATIO = 0.9 +FOLDS = 10 +LIST_CLASSIFIERS = [ 'NaiveBayesClassifier', 'MaxentClassifier', 'DecisionTreeClassifier', 'SvmClassifier' ] +LIST_METHODS = ['1step', '2step'] + +def k_fold_cross_validation(X, K, randomise = False): + """ + Generates K (training, validation) pairs from the items in X. + + Each pair is a partition of X, where validation is an iterable + of length len(X)/K. So each training iterable is of length (K-1)*len(X)/K. + + If randomise is true, a copy of X is shuffled before partitioning, + otherwise its order is preserved in training and validation. + """ + if randomise: from random import shuffle; X=list(X); shuffle(X) + for k in xrange(K): + training = [x for i, x in enumerate(X) if i % K != k] + validation = [x for i, x in enumerate(X) if i % K == k] + yield training, validation + +#X = [i for i in xrange(97)] +#for training, validation in k_fold_cross_validation(X, K=7): +# for x in X: assert (x in training) ^ (x in validation), x + + +def getTrainingAndTestData(tweets, K, k, method, feature_set): + + add_ngram_feat = feature_set.get('ngram', 1) + add_negtn_feat = feature_set.get('negtn', False) + + + from functools import wraps + import preprocessing + + procTweets = [ (preprocessing.processAll(text, subject=subj, query=quer), sent) \ + for (text, sent, subj, quer) in tweets] + + + + stemmer = nltk.stem.PorterStemmer() + + all_tweets = [] #DATADICT: all_tweets = [ (words, sentiment), ... ] + for (text, sentiment) in procTweets: + words = [word if(word[0:2]=='__') else word.lower() \ + for word in text.split() \ + if len(word) >= 3] + words = [stemmer.stem(w) for w in words] #DATADICT: words = [ 'word1', 'word2', ... ] + all_tweets.append((words, sentiment)) + + # train_tweets = all_tweets[:int(len(all_tweets)*ratio)] #DATADICT: train_tweets = [ (words, sentiment), ... ] + # test_tweets = all_tweets[int(len(all_tweets)*ratio):] #DATADICT: test_tweets = [ (words, sentiment), ... ] + train_tweets = [x for i,x in enumerate(all_tweets) if i % K !=k] + test_tweets = [x for i,x in enumerate(all_tweets) if i % K ==k] + + unigrams_fd = nltk.FreqDist() + if add_ngram_feat > 1 : + n_grams_fd = nltk.FreqDist() + + for( words, sentiment ) in train_tweets: + words_uni = words + unigrams_fd.update(words) + + if add_ngram_feat>=2 : + words_bi = [ ','.join(map(str,bg)) for bg in nltk.bigrams(words) ] + n_grams_fd.update( words_bi ) + + if add_ngram_feat>=3 : + words_tri = [ ','.join(map(str,tg)) for tg in nltk.trigrams(words) ] + n_grams_fd.update( words_tri ) + + sys.stderr.write( '\nlen( unigrams ) = '+str(len( unigrams_fd.keys() )) ) + + #unigrams_sorted = nltk.FreqDist(unigrams).keys() + unigrams_sorted = unigrams_fd.keys() + #bigrams_sorted = nltk.FreqDist(bigrams).keys() + #trigrams_sorted = nltk.FreqDist(trigrams).keys() + if add_ngram_feat > 1 : + sys.stderr.write( '\nlen( n_grams ) = '+str(len( n_grams_fd )) ) + ngrams_sorted = [ k for (k,v) in n_grams_fd.items() if v>1] + sys.stderr.write( '\nlen( ngrams_sorted ) = '+str(len( ngrams_sorted )) ) + + def get_word_features(words): + bag = {} + words_uni = [ 'has(%s)'% ug for ug in words ] + + if( add_ngram_feat>=2 ): + words_bi = [ 'has(%s)'% ','.join(map(str,bg)) for bg in nltk.bigrams(words) ] + else: + words_bi = [] + + if( add_ngram_feat>=3 ): + words_tri = [ 'has(%s)'% ','.join(map(str,tg)) for tg in nltk.trigrams(words) ] + else: + words_tri = [] + + for f in words_uni+words_bi+words_tri: + bag[f] = 1 + + #bag = collections.Counter(words_uni+words_bi+words_tri) + return bag + + negtn_regex = re.compile( r"""(?: + ^(?:never|no|nothing|nowhere|noone|none|not| + havent|hasnt|hadnt|cant|couldnt|shouldnt| + wont|wouldnt|dont|doesnt|didnt|isnt|arent|aint + )$ + ) + | + n't + """, re.X) + + def get_negation_features(words): + INF = 0.0 + negtn = [ bool(negtn_regex.search(w)) for w in words ] + + left = [0.0] * len(words) + prev = 0.0 + for i in range(0,len(words)): + if( negtn[i] ): + prev = 1.0 + left[i] = prev + prev = max( 0.0, prev-0.1) + + right = [0.0] * len(words) + prev = 0.0 + for i in reversed(range(0,len(words))): + if( negtn[i] ): + prev = 1.0 + right[i] = prev + prev = max( 0.0, prev-0.1) + + return dict( zip( + ['neg_l('+w+')' for w in words] + ['neg_r('+w+')' for w in words], + left + right ) ) + + def counter(func): #http://stackoverflow.com/questions/13512391/to-count-no-times-a-function-is-called + @wraps(func) + def tmp(*args, **kwargs): + tmp.count += 1 + return func(*args, **kwargs) + tmp.count = 0 + return tmp + + @counter #http://stackoverflow.com/questions/13512391/to-count-no-times-a-function-is-called + def extract_features(words): + features = {} + + word_features = get_word_features(words) + features.update( word_features ) + + if add_negtn_feat : + negation_features = get_negation_features(words) + features.update( negation_features ) + + sys.stderr.write( '\rfeatures extracted for ' + str(extract_features.count) + ' tweets' ) + return features + + extract_features.count = 0; + + + if( '1step' == method ): + # Apply NLTK's Lazy Map + v_train = nltk.classify.apply_features(extract_features,train_tweets) + v_test = nltk.classify.apply_features(extract_features,test_tweets) + return (v_train, v_test) + + elif( '2step' == method ): + isObj = lambda sent: sent in ['neg','pos'] + makeObj = lambda sent: 'obj' if isObj(sent) else sent + + train_tweets_obj = [ (words, makeObj(sent)) for (words, sent) in train_tweets ] + test_tweets_obj = [ (words, makeObj(sent)) for (words, sent) in test_tweets ] + + train_tweets_sen = [ (words, sent) for (words, sent) in train_tweets if isObj(sent) ] + test_tweets_sen = [ (words, sent) for (words, sent) in test_tweets if isObj(sent) ] + + v_train_obj = nltk.classify.apply_features(extract_features,train_tweets_obj) + v_train_sen = nltk.classify.apply_features(extract_features,train_tweets_sen) + v_test_obj = nltk.classify.apply_features(extract_features,test_tweets_obj) + v_test_sen = nltk.classify.apply_features(extract_features,test_tweets_sen) + + test_truth = [ sent for (words, sent) in test_tweets ] + + return (v_train_obj,v_train_sen,v_test_obj,v_test_sen,test_truth) + + else: + return nltk.classify.apply_features(extract_features,all_tweets) + +def trainAndClassify( tweets, classifier, method, feature_set, fileprefix ): + + INFO = '_'.join( [str(classifier), str(method)] + [ str(k)+str(v) for (k,v) in feature_set.items()] ) + if( len(fileprefix)>0 and '_'!=fileprefix[0] ): + directory = os.path.dirname(fileprefix) + if not os.path.exists(directory): + os.makedirs(directory) + realstdout = sys.stdout + sys.stdout = open( fileprefix+'_'+INFO+'.txt' , 'w') + + print INFO + sys.stderr.write( '\n'+ '#'*80 +'\n' + INFO ) + + if('NaiveBayesClassifier' == classifier): + CLASSIFIER = nltk.classify.NaiveBayesClassifier + def train_function(v_train): + return CLASSIFIER.train(v_train) + elif('MaxentClassifier' == classifier): + CLASSIFIER = nltk.classify.MaxentClassifier + def train_function(v_train): + return CLASSIFIER.train(v_train, algorithm='GIS', max_iter=10) + elif('SvmClassifier' == classifier): + CLASSIFIER = nltk.classify.SvmClassifier + def SvmClassifier_show_most_informative_features( self, n=10 ): + print 'unimplemented' + CLASSIFIER.show_most_informative_features = SvmClassifier_show_most_informative_features + def train_function(v_train): + return CLASSIFIER.train(v_train) + elif('DecisionTreeClassifier' == classifier): + CLASSIFIER = nltk.classify.DecisionTreeClassifier + def DecisionTreeClassifier_show_most_informative_features( self, n=10 ): + text = '' + for i in range( 1, 10 ): + text = nltk.classify.DecisionTreeClassifier.pp(self,depth=i) + if len( text.split('\n') ) > n: + break + print text + CLASSIFIER.show_most_informative_features = DecisionTreeClassifier_show_most_informative_features + def train_function(v_train): + return CLASSIFIER.train(v_train, entropy_cutoff=0.05, depth_cutoff=100, support_cutoff=10, binary=False) + + accuracies = [] + if '1step' == method: + for k in range(FOLDS): + (v_train, v_test) = getTrainingAndTestData(tweets, FOLDS, k, method, feature_set) + + sys.stderr.write( '\n[training start]' ) + classifier_tot = train_function(v_train) + sys.stderr.write( ' [training complete]' ) + + print '######################' + print '1 Step Classifier :', classifier + accuracy_tot = nltk.classify.accuracy(classifier_tot, v_test) + print 'Accuracy :', accuracy_tot + print '######################' + print classifier_tot.show_most_informative_features(NUM_SHOW_FEATURES) + print '######################' + + # build confusion matrix over test set + test_truth = [s for (t,s) in v_test] + test_predict = [classifier_tot.classify(t) for (t,s) in v_test] + + print 'Accuracy :', accuracy_tot + print 'Confusion Matrix' + print nltk.ConfusionMatrix( test_truth, test_predict ) + + accuracies.append( accuracy_tot ) + print "Accuracies:", accuracies + print "Average Accuracy:", sum(accuracies)/FOLDS + + + elif '2step' == method: + # (v_train, v_test) = getTrainingAndTestData(tweets,SPLIT_RATIO, '1step', feature_set) + + # isObj = lambda sent: sent in ['neg','pos'] + # makeObj = lambda sent: 'obj' if isObj(sent) else sent + + # def makeObj_tweets(v_tweets): + # for (words, sent) in v_tweets: + # print sent, makeObj(sent) + # yield (words, makeObj(sent)) + # def getSen_tweets(v_tweets): + # for (words, sent) in v_tweets: + # print sent, isObj(sent) + # if isObj(sent): + # yield (words, sent) + + + # v_train_obj = makeObj_tweets( v_train ) + # v_test_obj = makeObj_tweets( v_test ) + + # v_train_sen = getSen_tweets( v_train ) + # v_test_sen = getSen_tweets( v_test ) + + accuracies = [] + for k in range(FOLDS): + (v_train_obj, v_train_sen, v_test_obj, v_test_sen, test_truth) = getTrainingAndTestData(tweets, FOLDS, k, method, feature_set) + + sys.stderr.write( '\n[training start]' ) + classifier_obj = train_function(v_train_obj) + sys.stderr.write( ' [training complete]' ) + + sys.stderr.write( '\n[training start]' ) + classifier_sen = train_function(v_train_sen) + sys.stderr.write( ' [training complete]' ) + + print '######################' + print 'Objectivity Classifier :', classifier + accuracy_obj = nltk.classify.accuracy(classifier_obj, v_test_obj) + print 'Accuracy :', accuracy_obj + print '######################' + print classifier_obj.show_most_informative_features(NUM_SHOW_FEATURES) + print '######################' + + test_truth_obj = [s for (t,s) in v_test_obj] + test_predict_obj = [classifier_obj.classify(t) for (t,s) in v_test_obj] + + print 'Accuracy :', accuracy_obj + print 'Confusion Matrix' + print nltk.ConfusionMatrix( test_truth_obj, test_predict_obj ) + + print '######################' + print 'Sentiment Classifier :', classifier + accuracy_sen = nltk.classify.accuracy(classifier_sen, v_test_sen) + print 'Accuracy :', accuracy_sen + print '######################' + print classifier_sen.show_most_informative_features(NUM_SHOW_FEATURES) + print '######################' + + test_truth_sen = [s for (t,s) in v_test_sen] + test_predict_sen = [classifier_sen.classify(t) for (t,s) in v_test_sen] + + print 'Accuracy :', accuracy_sen + print 'Confusion Matrix' + if( len(test_truth_sen) > 0 ): + print nltk.ConfusionMatrix( test_truth_sen, test_predict_sen ) + + v_test_sen2 = [(t,classifier_obj.classify(t)) for (t,s) in v_test_obj] + test_predict = [classifier_sen.classify(t) if s=='obj' else s for (t,s) in v_test_sen2] + + correct = [ t==p for (t,p) in zip(test_truth, test_predict)] + accuracy_tot = float(sum(correct))/len(correct) if correct else 0 + + print '######################' + print '2 - Step Classifier :', classifier + print 'Accuracy :', accuracy_tot + print 'Confusion Matrix' + print nltk.ConfusionMatrix( test_truth, test_predict ) + print '######################' + + classifier_tot = (classifier_obj, classifier_sen) + accuracies.append( accuracy_tot ) + print "Accuracies:", accuracies + print "Average Accuracy:", sum(accuracies)/FOLDS + + sys.stderr.write('\nAccuracies :') + for k in range(FOLDS): + sys.stderr.write(' %0.5f'%accuracies[k]) + sys.stderr.write('\nAverage Accuracy: %0.5f\n'% (sum(accuracies)/FOLDS)) + sys.stderr.flush() + + sys.stdout.flush() + if( len(fileprefix)>0 and '_'!=fileprefix[0] ): + sys.stdout.close() + sys.stdout = realstdout + + return classifier_tot + +def main(argv) : + __usage__=''' + usage: python sentiment.py logs/fileprefix ClassifierName,s methodName,s ngramVal,s negtnVal,s + ClassifierName,s: %s + methodName,s: %s + ngramVal,s: %s + negtnVal,s: %s + ''' % ( str( LIST_CLASSIFIERS ), str( LIST_METHODS ), str([1,3]), str([0,1]) ) + import sanderstwitter02 + import stanfordcorpus + import stats + + fileprefix = '' + + if (len(argv) >= 1) : + fileprefix = str(argv[0]) + else : + fileprefix = 'logs/run' + + classifierNames = [] + if (len(argv) >= 2) : + classifierNames = [name for name in argv[1].split(',') if name in LIST_CLASSIFIERS] + else : + classifierNames = ['NaiveBayesClassifier'] + + methodNames = [] + if (len(argv) >= 3) : + methodNames = [name for name in argv[2].split(',') if name in LIST_METHODS] + else : + methodNames = ['1step'] + + ngramVals = [] + if (len(argv) >= 4) : + ngramVals = [int(val) for val in argv[3].split(',') if val.isdigit()] + else : + ngramVals = [ 1 ] + + negtnVals = [] + if (len(argv) >= 5) : + negtnVals = [bool(int(val)) for val in argv[4].split(',') if val.isdigit()] + else : + negtnVals = [ False ] + + if (len( fileprefix )==0 or len( classifierNames )==0 or len( methodNames )==0 or len( ngramVals )==0 or len( negtnVals )==0 ): + print __usage__ + return + + tweets1 = sanderstwitter02.getTweetsRawData('sentiment.csv') + tweets2 = stanfordcorpus.getNormalisedTweets('stanfordcorpus/'+stanfordcorpus.FULLDATA+'.5000.norm.csv') + #random.shuffle(tweets1) + #random.shuffle(tweets2) + tweets = tweets1 + tweets2 + random.shuffle( tweets ) + #tweets = tweets[:100] + sys.stderr.write( '\nlen( tweets ) = '+str(len( tweets )) ) + + #sys.stderr.write( '\n' ) + #stats.preprocessingStats( tweets1, fileprefix='logs/stats_'+TIME_STAMP+'/TSC' ) + #sys.stderr.write( '\n' ) + #stats.preprocessingStats( tweets2, fileprefix='')#logs/stats_'+TIME_STAMP+'/STAN' ) + #sys.stderr.write( '\n' ) + #stats.stepStats( tweets , fileprefix='logs/stats_'+TIME_STAMP+'/Both' ) + + #generateARFF(tweets, fileprefix) + + #print classifierNames, methodNames, ngramVals, negtnVals + TIME_STAMP = get_time_stamp() + for (((cname, mname), ngramVal), negtnVal) in grid( grid( grid( classifierNames, methodNames), ngramVals ), negtnVals ): + try: + trainAndClassify( + tweets, classifier=cname, method=mname, + feature_set={'ngram':ngramVal, 'negtn':negtnVal}, + fileprefix=fileprefix+'_'+TIME_STAMP ) + except Exception, e: + print e + sys.stdout.flush() + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/stanfordcorpus/__init__.py b/stanfordcorpus/__init__.py new file mode 100644 index 0000000..02d5762 --- /dev/null +++ b/stanfordcorpus/__init__.py @@ -0,0 +1,147 @@ +""" +http://help.sentiment140.com/for-students + +Format +Data file format has 6 fields: +0 - the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive) +1 - the id of the tweet (2087) +2 - the date of the tweet (Sat May 16 23:58:44 UTC 2009) +3 - the query (lyx). If there is no query, then this value is NO_QUERY. +4 - the user that tweeted (robotickilldozr) +5 - the text of the tweet (Lyx is cool) + +""" + +FULLDATA = 'training.1600000.processed.noemoticon.csv' +TESTDATA = 'testdata.manual.2009.06.14.csv' + +POLARITY= 0 # in [0,5] +TWID = 1 +DATE = 2 +SUBJ = 3 # NO_QUERY +USER = 4 +TEXT = 5 + +import csv, re, random + +regex = re.compile( r'\w+|\".*?\"' ) + +def get_class( polarity ): + if polarity in ['0', '1']: + return 'neg' + elif polarity in ['3', '4']: + return 'pos' + elif polarity == '2': + return 'neu' + else: + return 'err' + +def get_query( subject ): + if subject == 'NO_QUERY': + return [] + else: + return regex.findall(subject) + +def getAllQueries(in_file): + + fp = open(in_file , 'r') + rd = csv.reader(fp, delimiter=',', quotechar='"' ) + + queries = set([]) + + for row in rd: + queries.add(row[3]) + + print queries + + for q in queries: + print q, "\t", + + return queries + +def sampleCSV( in_file, out_file, K=100 ): + + fp = open(in_file , 'r') + fp2 = open(out_file , 'w') + + for i in range(0,K): + line = fp.readline() + fp2.write(line) + + fp.close() + fp2.close() + + return 0 + +def randomSampleCSV( in_file, out_file, K=100 ): + + fp = open(in_file , 'r') + fq = open(out_file, 'w') + + rows = [None] * K + + i = 0 + for row in fp: + i+=1 + j = random.randint(1,i) + if i < K: + rows[i] = row + elif j <= K: + rows[j-1] = row + + for row in rows: + fq.write(row) + + min(1, K/i) + +def getNormalisedCSV( in_file, out_file ): + fp = open(in_file , 'r') + rd = csv.reader(fp, delimiter=',', quotechar='"' ) + + fq = open(out_file, 'w') + wr = csv.writer(fq, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL ) + + for row in rd: + queries = get_query(row[SUBJ]) + wr.writerow( [row[TEXT], get_class(row[POLARITY]), row[SUBJ]] + [len(queries)] + queries ) + +def getNormalisedTweets(in_file): + fp = open(in_file , 'r') + rd = csv.reader(fp, delimiter=',', quotechar='"' ) + #print in_file, countlines( in_file ) + + tweets = [] + count = 0 + for row in rd: + numQueries = int(row[3]) + tweets.append( row[:3] + [row[4:4+numQueries]] ) + count+=1 + + #print count + #print 'len(tweets) =', len(tweets) + return tweets + +def countlines( filename ): + count = 0 + with open( filename, 'r' ) as fp: + for line in fp: + count+=1 + return count + +#getAllQueries( 'testdata.manual.2009.06.14.csv' ) +#getAllQueries( 'training.1600000.processed.noemoticon.csv' ) + +#randomSampleCSV(FULLDATA, FULLDATA+'.sample.csv') +#sampleCSV(TESTDATA, TESTDATA+'.sample.csv') + +#getNormalisedCSV(FULLDATA+'.sample.csv', FULLDATA+'.norm.csv') + +#randomSampleCSV(FULLDATA, FULLDATA+'.100000.sample.csv', K=100000) +#getNormalisedCSV(FULLDATA+'.100000.sample.csv', FULLDATA+'.100000.norm.csv') + + +SampleTweetsStats = ''' + Class Count Example + neg 2449 @jbrotherlove I thought it was a great love story + pos 2551 I hope that these kitchen renos don't last any longer... they are so annoying +''' diff --git a/stats.py b/stats.py new file mode 100644 index 0000000..a18566a --- /dev/null +++ b/stats.py @@ -0,0 +1,289 @@ +#@uthor: Ayush Pareek +import sys, time, os +import random, re, csv, collections +import nltk, pylab, numpy + +import preprocessing + +def printClassStats( tweets ): + tweets_counter = collections.Counter( [t[1] for t in tweets] ) + print '%8s %8s %s' % ('Class', 'Count', 'Example') + for (sent, count) in tweets_counter.items(): + print '%8s %8d %s' % (sent, count, random.choice([t for (t,s,_,_) in tweets if s==sent ]) ) + +def printFeaturesStats( tweets ): + arr_Handles = numpy.array( [0]*len(tweets) ) + arr_Hashtags = numpy.array( [0]*len(tweets) ) + arr_Urls = numpy.array( [0]*len(tweets) ) + arr_Emoticons = numpy.array( [0]*len(tweets) ) + arr_Words = numpy.array( [0]*len(tweets) ) + arr_Chars = numpy.array( [0]*len(tweets) ) + + + i=0 + for (text, sent, subj, quer) in tweets: + arr_Handles[i] = preprocessing.countHandles(text) + arr_Hashtags[i] = preprocessing.countHashtags(text) + arr_Urls[i] = preprocessing.countUrls(text) + arr_Emoticons[i] = preprocessing.countEmoticons(text) + arr_Words[i] = len(text.split()) + arr_Chars[i] = len(text) + i+=1 + + print '%-10s %-010s %-4s '%('Features', 'Average', 'Maximum') + print '%10s %10.6f %10d'%('Handles', arr_Handles.mean(), arr_Handles.max() ) + print '%10s %10.6f %10d'%('Hashtags', arr_Hashtags.mean(), arr_Hashtags.max() ) + print '%10s %10.6f %10d'%('Urls', arr_Urls.mean(), arr_Urls.max() ) + print '%10s %10.6f %10d'%('Emoticons', arr_Emoticons.mean(), arr_Emoticons.max() ) + print '%10s %10.6f %10d'%('Words', arr_Words.mean(), arr_Words.max() ) + print '%10s %10.6f %10d'%('Chars', arr_Chars.mean(), arr_Chars.max() ) + +def printReductionStats( tweets, function, filtering=True): + if( function ): + procTweets = [ (function(text, subject=subj, query=quer), sent) \ + for (text, sent, subj, quer) in tweets] + else: + procTweets = [ (text, sent) \ + for (text, sent, subj, quer) in tweets] + tweetsArr = [] + for (text, sentiment) in procTweets: + words = [word if(word[0:2]=='__') else word.lower() \ + for word in text.split() \ + if ( (not filtering) | (len(word) >= 3) ) ] + tweetsArr.append([words, sentiment]) + # tweetsArr + bag = collections.Counter() + for (words, sentiment) in tweetsArr: + bag.update(words) + # unigram + + print '%20s %-10s %12d'% ( + ('None' if function is None else function.__name__), + ( 'gte3' if filtering else 'all' ), + sum(bag.values()) + ) + return True + +def printAllRecuctionStats(tweets): + print '%-20s %-10s %-12s'% ( 'Preprocessing', 'Filter', 'Words' ) + printReductionStats( tweets, None, False ) + #printReductionStats( tweets, None, True ) + printReductionStats( tweets, preprocessing.processHashtags, True ) + printReductionStats( tweets, preprocessing.processHandles, True ) + printReductionStats( tweets, preprocessing.processUrls, True ) + printReductionStats( tweets, preprocessing.processEmoticons, True ) + printReductionStats( tweets, preprocessing.processPunctuations, True ) + printReductionStats( tweets, preprocessing.processRepeatings, True ) + #printReductionStats( tweets, preprocessing.processAll, False ) + printReductionStats( tweets, preprocessing.processAll, True ) + +def printFreqDistCSV( dist, filename='' ): + n_samples = len(dist.keys()) + n_repeating_samples = sum([ 1 for (k,v) in dist.items + () if v>1 ]) + n_outcomes = dist._N + print '%-12s %-12s %-12s'%( 'Samples', 'RepSamples', 'Outcomes' ) + print '%12d %12d %12d'%( n_samples, n_repeating_samples, n_outcomes ) + + if( len(filename)>0 and '_'!=filename[0] ): + with open( filename, 'w' ) as fcsv: + distwriter = csv.writer( fcsv, delimiter=',', quotechar='"', quoting=csv.QUOTE_NONNUMERIC ) + + for (key,value) in dist.items(): + distwriter.writerow( [key, value] ) #print key, '\t,\t', dist[key] + +def preprocessingStats( tweets, fileprefix='' ): + + if( len(fileprefix)>0 and '_'!=fileprefix[0] ): + directory = os.path.dirname(fileprefix) + if not os.path.exists(directory): + os.makedirs(directory) + print 'writing to', fileprefix+'_stats.txt' + realstdout = sys.stdout + sys.stdout = open( fileprefix+'_stats.txt' , 'w') + + ########################################################################### + + print 'for', len(tweets), 'tweets:' + + print '###########################################################################' + + printFeaturesStats( tweets ) + + print '###########################################################################' + + printAllRecuctionStats( tweets ) + + print '###########################################################################' + + procTweets = [ (preprocessing.processAll(text, subject=subj, query=quer), sent) \ + for (text, sent, subj, quer) in tweets] + tweetsArr = [] + for (text, sentiment) in procTweets: + words = [word if(word[0:2]=='__') else word.lower() \ + for word in text.split() \ + if ( (len(word) >= 3) ) ] + tweetsArr.append([words, sentiment]) + unigrams_fd = nltk.FreqDist() + bigrams_fd = nltk.FreqDist() + trigrams_fd = nltk.FreqDist() + for (words, sentiment) in tweetsArr: + words_bi = [ ','.join(map(str,bg)) for bg in nltk.bigrams(words) ] + words_tri = [ ','.join(map(str,tg)) for tg in nltk.trigrams(words) ] + unigrams_fd.update( words ) + bigrams_fd.update( words_bi ) + trigrams_fd.update( words_tri ) + + print 'Unigrams Distribution' + printFreqDistCSV(unigrams_fd, filename=fileprefix+'_1grams.csv') + if( len(fileprefix)>0 and '_'!=fileprefix[0] ): + pylab.show = lambda : pylab.savefig(fileprefix+'_1grams.pdf') + unigrams_fd.plot(50, cumulative=True) + pylab.close() + + print 'Bigrams Distribution' + printFreqDistCSV(bigrams_fd, filename=fileprefix+'_2grams.csv') + if( len(fileprefix)>0 and '_'!=fileprefix[0] ): + pylab.show = lambda : pylab.savefig(fileprefix+'_2grams.pdf') + bigrams_fd.plot(50, cumulative=True) + pylab.close() + + print 'Trigrams Distribution' + printFreqDistCSV(trigrams_fd, filename=fileprefix+'_3grams.csv') + if( len(fileprefix)>0 and '_'!=fileprefix[0] ): + pylab.show = lambda : pylab.savefig(fileprefix+'_3grams.pdf') + trigrams_fd.plot(50, cumulative=True) + pylab.close() + + if( len(fileprefix)>0 and '_'!=fileprefix[0] ): + pylab.show = lambda : pylab.savefig(fileprefix+'_ngrams.pdf') + unigrams_fd.plot(50, cumulative=True) + bigrams_fd.plot(50, cumulative=True) + trigrams_fd.plot(50, cumulative=True) + pylab.close() + + if( len(fileprefix)>0 and '_'!=fileprefix[0] ): + sys.stdout.close() + sys.stdout = realstdout + +def stepStats( tweets, num_bins=10, split='easy', fileprefix='' ): + tot_size = len(tweets) + num_digits = len(str(tot_size)) + + if split=='equal': + sizes = [ int((r+1.0)/num_bins*tot_size) for r in range( num_bins ) ] + elif split=='log': + sizes = [ int(2**(math.log(tot_size,2)*(r+1.0)/num_bins) ) for r in range( num_bins ) ] + else: # split=='easy' + sizes = range( 0, tot_size, tot_size/num_bins)[1:]+[tot_size] + + for s in sizes: + print 'processing stats for %d tweets'%s + preprocessingStats( tweets[0:s], fileprefix+'_%0{0}d'.format(num_digits) % s ) + +def oldStats2CSV( in_file, fileprefix=''): + if fileprefix == '': + fileprefix = in_file.rstrip('_stats.txt') + fp = open( in_file, 'r' ) + fq = open( fileprefix+'_statsnew.txt', 'w' ) + + line = '' + line_start = 0 + line_count = 20 + line_end = line_start+line_count + for line_num in range(line_start, line_end): # write Statistics + line = fp.readline() + fq.write( line ) + + for section in [1,2,3]: + line_start = line_end + line_count = 2 + line_end = line_start+line_count + for line_num in range( line_start, line_end ): + line = fp.readline() + fq.write( line ) + + line_start = line_end + line_count = [int(l) for l in line.split() if l.isdigit()][0] + line_end = line_start+line_count + fr = open( fileprefix+'_%dgrams.csv'%section, 'w') + fwrt = csv.writer( fr, delimiter=',', quotechar='"', quoting=csv.QUOTE_NONNUMERIC ) + for line_num in range( line_start, line_end ): # write unigrams + line = fp.readline() + row = line.split('\t,\t') + row[0] = row[0].strip() + row[1] = int(row[1]) + fwrt.writerow( row ) + fr.close() + + fp.close() + fq.close() + +stats_tiltes = [ +'"# tweets"', +'"avg(Handles)"', +'"max(Handles)"', +'"avg(Hashtags)"', +'"max(Hashtags)"', +'"avg(Urls)"', +'"max(Urls)"', +'"avg(Emoticons)"', +'"max(Emoticons)"', +'"avg(Words)"', +'"max(Words)"', +'"avg(Chars)"', +'"max(Chars)"', +'"preprocessing(None)"', +'"preprocessing(Hashtags)"', +'"preprocessing(Handles)"', +'"preprocessing(Urls)"', +'"preprocessing(Emoticons)"', +'"preprocessing(Punctuations)"', +'"preprocessing(Repeatings)"', +'"preprocessing(All)"', +'"Unigrams samples"', +'"Unigrams repeating samples"', +'"Unigrams outcomes"', +'"Bigrams samples"', +'"Bigrams repeating samples"', +'"Bigrams outcomes"', +'"Trigrams samples"', +'"Trigrams repeating samples"', +'"Trigrams outcomes"', +] + +def newStats2CSV(files, out_file): + + arr = [ [] ] * len(files) + + for j in range( len(files)): + values = [] + with open(files[j], 'r') as fp: + for line in fp: + values += [ float(w) for w in line.split()\ + if w[0] in ['0','1','2','3','4','5','6','7','8','9'] ] + arr[j] = values + + with open(out_file, 'w') as fq: + stats_writer = csv.writer( fq, delimiter=',', quotechar='\'')#, quoting=csv.QUOTE_NONE ) + for i in range(0,len(stats_tiltes)): + row = [stats_tiltes[i]] + [arr[j][i] for j in range(len(files))] + stats_writer.writerow( row ) + + +filelist = [ +'logs/stats_140617-214922-IST/Both_0978_stats.txt', +'logs/stats_140617-214922-IST/Both_1956_stats.txt', +'logs/stats_140617-214922-IST/Both_2934_stats.txt', +'logs/stats_140617-214922-IST/Both_3912_stats.txt', +'logs/stats_140617-214922-IST/Both_4890_stats.txt', +'logs/stats_140617-214922-IST/Both_5868_stats.txt', +'logs/stats_140617-214922-IST/Both_6846_stats.txt', +'logs/stats_140617-214922-IST/Both_7824_stats.txt', +'logs/stats_140617-214922-IST/Both_8802_stats.txt', +'logs/stats_140617-214922-IST/Both_9780_stats.txt', +'logs/stats_140617-214922-IST/Both_9782_stats.txt', +] + + +['0','1','2','3','4','5','6','7','8','9']