diff --git a/preprocessing/__init__.py b/preprocessing/__init__.py
new file mode 100644
index 0000000..2658f07
--- /dev/null
+++ b/preprocessing/__init__.py
@@ -0,0 +1,153 @@
+# This Python file uses the following encoding: utf-8
+import re
+
+# Hashtags
+hash_regex = re.compile(r"#(\w+)")
+def hash_repl(match):
+	return '__HASH_'+match.group(1).upper()
+
+# Handels
+hndl_regex = re.compile(r"@(\w+)")
+def hndl_repl(match):
+	return '__HNDL'#_'+match.group(1).upper()
+
+# URLs
+url_regex = re.compile(r"(http|https|ftp)://[a-zA-Z0-9\./]+")
+
+# Spliting by word boundaries
+word_bound_regex = re.compile(r"\W+")
+
+# Repeating words like hurrrryyyyyy
+rpt_regex = re.compile(r"(.)\1{1,}", re.IGNORECASE);
+def rpt_repl(match):
+	return match.group(1)+match.group(1)
+
+# Emoticons
+emoticons = \
+	[	('__EMOT_SMILEY',	[':-)', ':)', '(:', '(-:', ] )	,\
+		('__EMOT_LAUGH',		[':-D', ':D', 'X-D', 'XD', 'xD', ] )	,\
+		('__EMOT_LOVE',		['<3', ':\*', ] )	,\
+		('__EMOT_WINK',		[';-)', ';)', ';-D', ';D', '(;', '(-;', ] )	,\
+		('__EMOT_FROWN',		[':-(', ':(', '(:', '(-:', ] )	,\
+		('__EMOT_CRY',		[':,(', ':\'(', ':"(', ':(('] )	,\
+	]
+
+# Punctuations
+punctuations = \
+	[	#('',		['.', ] )	,\
+		#('',		[',', ] )	,\
+		#('',		['\'', '\"', ] )	,\
+		('__PUNC_EXCL',		['!', '¡', ] )	,\
+		('__PUNC_QUES',		['?', '¿', ] )	,\
+		('__PUNC_ELLP',		['...', '…', ] )	,\
+		#FIXME : MORE? http://en.wikipedia.org/wiki/Punctuation
+	]
+
+#Printing functions for info
+def print_config(cfg):
+	for (x, arr) in cfg:
+		print x, '\t',
+		for a in arr:
+			print a, '\t',
+		print ''
+
+def print_emoticons():
+	print_config(emoticons)
+
+def print_punctuations():
+	print_config(punctuations)
+
+#For emoticon regexes
+def escape_paren(arr):
+	return [text.replace(')', '[)}\]]').replace('(', '[({\[]') for text in arr]
+
+def regex_union(arr):
+	return '(' + '|'.join( arr ) + ')'
+
+emoticons_regex = [ (repl, re.compile(regex_union(escape_paren(regx))) ) \
+					for (repl, regx) in emoticons ]
+
+#For punctuation replacement
+def punctuations_repl(match):
+	text = match.group(0)
+	repl = []
+	for (key, parr) in punctuations :
+		for punc in parr :
+			if punc in text:
+				repl.append(key)
+	if( len(repl)>0 ) :
+		return ' '+' '.join(repl)+' '
+	else :
+		return ' '
+
+def processHashtags( 	text, subject='', query=[]):
+	return re.sub( hash_regex, hash_repl, text )
+
+def processHandles( 	text, subject='', query=[]):
+	return re.sub( hndl_regex, hndl_repl, text )
+
+def processUrls( 		text, subject='', query=[]):
+	return re.sub( url_regex, ' __URL ', text )
+
+def processEmoticons( 	text, subject='', query=[]):
+	for (repl, regx) in emoticons_regex :
+		text = re.sub(regx, ' '+repl+' ', text)
+	return text
+
+def processPunctuations( text, subject='', query=[]):
+	return re.sub( word_bound_regex , punctuations_repl, text )
+
+def processRepeatings( 	text, subject='', query=[]):
+	return re.sub( rpt_regex, rpt_repl, text )
+
+def processQueryTerm( 	text, subject='', query=[]):
+	query_regex = "|".join([ re.escape(q) for q in query])
+	return re.sub( query_regex, '__QUER', text, flags=re.IGNORECASE )
+
+def countHandles(text):
+	return len( re.findall( hndl_regex, text) )
+def countHashtags(text):
+	return len( re.findall( hash_regex, text) )
+def countUrls(text):
+	return len( re.findall( url_regex, text) )
+def countEmoticons(text):
+	count = 0
+	for (repl, regx) in emoticons_regex :
+		count += len( re.findall( regx, text) )
+	return count
+
+#FIXME: preprocessing.preprocess()! wtf! will need to move.
+#FIXME: use process functions inside
+def processAll( 		text, subject='', query=[]):
+
+	if(len(query)>0):
+		query_regex = "|".join([ re.escape(q) for q in query])
+		text = re.sub( query_regex, '__QUER', text, flags=re.IGNORECASE )
+
+	text = re.sub( hash_regex, hash_repl, text )
+	text = re.sub( hndl_regex, hndl_repl, text )
+	text = re.sub( url_regex, ' __URL ', text )
+
+	for (repl, regx) in emoticons_regex :
+		text = re.sub(regx, ' '+repl+' ', text)
+
+
+	text = text.replace('\'','')
+	# FIXME: Jugad
+
+	text = re.sub( word_bound_regex , punctuations_repl, text )
+	text = re.sub( rpt_regex, rpt_repl, text )
+
+	return text
+
+#from time import time
+#import preprocessing, sanderstwitter02
+#tweets = sanderstwitter02.getTweetsRawData('sentiment.csv')
+#start = time()
+#procTweets = [ (preprocessing.preprocess(t),s) for (t,s) in tweets]
+#end = time()
+#end - start
+
+#uni = [ a if(a[0:2]=='__') else a.lower() for a in re.findall(r"\w+", text) ]
+#bi  = nltk.bigrams(uni)
+#tri = nltk.trigrams(uni)
diff --git a/sandersfeatures/__init__.py b/sandersfeatures/__init__.py
new file mode 100644
index 0000000..2288412
--- /dev/null
+++ b/sandersfeatures/__init__.py
@@ -0,0 +1 @@
+import tweet_features, tweet_pca
diff --git a/sandersfeatures/tweet_features.py b/sandersfeatures/tweet_features.py
new file mode 100644
index 0000000..c5f2986
--- /dev/null
+++ b/sandersfeatures/tweet_features.py
@@ -0,0 +1,146 @@
+"""
+@package tweet_features
+Convert tweet to feature vector.
+
+These routines help convert arbitrary tweets in to feature vectors.
+
+"""
+import numpy
+
+
+# search patterns for features
+testFeatures = \
+    [('hasAddict',     (' addict',)), \
+     ('hasAwesome',    ('awesome',)), \
+     ('hasBroken',     ('broke',)), \
+     ('hasBad',        (' bad',)), \
+     ('hasBug',        (' bug',)), \
+     ('hasCant',       ('cant','can\'t')), \
+     ('hasCrash',      ('crash',)), \
+     ('hasCool',       ('cool',)), \
+     ('hasDifficult',  ('difficult',)), \
+     ('hasDisaster',   ('disaster',)), \
+     ('hasDown',       (' down',)), \
+     ('hasDont',       ('dont','don\'t','do not','does not','doesn\'t')), \
+     ('hasEasy',       (' easy',)), \
+     ('hasExclaim',    ('!',)), \
+     ('hasExcite',     (' excite',)), \
+     ('hasExpense',    ('expense','expensive')), \
+     ('hasFail',       (' fail',)), \
+     ('hasFast',       (' fast',)), \
+     ('hasFix',        (' fix',)), \
+     ('hasFree',       (' free',)), \
+     ('hasFrowny',     (':(', '):')), \
+     ('hasFuck',       ('fuck',)), \
+     ('hasGood',       ('good','great')), \
+     ('hasHappy',      (' happy',' happi')), \
+     ('hasHate',       ('hate',)), \
+     ('hasHeart',      ('heart', '<3')), \
+     ('hasIssue',      (' issue',)), \
+     ('hasIncredible', ('incredible',)), \
+     ('hasInterest',   ('interest',)), \
+     ('hasLike',       (' like',)), \
+     ('hasLol',        (' lol',)), \
+     ('hasLove',       ('love','loving')), \
+     ('hasLose',       (' lose',)), \
+     ('hasNeat',       ('neat',)), \
+     ('hasNever',      (' never',)), \
+     ('hasNice',       (' nice',)), \
+     ('hasPoor',       ('poor',)), \
+     ('hasPerfect',    ('perfect',)), \
+     ('hasPlease',     ('please',)), \
+     ('hasSerious',    ('serious',)), \
+     ('hasShit',       ('shit',)), \
+     ('hasSlow',       (' slow',)), \
+     ('hasSmiley',     (':)', ':D', '(:')), \
+     ('hasSuck',       ('suck',)), \
+     ('hasTerrible',   ('terrible',)), \
+     ('hasThanks',     ('thank',)), \
+     ('hasTrouble',    ('trouble',)), \
+     ('hasUnhappy',    ('unhapp',)), \
+     ('hasWin',        (' win ','winner','winning')), \
+     ('hasWinky',      (';)',)), \
+     ('hasWow',        ('wow','omg')) ]
+
+
+def make_tweet_nparr( txt ):
+    """
+    Extract tweet feature vector as NumPy array.
+    """
+    # result storage
+    fvec = numpy.empty( len(testFeatures) )
+
+    # search for each feature
+    txtLow = ' ' + txt.lower() + ' '
+    for i in range( 0, len(testFeatures) ):
+
+        key = testFeatures[i][0]
+
+        fvec[i] = False
+        for tstr in testFeatures[i][1]:
+            fvec[i] = fvec[i] or (txtLow.find(tstr) != -1)
+
+    return fvec
+
+
+def make_tweet_dict( txt ):
+    """
+    Extract tweet feature vector as dictionary.
+    """
+    txtLow = ' ' + txt.lower() + ' '
+
+    # result storage
+    fvec = {}
+
+    # search for each feature
+    for test in testFeatures:
+
+        key = test[0]
+
+        fvec[key] = False;
+        for tstr in test[1]:
+            fvec[key] = fvec[key] or (txtLow.find(tstr) != -1)
+
+    return fvec
+
+
+def tweet_dict_to_nparr( dict ):
+    """
+    Convert dictionary feature vector to numpy array
+    """
+    fvec = numpy.empty( len(testFeatures) )
+
+    for i in range( 0, len(testFeatures) ):
+        fvec[i] = dict[ testFeatures[i][0] ]
+
+    return fvec
+
+
+def tweet_nparr_to_dict( nparr, use_standard_features=False ):
+    """
+    Convert NumPy array to dictionary
+    """
+    fvec = {}
+
+    if use_standard_features:
+        assert len(nparr) == len(testFeatures)
+        fvec = {}
+        for i in range( 0, len(nparr) ):
+            fvec[ testFeatures[i][0] ] = nparr[i]
+
+    else:
+        for i in range( 0, len(nparr) ):
+            fvec[ str(i) ] = nparr[i]
+
+    return fvec
+
+
+def is_zero_dict( dict ):
+    """
+    Identifies empty feature vectors
+    """
+    has_any_features = False
+    for key in dict:
+        has_any_features = has_any_features or dict[key]
+
+    return not has_any_features
diff --git a/sandersfeatures/tweet_pca.py b/sandersfeatures/tweet_pca.py
new file mode 100644
index 0000000..0659fe0
--- /dev/null
+++ b/sandersfeatures/tweet_pca.py
@@ -0,0 +1,47 @@
+"""
+@package tweet_pca
+PCT for dimensionality reduction.
+
+"""
+import mdp, numpy
+import tweet_features
+
+import pdb
+
+
+def tweet_pca_reduce( tweets_train, tweets_test, output_dim ):
+
+    # convert dictionary feature vecs to numpy array
+    print '--> Converting dictionaries to NumPy arrays'
+    train_arr = numpy.array( [tweet_features.tweet_dict_to_nparr(t) for \
+                              (t,s) in tweets_train])
+
+    test_arr = numpy.array( [tweet_features.tweet_dict_to_nparr(t) for \
+                             (t,s) in tweets_test])
+
+
+    # compute principle components over training set
+    print '--> Computing PCT'
+    pca_array = mdp.pca( train_arr.transpose(), \
+                         svd=True, output_dim=output_dim )
+
+
+    # both train and test sets to PC space
+    print '--> Projecting feature vectors to PC space'
+
+    train_arr = numpy.dot( train_arr, pca_array )
+    test_arr  = numpy.dot( test_arr,  pca_array )
+
+
+    # convert projected vecs back to reduced dictionaries
+    print '--> Converting NumPy arrays to dictionaries'
+
+    reduced_train = \
+        zip( [tweet_features.tweet_nparr_to_dict(v) for v in train_arr], \
+             [s for (t,s) in tweets_train] )
+
+    reduced_test  = \
+        zip( [tweet_features.tweet_nparr_to_dict(v) for v in test_arr], \
+             [s for (t,s) in tweets_test])
+
+    return (reduced_train, reduced_test)
diff --git a/sanderstwitter02/__init__.py b/sanderstwitter02/__init__.py
new file mode 100644
index 0000000..49fbb99
--- /dev/null
+++ b/sanderstwitter02/__init__.py
@@ -0,0 +1,33 @@
+import csv
+
+queryTerms = {\
+                'apple'     : ['@apple',    ],  \
+                'microsoft' : ['#microsoft', ], \
+                'google'    : ['#google', ],    \
+                'twitter'   : ['#twitter', ],    \
+    }
+
+def getTweetsRawData( fileName ):
+    # read all tweets and labels
+    fp = open( fileName, 'rb' )
+    reader = csv.reader( fp, delimiter=',', quotechar='"', escapechar='\\' )
+    tweets = []
+    for row in reader:
+        tweets.append( [row[4], row[1], row[0], queryTerms[(row[0]).lower()] ] )
+    # treat neutral and irrelevant the same
+    for t in tweets:
+        if (t[1] == 'positive'):
+            t[1] = 'pos'
+        elif (t[1] == 'negative'):
+            t[1] = 'neg'
+        elif (t[1] == 'irrelevant')|(t[1] == 'neutral'):
+            t[1] = 'neu'
+
+    return tweets # 0: Text # 1: class # 2: subject # 3: query
+
+SampleTweetsStats = '''
+   Class    Count Example
+     neg      529 #Skype often crashing: #microsoft, what are you doing?
+     neu     3770 How #Google Ventures Chooses Which Startups Get Its $200 Million http://t.co/FCWXoUd8 via @mashbusiness @mashable
+     pos      483 Now all @Apple has to do is get swype on the iphone and it will be crack. Iphone that is
+'''
diff --git a/sanderstwitter02/install.py b/sanderstwitter02/install.py
new file mode 100644
index 0000000..4f7f45f
--- /dev/null
+++ b/sanderstwitter02/install.py
@@ -0,0 +1,268 @@
+#
+# Sanders-Twitter Sentiment Corpus Install Script
+# Version 0.1
+#
+# Adapted from http://www.sananalytics.com/lab/twitter-sentiment/
+#
+# Yogesh Garg
+# yogeshg91@gmail.com
+#
+import csv, getpass, json, os, time, random
+
+# using python-twitter library
+import twitter
+
+
+def get_user_params():
+
+    user_params = {}
+
+    # get user input params
+    user_params['inList']  = '' #raw_input( '\nInput file [./corpus.csv]: ' )
+    user_params['outList'] = '' #raw_input( 'Results file [./full-corpus.csv]: ' )
+    user_params['rawDir']  = '' #raw_input( 'Raw data dir [./rawdata/]: ' )
+
+    # apply defaults
+    if user_params['inList']  == '':
+        user_params['inList'] = './corpus.csv'
+    if user_params['outList'] == '':
+        user_params['outList'] = './full-corpus.csv'
+    if user_params['rawDir']  == '':
+        user_params['rawDir'] = './rawdata/'
+
+    return user_params
+
+
+def dump_user_params( user_params ):
+
+    # dump user params for confirmation
+    print 'Input:    '   + user_params['inList']
+    print 'Output:   '   + user_params['outList']
+    print 'Raw data: '   + user_params['rawDir']
+    return
+
+def filter_list( total_list ) :
+    # filtering only apple for test purposes
+    indices = [i for i in range( len( total_list ) ) if (total_list[i])[0] ==  "apple"]
+    return [total_list[i] for i in indices]
+
+def read_total_list( in_filename ):
+
+    # read total fetch list csv
+    fp = open( in_filename, 'rb' )
+    reader = csv.reader( fp, delimiter=',', quotechar='"' )
+
+    total_list = []
+    for row in reader:
+        total_list.append( row )
+
+    return total_list
+
+
+def purge_already_fetched( fetch_list, raw_dir ):
+
+    # list of tweet ids that still need downloading
+    rem_list = []
+    count = 0;
+
+    # check each tweet to see if we have it
+    for item in fetch_list:
+
+        # check if json file exists
+        tweet_file = raw_dir + item[2] + '.json'
+        if os.path.exists( tweet_file ):
+
+            # attempt to parse json file
+            try:
+                parse_tweet_json( tweet_file )
+                count = count + 1
+                print '--> already downloaded #' + item[2]
+            except RuntimeError:
+                rem_list.append( item )
+        else:
+            rem_list.append( item )
+
+    print 'already fetched :', count
+
+    return rem_list
+
+
+def get_time_left_str( cur_idx, fetch_list, download_pause ):
+
+    tweets_left = len(fetch_list) - cur_idx
+    total_seconds = tweets_left * download_pause
+
+    str_hr = int( total_seconds / 3600 )
+    str_min = int((total_seconds - str_hr*3600) / 60)
+    str_sec = total_seconds - str_hr*3600 - str_min*60
+
+    return '%dh %dm %ds' % (str_hr, str_min, str_sec)
+
+
+def download_tweets( fetch_list, raw_dir ):
+
+    # proxy settings for downloading behind a proxy
+    #os.environ['http_proxy'] = 'http://10.10.78.21:3128/'
+    #os.environ['https_proxy'] = 'http://10.10.78.21:3128/'
+
+    # using python-twitter library
+    api = twitter.Api(consumer_key='yDkaORxEcwX6SheX6pa1fw',
+                  consumer_secret='VYIGd2KITohR4ygmHrcyZgV0B74CXi5wsT1eryVtw',
+                  access_token_key='227846642-8IjK2K32CDFt3682SNOOpnzegAja3TyVpzFOGrQj',
+                  access_token_secret='L6of20EZdBv48EA2GE8Js6roIfZFnCKBpoPwvBDxF8',
+                  input_encoding=None, cache=None)
+
+    # ensure raw data directory exists
+    if not os.path.exists( raw_dir ):
+        os.mkdir( raw_dir )
+
+    # stay within rate limits
+    max_tweets_per_hr  = 180*4
+    download_pause_sec = 3600 / max_tweets_per_hr
+
+    # download tweets
+    for idx in range(0,len(fetch_list)):
+
+        # current item
+        item = fetch_list[idx]
+
+        # print status
+        trem = get_time_left_str( idx, fetch_list, download_pause_sec )
+        print '--> downloading tweet #%s (%d of %d) (%s left)' % \
+              (item[2], idx+1, len(fetch_list), trem)
+
+        # pull data
+        start = time.time()
+        try:
+            tweetStatus = api.GetStatus(item[2])
+            tweetFile = open(raw_dir + item[2] + '.json', 'w')
+            tweetFile.write( tweetStatus.AsJsonString() )
+            tweetFile.close()
+        except Exception, e:
+            print 'Cannot download tweet #'+item[2]
+            print e
+        end = time.time()
+
+        # stay in Twitter API rate limits
+        print '    pausing %.2f sec to obey Twitter API rate limits' % \
+              (download_pause_sec-(end-start))
+        time.sleep( download_pause_sec-(end-start) )
+
+    return
+
+
+def parse_tweet_json( filename ):
+
+    # read tweet
+    print 'opening: ' + filename
+    fp = open( filename, 'rb' )
+
+    # parse json
+    try:
+        tweet_json = json.load( fp )
+    except ValueError:
+        raise RuntimeError('error parsing json')
+
+    # look for twitter api error msgs
+    if 'error' in tweet_json:
+        raise RuntimeError('error in downloaded tweet')
+
+    # extract creation date and tweet text
+    return [ tweet_json['created_at'], tweet_json['text'] ]
+
+
+def build_output_corpus( out_filename, raw_dir, total_list ):
+
+    # open csv output file
+    fp = open( out_filename, 'wb' )
+    writer = csv.writer( fp, delimiter=',', quotechar='"', escapechar='\\',
+                         quoting=csv.QUOTE_ALL )
+
+    # write header row
+    #writer.writerow( ['Topic','Sentiment','TweetId','TweetDate','TweetText'] )
+
+    # parse all downloaded tweets
+    missing_count = 0
+    for item in total_list:
+
+        # ensure tweet exists
+        if os.path.exists( raw_dir + item[2] + '.json' ):
+
+            try:
+                # parse tweet
+                parsed_tweet = parse_tweet_json( raw_dir + item[2] + '.json' )
+                full_row = item + parsed_tweet
+
+                # character encoding for output
+                for i in range(0,len(full_row)):
+                    full_row[i] = full_row[i].encode("utf-8").replace('\n',' ')
+
+                # write csv row
+                writer.writerow( full_row )
+
+            except RuntimeError:
+                print '--> bad data in tweet #' + item[2]
+                missing_count += 1
+
+        else:
+            print '--> missing tweet #' + item[2]
+            missing_count += 1
+
+    # indicate success
+    if missing_count == 0:
+        print '\nSuccessfully downloaded corpus!'
+        print 'Output in: ' + out_filename + '\n'
+    else:
+        print '\nMissing %d of %d tweets!' % (missing_count, len(total_list))
+        print 'Partial output in: ' + out_filename + '\n'
+
+    return
+
+
+def rebuild_output_corpus():
+    user_params = {}
+    user_params['inList'] = './sanderstwitter02/corpus.csv'
+    user_params['outList'] = './sanderstwitter02/full-corpus.csv'
+    user_params['rawDir'] = './sanderstwitter02/rawdata/'
+
+    total_list = read_total_list( user_params['inList'] )
+    build_output_corpus( user_params['outList'], user_params['rawDir'],
+                         total_list )
+
+
+def main():
+
+    # get user parameters
+    user_params = get_user_params()
+    dump_user_params( user_params )
+
+    # get fetch list
+    total_list = read_total_list( user_params['inList'] )
+
+    # filter out only apple tweets
+    #total_list = filter_list( total_list )
+
+    # pull only 100 tweets
+    #total_list = random.sample( total_list, 100 )
+
+    print 'total tweets : ', len( total_list )
+    fetch_list = purge_already_fetched( total_list, user_params['rawDir'] )
+    print 'fetch tweets : ',  len( fetch_list )
+
+    # start fetching data from twitter
+    download_tweets( fetch_list, user_params['rawDir'] )
+
+    # second pass for any failed downloads
+    print '\nStarting second pass to retry any failed downloads';
+    fetch_list = purge_already_fetched( total_list, user_params['rawDir'] )
+    download_tweets( fetch_list, user_params['rawDir'] )
+
+    # build output corpus
+    build_output_corpus( user_params['outList'], user_params['rawDir'],
+                         total_list )
+
+    return
+
+
+if __name__ == '__main__':
+    main()
diff --git a/sentiment.py b/sentiment.py
new file mode 100644
index 0000000..2187bf6
--- /dev/null
+++ b/sentiment.py
@@ -0,0 +1,457 @@
+"""
+Sentiment Analysis of Twitter Feeds
+@Ayush Pareek
+"""
+
+import sys, os, random
+import nltk, re
+import collections
+
+import time
+
+def get_time_stamp():
+    return time.strftime("%y%m%d-%H%M%S-%Z")
+def grid(alist, blist):
+    for a in alist:
+        for b in blist:
+            yield(a, b)
+
+TIME_STAMP = get_time_stamp()
+
+NUM_SHOW_FEATURES = 100
+SPLIT_RATIO = 0.9
+FOLDS = 10
+LIST_CLASSIFIERS = [ 'NaiveBayesClassifier', 'MaxentClassifier', 'DecisionTreeClassifier', 'SvmClassifier' ] 
+LIST_METHODS = ['1step', '2step']
+
+def k_fold_cross_validation(X, K, randomise = False):
+    """
+    Generates K (training, validation) pairs from the items in X.
+
+    Each pair is a partition of X, where validation is an iterable
+    of length len(X)/K. So each training iterable is of length (K-1)*len(X)/K.
+
+    If randomise is true, a copy of X is shuffled before partitioning,
+    otherwise its order is preserved in training and validation.
+    """
+    if randomise: from random import shuffle; X=list(X); shuffle(X)
+    for k in xrange(K):
+        training = [x for i, x in enumerate(X) if i % K != k]
+        validation = [x for i, x in enumerate(X) if i % K == k]
+        yield training, validation
+
+#X = [i for i in xrange(97)]
+#for training, validation in k_fold_cross_validation(X, K=7):
+#    for x in X: assert (x in training) ^ (x in validation), x
+
+
+def getTrainingAndTestData(tweets, K, k, method, feature_set):
+
+    add_ngram_feat = feature_set.get('ngram', 1)
+    add_negtn_feat = feature_set.get('negtn', False)
+
+
+    from functools import wraps
+    import preprocessing
+
+    procTweets = [ (preprocessing.processAll(text, subject=subj, query=quer), sent)    \
+                        for (text, sent, subj, quer) in tweets]
+
+    
+
+    stemmer = nltk.stem.PorterStemmer()
+
+    all_tweets = []                                             #DATADICT: all_tweets =   [ (words, sentiment), ... ]
+    for (text, sentiment) in procTweets:
+        words = [word if(word[0:2]=='__') else word.lower() \
+                    for word in text.split() \
+                    if len(word) >= 3]
+        words = [stemmer.stem(w) for w in words]                #DATADICT: words = [ 'word1', 'word2', ... ]
+        all_tweets.append((words, sentiment))
+
+    # train_tweets = all_tweets[:int(len(all_tweets)*ratio)]      #DATADICT: train_tweets = [ (words, sentiment), ... ]
+    # test_tweets  = all_tweets[int(len(all_tweets)*ratio):]      #DATADICT: test_tweets  = [ (words, sentiment), ... ]
+    train_tweets = [x for i,x in enumerate(all_tweets) if i % K !=k]
+    test_tweets  = [x for i,x in enumerate(all_tweets) if i % K ==k]
+
+    unigrams_fd = nltk.FreqDist()
+    if add_ngram_feat > 1 :
+        n_grams_fd = nltk.FreqDist()
+
+    for( words, sentiment ) in train_tweets:
+        words_uni = words
+        unigrams_fd.update(words)
+
+        if add_ngram_feat>=2 :
+            words_bi  = [ ','.join(map(str,bg)) for bg in nltk.bigrams(words) ]
+            n_grams_fd.update( words_bi )
+
+        if add_ngram_feat>=3 :
+            words_tri  = [ ','.join(map(str,tg)) for tg in nltk.trigrams(words) ]
+            n_grams_fd.update( words_tri )
+
+    sys.stderr.write( '\nlen( unigrams ) = '+str(len( unigrams_fd.keys() )) )
+
+    #unigrams_sorted = nltk.FreqDist(unigrams).keys()
+    unigrams_sorted = unigrams_fd.keys()
+    #bigrams_sorted = nltk.FreqDist(bigrams).keys()
+    #trigrams_sorted = nltk.FreqDist(trigrams).keys()
+    if add_ngram_feat > 1 :
+        sys.stderr.write( '\nlen( n_grams ) = '+str(len( n_grams_fd )) )
+        ngrams_sorted = [ k for (k,v) in n_grams_fd.items() if v>1]
+        sys.stderr.write( '\nlen( ngrams_sorted ) = '+str(len( ngrams_sorted )) )
+
+    def get_word_features(words):
+        bag = {}
+        words_uni = [ 'has(%s)'% ug for ug in words ]
+
+        if( add_ngram_feat>=2 ):
+            words_bi  = [ 'has(%s)'% ','.join(map(str,bg)) for bg in nltk.bigrams(words) ]
+        else:
+            words_bi  = []
+
+        if( add_ngram_feat>=3 ):
+            words_tri = [ 'has(%s)'% ','.join(map(str,tg)) for tg in nltk.trigrams(words) ]
+        else:
+            words_tri = []
+
+        for f in words_uni+words_bi+words_tri:
+            bag[f] = 1
+
+        #bag = collections.Counter(words_uni+words_bi+words_tri)
+        return bag
+
+    negtn_regex = re.compile( r"""(?:
+        ^(?:never|no|nothing|nowhere|noone|none|not|
+            havent|hasnt|hadnt|cant|couldnt|shouldnt|
+            wont|wouldnt|dont|doesnt|didnt|isnt|arent|aint
+        )$
+    )
+    |
+    n't
+    """, re.X)
+
+    def get_negation_features(words):
+        INF = 0.0
+        negtn = [ bool(negtn_regex.search(w)) for w in words ]
+    
+        left = [0.0] * len(words)
+        prev = 0.0
+        for i in range(0,len(words)):
+            if( negtn[i] ):
+                prev = 1.0
+            left[i] = prev
+            prev = max( 0.0, prev-0.1)
+    
+        right = [0.0] * len(words)
+        prev = 0.0
+        for i in reversed(range(0,len(words))):
+            if( negtn[i] ):
+                prev = 1.0
+            right[i] = prev
+            prev = max( 0.0, prev-0.1)
+    
+        return dict( zip(
+                        ['neg_l('+w+')' for w in  words] + ['neg_r('+w+')' for w in  words],
+                        left + right ) )
+    
+    def counter(func):  #http://stackoverflow.com/questions/13512391/to-count-no-times-a-function-is-called
+        @wraps(func)
+        def tmp(*args, **kwargs):
+            tmp.count += 1
+            return func(*args, **kwargs)
+        tmp.count = 0
+        return tmp
+
+    @counter    #http://stackoverflow.com/questions/13512391/to-count-no-times-a-function-is-called
+    def extract_features(words):
+        features = {}
+
+        word_features = get_word_features(words)
+        features.update( word_features )
+
+        if add_negtn_feat :
+            negation_features = get_negation_features(words)
+            features.update( negation_features )
+ 
+        sys.stderr.write( '\rfeatures extracted for ' + str(extract_features.count) + ' tweets' )
+        return features
+
+    extract_features.count = 0;
+
+    
+    if( '1step' == method ):
+        # Apply NLTK's Lazy Map
+        v_train = nltk.classify.apply_features(extract_features,train_tweets)
+        v_test  = nltk.classify.apply_features(extract_features,test_tweets)
+        return (v_train, v_test)
+
+    elif( '2step' == method ):
+        isObj   = lambda sent: sent in ['neg','pos']
+        makeObj = lambda sent: 'obj' if isObj(sent) else sent
+        
+        train_tweets_obj = [ (words, makeObj(sent)) for (words, sent) in train_tweets ]
+        test_tweets_obj  = [ (words, makeObj(sent)) for (words, sent) in test_tweets ]
+
+        train_tweets_sen = [ (words, sent) for (words, sent) in train_tweets if isObj(sent) ]
+        test_tweets_sen  = [ (words, sent) for (words, sent) in test_tweets if isObj(sent) ]
+
+        v_train_obj = nltk.classify.apply_features(extract_features,train_tweets_obj)
+        v_train_sen = nltk.classify.apply_features(extract_features,train_tweets_sen)
+        v_test_obj  = nltk.classify.apply_features(extract_features,test_tweets_obj)
+        v_test_sen  = nltk.classify.apply_features(extract_features,test_tweets_sen)
+
+        test_truth = [ sent for (words, sent) in test_tweets ]
+
+        return (v_train_obj,v_train_sen,v_test_obj,v_test_sen,test_truth)
+
+    else:
+        return nltk.classify.apply_features(extract_features,all_tweets)
+
+def trainAndClassify( tweets, classifier, method, feature_set, fileprefix ):
+
+    INFO = '_'.join( [str(classifier), str(method)] + [ str(k)+str(v) for (k,v) in feature_set.items()] )
+    if( len(fileprefix)>0 and '_'!=fileprefix[0] ):
+        directory = os.path.dirname(fileprefix)
+        if not os.path.exists(directory):
+            os.makedirs(directory)
+        realstdout = sys.stdout
+        sys.stdout = open( fileprefix+'_'+INFO+'.txt' , 'w')
+
+    print INFO
+    sys.stderr.write( '\n'+ '#'*80 +'\n' + INFO )
+
+    if('NaiveBayesClassifier' == classifier):
+        CLASSIFIER = nltk.classify.NaiveBayesClassifier
+        def train_function(v_train):
+            return CLASSIFIER.train(v_train)
+    elif('MaxentClassifier' == classifier):
+        CLASSIFIER = nltk.classify.MaxentClassifier
+        def train_function(v_train):
+            return CLASSIFIER.train(v_train, algorithm='GIS', max_iter=10)
+    elif('SvmClassifier' == classifier):
+        CLASSIFIER = nltk.classify.SvmClassifier
+        def SvmClassifier_show_most_informative_features( self, n=10 ):
+            print 'unimplemented'
+        CLASSIFIER.show_most_informative_features = SvmClassifier_show_most_informative_features
+        def train_function(v_train):
+            return CLASSIFIER.train(v_train)
+    elif('DecisionTreeClassifier' == classifier):
+        CLASSIFIER = nltk.classify.DecisionTreeClassifier
+        def DecisionTreeClassifier_show_most_informative_features( self, n=10 ):
+            text = ''
+            for i in range( 1, 10 ):
+                text = nltk.classify.DecisionTreeClassifier.pp(self,depth=i)
+                if len( text.split('\n') ) > n:
+                    break
+            print text
+        CLASSIFIER.show_most_informative_features = DecisionTreeClassifier_show_most_informative_features
+        def train_function(v_train):
+            return CLASSIFIER.train(v_train, entropy_cutoff=0.05, depth_cutoff=100, support_cutoff=10, binary=False)
+
+    accuracies = []
+    if '1step' == method:
+     for k in range(FOLDS):
+        (v_train, v_test) = getTrainingAndTestData(tweets, FOLDS, k, method, feature_set)
+
+        sys.stderr.write( '\n[training start]' )
+        classifier_tot = train_function(v_train)
+        sys.stderr.write( ' [training complete]' )
+        
+        print '######################'
+        print '1 Step Classifier :', classifier
+        accuracy_tot = nltk.classify.accuracy(classifier_tot, v_test)
+        print 'Accuracy :', accuracy_tot
+        print '######################'
+        print classifier_tot.show_most_informative_features(NUM_SHOW_FEATURES)
+        print '######################'
+
+        # build confusion matrix over test set
+        test_truth   = [s for (t,s) in v_test]
+        test_predict = [classifier_tot.classify(t) for (t,s) in v_test]
+
+        print 'Accuracy :', accuracy_tot
+        print 'Confusion Matrix'
+        print nltk.ConfusionMatrix( test_truth, test_predict )
+
+        accuracies.append( accuracy_tot )
+     print "Accuracies:", accuracies
+     print "Average Accuracy:", sum(accuracies)/FOLDS
+
+
+    elif '2step' == method:
+        # (v_train, v_test) = getTrainingAndTestData(tweets,SPLIT_RATIO, '1step', feature_set)
+
+        # isObj   = lambda sent: sent in ['neg','pos']
+        # makeObj = lambda sent: 'obj' if isObj(sent) else sent
+
+        # def makeObj_tweets(v_tweets):
+        #     for (words, sent) in v_tweets:
+        #         print sent, makeObj(sent)
+        #         yield (words, makeObj(sent))
+        # def getSen_tweets(v_tweets):
+        #     for (words, sent) in v_tweets:
+        #         print sent, isObj(sent)
+        #         if isObj(sent):
+        #             yield (words, sent)
+
+        
+        # v_train_obj = makeObj_tweets( v_train )
+        # v_test_obj = makeObj_tweets( v_test )
+
+        # v_train_sen = getSen_tweets( v_train )
+        # v_test_sen = getSen_tweets( v_test )
+
+     accuracies = []
+     for k in range(FOLDS):
+        (v_train_obj, v_train_sen, v_test_obj, v_test_sen, test_truth) = getTrainingAndTestData(tweets, FOLDS, k, method, feature_set)
+        
+        sys.stderr.write( '\n[training start]' )
+        classifier_obj = train_function(v_train_obj)
+        sys.stderr.write( ' [training complete]' )
+
+        sys.stderr.write( '\n[training start]' )
+        classifier_sen = train_function(v_train_sen)
+        sys.stderr.write( ' [training complete]' )
+
+        print '######################'
+        print 'Objectivity Classifier :', classifier
+        accuracy_obj = nltk.classify.accuracy(classifier_obj, v_test_obj)
+        print 'Accuracy :', accuracy_obj
+        print '######################'
+        print classifier_obj.show_most_informative_features(NUM_SHOW_FEATURES)
+        print '######################'
+
+        test_truth_obj   = [s for (t,s) in v_test_obj]
+        test_predict_obj = [classifier_obj.classify(t) for (t,s) in v_test_obj]
+
+        print 'Accuracy :', accuracy_obj
+        print 'Confusion Matrix'
+        print nltk.ConfusionMatrix( test_truth_obj, test_predict_obj )
+        
+        print '######################'
+        print 'Sentiment Classifier :', classifier
+        accuracy_sen = nltk.classify.accuracy(classifier_sen, v_test_sen)
+        print 'Accuracy :', accuracy_sen
+        print '######################'
+        print classifier_sen.show_most_informative_features(NUM_SHOW_FEATURES)
+        print '######################'
+
+        test_truth_sen   = [s for (t,s) in v_test_sen]
+        test_predict_sen = [classifier_sen.classify(t) for (t,s) in v_test_sen]
+
+        print 'Accuracy :', accuracy_sen
+        print 'Confusion Matrix'
+        if( len(test_truth_sen) > 0 ):
+            print nltk.ConfusionMatrix( test_truth_sen, test_predict_sen )
+
+        v_test_sen2 = [(t,classifier_obj.classify(t)) for (t,s) in v_test_obj]
+        test_predict = [classifier_sen.classify(t) if s=='obj' else s for (t,s) in v_test_sen2]
+
+        correct = [ t==p for (t,p) in zip(test_truth, test_predict)]
+        accuracy_tot = float(sum(correct))/len(correct) if correct else 0
+
+        print '######################'
+        print '2 - Step Classifier :', classifier
+        print 'Accuracy :', accuracy_tot
+        print 'Confusion Matrix'
+        print nltk.ConfusionMatrix( test_truth, test_predict )
+        print '######################'
+
+        classifier_tot = (classifier_obj, classifier_sen)
+        accuracies.append( accuracy_tot )
+     print "Accuracies:", accuracies
+     print "Average Accuracy:", sum(accuracies)/FOLDS
+
+    sys.stderr.write('\nAccuracies :')    
+    for k in range(FOLDS):
+        sys.stderr.write(' %0.5f'%accuracies[k])
+    sys.stderr.write('\nAverage Accuracy: %0.5f\n'% (sum(accuracies)/FOLDS))
+    sys.stderr.flush()
+    
+    sys.stdout.flush()
+    if( len(fileprefix)>0 and '_'!=fileprefix[0] ):
+        sys.stdout.close()
+        sys.stdout = realstdout
+
+    return classifier_tot
+
+def main(argv) :
+    __usage__='''
+    usage: python sentiment.py logs/fileprefix ClassifierName,s methodName,s ngramVal,s negtnVal,s
+        ClassifierName,s:   %s
+        methodName,s:       %s
+        ngramVal,s:         %s
+        negtnVal,s:         %s
+    ''' % ( str( LIST_CLASSIFIERS ), str( LIST_METHODS ), str([1,3]), str([0,1]) )
+    import sanderstwitter02
+    import stanfordcorpus
+    import stats
+
+    fileprefix = ''
+
+    if (len(argv) >= 1) :
+        fileprefix = str(argv[0])
+    else :
+        fileprefix = 'logs/run'
+
+    classifierNames = []
+    if (len(argv) >= 2) :
+        classifierNames = [name for name in argv[1].split(',') if name in LIST_CLASSIFIERS]
+    else :
+        classifierNames = ['NaiveBayesClassifier']
+
+    methodNames = []
+    if (len(argv) >= 3) :
+        methodNames = [name for name in argv[2].split(',') if name in LIST_METHODS]
+    else :
+        methodNames = ['1step']
+
+    ngramVals = []
+    if (len(argv) >= 4) :
+        ngramVals = [int(val) for val in argv[3].split(',') if val.isdigit()]
+    else :
+        ngramVals = [ 1 ]
+
+    negtnVals = []
+    if (len(argv) >= 5) :
+        negtnVals = [bool(int(val)) for val in argv[4].split(',') if val.isdigit()]
+    else :
+        negtnVals = [ False ]
+
+    if (len( fileprefix )==0 or len( classifierNames )==0 or len( methodNames )==0 or len( ngramVals )==0 or len( negtnVals )==0 ):
+        print __usage__
+        return
+    
+    tweets1 = sanderstwitter02.getTweetsRawData('sentiment.csv')
+    tweets2 = stanfordcorpus.getNormalisedTweets('stanfordcorpus/'+stanfordcorpus.FULLDATA+'.5000.norm.csv')
+    #random.shuffle(tweets1)
+    #random.shuffle(tweets2)
+    tweets = tweets1 + tweets2
+    random.shuffle( tweets )
+    #tweets = tweets[:100]
+    sys.stderr.write( '\nlen( tweets ) = '+str(len( tweets )) )
+
+    #sys.stderr.write( '\n' )
+    #stats.preprocessingStats( tweets1, fileprefix='logs/stats_'+TIME_STAMP+'/TSC' )
+    #sys.stderr.write( '\n' )
+    #stats.preprocessingStats( tweets2, fileprefix='')#logs/stats_'+TIME_STAMP+'/STAN' )
+    #sys.stderr.write( '\n' )
+    #stats.stepStats( tweets , fileprefix='logs/stats_'+TIME_STAMP+'/Both' )
+
+    #generateARFF(tweets, fileprefix)
+
+    #print classifierNames, methodNames, ngramVals, negtnVals
+    TIME_STAMP = get_time_stamp()
+    for (((cname, mname), ngramVal), negtnVal) in grid( grid( grid( classifierNames, methodNames), ngramVals ), negtnVals ):
+        try:
+            trainAndClassify(
+                tweets, classifier=cname, method=mname,
+                feature_set={'ngram':ngramVal, 'negtn':negtnVal},
+                fileprefix=fileprefix+'_'+TIME_STAMP )
+        except Exception, e:
+            print e
+    sys.stdout.flush()
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
diff --git a/stanfordcorpus/__init__.py b/stanfordcorpus/__init__.py
new file mode 100644
index 0000000..02d5762
--- /dev/null
+++ b/stanfordcorpus/__init__.py
@@ -0,0 +1,147 @@
+"""
+http://help.sentiment140.com/for-students
+
+Format
+Data file format has 6 fields:
+0 - the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
+1 - the id of the tweet (2087)
+2 - the date of the tweet (Sat May 16 23:58:44 UTC 2009)
+3 - the query (lyx). If there is no query, then this value is NO_QUERY.
+4 - the user that tweeted (robotickilldozr)
+5 - the text of the tweet (Lyx is cool)
+
+"""
+
+FULLDATA = 'training.1600000.processed.noemoticon.csv'
+TESTDATA = 'testdata.manual.2009.06.14.csv'
+
+POLARITY= 0 # in [0,5]
+TWID    = 1
+DATE    = 2
+SUBJ    = 3 # NO_QUERY
+USER    = 4
+TEXT    = 5
+
+import csv, re, random
+
+regex = re.compile( r'\w+|\".*?\"' )
+
+def get_class( polarity ):
+    if polarity in ['0', '1']:
+        return 'neg'
+    elif polarity in ['3', '4']:
+        return 'pos'
+    elif polarity == '2':
+        return 'neu'
+    else:
+        return 'err'
+
+def get_query( subject ):
+    if subject == 'NO_QUERY':
+        return []
+    else:
+        return regex.findall(subject)
+
+def getAllQueries(in_file):
+
+    fp = open(in_file , 'r')
+    rd = csv.reader(fp, delimiter=',', quotechar='"' )
+
+    queries = set([])
+
+    for row in rd:
+        queries.add(row[3])
+
+    print queries
+
+    for q in queries:
+        print q, "\t",
+
+    return queries
+
+def sampleCSV( in_file, out_file, K=100 ):
+
+    fp = open(in_file , 'r')
+    fp2 = open(out_file , 'w')
+
+    for i in range(0,K):
+        line = fp.readline()
+        fp2.write(line)
+
+    fp.close()
+    fp2.close()
+
+    return 0
+
+def randomSampleCSV( in_file, out_file, K=100 ):
+
+    fp = open(in_file , 'r')
+    fq = open(out_file, 'w')
+
+    rows = [None] * K
+
+    i = 0
+    for row in fp:
+        i+=1
+        j = random.randint(1,i)
+        if i < K:
+            rows[i] = row
+        elif j <= K:
+            rows[j-1] = row
+
+    for row in rows:
+        fq.write(row)
+
+    min(1, K/i)
+
+def getNormalisedCSV( in_file, out_file ):
+    fp = open(in_file , 'r')
+    rd = csv.reader(fp, delimiter=',', quotechar='"' )
+
+    fq = open(out_file, 'w')
+    wr = csv.writer(fq, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL )
+
+    for row in rd:
+        queries = get_query(row[SUBJ])
+        wr.writerow( [row[TEXT], get_class(row[POLARITY]), row[SUBJ]] + [len(queries)] + queries )
+
+def getNormalisedTweets(in_file):
+    fp = open(in_file , 'r')
+    rd = csv.reader(fp, delimiter=',', quotechar='"' )
+    #print in_file, countlines( in_file )
+
+    tweets = []
+    count = 0
+    for row in rd:
+        numQueries = int(row[3])
+        tweets.append( row[:3] + [row[4:4+numQueries]] )
+        count+=1
+
+    #print count
+    #print 'len(tweets) =', len(tweets)
+    return tweets
+
+def countlines( filename ):
+    count = 0
+    with open( filename, 'r' ) as fp:
+        for line in fp:
+            count+=1
+    return count
+
+#getAllQueries( 'testdata.manual.2009.06.14.csv' )
+#getAllQueries( 'training.1600000.processed.noemoticon.csv' )
+
+#randomSampleCSV(FULLDATA, FULLDATA+'.sample.csv')
+#sampleCSV(TESTDATA, TESTDATA+'.sample.csv')
+
+#getNormalisedCSV(FULLDATA+'.sample.csv', FULLDATA+'.norm.csv')
+
+#randomSampleCSV(FULLDATA, FULLDATA+'.100000.sample.csv', K=100000)
+#getNormalisedCSV(FULLDATA+'.100000.sample.csv', FULLDATA+'.100000.norm.csv')
+
+
+SampleTweetsStats = '''
+   Class    Count Example
+     neg     2449 @jbrotherlove I thought it was a great love story 
+     pos     2551 I hope that these kitchen renos don't last any longer... they are so annoying 
+'''
diff --git a/stats.py b/stats.py
new file mode 100644
index 0000000..a18566a
--- /dev/null
+++ b/stats.py
@@ -0,0 +1,289 @@
+#@uthor: Ayush Pareek
+import sys, time, os
+import random, re, csv, collections
+import nltk, pylab, numpy
+
+import preprocessing
+
+def printClassStats( tweets ):
+    tweets_counter = collections.Counter( [t[1] for t in tweets] )
+    print '%8s %8s %s' % ('Class', 'Count', 'Example')
+    for (sent, count) in tweets_counter.items():
+        print '%8s %8d %s' % (sent, count, random.choice([t for (t,s,_,_) in tweets if s==sent ]) )
+
+def printFeaturesStats( tweets ):
+    arr_Handles   = numpy.array( [0]*len(tweets) )
+    arr_Hashtags  = numpy.array( [0]*len(tweets) )
+    arr_Urls      = numpy.array( [0]*len(tweets) )
+    arr_Emoticons = numpy.array( [0]*len(tweets) )
+    arr_Words     = numpy.array( [0]*len(tweets) )
+    arr_Chars     = numpy.array( [0]*len(tweets) )
+    
+
+    i=0
+    for (text, sent, subj, quer) in tweets:
+        arr_Handles[i]   = preprocessing.countHandles(text)
+        arr_Hashtags[i]  = preprocessing.countHashtags(text)
+        arr_Urls[i]      = preprocessing.countUrls(text)
+        arr_Emoticons[i] = preprocessing.countEmoticons(text)
+        arr_Words[i]     = len(text.split())
+        arr_Chars[i]     = len(text)
+        i+=1
+
+    print '%-10s %-010s %-4s '%('Features',  'Average',            'Maximum')
+    print '%10s %10.6f %10d'%('Handles',   arr_Handles.mean(),   arr_Handles.max()   )
+    print '%10s %10.6f %10d'%('Hashtags',  arr_Hashtags.mean(),  arr_Hashtags.max()  )
+    print '%10s %10.6f %10d'%('Urls',      arr_Urls.mean(),      arr_Urls.max()      )
+    print '%10s %10.6f %10d'%('Emoticons', arr_Emoticons.mean(), arr_Emoticons.max() )
+    print '%10s %10.6f %10d'%('Words',     arr_Words.mean(),     arr_Words.max()     )
+    print '%10s %10.6f %10d'%('Chars',     arr_Chars.mean(),     arr_Chars.max()     )
+
+def printReductionStats( tweets, function, filtering=True):
+    if( function ):
+        procTweets = [ (function(text, subject=subj, query=quer), sent)    \
+                        for (text, sent, subj, quer) in tweets]
+    else:
+        procTweets = [ (text, sent)    \
+                        for (text, sent, subj, quer) in tweets]
+    tweetsArr = []
+    for (text, sentiment) in procTweets:
+        words = [word if(word[0:2]=='__') else word.lower() \
+                        for word in text.split() \
+                        if ( (not filtering) | (len(word) >= 3) ) ]
+        tweetsArr.append([words, sentiment])
+    # tweetsArr
+    bag = collections.Counter()
+    for (words, sentiment) in tweetsArr:
+        bag.update(words)
+    # unigram
+
+    print '%20s %-10s %12d'% (
+                ('None' if function is None else function.__name__),
+                ( 'gte3' if filtering else 'all' ),
+                sum(bag.values())
+            )
+    return True
+
+def printAllRecuctionStats(tweets):
+    print '%-20s %-10s %-12s'% ( 'Preprocessing', 'Filter', 'Words' )
+    printReductionStats( tweets, None,                   False   )
+    #printReductionStats( tweets, None,                   True    )
+    printReductionStats( tweets, preprocessing.processHashtags,        True    )
+    printReductionStats( tweets, preprocessing.processHandles,         True    )
+    printReductionStats( tweets, preprocessing.processUrls,            True    )
+    printReductionStats( tweets, preprocessing.processEmoticons,       True    )
+    printReductionStats( tweets, preprocessing.processPunctuations,    True    )
+    printReductionStats( tweets, preprocessing.processRepeatings,      True    )
+    #printReductionStats( tweets, preprocessing.processAll,             False   )
+    printReductionStats( tweets, preprocessing.processAll,             True    )
+
+def printFreqDistCSV( dist, filename='' ):
+    n_samples = len(dist.keys())
+    n_repeating_samples = sum([ 1 for (k,v) in dist.items
+        () if v>1 ])
+    n_outcomes = dist._N
+    print '%-12s %-12s %-12s'%( 'Samples', 'RepSamples', 'Outcomes' )
+    print '%12d %12d %12d'%( n_samples, n_repeating_samples, n_outcomes )
+    
+    if( len(filename)>0 and '_'!=filename[0] ):
+        with open( filename, 'w' ) as fcsv:
+            distwriter = csv.writer( fcsv, delimiter=',', quotechar='"', quoting=csv.QUOTE_NONNUMERIC )
+            
+            for (key,value) in dist.items():
+                distwriter.writerow( [key, value] ) #print key, '\t,\t', dist[key]
+
+def preprocessingStats( tweets, fileprefix='' ):
+
+    if( len(fileprefix)>0 and '_'!=fileprefix[0] ):
+        directory = os.path.dirname(fileprefix)
+        if not os.path.exists(directory):
+            os.makedirs(directory)
+        print 'writing to', fileprefix+'_stats.txt'
+        realstdout = sys.stdout
+        sys.stdout = open( fileprefix+'_stats.txt' , 'w')
+
+    ###########################################################################  
+
+    print 'for', len(tweets), 'tweets:'
+
+    print '###########################################################################'
+
+    printFeaturesStats( tweets )
+
+    print '###########################################################################'
+
+    printAllRecuctionStats( tweets )
+
+    print '###########################################################################'
+
+    procTweets = [ (preprocessing.processAll(text, subject=subj, query=quer), sent)    \
+                        for (text, sent, subj, quer) in tweets]
+    tweetsArr = []
+    for (text, sentiment) in procTweets:
+        words = [word if(word[0:2]=='__') else word.lower() \
+                        for word in text.split() \
+                        if ( (len(word) >= 3) ) ]
+        tweetsArr.append([words, sentiment])
+    unigrams_fd = nltk.FreqDist()
+    bigrams_fd = nltk.FreqDist()
+    trigrams_fd = nltk.FreqDist()
+    for (words, sentiment) in tweetsArr:
+        words_bi = [ ','.join(map(str,bg)) for bg in nltk.bigrams(words) ]
+        words_tri  = [ ','.join(map(str,tg)) for tg in nltk.trigrams(words) ]
+        unigrams_fd.update( words )
+        bigrams_fd.update( words_bi )
+        trigrams_fd.update( words_tri )
+
+    print 'Unigrams Distribution'
+    printFreqDistCSV(unigrams_fd, filename=fileprefix+'_1grams.csv')
+    if( len(fileprefix)>0 and '_'!=fileprefix[0] ):
+        pylab.show = lambda : pylab.savefig(fileprefix+'_1grams.pdf')
+    unigrams_fd.plot(50, cumulative=True)
+    pylab.close()
+
+    print 'Bigrams Distribution'
+    printFreqDistCSV(bigrams_fd, filename=fileprefix+'_2grams.csv')
+    if( len(fileprefix)>0 and '_'!=fileprefix[0] ):
+        pylab.show = lambda : pylab.savefig(fileprefix+'_2grams.pdf')
+    bigrams_fd.plot(50, cumulative=True)
+    pylab.close()
+
+    print 'Trigrams Distribution'
+    printFreqDistCSV(trigrams_fd, filename=fileprefix+'_3grams.csv')
+    if( len(fileprefix)>0 and '_'!=fileprefix[0] ):
+        pylab.show = lambda : pylab.savefig(fileprefix+'_3grams.pdf')
+    trigrams_fd.plot(50, cumulative=True)
+    pylab.close()
+
+    if( len(fileprefix)>0 and '_'!=fileprefix[0] ):
+        pylab.show = lambda : pylab.savefig(fileprefix+'_ngrams.pdf')
+    unigrams_fd.plot(50, cumulative=True)
+    bigrams_fd.plot(50, cumulative=True)
+    trigrams_fd.plot(50, cumulative=True)
+    pylab.close()    
+
+    if( len(fileprefix)>0 and '_'!=fileprefix[0] ):
+        sys.stdout.close()
+        sys.stdout = realstdout
+
+def stepStats( tweets, num_bins=10, split='easy', fileprefix='' ):
+    tot_size = len(tweets)
+    num_digits = len(str(tot_size))
+
+    if split=='equal':
+        sizes = [ int((r+1.0)/num_bins*tot_size) for r in range( num_bins ) ]
+    elif split=='log':
+        sizes = [ int(2**(math.log(tot_size,2)*(r+1.0)/num_bins) ) for r in range( num_bins ) ]
+    else: # split=='easy'
+        sizes = range( 0, tot_size, tot_size/num_bins)[1:]+[tot_size]
+
+    for s in sizes:
+        print 'processing stats for %d tweets'%s
+        preprocessingStats( tweets[0:s], fileprefix+'_%0{0}d'.format(num_digits) % s )
+
+def oldStats2CSV( in_file, fileprefix=''):
+    if fileprefix == '':
+        fileprefix = in_file.rstrip('_stats.txt')
+    fp = open( in_file, 'r' )
+    fq = open( fileprefix+'_statsnew.txt', 'w' )
+
+    line = ''
+    line_start = 0
+    line_count = 20
+    line_end   = line_start+line_count
+    for line_num in range(line_start, line_end):   # write Statistics
+        line = fp.readline()
+        fq.write( line )
+
+    for section in [1,2,3]:
+        line_start = line_end
+        line_count = 2
+        line_end   = line_start+line_count
+        for line_num in range( line_start, line_end ):
+            line = fp.readline()
+            fq.write( line )
+    
+        line_start = line_end
+        line_count = [int(l) for l in line.split() if l.isdigit()][0]
+        line_end = line_start+line_count
+        fr = open( fileprefix+'_%dgrams.csv'%section, 'w')
+        fwrt = csv.writer( fr, delimiter=',', quotechar='"', quoting=csv.QUOTE_NONNUMERIC )
+        for line_num in range( line_start, line_end ):  # write unigrams
+            line = fp.readline()
+            row = line.split('\t,\t')
+            row[0] = row[0].strip()
+            row[1] = int(row[1])
+            fwrt.writerow( row )
+        fr.close()
+
+    fp.close()
+    fq.close()
+
+stats_tiltes = [
+'"# tweets"',
+'"avg(Handles)"',
+'"max(Handles)"',
+'"avg(Hashtags)"',
+'"max(Hashtags)"',
+'"avg(Urls)"',
+'"max(Urls)"',
+'"avg(Emoticons)"',
+'"max(Emoticons)"',
+'"avg(Words)"',
+'"max(Words)"',
+'"avg(Chars)"',
+'"max(Chars)"',
+'"preprocessing(None)"',
+'"preprocessing(Hashtags)"',
+'"preprocessing(Handles)"',
+'"preprocessing(Urls)"',
+'"preprocessing(Emoticons)"',
+'"preprocessing(Punctuations)"',
+'"preprocessing(Repeatings)"',
+'"preprocessing(All)"',
+'"Unigrams samples"',
+'"Unigrams repeating samples"',
+'"Unigrams outcomes"',
+'"Bigrams samples"',
+'"Bigrams repeating samples"',
+'"Bigrams outcomes"',
+'"Trigrams samples"',
+'"Trigrams repeating samples"',
+'"Trigrams outcomes"',
+]
+
+def newStats2CSV(files, out_file):
+
+    arr = [ [] ] * len(files)
+
+    for j in range( len(files)):
+        values = []
+        with open(files[j], 'r') as fp:
+            for line in fp:
+                values += [ float(w) for w in line.split()\
+                                if  w[0] in ['0','1','2','3','4','5','6','7','8','9'] ]
+        arr[j] = values
+
+    with open(out_file, 'w') as fq:
+        stats_writer = csv.writer( fq, delimiter=',', quotechar='\'')#, quoting=csv.QUOTE_NONE )
+        for i in range(0,len(stats_tiltes)):
+            row = [stats_tiltes[i]] + [arr[j][i] for j in range(len(files))]
+            stats_writer.writerow( row )
+
+
+filelist = [
+'logs/stats_140617-214922-IST/Both_0978_stats.txt',
+'logs/stats_140617-214922-IST/Both_1956_stats.txt',
+'logs/stats_140617-214922-IST/Both_2934_stats.txt',
+'logs/stats_140617-214922-IST/Both_3912_stats.txt',
+'logs/stats_140617-214922-IST/Both_4890_stats.txt',
+'logs/stats_140617-214922-IST/Both_5868_stats.txt',
+'logs/stats_140617-214922-IST/Both_6846_stats.txt',
+'logs/stats_140617-214922-IST/Both_7824_stats.txt',
+'logs/stats_140617-214922-IST/Both_8802_stats.txt',
+'logs/stats_140617-214922-IST/Both_9780_stats.txt',
+'logs/stats_140617-214922-IST/Both_9782_stats.txt',
+]
+
+
+['0','1','2','3','4','5','6','7','8','9']