-
Notifications
You must be signed in to change notification settings - Fork 1
/
create_histograms.py
61 lines (51 loc) · 2.29 KB
/
create_histograms.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# -*- coding: utf-8 -*-
"""
Created on Wed Jun 10 00:42:54 2015
@author: nicholashamlin
"""
import os
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import FreqDist
#Define both default and Twitter-specific stopwords for exclusion
default_stops = (stopwords.words("english"))
custom_stops=['rt','ht','mt','@','#','!',':',';',',','.',"'s","?","\\n",'http','https',"n't","&","\\",'...','-','"']
stops=list(set(default_stops+custom_stops))
# Processing function also accepts another list of stopwords for call-specific exclusions
def process_tweets (hashtag,addl_stops=[]):
count=0
good_count=0
words_to_plot=[]
#Iterate through all chunked files with relevant hashtag
for fname in os.listdir(os.getcwd()):
if fname.startswith(hashtag):
with open(fname,'r') as data_file:
data=data_file.read()
# Parse raw string since json.load() approach wasn't working
data=data.split("\n\x00,")
for tweet in data:
count+=1
# Tweets have a well-defined structure, so we can parse them
# manually (even though the JSON approach would be cleaner)
text=tweet[tweet.find("text\":")+7:tweet.find(",\"source\"")-1]
# Skip tweets that contain Unicode
if text.find('\u')>=0:
continue
else:
good_count+=1
# Tokenize and count word frequency, ignoring case
words = word_tokenize(text)
clean_words= [w.lower() for w in words if not w.lower() in set(stops+addl_stops)]
words_to_plot=words_to_plot+clean_words
#Create frequency histogram of 50 most common words and print summary of activity
fdist=FreqDist(words_to_plot)
fdist.plot(50)
print "for "+hashtag+' we collected '+str(count)+' tweets out of which '+str(good_count)+" will be analyzed"
return words_to_plot
if __name__=='__main__':
pass
process_tweets("#both")
#Example of a call-specific stopword inclusion
process_tweets("#NBAFinals2015",['nbafinals2015'])
process_tweets("#Warriors")
#66609 tweets gathered overall, 46892 included in analysis