-
Notifications
You must be signed in to change notification settings - Fork 0
/
DataObjects.py
144 lines (122 loc) · 4.25 KB
/
DataObjects.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import utils
import datetime as dt
class Tweet:
def __init__(self, dTweet):
try:
# Information about the tweet
self.id = dTweet['id']
self.retweet_count = dTweet['retweet_count']
self.contents = dTweet['text'].lower()
keywords = utils.wordFilter(self.contents.split())
self.keywords = list(set(keywords))
self.dTermFreqs = {}
for word in self.keywords:
self.dTermFreqs[word] = keywords.count(word)
self.urls = []
self.user_mentions = []
self.hashtags = []
for url in dTweet['entities']['urls']:
self.urls.append(url['expanded_url'])
for mention in dTweet['entities']['user_mentions']:
self.user_mentions.append(mention['id'])
for tag in dTweet['entities']['hashtags']:
self.hashtags.append(tag['text'])
#This will be a date object
tokens = dTweet['created_at'].split(' ')
time = tokens[3].split(':')
self.date = dt.datetime(int(tokens[5]), utils.months[tokens[1]], int(tokens[2]), int(time[0]), int(time[1]), int(time[2]))
self.time_period = utils.determineTimePeriod(self.date)
self.valid = True
if dTweet['coordinates']:
self.location = {'type': dTweet['coordinates']['type'], 'shape': None, 'lat': dTweet['coordinates']['coordinates'][1], 'lng': dTweet['coordinates']['coordinates'][0]}
elif dTweet['place']:
shape = []
for coord in dTweet['place']['bounding_box']['coordinates'][0]:
shape.append({'lat': coord[1], 'lng': coord[0]})
self.location = {'type': dTweet['place']['bounding_box']['type'], 'shape': shape, 'lat': None, 'lng': None }
else:
self.valid = False
self.bound = utils.assignBounds(self.location)
# Information about the user
self.user = dTweet['user']['id']
self.follower_count = dTweet['user']['followers_count']
except KeyError as e:
# This occurs when twitter returns a json packet with limit as the only attribute
#print "Bad tweet data"
self.valid = False
def __str__(self):
return "Location type: {0} at ({1}, {2})".format(self.location['type'], self.location['lat'], self.location['lat'])
# This function returns a list of dictionaries of the same tweet on with
# different keywords
def toDBObject(self):
temp_dict = {
'_id' : self.id,
'contents' : self.contents,
'hashtags' : self.hashtags,
'date' : self.date,
'time_period' : self.time_period,
'location' : self.location,
'valid' : self.valid,
'keywords' : self.keywords,
'dTermFreqs': self.dTermFreqs,
'urls': self.urls,
'retweet_count': self.retweet_count,
'user_mentions': self.user_mentions,
'user': self.user,
'follower_count': self.follower_count,
'bound': self.bound
}
data = ({'_id': self.id}, temp_dict)
return data
class KeywordStat:
def __init__(self, keyword, time_period, bound=utils.UNK, poh=0):
self.keyword = keyword
self.time_period = time_period
self.bound = bound
self.poh = poh
self.doc_freq = 0
self.term_freq = 0
self.entropy = [0,0,0,0,0,0,0]
# Increases document and term frequencies simultaneously
def incFreqs(self, n=1):
self.doc_freq += 1
self.term_freq += n
def setEntropy(self, entro):
self.entropy = entro
def toDBObject(self):
temp_dict = {
"keyword" : self.keyword,
"df" : self.doc_freq,
"bound" : self.bound,
"tf" : self.term_freq,
"entropy" : self.entropy,
"poh" : self.poh,
"time_period" : self.time_period
}
data = ({"$and":[{'time_period': self.time_period}, {'keyword': self.keyword}, {'bound': self.bound}]}, temp_dict)
return data
class TimePeriodStat:
def __init__(self, time_period, bound = utils.UNK):
self.time_period = time_period
self.bound = bound
self.total_hashtags = 0
self.total_tweets = 0
self.total_keywords = 0
def incHashtags(self, n=1):
self.total_hashtags += n
def incTweetStats(self, tweetObj):
self.total_tweets += 1
self.total_hashtags += len(tweetObj.hashtags)
self.total_keywords += len(tweetObj.keywords)
def incKeywords(self, n=1):
self.total_keywords += n
def toDBObject(self):
temp_dict = {
"time_period" : self.time_period,
"total_hashtags" : self.total_hashtags,
"total_tweets" : self.total_tweets,
"bound" : self.bound,
"total_keywords" : self.total_keywords
}
data = ({"$and":[{'time_period': self.time_period}, {'bound': self.bound}]}, temp_dict)
return data