-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
81 lines (65 loc) · 2.02 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import re,string,sys,types
import datetime as dt
UNK, USA = range(2)
months = {'Jan': 1,'Feb':2,'Mar':3,'Apr':4,'May':5,'Jun':6,'Jul':7,'Aug':8,'Sep':9,'Oct':10,'Nov':11,'Dec':12}
# Parses the time period from the filename
def parseTimePeriod(filename):
raw_date = filename.split('.')[1]
date, time = raw_date.split('_')
date = [int(i) for i in date.split('-')]
time = [int(i) for i in time.split('-')]
return (dt.datetime(date[0], date[1], date[2], time[0], time[1]) + dt.timedelta(hours=5, minutes=10))
def inTimePeriod(time_period, date):
return date < time_period + dt.timedelta(minutes=15)
#return time_period <= date and date < time_period + dt.datetime(minutes=15)
# Determines the appropriate time period to begin loading tweets
def determineTimePeriod(tweet_date):
return tweet_date.replace(minute=15*(int(tweet_date.minute/15)), second=0)
def assignBounds(location):
if not location['shape']:
lat = location['lat']
lng = location['lng']
if lat <= 48 and lat >= 25 and lng >= -125 and lng <= -66:
return USA
else:
return UNK
return False
# Need to make tests for this function
def wordFilter(words):
punc = re.compile('[%s]'%re.escape(string.punctuation))
num = re.compile('[%s]'%re.escape(string.digits))
alpha = re.compile('[^a-z]')
white = re.compile('[\s]')
keywords = []
file = open("stopwords//lextek.txt", 'r')
stopwords = file.read().split()
file.close()
for word in words:
# ignore long strings
if len(word) > 20:
continue
# ignore url
if u'http' in word:
continue
# ignore mentions
if word[0] == '@':
continue
temp_word = punc.sub('',word)
temp_word = num.sub('',temp_word)
# ignore unicode
if re.search(alpha, temp_word) != None:
continue
# ignore stopwords
try:
if temp_word in stopwords:
continue
except UnicodeWarning:
print temp_word
# ignore empty string
if len(temp_word) == 0:
continue
if '\x00' in temp_word:
temp_word = string.replace(temp_word, '\x00', '')
print temp_word
keywords.append(temp_word)
return keywords