-
Notifications
You must be signed in to change notification settings - Fork 6
/
data.py
64 lines (56 loc) · 2.46 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
"""Functions for reading data from the sentiment dictionary and tweet files."""
import os
import re
import string
from datetime import datetime
from ucb import main, interact
DATA_PATH = 'data' + os.sep
def load_sentiments(file_name="data"+os.sep+"sentiments.csv"):
"""Read the sentiment file and return a dictionary containing the sentiment
score of each word, a value from -1 to +1.
"""
sentiments = {}
for line in open(file_name, encoding='utf8'):
word, score = line.split(',')
sentiments[word] = float(score.strip())
return sentiments
word_sentiments = load_sentiments()
def file_name_for_term(term):
"""Return a valid filename that corresponds to an arbitrary term string."""
valid_characters = '-_' + string.ascii_letters + string.digits
no_space = term.replace(' ', '_')
return ''.join(c for c in no_space if c in valid_characters) + '.txt'
def generate_filtered_file(unfiltered_name, term):
"""Return the path to a file containing tweets that match term, generating
that file if necessary.
"""
filtered_path = DATA_PATH + file_name_for_term(term)
if not os.path.exists(filtered_path):
print('Generating filtered tweets file for "{0}".'.format(term))
r = re.compile('\W' + term + '\W', flags=re.IGNORECASE)
with open(filtered_path, mode='w', encoding='utf8') as out:
unfiltered = open(DATA_PATH + unfiltered_name, encoding='utf8')
matches = [l for l in unfiltered if term in l.lower()]
for line in matches:
if r.search(line):
out.write(line)
return filtered_path
def load_tweets(make_tweet, term='my job', file_name='all_tweets.txt'):
"""Return the list of tweets in file_name that contain term.
make_tweet -- a constructor that takes four arguments:
- a string containing the words in the tweet
- a datetime.datetime object representing the time of the tweet
- a longitude coordinate
- a latitude coordinate
"""
term = term.lower()
filtered_path = generate_filtered_file(file_name, term)
tweets = []
for line in open(filtered_path, encoding='utf8'):
if len(line.strip().split("\t")) >=4:
loc, _, time_text, text = line.strip().split("\t")
time = datetime.strptime(time_text, '%Y-%m-%d %H:%M:%S')
lat, lon = eval(loc)
tweet = make_tweet(text.lower(), time, lat, lon)
tweets.append(tweet)
return tweets