-
Notifications
You must be signed in to change notification settings - Fork 3
/
harvester.py
136 lines (105 loc) · 3.78 KB
/
harvester.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
"""
Main script to call either harvester.
Usage: python3 harvester.py <config> <mode> <auth_index>
Where: <config> -- A json file with configuration information.
<mode> -- Mode of usage (stream or search).
<auth_index> -- Index for authentification information in config.
"""
import sys
import logging
import json
import couchdb
import tweepy
from harvesterStream import TwitterStreamListener
from harvesterSearch import TwitterSearcher
NUM_ARGS = 4
ERROR = 2
def get_database(config):
"""Return handle to couchdb as defined in config file."""
with open(config) as fp:
jconfig = json.load(fp)
try:
# Pull server information from config.
server = jconfig['Servers'][0]
couch = couchdb.Server(server)
# Check if databse exists, create if not.
db_name = jconfig['DatabaseName']
if db_name in couch:
logging.info("Database {} already exists.".format(db_name))
db = couch[db_name]
else:
logging.info("Created databse {}".format(db_name))
db = couch.create(db_name)
except Exception as e:
logging.error(str(e))
sys.exit(2)
return db
def get_credentials(config, auth_index):
"""Read and return credentials from config file."""
with open(config) as fp:
jconfig = json.load(fp)
# Attempt to read authentification details from config file.
try:
c_key = jconfig['Authentication'][auth_index]['ConsumerKey']
c_secret = jconfig['Authentication'][auth_index]['ConsumerSecret']
a_token = jconfig['Authentication'][auth_index]['AccessToken']
a_secret = (
jconfig['Authentication'][auth_index]['AccessTokenSecret']
)
except Exception as e:
logging.error(str(e))
sys.exit(ERROR)
return c_key, c_secret, a_token, a_secret
def get_box(config):
"""Return a box representing locations defined in config file."""
with open(config) as fp:
jconfig = json.load(fp)
try:
box = [
float(jconfig['Coordinates'][0]),
float(jconfig['Coordinates'][1]),
float(jconfig['Coordinates'][2]),
float(jconfig['Coordinates'][3])
]
except Exception as e:
logging.error(str(e))
sys.exit(ERROR)
return box
def get_geocode(config):
"""Return geocode defined in config file."""
with open(config) as fp:
jconfig = json.load(fp)
try:
geo = jconfig['Geocode']
except Exception as e:
logging.error(str(e))
sys.exit(ERROR)
return geo
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
if len(sys.argv) != NUM_ARGS:
logging.error(
'invalid number of arguments: <harvester.py> <config.json> <mode> '
'<auth_index>'
)
sys.exit(ERROR)
config = sys.argv[1]
mode = sys.argv[2]
auth_index = int(sys.argv[3])
db = get_database(config)
c_key, c_secret, a_token, a_secret = get_credentials(config, auth_index)
auth = tweepy.OAuthHandler(c_key, c_secret)
auth.set_access_token(a_token, a_secret)
api = tweepy.API(auth)
if mode == 'stream':
box = get_box(config)
stream_listener = TwitterStreamListener(db)
stream = tweepy.Stream(auth=api.auth, listener=stream_listener)
stream.filter(locations=box)
elif mode == 'search':
geo = get_geocode(config)
api = tweepy.API(
auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True
)
searcher = TwitterSearcher(api, db, geo, "*")
searcher.search()