-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathbitcrawl-script.py
executable file
·206 lines (184 loc) · 9.18 KB
/
bitcrawl-script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
#!/usr/bin/python
"""Crawls the web looking for quantitative information about bitcoin popularity.
Calculates the correlation coefficient between several numerical values
gleaned from webpages.
The short sample set available for this contest entry revealed surprisingly
strong correlation among several parameters, like wikipedia Bitcoin article
visit/view rates and bitcoin price.
Examples (require Internet connection):
> > python bitcrawl-script.py
Standard Module Dependencies:
argparse ArgumentParser
urllib urlencode, urlopen,...
urllib2 HTTPRedirectHandler, HTTPCookieProcessor, ...
time sleep
datetime now()
httplib IncompleteRead
Nonstandard Module Dependencies:
tz Local
TODO:
1. deal with csv: http://www.google.com/trends/?q=bitcoin&ctab=0&geo=us&date=ytd&sort=0 ,
<a href='/trends/viz?q=bitcoin&date=ytd&geo=us&graph=all_csv&sort=0&scale=1&sa=N'>
other examples in comments below
2. poll domain name registries to determine the number of domain names with "bitcoin" in them or beginning with "bit" or having "bit" and "coin" in them
3. build website and REST to share bitcoin trend info, several domain names saved at bustaname under shopper@tg username
pairbit, bitpair, coinpair, paircoin, coorbit, bitcorr, bitcoinarbitrage, etc
4. generalize the search with AI and ML to identify good and bad trends for all common proper names--stock symbols, etc
a) write research paper to prove it does as good a job as a human stock analyst at predicting future price movements
b) write a browser plugin that allows a human to supervise the machine learning and identify useful/relevant quantitative data
5. implement the indexer and search engine for the double-star question 3 in CS101 and get quant data directly from the index
6. implement the levetshire distance algorithm from the CS101 exam for use in word-stemming and search term similarity estimate
7. add query to .forecast_data() & .plot_data(): "bitfloor.bids[0][0]"
8. add query to .forecast_data() & .plot_data(): "mean(bitfloor.bids[:][0])"
9. add query to .forecast_data() & .plot_data(): "diff(mtgox.last)"
:copyright: 2012 by Hobson Lane ([email protected]), see AUTHORS for details
:license: Creative Commons BY-NC-SA, see LICENSE for details"""
import bitcrawl as bc
from utils import size, size2, size3
from pprint import pprint
from argparse import ArgumentParser
from warnings import warn
def parse_args():
"""Parse the command line arguments
TODO:
allow user to input a number format and prefix in some form other than python regexes
add options or dictionary members to hold patterns for "unit-of-measure" and "suffix"
generalize the format to allow user to ask the miner to count links at the url, rather than just extracting a literal value
"""
p = ArgumentParser(description=__doc__.strip())
p.add_argument(
'-b','--bitfloor','--bf',
type = int,
nargs = '?',
default = 0,
help = 'Retrieve this number of prices from the order book at bitfloor (TBD volume).',
)
p.add_argument(
'-l','--lead','--lag',
type = int,
nargs = '?',
default = 2,
help = 'Compute the correlation coefficients for data that is lead by this amount of sample periods (days).',
)
p.add_argument(
'-g','--graph','--plot',
nargs = '*',
default = ['mtgox.average','wikipedia_view_rate_Bitcoin.view_rate_Bitcoin'],
help = 'List of values to plot.',
)
p.add_argument(
'-u','--urls','--url',
type = str,
nargs = '*',
default = bc.URLs,
help = 'URL to mine data. OR. List of dict of dict [{{,,},{,,,}},{{,,},{,,,}},,,] defining regexs and URLs to extract data.',
)
p.add_argument(
'-p','--prefix',
type = str,
nargs = '*',
default = '',
help = 'HTML that preceeds the desired numerical text.',
)
p.add_argument(
'-r','--regex','--regexp','--re',
type = str,
nargs = '*',
default = '',
help = 'Python/Perl regular expression to capture numerical string only.',
)
p.add_argument(
'-v','--verbose',
action = 'store_true',
default = False,
help = 'Print out (to stdout) progress messages.',
)
p.add_argument(
'-q','--quiet',
action = 'store_true',
default = False,
help = "Don't output anything to stdout, not even the numerical values minded. Overrides verbose setting.",
)
p.add_argument(
'-n','--no-mine','--no-mining','--no-data','--no-datamining','--no-crawl','--no-crawling',
dest = 'nomine',
action = 'store_true',
default = False,
help = "Don't crawl the internet for numerical data.",
)
p.add_argument(
'-f','--path','--filename',
type = str,
#nargs = '*', # other options '*','+', 2
default = bc.FILEPATH,
help = 'File to append the numerical data to (after converting to a string).',
)
return p.parse_args()
if __name__ == "__main__":
o = parse_args()
data = None
if not o.quiet or o.verbose:
data = bc.load_json(filepath=o.path,verbose=-2)
print o.graph
sites, values, datetimes = bc.parse_query(o.graph)
print 'sites',sites
print 'values',values
rows = bc.retrieve_data(sites,values,datetimes)
NM = size(rows)
assert NM[1]>=NM[0], size(rows)
#cols = bc.transpose_lists(rows)
bc.display_correlation(rows=rows, labels=o.graph, leads=o.lead)
#bc.plot_data(columns=cols,normalize=True)
if not o.nomine:
# mine hard-coded urls
d = dict()
if type(o.urls)==dict:
# TODO: this check and "iterification" of mine_data() should happen inside the function
# check to see if all the dictionary keys look like urls
if bc.are_all_urls(o.urls):
for u,r in o.urls.items():
d[u]=bc.mine_data(url=u, prefixes=r, verbose=not o.quiet)
# otherwise assume the new format where each dict key is a name, and 'url' is a key of the nested dict
else:
for name,r in o.urls.items():
d[name]=bc.mine_data(url=r.pop('url'), prefixes=r, verbose=not o.quiet)
else:
raise ValueError('Invalid URL, prefix, or regex argument.')
# CRAWLING and MINING done here, data stored as a list of lists of dicts of dicts
data=[ d,
bc.bitfloor_book ( verbose=not o.quiet),
bc.wikipedia_view_rates( verbose=not o.quiet),
bc.get_links (max_depth=0,verbose=not o.quiet)
]
# compose a json string that can be appended to the end of a list within a json file (prefix = '')
json_string = bc.join_json(data,prefix='',suffix='\n]\n')
# TODO: make writable() check for a regex match (for formating and content verificaiton)
# TODO: make writable() write the file with acceptable initial content
if not bc.updateable(o.path,initial_content='[\n\n]\n'):
print 'ERROR! Unable to log data to "'+o.path+'". Printing to stdout instead...'
print json_string
raise RuntimeError('Unable to log data to "'+o.path+'".')
# TODO: create a function in bitcrawl module for appending new json data to existing data, as this block does
# see http://stackoverflow.com/a/1466036/623735 for definitions of file modes (write, read, update)
# + = update
# a+ = only allow you to seek and write after the end of the existing data
# w+ = truncate (delete existing data) before opening and updating/writing with new data
# r+ = leaves existing contents in tact and allows writing, reading, seeking anywhere (random access)
with open(o.path,'r+') as f:
# pointer should be at the end already due to append mode, but it's not,
f.seek(0,2) # go to position 0 relative to 2=EOF (1=current, 0=begin), not sure if this is required before the -3 seek
f.seek(-3,2) # if you do this before seek(0,2) on a "a+" or "w+" file you get "[Errno 22] Invalid argument"
if f.tell()>10: # file isn't empty
f.write(',\n') # to allow continuation of the json array/list by overwriting the ']' that terminates the existing list
else:
f.write('\n') # to allow continuation of the json array/list by overwriting the ']' that terminates the existing list
f.write(json_string)
if not o.quiet:
print 'Appended json records to "'+o.path+'"'
try:
print 'MtGox price is '+str(data[0]['mtgox']['average'])
except KeyError:
print 'Unable to retrieve the MtGox price. Network dropout? Format change at MtGox?'
if o.verbose:
'New data extracted from web pages...'
print json_string