forked from fgregg/scraping-intro
-
Notifications
You must be signed in to change notification settings - Fork 0
/
direct_example.py
99 lines (73 loc) · 2.77 KB
/
direct_example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import csv
import urllib2
import time
from BeautifulSoup import BeautifulSoup
column_names = ['municipality',
'county',
'number_taxpayers',
'tax_type',
'year',
'quarter',
'general_merchandise',
'food',
'drinking_and_eating_places',
'apparel',
'furniture_and_hh_and_radio',
'lumber_bldg_hardware',
'automotive_and_filling_stations',
'drugs_and_misc_retail',
'agriculture_and_all_others',
'manufacturers',
'total']
with open('taxes.csv', 'w') as csvfile:
csvwriter = csv.writer(csvfile)
csvwriter.writerow(column_names)
for year in range(1999,2013) :
for quarter in range(1,5) :
url = str('https://www.revenue.state.il.us/app/kob/KOBReport?r=Specific&p='
+ str(year)
+ str(quarter)
+ '&m=0160001')
response = urllib2.urlopen(url)
soup = BeautifulSoup(response.read())
tables = soup.fetch('table', {'cellspacing':'3'})
# the first table is the form
tax_info = []
for table in tables[1:] :
taxes = table.fetch('td', {'align':'right', 'valign':'top'})
tax_list = []
for tax in taxes :
tax_name = tax.contents[1].text
tax_rows = str(tax)
tax_rows = tax_rows.replace('<br />', '')
tax_rows = tax_rows.replace('</td>', '')
tax_rows = tax_rows.replace(',', '')
tax_rows = tax_rows.split('\n')
amounts = tax_rows[2:]
tax_list.append((tax_name, amounts))
heading = table.fetch('table', {'width':'600'})[0]
headers = []
for header in heading.fetch('td') :
header_text = header.text
if 'Number of Taxpayers' in header_text :
num_tax_payers = header_text.split(':')
num_tax_payers = num_tax_payers[1]
num_tax_payers = num_tax_payers.strip()
num_tax_payers = num_tax_payers.replace(',','')
headers.append(num_tax_payers)
else:
headers.append(header_text)
tax_info.append((headers, tax_list))
print url
print tax_info
with open('taxes.csv', 'a') as csvfile:
csvwriter = csv.writer(csvfile)
for tax_body in tax_info :
muni, county, num_taxpayers = tax_body[0]
for taxes in tax_body[1] :
tax_row = [muni, county, num_taxpayers, taxes[0], year, quarter]
tax_row.extend(taxes[1])
print tax_row
csvwriter.writerow(tax_row)
# pause for a few seconds before making the next request
time.sleep(5)