-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfrb_scraper.py
85 lines (74 loc) · 4.23 KB
/
frb_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/usr/bin/env python
from pyvirtualdisplay import Display
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
import time
def get_all_items():
#list to store alll scraped data
all_items = list()
#Display - read about pyvirtualdisplay
display = Display(visible=0, size=(1024, 768))
display.start()
#webdriver - read about selenium.webdriver
driver = webdriver.Firefox()
#this is a starting page we are scraping
driver.get("http://www.federalreserve.gov/apps/reportforms/default.aspx")
#Every element on the HTML page can be located using CSS selectors.
#Opening the starting page in Chrome, right click on the drop-down menu, click "Inspect" we see a tag on the right highlighted, we copy it's id - MainContent_ddl_ReportForms
#Knowing the id of dropdown menu, we can locate it with Selenium like this
main_menu = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CSS_SELECTOR,"#MainContent_ddl_ReportForms")))
#Drop down menu is an HTML table of options which can be verified in Chrome browser (Developer Tools, that pop up when you right click and press "Inspect" on an element)
#Following returns all of the options - rows in that table
form_options = main_menu.find_elements_by_tag_name("option")
#We count them
option_count = len(form_options)
#Next, we loop over all of them - essentially like we scrolling down the drop down menu and clicking on each every form
for form_i in xrange(1,option_count):
#Get web element corresponding to a form
form = form_options[form_i]
#Click as a mouse click-action in browser
form.click()
#Get text, because we need to store the form number
form_id = form.text
#Locate a web element corresponding to the submit button. By CSS selector which we found by inspection in Chrome browser (same logic as above)
submit_button = WebDriverWait(driver,3).until(EC.presence_of_element_located((By.CSS_SELECTOR,"#MainContent_btn_GetForm")))
#Click as a mouse click-action in browser
submit_button.click()
#Prepare data structures to store all the info we want to scrape
a = dict.fromkeys(['Description','OMB','Background','RespondentPanel','Frequency','PublicRelease'])
#We are on a web page after submit-click, following will search for all items of interest. Or for corresponding
#web-elements
for el in a.keys():
try:
item = driver.find_element_by_css_selector("#MainContent_lbl_"+el+"_data")
#Once found it will store them in our dictionary, if not it will proceed to "except" section and do nothing
a[el] = item.text
except:
#case when there is no such field
pass
#we need form number as well
a['FormNumber'] = form_id
#keeping them all in one list, which will have a dictionary per Form Number - and later, a row in your excel file per Form number
all_items.append(a)
#Ok, that part bothers me a little: it looks like I have to refresh "form_options" each time...
#Otherwise I get following exception: selenium.common.exceptions.StaleElementReferenceException: Message: Element not found in the cache - perhaps the page has changed since it was looked up
driver.get("http://www.federalreserve.gov/apps/reportforms/default.aspx")
main_menu = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CSS_SELECTOR,"#MainContent_ddl_ReportForms")))
form_options = main_menu.find_elements_by_tag_name("option")
driver.close()
display.stop()
return all_items
def main():
all_items = get_all_items()
#print all_items
#Convert our data structre and write to CSV
import csv
keys = ['FormNumber','Description','OMB','Background','RespondentPanel','Frequency','PublicRelease']
with open('forms.csv', 'wb') as output_file:
dict_writer = csv.DictWriter(output_file, keys)
dict_writer.writeheader()
dict_writer.writerows(all_items)
if __name__=='__main__':
main()