-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfeed_bills_to_pandas.py
136 lines (105 loc) · 3.58 KB
/
feed_bills_to_pandas.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import os
import json
import multiprocessing
import pandas as pd
import numpy as np
import logging
import pandas_lib as pl
from multiprocessing import Pool
DATA_DIR = "./data/congress"
OUT_DIR = "./data/csv"
class Congress(object):
name = None
legislation = None
def __init__(self, name):
self.name = name
def extract_legislation(bill):
"""
Returns a list of the legislation fields we need for our legislation DataFrame
:param bill:
:return list:
"""
record = []
record.append(bill.get('congress', None))
record.append(bill.get('bill_id', None))
record.append(bill.get('bill_type', None))
record.append(bill.get('enacted_as', None))
record.append(bill.get('active', None))
record.append(bill.get('active_at', None))
record.append(bill.get('awaiting_signature', None))
record.append(bill.get('enacted', None))
record.append(bill.get('vetoed', None))
record.append(bill.get('introduced_at', None))
record.append(bill.get('number', None))
record.append(bill.get('official_title', None))
record.append(bill.get('popular_title', None))
record.append(bill.get('short_title', None))
record.append(bill.get('status', None))
record.append(bill.get('status_at', None))
record.append(bill.get('top_subject', None))
record.append(bill.get('updated_at', None))
return record
def extract_sponsor(bill):
"""
Return a list of the fields we need to map a sponser to a bill
"""
logger.debug("Extracting Sponsor")
sponsor_map = []
sponsor = bill.get('sponsor', None)
if sponsor:
sponsor_map.append(sponsor.get('type'))
sponsor_map.append(sponsor.get('thomas_id'))
sponsor_map.append(bill.get('bill_id'))
logger.debug("END Extracting Sponsor")
return sponsor_map
def crawl_congress(congress):
"""
A container function that recurses a set of directory and extracts data from
the legislation contained therein.
:return dict: A Dictionary of DataFrames
"""
logger = multiprocessing.log_to_stderr()
logger.setLevel(logging.DEBUG)
logger.info(congress)
congress_obj = Congress(congress)
# We construct lists that can be used to construct dataframes. Adding to
# dataframes is expensive so we don't do that.
# Core Data
legislation = []
# Relationships
bills_per_congress = []
sponsors = []
cosponsors = []
committees = []
ammendments = []
subjects = []
titles = []
# Change Log
actions = pd.DataFrame()
bills = "{0}/{1}/bills".format(DATA_DIR, congress)
index = 0
for root, dirs, files in os.walk(bills):
if "data.json" in files and "text-versions" not in root:
file_path = "{0}/data.json".format(root)
bill = json.loads(open(file_path, 'r').read())
# let's start with just the legislative information
record = extract_legislation(bill)
legislation.append(record)
sponsor = extract_sponsor(bill)
sponsors.append(sponsor)
congress_obj.legislation = pd.DataFrame(legislation)
congress_obj.sponsors = pd.DataFrame(sponsors)
pl.save_congress(congress_obj)
# print "{0} - {1}".format(congress, len(legislation))
if __name__ == '__main__':
logger = multiprocessing.log_to_stderr()
logger.setLevel(logging.DEBUG)
jobs = []
dirs = os.walk(DATA_DIR).next()[1]
p = Pool(12)
try:
p.map_async(crawl_congress, dirs).get(999999)
except KeyboardInterrupt:
pool.terminate()
print "You cancelled the program!"
sys.exit(1)