-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathGenerateTransactees.py
158 lines (152 loc) · 7.25 KB
/
GenerateTransactees.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#!/usr/bin/python3
##############################################################################
#
# File: GenerateTransactees.py
# Author: Matthew Leeds <[email protected]>
# License: GNU GPL <http://www.gnu.org/licenses/gpl.html>
# Purpose: This script reads the four data files from
# http://fcpa.alabamavotes.gov/PublicSite/DataDownload.aspx
# and makes a list of transactees (contributors, payees, and receipt sources)
# that can be geocoded. Transactees refer to people or organizations who do
# business with (or contribute to) political parties (PACs/Candidates).
# The txIDs are the unique identifiers for rows in the data files,
# so either ReceiptID, ExpenditureID, InKindContributionID, or ContributionID
# depending on the file. They are only unique within their own file.
# The data format is documented on the GitHub wiki. Configuration parameters
# will be read from 'config.ini' in the current directory.
#
##############################################################################
import sys
import json
import csv
from uuid import uuid4
from datetime import datetime
from configparser import ConfigParser
def main():
# Read the config file.
config = ConfigParser()
config.read('config.ini')
YEAR = config.get('GENERATE_TRANSACTEES', 'YEAR')
DATA_DIR = config.get('GENERATE_TRANSACTEES', 'DATA_DIR')
DATAFILES = json.loads(config.get('GENERATE_TRANSACTEES', 'DATAFILES'))
OUTFILE = config.get('GENERATE_TRANSACTEES', 'OUTFILE')
PRETTY_PRINT = config.getboolean('GENERATE_TRANSACTEES', 'PRETTY_PRINT')
global allTransactees
allTransactees = [] # master list of Transactees
# hard code the ID column, org type column, and transactee type for each file
recordTypes = {}
for filename in DATAFILES:
if 'CashContribution' in filename:
recordTypes[filename] = ('ContributionID', 'ContributorType', 'Contributor')
elif 'Expenditure' in filename:
recordTypes[filename] = ('ExpenditureID', '', 'Payee')
elif 'InKindContribution' in filename:
recordTypes[filename] = ('InKindContributionID', 'ContributorType', 'Contributor')
elif 'OtherReceipts' in filename:
recordTypes[filename] = ('ReceiptID', 'ReceiptSourceType', 'ReceiptSource')
else:
print('>> Unrecognized filename: ' + filename + '. Quitting.')
sys.exit(1)
# Load the output file from a previous run if it's there so we have the id values.
global existingTransactees
existingTransactees = []
try:
with open(DATA_DIR + OUTFILE) as datafile:
existingTransactees = json.load(datafile)
except FileNotFoundError:
# This is a fresh run and new id's will be generated.
pass
if len(existingTransactees) > 0:
print('>> Loaded ' + str(len(existingTransactees)) + ' records from ' + OUTFILE)
# Make a dict out of the existingTransactees so we have constant time access to id values.
global transacteeIDs
transacteeIDs = {}
for entry in existingTransactees:
if isinstance(entry['transaction_ids'], str): # Undo flattening
txIDs = json.loads(entry['transaction_ids'].replace('\'', '"'))
else:
txIDs = entry['transaction_ids']
for txID in txIDs:
transacteeIDs[entry['filed_year'] + entry['transaction_type'] + txID] = entry['id']
# load data from each source file
for filename in DATAFILES:
print('>> Loading data from ' + filename)
with open(DATA_DIR + filename, 'r', errors='ignore', newline='') as csvfile:
process(YEAR, csv.DictReader(csvfile), recordTypes[filename])
# Merge the existing records with the newly found ones (except duplicates)
numDuplicates = mergeExistingTransactees()
if len(existingTransactees) > 0:
print('>> Merged transactees with records from the disk; there were ' + str(numDuplicates) + ' duplicates.')
print('>> Writing ' + str(len(allTransactees)) + ' records to ' + OUTFILE)
with open(DATA_DIR + OUTFILE, 'w') as datafile:
if PRETTY_PRINT:
json.dump(allTransactees, datafile, sort_keys=True,
indent=4, separators=(',', ': '))
else:
json.dump(allTransactees, datafile)
# process each record, adding it to allTransactees
# year is the year the data is from, as a string
# records is a csv.DictReader
# recordTypes is (<id col name>, <org type col name>, <transactee type>)
def process(year, records, recordTypes):
global transacteeIDs
global allTransactees
# idCol = ContributionID, ExpenditureID, InKindContributionID, or ReceiptID
idCol = recordTypes[0]
# orgTypeCol = ContributorType, ReceiptSourceType, or ''
orgTypeCol = recordTypes[1]
# transacteeType = Contributor, Payee, or ReceiptSource
transacteeType = recordTypes[2]
# record the starting index for this type (for efficiency later)
startIndex = len(allTransactees)
for record in records:
if ('ReceiptSourceType' in record and record['ReceiptSourceType'] == 'PAC') or \
('ContributorType' in record and record['ContributorType'] == 'PAC'):
continue # PACs are treated as Parties not Transactees
name = record['FirstName'] + ' ' + record['MI'] + ' ' + record['LastName'] + ' ' + record['Suffix']
name = name.strip().title().replace('Ii','II').replace('Iii','III').replace('IIi', 'III').replace(' ', ' ')
address = record['Address1'] + ' ' + record['City'] + ' ' + record['State'] + ' ' + record['Zip']
address = address.strip().replace(' ',' ')
txID = record[idCol]
try:
orgType = record[orgTypeCol]
except KeyError: # must be Expenditure data
orgType = ''
isNew = True
# treat each nameless person as unique
if len(name) != 0:
# check if they're already in allTransactees (and the same type)
for record in allTransactees[startIndex:]:
if record['name'] == name and record['address'] == address:
# add this transaction id to the existing record
record['transaction_ids'].append(txID)
isNew = False
break
if isNew: # we haven't seen them yet
newOrg = {}
newOrg['name'] = name
newOrg['address'] = address
newOrg['transactee_type'] = transacteeType
newOrg['transaction_type'] = idCol[:-2]
# If the transactee id was generated in a previous run, reuse it.
try:
newOrg['id'] = transacteeIDs[year + newOrg['transaction_type'] + txID]
except KeyError:
newOrg['id'] = str(uuid4()).upper() # random unique id
newOrg['API_status'] = '' # will be used by geocoding script
newOrg['organization_type'] = orgType
newOrg['transaction_ids'] = [txID]
newOrg['filed_year'] = year
allTransactees.append(newOrg)
def mergeExistingTransactees():
global allTransactees
global existingTransactees
numDuplicates = 0
for transactee in existingTransactees:
if transactee not in allTransactees:
allTransactees.append(transactee)
else:
numDuplicates += 1
return numDuplicates
if __name__=='__main__':
main()