Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
davidpeckham committed Jul 24, 2021
1 parent 58d1e6b commit 9bdeef4
Show file tree
Hide file tree
Showing 2 changed files with 140 additions and 0 deletions.
139 changes: 139 additions & 0 deletions Data Pipeline/00 - Download Dataset.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
{
"metadata": {
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
},
"orig_nbformat": 4,
"kernelspec": {
"name": "python3",
"display_name": "Python 3.8.8 64-bit ('base': conda)"
},
"interpreter": {
"hash": "e245b9d4d52625933425f13c940396e11f2ad0cf135519173d3aca2cac5d4603"
}
},
"nbformat": 4,
"nbformat_minor": 2,
"cells": [
{
"source": [
"# Download Election Dataset\n",
" \n",
"Download our State Board of Elections dataset from http://nc-campaign-finance-storage.s3-website-us-east-1.amazonaws.com/"
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pip install requests python-dateutil"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from datetime import datetime, timezone\n",
"from dateutil import parser\n",
"from os import utime\n",
"from pathlib import Path\n",
"import requests\n",
"\n",
"DATA_URL = \"http://nc-campaign-finance-storage.s3.amazonaws.com/sboe-raw-files\"\n",
"DATA_DIR = \"./data\"\n",
"\n",
"raw_files = [\n",
" \"raw_files/contributions/contributions_20100101-20101231.csv\",\n",
" \"raw_files/contributions/contributions_20110101-20111231.csv\",\n",
" \"raw_files/contributions/contributions_20120101-20121231.csv\",\n",
" \"raw_files/contributions/contributions_20130101-20131231.csv\",\n",
" \"raw_files/contributions/contributions_20140101-20141231.csv\",\n",
" \"raw_files/contributions/contributions_20150101-20151231.csv\",\n",
" \"raw_files/contributions/contributions_20160101-20161231.csv\",\n",
" \"raw_files/contributions/contributions_20170101-20171231.csv\",\n",
" \"raw_files/contributions/contributions_20180101-20181231.csv\",\n",
" \"raw_files/contributions/contributions_20190101-20191231.csv\",\n",
" \"raw_files/contributions/contributions_20200101_20200630.csv\",\n",
" \"raw_files/contributions/contributions_20200701_20201231.csv\",\n",
" \"raw_files/expenses/expenses_20100101_20101231.csv\",\n",
" \"raw_files/expenses/expenses_20110101_20111231.csv\",\n",
" \"raw_files/expenses/expenses_20120101_20121231.csv\",\n",
" \"raw_files/expenses/expenses_20130101_20131231.csv\",\n",
" \"raw_files/expenses/expenses_20140101_20141231.csv\",\n",
" \"raw_files/expenses/expenses_20150101_20151231.csv\",\n",
" \"raw_files/expenses/expenses_20160101_20161231.csv\",\n",
" \"raw_files/expenses/expenses_20170101_20171231.csv\",\n",
" \"raw_files/expenses/expenses_20180101_20181231.csv\",\n",
" \"raw_files/expenses/expenses_20190101_20191231.csv\",\n",
" \"raw_files/expenses/expenses_20200101_20201231.csv\"\n",
"]\n",
"\n",
"def download_file(url, path):\n",
" with requests.get(url, stream=True) as response:\n",
" response.raise_for_status()\n",
" remote_size = int(response.headers[\"Content-Length\"])\n",
" remote_mtime = parser.parse(response.headers[\"Last-Modified\"])\n",
" if path.exists():\n",
" stats = path.stat()\n",
" local_size = stats.st_size\n",
" local_mtime = datetime.fromtimestamp(stats.st_mtime, timezone.utc)\n",
"\n",
" if local_size == remote_size and local_mtime == remote_mtime:\n",
" print(f'{path} skipped (already downloaded)')\n",
" return True\n",
" else:\n",
" path.parent.mkdir(parents=True, exist_ok=True)\n",
"\n",
" try:\n",
" with open(path, 'wb') as f:\n",
" for chunk in response.iter_content(chunk_size=8192):\n",
" f.write(chunk)\n",
" remote_ts = remote_mtime.timestamp()\n",
" os.utime(path, times=(remote_ts, remote_ts))\n",
" print(f'{path} downloaded')\n",
" return True\n",
" except:\n",
" path.unlink()\n",
" print(f'{path} incomplete, deleted')\n",
" return False\n",
"\n",
"\n",
"interrupted = False\n",
"\n",
"for file in raw_files:\n",
" url = f\"{DATA_URL}/{file}\"\n",
" path = Path(DATA_DIR, file)\n",
" if not download_file(url, path):\n",
" interrupted = True\n",
" break\n",
"\n",
"if interrupted:\n",
" print('Downloads were interrupted')\n",
"else:\n",
" print(\"Downloads complete\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
]
}
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ The Python scripts are Jupyter Notebooks, but should be easily converted to an i

## The scripts are meant to be run in order

* 00 - Download Dataset - downloads the raw files
* 01 - Preprocess - imports the raw files, sets up the Postgres tables and preps the data for dedupe
* 02 - Dedupe - this is a actual part that goes over the entire universe of donors and payees and determines if they are the same despite speeling and missing information
* 03 - Post Dedupe - this creates the views, copies the canonical ids to the transactions and parses out the various sources of committee information to determine party, candidate and active years
Expand Down

0 comments on commit 9bdeef4

Please sign in to comment.