Skip to content

Commit 8a0a2c0

Browse files
authored
Merge pull request #316 from sgsmob/geo
Refactor usafacts to use geo utils
2 parents ca046a6 + 6521db3 commit 8a0a2c0

File tree

5 files changed

+64
-107
lines changed

5 files changed

+64
-107
lines changed

usafacts/delphi_usafacts/geo.py

Lines changed: 19 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -2,63 +2,10 @@
22
"""Functions for converting geocodes."""
33
import pandas as pd
44

5+
from delphi_utils import GeoMapper
56

67
INCIDENCE_BASE = 100000
7-
# https://code.activestate.com/recipes/577775-state-fips-codes-dict/
8-
STATE_TO_FIPS = {
9-
"WA": "53",
10-
"DE": "10",
11-
"DC": "11",
12-
"WI": "55",
13-
"WV": "54",
14-
"HI": "15",
15-
"FL": "12",
16-
"WY": "56",
17-
"PR": "72",
18-
"NJ": "34",
19-
"NM": "35",
20-
"TX": "48",
21-
"LA": "22",
22-
"NC": "37",
23-
"ND": "38",
24-
"NE": "31",
25-
"TN": "47",
26-
"NY": "36",
27-
"PA": "42",
28-
"AK": "02",
29-
"NV": "32",
30-
"NH": "33",
31-
"VA": "51",
32-
"CO": "08",
33-
"CA": "06",
34-
"AL": "01",
35-
"AR": "05",
36-
"VT": "50",
37-
"IL": "17",
38-
"GA": "13",
39-
"IN": "18",
40-
"IA": "19",
41-
"MA": "25",
42-
"AZ": "04",
43-
"ID": "16",
44-
"CT": "09",
45-
"ME": "23",
46-
"MD": "24",
47-
"OK": "40",
48-
"OH": "39",
49-
"UT": "49",
50-
"MO": "29",
51-
"MN": "27",
52-
"MI": "26",
53-
"RI": "44",
54-
"KS": "20",
55-
"MT": "30",
56-
"MS": "28",
57-
"SC": "45",
58-
"KY": "21",
59-
"OR": "41",
60-
"SD": "46",
61-
}
8+
629
SECONDARY_FIPS = [
6310
("51620", ["51093", "51175"]),
6411
("51685", ["51153"]),
@@ -77,6 +24,7 @@
7724
("46102", "46113"),
7825
]
7926

27+
8028
FIPS_TO_STATE = {v: k.lower() for k, v in STATE_TO_FIPS.items()}
8129

8230
# Valid geographical resolutions output by this indicator.
@@ -109,7 +57,6 @@ def fips_to_state(fips: str) -> str:
10957
"""
11058
return FIPS_TO_STATE[fips[:2]]
11159

112-
11360
def disburse(df: pd.DataFrame, pooled_fips: str, fips_list: list):
11461
"""Disburse counts from POOLED_FIPS equally to the counties in FIPS_LIST.
11562
@@ -148,7 +95,7 @@ def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame, sensor: str):
14895
Columns: fips, timestamp, new_counts, cumulative_counts, population ...
14996
geo_res: str
15097
Geographic resolution to which to aggregate. Valid options:
151-
('county', 'state', 'msa', 'hrr').
98+
("county", "state", "msa", "hrr").
15299
map_df: pd.DataFrame
153100
Loaded from static file "fips_prop_pop.csv".
154101
sensor: str
@@ -164,23 +111,28 @@ def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame, sensor: str):
164111
if geo_res not in VALID_GEO_RES:
165112
raise ValueError(f"geo_res must be one of {VALID_GEO_RES}")
166113

167-
df_mega = df[df['fips'].astype(int) % 1000 == 0].copy()
114+
# State-level records unassigned to specific counties are coded as fake
115+
# counties with fips XX000.
116+
unassigned_counties = df[df["fips"].str.endswith("000")].copy()
168117

169-
df = df[df['fips'].astype(int) % 1000 != 0].copy()
118+
df = df[df["fips"].astype(int) % 1000 != 0].copy()
170119
# Disburse unallocated cases/deaths in NYC to NYC counties
171120
df = disburse(df, NYC_FIPS[0][0], NYC_FIPS[0][1])
172-
df = df[df['fips'] != NYC_FIPS[0][0]]
121+
df = df[df["fips"] != NYC_FIPS[0][0]]
173122

174123
if geo_res == "county":
175124
if sensor not in PROP_SENSORS:
176-
df = df.append(df_mega)
125+
# It is not clear how to calculate the proportion for unallocated
126+
# cases/deaths, so we exclude them for those sensors.
127+
df = df.append(unassigned_counties)
177128
df["geo_id"] = df["fips"]
178129
elif geo_res == "state":
179130
# Grab first two digits of fips
180131
# Map state fips to us postal code
181132
# Add unallocated cases/deaths
182-
df = df.append(df_mega)
183-
df["geo_id"] = df["fips"].apply(fips_to_state)
133+
df = df.append(unassigned_counties)
134+
geo_mapper = GeoMapper()
135+
df = geo_mapper.add_geocode(df, "fips", "state_id", new_col="geo_id")
184136
elif geo_res in ("msa", "hrr"):
185137
# Map "missing" secondary FIPS to those that are in our canonical set
186138
for fips, fips_list in SECONDARY_FIPS:
@@ -192,12 +144,14 @@ def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame, sensor: str):
192144
map_df["geo_id"] = map_df[colname].astype(int)
193145
df["fips"] = df["fips"].astype(int)
194146
merged = df.merge(map_df, on="fips")
195-
merged["cumulative_counts"] = merged["cumulative_counts"] * merged["pop_prop"]
147+
merged["cumulative_counts"] =\
148+
merged["cumulative_counts"] * merged["pop_prop"]
196149
merged["new_counts"] = merged["new_counts"] * merged["pop_prop"]
197150
merged["population"] = merged["population"] * merged["pop_prop"]
198151
df = merged.drop(["zip", "pop_prop", "hrrnum", "cbsa_id"], axis=1)
199152
df = df.drop("fips", axis=1)
200153
df = df.groupby(["geo_id", "timestamp"]).sum().reset_index()
201154
df["incidence"] = df["new_counts"] / df["population"] * INCIDENCE_BASE
202-
df["cumulative_prop"] = df["cumulative_counts"] / df["population"] * INCIDENCE_BASE
155+
df["cumulative_prop"] =\
156+
df["cumulative_counts"] / df["population"] * INCIDENCE_BASE
203157
return df

usafacts/delphi_usafacts/pull.py

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
"""Functions for pulling data from the USAFacts website."""
33
import numpy as np
44
import pandas as pd
5+
from delphi_utils import GeoMapper
56

67
# Columns to drop the the data frame.
78
DROP_COLUMNS = [
@@ -12,7 +13,7 @@
1213
]
1314

1415

15-
def pull_usafacts_data(base_url: str, metric: str, pop_df: pd.DataFrame) -> pd.DataFrame:
16+
def pull_usafacts_data(base_url: str, metric: str, geo_mapper: GeoMapper) -> pd.DataFrame:
1617
"""Pulls the latest USA Facts data, and conforms it into a dataset
1718
1819
The output dataset has:
@@ -44,8 +45,8 @@ def pull_usafacts_data(base_url: str, metric: str, pop_df: pd.DataFrame) -> pd.D
4445
Base URL for pulling the USA Facts data
4546
metric: str
4647
One of 'confirmed' or 'deaths'. The keys of base_url.
47-
pop_df: pd.DataFrame
48-
Read from static file "fips_population.csv".
48+
geo_mapper: GeoMapper
49+
GeoMapper object with population info.
4950
5051
Returns
5152
-------
@@ -82,6 +83,19 @@ def pull_usafacts_data(base_url: str, metric: str, pop_df: pd.DataFrame) -> pd.D
8283

8384
# Conform FIPS
8485
df["fips"] = df["FIPS"].apply(lambda x: f"{int(x):05d}")
86+
87+
# The FIPS code 00001 is a dummy for unallocated NYC data. It doesn't have
88+
# a corresponding population entry in the GeoMapper so it will be dropped
89+
# in the call to `add_population_column()`. We pull it out here to
90+
# reinsert it after the population data is added.
91+
nyc_dummy_row = df[df["fips"] == "00001"]
92+
assert len(nyc_dummy_row) == 1
93+
94+
# Merge in population LOWERCASE, consistent across confirmed and deaths
95+
# Population for unassigned cases/deaths is NAN
96+
df = geo_mapper.add_population_column(df, "fips")
97+
df = df.append(nyc_dummy_row, ignore_index=True)
98+
8599
# Drop unnecessary columns (state is pre-encoded in fips)
86100
try:
87101
df.drop(DROP_COLUMNS, axis=1, inplace=True)

usafacts/delphi_usafacts/run.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
read_params,
1616
create_export_csv,
1717
S3ArchiveDiffer,
18+
GeoMapper
1819
)
1920

2021
from .geo import geo_map
@@ -88,12 +89,10 @@ def run_module():
8889
map_df = pd.read_csv(
8990
join(static_file_dir, "fips_prop_pop.csv"), dtype={"fips": int}
9091
)
91-
pop_df = pd.read_csv(
92-
join(static_file_dir, "fips_population.csv"),
93-
dtype={"fips": float, "population": float},
94-
).rename({"fips": "FIPS"}, axis=1)
9592

96-
dfs = {metric: pull_usafacts_data(base_url, metric, pop_df) for metric in METRICS}
93+
geo_mapper = GeoMapper()
94+
95+
dfs = {metric: pull_usafacts_data(base_url, metric, geo_mapper) for metric in METRICS}
9796
for metric, geo_res, sensor, smoother in product(
9897
METRICS, GEO_RESOLUTIONS, SENSORS, SMOOTHERS):
9998
print(geo_res, metric, sensor, smoother)

usafacts/tests/test_geo.py

Lines changed: 17 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,22 @@
1-
import pytest
2-
31
from os.path import join
42

3+
import pytest
4+
55
import numpy as np
66
import pandas as pd
7-
from delphi_usafacts.geo import fips_to_state, disburse, geo_map
7+
from delphi_usafacts.geo import disburse, geo_map
88

99
MAP_DF = pd.read_csv(
1010
join("..", "static", "fips_prop_pop.csv"),
1111
dtype={"fips": int}
1212
)
1313

14-
sensor = "new_counts"
15-
class TestFipsToState:
16-
17-
def test_normal(self):
18-
19-
assert fips_to_state("53003") == "wa"
20-
assert fips_to_state("48027") == "tx"
21-
assert fips_to_state("12003") == "fl"
22-
assert fips_to_state("50103") == "vt"
23-
assert fips_to_state("15003") == "hi"
24-
14+
SENSOR = "new_counts"
2515

2616
class TestDisburse:
17+
"""Tests for the `geo.disburse()` function."""
2718
def test_even(self):
28-
19+
"""Tests that values are disbursed evenly across recipients."""
2920
df = pd.DataFrame(
3021
{
3122
"fips": ["51093", "51175", "51620"],
@@ -43,8 +34,9 @@ def test_even(self):
4334

4435

4536
class TestGeoMap:
37+
"""Tests for `geo.geo_map()`."""
4638
def test_incorrect_geo(self):
47-
39+
"""Tests that an invalid resolution raises an error."""
4840
df = pd.DataFrame(
4941
{
5042
"fips": ["53003", "48027", "50103"],
@@ -56,10 +48,10 @@ def test_incorrect_geo(self):
5648
)
5749

5850
with pytest.raises(ValueError):
59-
geo_map(df, "département", MAP_DF, sensor)
51+
geo_map(df, "département", MAP_DF, SENSOR)
6052

6153
def test_county(self):
62-
54+
"""Tests that values are correctly aggregated at the county level."""
6355
df = pd.DataFrame(
6456
{
6557
"fips": ["53003", "48027", "50103"],
@@ -70,7 +62,7 @@ def test_county(self):
7062
}
7163
)
7264

73-
new_df = geo_map(df, "county", MAP_DF, sensor)
65+
new_df = geo_map(df, "county", MAP_DF, SENSOR)
7466

7567
exp_incidence = df["new_counts"] / df["population"] * 100000
7668
exp_cprop = df["cumulative_counts"] / df["population"] * 100000
@@ -81,7 +73,7 @@ def test_county(self):
8173
assert set(new_df["cumulative_prop"].values) == set(exp_cprop.values)
8274

8375
def test_state(self):
84-
76+
"""Tests that values are correctly aggregated at the state level."""
8577
df = pd.DataFrame(
8678
{
8779
"fips": ["04001", "04003", "04009", "25023"],
@@ -92,7 +84,7 @@ def test_state(self):
9284
}
9385
)
9486

95-
new_df = geo_map(df, "state", MAP_DF, sensor)
87+
new_df = geo_map(df, "state", MAP_DF, SENSOR)
9688

9789
exp_incidence = np.array([27, 13]) / np.array([2500, 25]) * 100000
9890
exp_cprop = np.array([165, 60]) / np.array([2500, 25]) * 100000
@@ -106,7 +98,7 @@ def test_state(self):
10698
assert (new_df["cumulative_prop"].values == exp_cprop).all()
10799

108100
def test_hrr(self):
109-
101+
"""Tests that values are correctly aggregated at the HRR level."""
110102
df = pd.DataFrame(
111103
{
112104
"fips": ["13009", "13017", "13021", "09015"],
@@ -117,7 +109,7 @@ def test_hrr(self):
117109
}
118110
)
119111

120-
new_df = geo_map(df, "hrr", MAP_DF, sensor)
112+
new_df = geo_map(df, "hrr", MAP_DF, SENSOR)
121113

122114
exp_incidence = np.array([13, 27]) / np.array([25, 2500]) * 100000
123115
exp_cprop = np.array([60, 165]) / np.array([25, 2500]) * 100000
@@ -131,7 +123,7 @@ def test_hrr(self):
131123
assert new_df["cumulative_prop"].values == pytest.approx(exp_cprop)
132124

133125
def test_msa(self):
134-
126+
"""Tests that values are correctly aggregated at the MSA level."""
135127
df = pd.DataFrame(
136128
{
137129
"fips": ["13009", "13017", "13021", "09015"],
@@ -142,7 +134,7 @@ def test_msa(self):
142134
}
143135
)
144136

145-
new_df = geo_map(df, "msa", MAP_DF, sensor)
137+
new_df = geo_map(df, "msa", MAP_DF, SENSOR)
146138

147139
exp_incidence = np.array([2, 13]) / np.array([300, 25]) * 100000
148140
exp_cprop = np.array([45, 60]) / np.array([300, 25]) * 100000

usafacts/tests/test_pull.py

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,9 @@
33
from os.path import join
44

55
import pandas as pd
6+
from delphi_utils import GeoMapper
67
from delphi_usafacts.pull import pull_usafacts_data
78

8-
pop_df = pd.read_csv(
9-
join("..", "static", "fips_population.csv"),
10-
dtype={"fips": float, "population": float}
11-
).rename({"fips": "FIPS"}, axis=1)
12-
139
base_url_good = "test_data/small_{metric}.csv"
1410

1511
base_url_bad = {
@@ -18,11 +14,13 @@
1814
"extra_cols": "test_data/bad_{metric}_extra_cols.csv"
1915
}
2016

17+
geo_mapper = GeoMapper()
18+
2119

2220
class TestPullUSAFacts:
2321
def test_good_file(self):
2422
metric = "deaths"
25-
df = pull_usafacts_data(base_url_good, metric, pop_df)
23+
df = pull_usafacts_data(base_url_good, metric, geo_mapper)
2624

2725
assert (
2826
df.columns.values
@@ -34,21 +32,21 @@ def test_missing_days(self):
3432
metric = "confirmed"
3533
with pytest.raises(ValueError):
3634
df = pull_usafacts_data(
37-
base_url_bad["missing_days"], metric, pop_df
35+
base_url_bad["missing_days"], metric, geo_mapper
3836
)
3937

4038
def test_missing_cols(self):
4139

4240
metric = "confirmed"
4341
with pytest.raises(ValueError):
4442
df = pull_usafacts_data(
45-
base_url_bad["missing_cols"], metric, pop_df
43+
base_url_bad["missing_cols"], metric, geo_mapper
4644
)
4745

4846
def test_extra_cols(self):
4947

5048
metric = "confirmed"
5149
with pytest.raises(ValueError):
5250
df = pull_usafacts_data(
53-
base_url_bad["extra_cols"], metric, pop_df
51+
base_url_bad["extra_cols"], metric, geo_mapper
5452
)

0 commit comments

Comments
 (0)