diff --git a/analysis/db/us_map/choroplethMap.py b/analysis/db/us_map/choroplethMap.py index 0bdd092..d5cd0e3 100644 --- a/analysis/db/us_map/choroplethMap.py +++ b/analysis/db/us_map/choroplethMap.py @@ -1,26 +1,26 @@ -import plotly.figure_factory as ff import numpy as np import plotly.io as pio from pyprojroot import here +import seaborn as sns +import matplotlib.pyplot as plt import plotly.express as px import pandas as pd from urllib.request import urlopen -import json import pathlib as pl +import json with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response: counties = json.load(response) -# TODO: Change range_color numbers in fig = px.choropleth() -# TODO: Build the map for a specific date -# TODO: Build line/bar graphs to check case numbers per state over a period of time -# TODO: Draw another version of this map, but accounts for population density per county (per capita count) -# TODO: See if rate is changing, counts over time (a 14 day sliding window count) -# Choropleth map with time slider and hover text - confirmed_df = pd.read_csv('https://github.com/CSSEGISandData/COVID-19/raw/master/csse_covid_19_data/' 'csse_covid_19_time_series/time_series_covid19_confirmed_US.csv') +# Resource for State_FIPS: https://www.nrcs.usda.gov/wps/portal/nrcs/detail/national/home/?cid=nrcs143_013697 loc_df = pd.read_excel(here('./data/db/original/maps/State_FIPS.xlsx')) +# Resource for PopulationEstimates: https://www.ers.usda.gov/data-products/county-level-data-sets/download-data/ +pop_df = pd.read_excel(here('./data/db/original/maps/PopulationEstimates.xls')) # population dataset for 2019 + +pop_df['fips_str'] = pop_df['FIPStxt'].apply(lambda x: f'{x:05.0f}') +pop_df = pop_df[['fips_str', 'Area_Name', 'POP_ESTIMATE_2019']] merged_df = pd.merge(loc_df, confirmed_df, right_on='Admin2', left_on='Name') @@ -33,83 +33,28 @@ molten_df['date_iso'] = pd.to_datetime(molten_df['date'], format="%m/%d/%y") # change date to ISO8601 standard format -fips = molten_df['fips_str'].tolist() -# max_val = molten_df['value'].max() +molten_pop_df = pd.merge(molten_df, pop_df, on='fips_str') # add population per county +grouped_by = molten_pop_df.groupby(['fips_str', 'date_iso', 'State', 'Admin2', 'POP_ESTIMATE_2019'])['value'].sum().reset_index() +grouped_by['total_per_cap'] = grouped_by['value'] / grouped_by['POP_ESTIMATE_2019'] # get per capita value +plot_data = grouped_by[grouped_by.date_iso == '2020-04-01'] # confirmed cases on a specific day +value = 'value' # 'value' = raw count, 'total_per_cap' = per capita -fig = px.choropleth(molten_df, +# confirmed cases per capita/raw count +fig = px.choropleth(plot_data, geojson=counties, - locations=fips, - color='value', - animation_frame='date', - hover_data=['State', 'value'], - color_continuous_scale="Viridis", - range_color=(0, 300), + locations=plot_data.fips_str, + color=value, + # animation_frame='date', + hover_data=['State', 'Admin2', value, 'POP_ESTIMATE_2019'], + color_continuous_scale='viridis_r', + range_color=(0, plot_data[value].max()), scope="usa", title='Confirmed cases', labels={'value': 'confirmed cases'} ) fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0}) -# fig.show() - -# save out figure -# save out working data pl.Path(here("./output/maps", warn=False)).mkdir(parents=True, exist_ok=True) pio.write_html(fig, file=str(here("./output/maps/choropleth_us_cases.html", warn=False)), - auto_open=False) - - -# ChoroplethMap using FIPS from merged data -''' -confirmed_df = pd.read_csv('https://github.com/CSSEGISandData/COVID-19/raw/master/csse_covid_19_data/' - 'csse_covid_19_time_series/time_series_covid19_confirmed_US.csv') - -loc_df = pd.read_excel(here('./data/db/original/maps/State_FIPS.xlsx')) -merged_df = pd.merge(loc_df, confirmed_df, right_on='Admin2', left_on='Name') - -merged_df['fips_str'] = merged_df['FIPS_x'].apply(lambda x: f'{x:05.0f}') # left pad with 0 for 5 digits - -colorscale = ["#f7fbff", "#ebf3fb", "#deebf7", "#d2e3f3", "#c6dbef", "#b3d2e9", "#9ecae1", - "#85bcdb", "#6baed6", "#57a0ce", "#4292c6", "#3082be", "#2171b5", "#1361a9", - "#08519c", "#0b4083", "#08306b"] - -molten_df = merged_df.melt( - id_vars=['FIPS_x', 'Name', 'State', 'UID', 'iso2', 'iso3', 'code3', 'FIPS_y', 'Admin2', - 'Province_State', 'Country_Region', 'Lat', 'Long_', 'Combined_Key', 'fips_str'], - var_name=['date'] -) - -molten_df['date_iso'] = pd.to_datetime(molten_df['date'], format="%m/%d/%y") # change date to ISO8601 standard format - -endpts = list(np.linspace(0, 3000, len(colorscale) - 1)) -fips = molten_df['fips_str'].tolist() -values = molten_df['date_iso'].tolist() - -fig = ff.create_choropleth( - fips=fips, - # values=values, - scope=['usa', - # 'Alaska', - # 'Puerto Rico', - # 'American Samoa', - # 'Commonwealth of the Northern Mariana Islands', 'Guam', - # 'United States Virgin Islands' - ], - binning_endpoints=endpts, - colorscale=colorscale, - show_state_data=True, - show_hover=True, - centroid_marker={'opacity': 0}, - asp=2.9, - title='Confirmed cases on April 22', - legend_title='# confirmed cases', - text=molten_df['fips'] -) - -fig.layout.template = None -fig.show() -''' - - - + auto_open=True) diff --git a/analysis/db/us_map/graphs.py b/analysis/db/us_map/graphs.py new file mode 100644 index 0000000..7034e4a --- /dev/null +++ b/analysis/db/us_map/graphs.py @@ -0,0 +1,47 @@ +import matplotlib.pyplot as plt +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go +import numpy as np +from pyprojroot import here +import plotly.io as pio +import seaborn as sns +from urllib.request import urlopen +import pathlib as pl + + +confirmed_df = pd.read_csv('https://github.com/CSSEGISandData/COVID-19/raw/master/csse_covid_19_data/' + 'csse_covid_19_time_series/time_series_covid19_confirmed_US.csv') +loc_df = pd.read_excel(here('./data/db/original/maps/State_FIPS.xlsx')) + +merged_df = pd.merge(loc_df, confirmed_df, right_on='Admin2', left_on='Name') + +merged_df['fips_str'] = merged_df['FIPS_x'].apply(lambda x: f'{x:05.0f}') # left pad with 0 for 5 digits +molten_df = merged_df.melt( + id_vars=['FIPS_x', 'Name', 'State', 'UID', 'iso2', 'iso3', 'code3', 'FIPS_y', 'Admin2', + 'Province_State', 'Country_Region', 'Lat', 'Long_', 'Combined_Key', 'fips_str'], + var_name=['date'] +) + +molten_df['date_iso'] = pd.to_datetime(molten_df['date'], format="%m/%d/%y") # change date to ISO8601 standard format +# state = molten_df.loc[molten_df.Province_State == 'Virginia', molten_df.Province_State == 'New York'] +# molten_df['date_iso'] = molten_df.loc[molten_df.date_iso == '2020-04-01', molten_df.date_iso == '2020-04-05'] +state1 = 'Washington' +state2 = 'California' +subset = molten_df.loc[(molten_df.Province_State.isin([state1, state2])), + ['Province_State', 'Admin2', 'value', 'date_iso']] +# subset = molten_df[['Province_State', 'Admin2', 'value', 'date_iso']] +# subset = subset[subset.date_iso == '2020-04-01'] # for violinplot to show on certain date +grouped_counts = subset.groupby(['date_iso', 'Province_State', 'Admin2'])['value'].sum().reset_index() + +# Noninteractive graps +''' +# ax = sns.lineplot(x="date_iso", y="value", hue='Province_State', data=grouped_counts) # show cases per state monthly +# ax = sns.stripplot(x="date_iso", y="value", hue='Province_State', data=grouped_counts) +# ax = sns.violinplot(x='date_iso', y='value', hue='Province_State', data=grouped_counts, palette="Set2", split=True, +# scale="count", inner="quartile") +# ax = sns.countplot(x="date_iso", hue='Province_State', data=grouped_counts) # works better if there are certain dates +# plt.tight_layout() +# plt.show() +''' +