Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding CTFipsQuery class and demo #138

Merged
merged 9 commits into from
Aug 14, 2024
Merged
118 changes: 118 additions & 0 deletions cities/queries/fips_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import plotly.graph_objects as go

from cities.utils.data_grabber import (
CTDataGrabberCSV,
DataGrabber,
MSADataGrabber,
check_if_tensed,
Expand Down Expand Up @@ -795,3 +796,120 @@ def __init__(
self.outcome_with_percentiles["percentile"] * 100, 2
)
self.outcome_percentile_range = outcome_percentile_range


class CTFipsQuery(FipsQuery): # census tract FipsQuery

def __init__(
self,
fips,
outcome_var=None,
feature_groups_with_weights=None,
lag=0,
top=5,
time_decay=1.08,
outcome_comparison_period=None,
outcome_percentile_range=None,
ct_time_period: str = "pre_2020", # "pre_2020" or "post_2020"
):
# self.data = MSADataGrabber()
# self.all_available_features = list_available_features(level="msa")
# self.gdp_var = "gdp_ma"
# print("MSAFipsQuery __init__ data:", self.data)

if feature_groups_with_weights is None and outcome_var:
feature_groups_with_weights = {outcome_var: 4}

if outcome_var:
outcome_var_dict = {
outcome_var: feature_groups_with_weights.pop(outcome_var)
}
outcome_var_dict.update(feature_groups_with_weights)
feature_groups_with_weights = outcome_var_dict

assert not (
lag > 0 and outcome_var is None
), "Lag will be idle with no outcome variable"

assert not (
lag > 0 and outcome_comparison_period is not None
), "outcome_comparison_period is only used when lag = 0"

assert not (
outcome_var is None and outcome_comparison_period is not None
), "outcome_comparison_period requires an outcome variable"

assert not (
outcome_var is None and outcome_percentile_range is not None
), "outcome_percentile_range requires an outcome variable"

self.all_available_features = list_available_features(
level="census_tract", ct_time_period=ct_time_period
)

feature_groups = list(feature_groups_with_weights.keys())

assert feature_groups, "You need to specify at least one feature group"

assert all(
isinstance(value, int) and -4 <= value <= 4
for value in feature_groups_with_weights.values()
), "Feature weights need to be integers between -4 and 4"

self.feature_groups_with_weights = feature_groups_with_weights
self.feature_groups = feature_groups
self.data = CTDataGrabberCSV(ct_time_period=ct_time_period)
self.repo_root = self.data.repo_root
self.fips = fips
self.lag = lag
self.top = top
self.population_var = "population" # default valuable
# population instead of gdp

self.gdp_var = self.population_var # for sake of using 'find_eucleadian_kins'

# it's fine if they're None (by default)
self.outcome_var = outcome_var
self.outcome_comparison_period = outcome_comparison_period

self.time_decay = time_decay

if self.population_var not in self.feature_groups:
self.all_features = [self.population_var] + feature_groups
else:
self.all_features = feature_groups

self.data.get_features_std_wide(self.all_features)
self.data.get_features_wide(self.all_features)

assert (
fips in self.data.std_wide[self.population_var]["GeoFIPS"].values
), "FIPS not found in the data set."
self.name = self.data.std_wide[self.population_var]["GeoName"][
self.data.std_wide[self.population_var]["GeoFIPS"] == self.fips
].values[0]

assert (
self.lag >= 0 and self.lag < 6 and isinstance(self.lag, int)
), "lag must be an iteger between 0 and 5"
assert (
self.top > 0
and isinstance(self.top, int)
and self.top
< 100 # TODO Make sure the number makes sense once we add all datasets we need
), "top must be a positive integer smaller than the number of locations in the dataset"

if outcome_var:
assert check_if_tensed(
self.data.std_wide[self.outcome_var]
), "Outcome needs to be a time series."

self.outcome_with_percentiles = self.data.std_wide[self.outcome_var].copy()
most_recent_outcome = self.data.wide[self.outcome_var].iloc[:, -1].values
self.outcome_with_percentiles["percentile"] = (
most_recent_outcome < most_recent_outcome[:, np.newaxis]
).sum(axis=1) / most_recent_outcome.shape[0]
self.outcome_with_percentiles["percentile"] = round(
self.outcome_with_percentiles["percentile"] * 100, 2
)
self.outcome_percentile_range = outcome_percentile_range
248 changes: 248 additions & 0 deletions docs/experimental_notebooks/CTfipsQuery_performance.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,248 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"from cities.queries.fips_query import CTFipsQuery\n",
"from cities.utils.data_grabber import CTDataGrabberCSV, find_repo_root, list_available_features\n",
"import time\n",
"\n",
"root = find_repo_root()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Defining the Functions Used for the Comparison\n",
"\n",
"The goal is to demonstrate that the size of the census tract level variables is responsible for the poor time performance of `CTFipsQuery`. To achieve this, we will compare the execution time of the `CTFipsQuery` function for the original size of the `population` census tract level variable, as well as for half and a quarter of its size.\n"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"def save_first_half_of_population_data(data, feature_type, fraction, file_path, pre_or_post='pre_2020'):\n",
"\n",
" if fraction not in [0.25, 0.5]:\n",
" raise ValueError(\"Fraction must be 0.25 or 0.5.\")\n",
"\n",
" data = CTDataGrabberCSV(ct_time_period = pre_or_post)\n",
"\n",
" if feature_type == 'std_wide':\n",
" data.get_features_std_wide(['population'])\n",
" pop = data.std_wide['population']\n",
" elif feature_type == 'wide':\n",
" data.get_features_wide(['population'])\n",
" pop = data.wide['population']\n",
" else:\n",
" raise ValueError(\"Invalid feature_type. Must be 'std_wide' or 'wide'.\")\n",
" \n",
" num_rows = len(pop)\n",
" num_to_save = int(num_rows * fraction)\n",
" \n",
" selected_data = pop.iloc[:num_to_save]\n",
"\n",
" os.makedirs(os.path.dirname(file_path), exist_ok=True)\n",
"\n",
" selected_data.to_csv(file_path, index=False)\n",
"\n",
"\n",
"\n",
"def delete_file(file_path):\n",
" if os.path.isfile(file_path):\n",
" os.remove(file_path)\n",
" print(f\"Deleted: {file_path}\")\n",
" else:\n",
" print(f\"File not found: {file_path}\")\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"def measure_time(command):\n",
"\n",
" start_time = time.time()\n",
" command()\n",
" end_time = time.time()\n",
" return end_time - start_time\n",
"\n",
"def run_query(fips, population_type, ct_time_period):\n",
"\n",
" f = CTFipsQuery(\n",
" fips,\n",
" population_type,\n",
" ct_time_period= ct_time_period,\n",
" )\n",
" f.find_euclidean_kins()\n",
"\n",
"def main(fips, ct_time = \"pre_2020\"):\n",
"\n",
" time_population = measure_time(lambda: run_query(fips, \"population\", ct_time))\n",
" time_population_half = measure_time(lambda: run_query(fips, \"populationHalf\", ct_time))\n",
" time_population_quarter = measure_time(lambda: run_query(fips, \"populationQuarter\", ct_time))\n",
"\n",
" print(f\"Time for 'population': {time_population:.4f} seconds\")\n",
" print(f\"Time for 'populationHalf': {time_population_half:.4f} seconds\")\n",
" print(f\"Time for 'populationQuarter': {time_population_quarter:.4f} seconds\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The test for `pre_2020` population variable"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Time for 'population': 47.8251 seconds\n",
"Time for 'populationHalf': 2.6128 seconds\n",
"Time for 'populationQuarter': 1.0029 seconds\n",
"Deleted: /home/nikodem/Documents/pythonProjects/cities/data/Census_tract_level/populationHalf_pre2020_CT_std_wide.csv\n",
"Deleted: /home/nikodem/Documents/pythonProjects/cities/data/Census_tract_level/populationHalf_pre2020_CT_wide.csv\n",
"Deleted: /home/nikodem/Documents/pythonProjects/cities/data/Census_tract_level/populationQuarter_pre2020_CT_std_wide.csv\n",
"Deleted: /home/nikodem/Documents/pythonProjects/cities/data/Census_tract_level/populationQuarter_pre2020_CT_wide.csv\n"
]
}
],
"source": [
"# halfs\n",
"\n",
"std_wide_path = f\"{root}/data/Census_tract_level/populationHalf_pre2020_CT_std_wide.csv\"\n",
"wide_path = f\"{root}/data/Census_tract_level/populationHalf_pre2020_CT_wide.csv\"\n",
"\n",
"save_first_half_of_population_data(data, 'std_wide', 0.5, std_wide_path, pre_or_post='pre_2020')\n",
"save_first_half_of_population_data(data, 'wide', 0.5, wide_path, pre_or_post='pre_2020')\n",
"\n",
"# quarters\n",
"\n",
"std_wide_pathQ = f\"{root}/data/Census_tract_level/populationQuarter_pre2020_CT_std_wide.csv\"\n",
"wide_pathQ = f\"{root}/data/Census_tract_level/populationQuarter_pre2020_CT_wide.csv\"\n",
"\n",
"save_first_half_of_population_data(data, 'std_wide', 0.25, std_wide_pathQ, pre_or_post='pre_2020')\n",
"save_first_half_of_population_data(data, 'wide', 0.25, wide_pathQ, pre_or_post='pre_2020')\n",
"\n",
"# the test\n",
"\n",
"fips_number = 1003010705\n",
"main(fips_number, \"pre_2020\")\n",
"\n",
"# deleting the files created for the comparison\n",
"file_paths = [std_wide_path, wide_path, std_wide_pathQ, wide_pathQ]\n",
"\n",
"for file_path in file_paths:\n",
" delete_file(file_path)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The test for `post_2020` population variable"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Time for 'population': 81.2661 seconds\n",
"Time for 'populationHalf': 3.4648 seconds\n",
"Time for 'populationQuarter': 1.0445 seconds\n",
"Deleted: /home/nikodem/Documents/pythonProjects/cities/data/Census_tract_level/populationHalf_post2020_CT_std_wide.csv\n",
"Deleted: /home/nikodem/Documents/pythonProjects/cities/data/Census_tract_level/populationHalf_post2020_CT_wide.csv\n",
"Deleted: /home/nikodem/Documents/pythonProjects/cities/data/Census_tract_level/populationQuarter_post2020_CT_std_wide.csv\n",
"Deleted: /home/nikodem/Documents/pythonProjects/cities/data/Census_tract_level/populationQuarter_post2020_CT_wide.csv\n"
]
}
],
"source": [
"# halfs\n",
"\n",
"std_wide_path = f\"{root}/data/Census_tract_level/populationHalf_post2020_CT_std_wide.csv\"\n",
"wide_path = f\"{root}/data/Census_tract_level/populationHalf_post2020_CT_wide.csv\"\n",
"\n",
"save_first_half_of_population_data(data, 'std_wide', 0.5, std_wide_path, pre_or_post='post_2020')\n",
"save_first_half_of_population_data(data, 'wide', 0.5, wide_path, pre_or_post='post_2020')\n",
"\n",
"# quarters\n",
"\n",
"std_wide_pathQ = f\"{root}/data/Census_tract_level/populationQuarter_post2020_CT_std_wide.csv\"\n",
"wide_pathQ = f\"{root}/data/Census_tract_level/populationQuarter_post2020_CT_wide.csv\"\n",
"\n",
"save_first_half_of_population_data(data, 'std_wide', 0.25, std_wide_pathQ, pre_or_post='post_2020')\n",
"save_first_half_of_population_data(data, 'wide', 0.25, wide_pathQ, pre_or_post='post_2020')\n",
"\n",
"# the test\n",
"\n",
"fips_number = 12117021203\n",
"main(fips_number, \"post_2020\")\n",
"\n",
"# deleting the files created for the comparison\n",
"file_paths = [std_wide_path, wide_path, std_wide_pathQ, wide_pathQ]\n",
"\n",
"for file_path in file_paths:\n",
" delete_file(file_path)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading
Loading