From 52d70586dc6b305b546e6cc24ceb2661b0767842 Mon Sep 17 00:00:00 2001 From: JialuJialu Date: Fri, 9 Aug 2024 17:16:21 -0400 Subject: [PATCH 01/10] simplify the selection of years in the training_pipeline --- cities/modeling/modeling_utils.py | 51 +++++-------------------- cities/modeling/tau_caching_pipeline.py | 6 +-- cities/modeling/training_pipeline.py | 4 +- cities/queries/fips_query.py | 12 +++--- 4 files changed, 21 insertions(+), 52 deletions(-) diff --git a/cities/modeling/modeling_utils.py b/cities/modeling/modeling_utils.py index 5ca0e543..74464e47 100644 --- a/cities/modeling/modeling_utils.py +++ b/cities/modeling/modeling_utils.py @@ -91,49 +91,17 @@ def prep_wide_data_for_inference( assert f_covariates_joint["GeoFIPS"].equals(intervention["GeoFIPS"]) # extract data for which intervention and outcome overlap - year_min = max( - intervention.columns[2:].astype(int).min(), - outcome.columns[2:].astype(int).min(), - ) - - year_max = min( - intervention.columns[2:].astype(int).max(), - outcome.columns[2:].astype(int).max(), - ) - - assert all(intervention["GeoFIPS"] == outcome["GeoFIPS"]) - - outcome_years_to_keep = [ - year - for year in outcome.columns[2:] - if year_min <= int(year) <= year_max + forward_shift - ] - - outcome_years_to_keep = [ - year for year in outcome_years_to_keep if year in intervention.columns[2:] - ] - - outcome = outcome[outcome_years_to_keep] - - # shift outcome `forward_shift` steps ahead - # for the prediction task - outcome_shifted = outcome.copy() - - for i in range(len(outcome_years_to_keep) - forward_shift): - outcome_shifted.iloc[:, i] = outcome_shifted.iloc[:, i + forward_shift] - - years_to_drop = [ - f"{year}" for year in range(year_max - forward_shift + 1, year_max + 1) - ] - outcome_shifted.drop(columns=years_to_drop, inplace=True) - + outcome.drop(columns=["GeoFIPS", "GeoName"], inplace=True) intervention.drop(columns=["GeoFIPS", "GeoName"], inplace=True) - intervention = intervention[outcome_shifted.columns] + outcome_shifted = outcome.rename(lambda x: str(int(x) - forward_shift), axis=1) + years_available = [ + year for year in intervention.columns if year in outcome_shifted.columns + ] + intervention = intervention[years_available] + outcome_shifted = outcome_shifted[years_available] assert intervention.shape == outcome_shifted.shape - years_available = outcome_shifted.columns.astype(int).values - unit_index = pd.factorize(f_covariates_joint["GeoFIPS"].values)[0] state_index = pd.factorize(f_covariates_joint["GeoFIPS"].values // 1000)[0] @@ -163,13 +131,14 @@ def prep_wide_data_for_inference( model_args = (N_t, N_cov, N_s, N_u, state_index, unit_index) + int_year_available = [int(year) for year in years_available] return { "model_args": model_args, "x": x, "t": t, "y": y, - "years_available": years_available, - "outcome_years": outcome_years_to_keep, + "years_available": int_year_available, + "outcome_years": [str(year + forward_shift) for year in int_year_available], } diff --git a/cities/modeling/tau_caching_pipeline.py b/cities/modeling/tau_caching_pipeline.py index b517d522..29e3a51b 100644 --- a/cities/modeling/tau_caching_pipeline.py +++ b/cities/modeling/tau_caching_pipeline.py @@ -42,8 +42,8 @@ num_files = len(files) logging.info( - f"{(num_files-2)} sample dictionaries already exist. " - f"Starting to obtain {N_combinations_samples - (num_files -2)}" + f"{(num_files - 2)} sample dictionaries already exist. " + f"Starting to obtain {N_combinations_samples - (num_files - 2)}" f" out of {N_combinations_samples} sample dictionaries needed." ) remaining = N_combinations_samples - (num_files - 2) @@ -84,5 +84,5 @@ logging.info( f"All samples are now available." - f"Sampling took {session_ends - session_start:.2f} seconds, or {(session_ends - session_start)/60:.2f} minutes." + f"Sampling took {session_ends - session_start:.2f} seconds, or {(session_ends - session_start) / 60:.2f} minutes." ) diff --git a/cities/modeling/training_pipeline.py b/cities/modeling/training_pipeline.py index 38b019ed..e1f67f75 100644 --- a/cities/modeling/training_pipeline.py +++ b/cities/modeling/training_pipeline.py @@ -42,8 +42,8 @@ logging.info( - f"{(num_files-2)/2} guides already exist. " - f"Starting to train {N_combinations - (num_files -2)/2} out of {N_combinations} guides needed." + f"{(num_files - 2) / 2} guides already exist. " + f"Starting to train {N_combinations - (num_files - 2) / 2} out of {N_combinations} guides needed." ) remaining = N_combinations - (num_files - 2) / 2 diff --git a/cities/queries/fips_query.py b/cities/queries/fips_query.py index 2878f31d..5d6a14f3 100644 --- a/cities/queries/fips_query.py +++ b/cities/queries/fips_query.py @@ -467,9 +467,9 @@ def find_euclidean_kins(self): if col.endswith(feature) ] if _selected: - atemporal_aggregated_dict[ - feature - ] = atemporal_featurewise_contributions_df[_selected].sum(axis=1) + atemporal_aggregated_dict[feature] = ( + atemporal_featurewise_contributions_df[_selected].sum(axis=1) + ) aggregated_atemporal_featurewise_contributions_df = pd.DataFrame( atemporal_aggregated_dict @@ -489,9 +489,9 @@ def find_euclidean_kins(self): axis=1, ) columns_to_normalize = self.aggregated_featurewise_contributions.iloc[:, 3:] - self.aggregated_featurewise_contributions.iloc[ - :, 3: - ] = columns_to_normalize.div(columns_to_normalize.sum(axis=1), axis=0) + self.aggregated_featurewise_contributions.iloc[:, 3:] = ( + columns_to_normalize.div(columns_to_normalize.sum(axis=1), axis=0) + ) # some sanity checks count = sum([1 for distance in distances if distance == 0]) From 141cc30e5d9ddbd6ae200140ce30e85670a0a03c Mon Sep 17 00:00:00 2001 From: JialuJialu Date: Mon, 12 Aug 2024 16:01:59 -0400 Subject: [PATCH 02/10] reformat and change Adam to ClippedAdam --- cities/modeling/modeling_utils.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/cities/modeling/modeling_utils.py b/cities/modeling/modeling_utils.py index 74464e47..76aaef19 100644 --- a/cities/modeling/modeling_utils.py +++ b/cities/modeling/modeling_utils.py @@ -6,7 +6,7 @@ import torch from pyro.infer import SVI, Trace_ELBO from pyro.infer.autoguide import AutoNormal -from pyro.optim import Adam +from pyro.optim.optim import ClippedAdam from cities.utils.data_grabber import ( DataGrabber, @@ -36,8 +36,8 @@ def prep_wide_data_for_inference( 4. Loads the required transformed features. 5. Merges fixed covariates into a joint dataframe based on a common ID column. 6. Ensures that the GeoFIPS (geographical identifier) is consistent across datasets. - 7. Extracts common years for which both intervention and outcome data are available. - 8. Shifts the outcome variable forward by the specified number of time steps. + 7. Shifts the outcome variable forward by the specified number of time steps determined by forward_shift. + 8. Extracts common years for which both intervention and outcome data are available. 9. Prepares tensors for input features (x), interventions (t), and outcomes (y). 10. Creates indices for states and units, preparing them as tensors. 11. Validates the shapes of the tensors. @@ -156,7 +156,10 @@ def train_interactions_model( guide = AutoNormal(conditioned_model) svi = SVI( - model=conditioned_model, guide=guide, optim=Adam({"lr": lr}), loss=Trace_ELBO() + model=conditioned_model, + guide=guide, + optim=ClippedAdam({"lr": lr}), + loss=Trace_ELBO(), ) losses = [] From c81e044ece0c8145db8e92983915379c4f0c6ce7 Mon Sep 17 00:00:00 2001 From: JialuJialu Date: Thu, 22 Aug 2024 14:59:42 -0400 Subject: [PATCH 03/10] reformat --- cities/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cities/__init__.py b/cities/__init__.py index 9db96c01..b76bbcc9 100644 --- a/cities/__init__.py +++ b/cities/__init__.py @@ -2,4 +2,5 @@ Project short description. """ -__version__ = "0.0.1" + +__version__ = "0.0.1" \ No newline at end of file From d59ad3b2307767558a4bfed373b1c1d75993466e Mon Sep 17 00:00:00 2001 From: JialuJialu Date: Thu, 22 Aug 2024 15:07:30 -0400 Subject: [PATCH 04/10] reformat again --- cities/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cities/__init__.py b/cities/__init__.py index b76bbcc9..f993e182 100644 --- a/cities/__init__.py +++ b/cities/__init__.py @@ -3,4 +3,4 @@ Project short description. """ -__version__ = "0.0.1" \ No newline at end of file +__version__ = "0.0.1" From 4256351711004b3c1241b710ce1f66728f710d6e Mon Sep 17 00:00:00 2001 From: rfl-urbaniak Date: Mon, 9 Sep 2024 09:05:17 -0400 Subject: [PATCH 05/10] update gitignore --- .gitignore | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/.gitignore b/.gitignore index 89fa2675..bbeb945f 100644 --- a/.gitignore +++ b/.gitignore @@ -23,3 +23,16 @@ tests/.coverage .vscode/launch.json data/sql/counties_database.db data/sql/msa_database.db +.Rproj.user +**/*.RData +**/*.Rhistory + +# data +data/minneapolis/processed/values_long.csv +data/minneapolis/processed/values_with_parking.csv +data/minneapolis/sourced/demographic/** +data/minneapolis/preds/** +data/minneapolis/sourced/parcel_to_census_tract_mappings/** +data/minneapolis/sourced/parcel_to_parking_info_mappings/** + +data/minneapolis/.pgpass From 4c6cb8ac474f12d5e7af1157e21d44e07adbee69 Mon Sep 17 00:00:00 2001 From: rfl-urbaniak Date: Mon, 9 Sep 2024 11:06:24 -0400 Subject: [PATCH 06/10] update scripts --- scripts/clean.sh | 21 +++++++++++++++------ scripts/lint.sh | 6 +++--- scripts/test_notebooks.sh | 2 +- 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/scripts/clean.sh b/scripts/clean.sh index fe727a37..6918545f 100755 --- a/scripts/clean.sh +++ b/scripts/clean.sh @@ -1,13 +1,22 @@ #!/bin/bash set -euxo pipefail -# isort suspended till the CI-vs-local issue is resolved -# isort cities/ tests/ +# isort suspended as conflicting with black +# nbqa isort docs/guides/ + + +# this sometimes conflicts with black but does some +# preliminary import sorting +# and is then overriden by black +isort cities/ tests/ + +black ./cities/ ./tests/ ./docs/guides/ + +black docs/guides/ -black cities/ tests/ autoflake --remove-all-unused-imports --in-place --recursive ./cities ./tests -nbqa autoflake --remove-all-unused-imports --recursive --in-place docs/guides/ -# nbqa isort docs/guides/ -nbqa black docs/guides/ +nbqa autoflake --nbqa-shell --remove-all-unused-imports --recursive --in-place docs/guides/ + +#nbqa black docs/guides/ diff --git a/scripts/lint.sh b/scripts/lint.sh index 538aeeb1..5e5b9abe 100755 --- a/scripts/lint.sh +++ b/scripts/lint.sh @@ -3,10 +3,10 @@ set -euxo pipefail mypy --ignore-missing-imports cities/ #isort --check --diff cities/ tests/ -black --check cities/ tests/ +black --check cities/ tests/ docs/guides/ flake8 cities/ tests/ --ignore=E203,W503 --max-line-length=127 -nbqa autoflake -v --recursive --check docs/guides/ +nbqa autoflake --nbqa-shell -v --recursive --check docs/guides/ #nbqa isort --check docs/guides/ -nbqa black --check docs/guides/ + diff --git a/scripts/test_notebooks.sh b/scripts/test_notebooks.sh index b31a8820..f5defc99 100755 --- a/scripts/test_notebooks.sh +++ b/scripts/test_notebooks.sh @@ -1,5 +1,5 @@ #!/bin/bash -INCLUDED_NOTEBOOKS="docs/guides/ docs/testing_notebooks/" +INCLUDED_NOTEBOOKS="docs/guides/ " # docs/testing_notebooks/" will revert when the pyro-ppl 1.9 bug is fixed CI=1 pytest -v --nbval-lax --dist loadscope -n auto $INCLUDED_NOTEBOOKS From 51ce1238651080bd4badf49ce8ba8046e46fe27c Mon Sep 17 00:00:00 2001 From: JialuJialu Date: Wed, 18 Sep 2024 14:27:55 -0400 Subject: [PATCH 07/10] change back the optimizer and add back the downstream variable --- cities/modeling/modeling_utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cities/modeling/modeling_utils.py b/cities/modeling/modeling_utils.py index 336da760..c9f5d008 100644 --- a/cities/modeling/modeling_utils.py +++ b/cities/modeling/modeling_utils.py @@ -124,6 +124,9 @@ def prep_wide_data_for_inference( assert f_covariates_joint["GeoFIPS"].equals(intervention["GeoFIPS"]) + # This is for the downstream variable + outcome_years_to_keep = [year for year in outcome.columns[2:] if year - forward_shift in intervention.columns[2:]] + # extract data for which intervention and outcome overlap outcome.drop(columns=["GeoFIPS", "GeoName"], inplace=True) intervention.drop(columns=["GeoFIPS", "GeoName"], inplace=True) @@ -193,7 +196,7 @@ def train_interactions_model( svi = SVI( model=conditioned_model, guide=guide, - optim=ClippedAdam({"lr": lr}), + optim=Adam({"lr": lr}), # type: ignore loss=Trace_ELBO(), ) From 0870b857001be272e495fd229cba6406c13a5ec8 Mon Sep 17 00:00:00 2001 From: JialuJialu Date: Wed, 18 Sep 2024 15:04:29 -0400 Subject: [PATCH 08/10] reformat --- cities/modeling/modeling_utils.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/cities/modeling/modeling_utils.py b/cities/modeling/modeling_utils.py index c9f5d008..db644ada 100644 --- a/cities/modeling/modeling_utils.py +++ b/cities/modeling/modeling_utils.py @@ -125,7 +125,11 @@ def prep_wide_data_for_inference( assert f_covariates_joint["GeoFIPS"].equals(intervention["GeoFIPS"]) # This is for the downstream variable - outcome_years_to_keep = [year for year in outcome.columns[2:] if year - forward_shift in intervention.columns[2:]] + outcome_years_to_keep = [ + year + for year in outcome.columns[2:] + if year - forward_shift in intervention.columns[2:] + ] # extract data for which intervention and outcome overlap outcome.drop(columns=["GeoFIPS", "GeoName"], inplace=True) @@ -196,7 +200,7 @@ def train_interactions_model( svi = SVI( model=conditioned_model, guide=guide, - optim=Adam({"lr": lr}), # type: ignore + optim=Adam({"lr": lr}), # type: ignore loss=Trace_ELBO(), ) From 2c05ea0486e3ba3d602ef7a70d2fbc7505592cf5 Mon Sep 17 00:00:00 2001 From: JialuJialu Date: Wed, 18 Sep 2024 17:10:14 -0400 Subject: [PATCH 09/10] fix type error --- cities/modeling/modeling_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cities/modeling/modeling_utils.py b/cities/modeling/modeling_utils.py index db644ada..a1be3e47 100644 --- a/cities/modeling/modeling_utils.py +++ b/cities/modeling/modeling_utils.py @@ -128,7 +128,7 @@ def prep_wide_data_for_inference( outcome_years_to_keep = [ year for year in outcome.columns[2:] - if year - forward_shift in intervention.columns[2:] + if str(int(year) - forward_shift) in intervention.columns[2:] ] # extract data for which intervention and outcome overlap From 8824f7f03c4a42ca33d1eecc3f909ec968b5b300 Mon Sep 17 00:00:00 2001 From: JialuJialu Date: Wed, 18 Sep 2024 17:24:40 -0400 Subject: [PATCH 10/10] fix unused variables --- cities/modeling/modeling_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cities/modeling/modeling_utils.py b/cities/modeling/modeling_utils.py index a1be3e47..30ea4d48 100644 --- a/cities/modeling/modeling_utils.py +++ b/cities/modeling/modeling_utils.py @@ -178,7 +178,7 @@ def prep_wide_data_for_inference( "x": x, "t": t, "y": y, - "years_available": years_available, + "years_available": int_year_available, "outcome_years": outcome_years_to_keep, "covariates_df": f_covariates_joint, }