Merge branch 'main' into ru-sql

BasisResearch · Nov 15, 2024 · 6c16d14 · 6c16d14
2 parents 5fc7161 + 21f59c6
commit 6c16d14
Show file tree

Hide file tree

Showing 9 changed files with 153 additions and 74 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -0,0 +1,59 @@
+name: Test
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main, staging-* ]
+  workflow_dispatch:
+
+jobs:
+  build:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ['3.10']
+        os: [ubuntu-latest]  # , macos-latest]
+
+    steps:
+      - uses: actions/checkout@v2
+      - name: Ubuntu cache
+        uses: actions/cache@v1
+        if: startsWith(matrix.os, 'ubuntu')
+        with:
+          path: ~/.cache/pip
+          key:
+            ${{ matrix.os }}-${{ matrix.python-version }}-${{ hashFiles('**/pyproject.toml') }}
+          restore-keys: |
+            ${{ matrix.os }}-${{ matrix.python-version }}-
+
+      - name: macOS cache
+        uses: actions/cache@v1
+        if: startsWith(matrix.os, 'macOS')
+        with:
+          path: ~/Library/Caches/pip
+          key:
+            ${{ matrix.os }}-${{ matrix.python-version }}-${{ hashFiles('**/pyproject.toml') }}
+          restore-keys: |
+            ${{ matrix.os }}-${{ matrix.python-version }}-
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v1
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -e .[dev]
+
+      # - name: Generate databases
+      #   run: python cities/utils/csv_to_db_pipeline.py
+
+      - name: Test
+        run: python -m pytest tests/
+
+      - name: Test Notebooks
+        run: |
+            ./scripts/test_notebooks.sh
diff --git a/.gitignore b/.gitignore
@@ -28,3 +28,17 @@ docs/experimental_notebooks/zoning/population_preds.dill
 docs/experimental_notebooks/zoning/waic_dict_7.pkl
 docs/experimental_notebooks/zoning/waic_dict_13.pkl
 docs/experimental_notebooks/zoning/waic_dict_14.pkl
+
+.Rproj.user
+**/*.RData
+**/*.Rhistory
+
+# data
+data/minneapolis/processed/values_long.csv
+data/minneapolis/processed/values_with_parking.csv
+data/minneapolis/sourced/demographic/**
+data/minneapolis/preds/**
+data/minneapolis/sourced/parcel_to_census_tract_mappings/**
+data/minneapolis/sourced/parcel_to_parking_info_mappings/**
+
+data/minneapolis/.pgpass
diff --git a/README.md b/README.md
@@ -3,19 +3,52 @@
 </p>
 
 
-## Evaluating Policy Transfer via Similarity Analysis and Causal Inference 
+# Evaluating Policy Transfer via Similarity Analysis and Causal Inference
+
+
+## Getting started
+
+
+Welcome to the repository for [polis](http://polis.basis.ai/), developed by [Basis Research Institute](https://www.basis.ai/) for [The Opportunity Project (TOP)](https://opportunity.census.gov/) 2023 in collaboration with the U.S. Department of Commerce. The primary goal of this project is to enhance access to data for local policymakers, facilitating more informed decision-making.
+
+This is the backend repository for more advanced users. For a more pleasant frontend experience and more information, please use the [app](http://polis.basis.ai/).
+
+
+Installation
+------------
+
+**Basic Setup:**
+
+```sh
+
+    git clone [email protected]:BasisResearch/cities.git
+    cd cities
+    git checkout main
+    pip install .
 ```
-python -m venv venv
-source venv/bin/activate
-pip install -r requirements.txt
-pip install -e .
-cd tests && python -m pytest
+
+The above will install the minimal version that's ported to [polis.basis.ai](http://polis.basis.ai)
+
+**Dev Setup:**
+
+To install dev dependencies, needed to run models, train models and run all the tests, run the following command:
+
+```sh
+pip install -e .'[dev]'
 ```
 
+Details of which packages are available in which see `setup.py`.
 
-Welcome to the repository for [polis](http://polis.basis.ai/), developed by the [Basis Research Institute](https://www.basis.ai/) for [The Opportunity Project (TOP)](https://opportunity.census.gov/) 2023 in collaboration with the U.S. Department of Commerce. The primary goal of this project is to enhance access to data for local policymakers, facilitating more informed decision-making.
 
-This is the backend repository for more advanced users. For a more pleasant frontend experience and more information, please use the [app](http://polis.basis.ai/).
+** Contributing: **
+
+Before submitting a pull request, please autoformat code and ensure that unit tests pass locally
+
+```sh
+make lint              # linting
+make format            # runs black and isort, including on notebooks in the docs/ folder
+make tests             # linting, unit and notebook tests
+```
 
 
 ### The repository is structured as follows:
@@ -36,11 +69,24 @@ This is the backend repository for more advanced users. For a more pleasant fron
 └── tests
 ```    
 
+**WARNING: during the beta testing, the most recent version lives on the `staging-county-data` git branch, and so do the most recent versions of the notebooks. Please switch to this branch before inspecting the notebooks.
 
 If you're interested in downloading the data or exploring advanced features beyond the frontend, check out the `guides` folder in the `docs` directory. There, you'll find:
 - `data_sources.ipynb` for information on data sources,
+- `similarity-conceptual.ipynb` for  a conceptual account of how similarity comparison works.
+- `counterfactual-explained.ipynb` contains a rough explanation of how our causal model works. 
 - `similarity_demo.ipynb` demonstrating the use of the `DataGrabber` class for easy data acces, and of  our `FipsQuery` class, which is the key tool in the similarity-focused part of the project,
 - `causal_insights_demo.ipynb` for an overview of how the `CausalInsight` class can be used to explore the influence of a range of intervention variables thanks to causal inference tools we employed. [WIP]
 
-Feel free to dive into these resources to gain deeper insights into the capabilities of the Polis project, or to reach out if you have any comments or suggestions.
+## Interested? We'd love to hear from you.
+
+[polis](http://polis.basis.ai/) is a research tool under very active development, and we are eager to hear feedback from users in the policymaking and public administration spaces to accelerate its benefit.
+
+If you have feature requests, recommendations for new data sources, tips for how to resolve missing data issues, find bugs in the tool (they certainly exist!), or anything else, please do not hesitate to contact us at [email protected].
+
+To stay up to date on our latest features, you can subscribe to our [mailing list](https://dashboard.mailerlite.com/forms/102625/110535550672308121/share). In the near-term, we will send out a notice about our upcoming batch of improvements (including performance speedups, support for mobile, and more comprehensive tutorials), as well as an interest form for users who would like to work closely with us on case studies to make the tool most useful in their work.
+
+Lastly, we emphasize that this website is still in beta testing, and hence all predictions should be taken with a grain of salt.
+
+Acknowledgments: polis was built by Basis, a non-profit AI research organization dedicated to creating automated reasoning technology that helps solve society's most intractable problems. To learn more about us, visit https://basis.ai.
 
diff --git a/cities/modeling/modeling_utils.py b/cities/modeling/modeling_utils.py
@@ -68,8 +68,8 @@ def prep_wide_data_for_inference(
         4. Loads the required transformed features.
         5. Merges fixed covariates into a joint dataframe based on a common ID column.
         6. Ensures that the GeoFIPS (geographical identifier) is consistent across datasets.
-        7. Extracts common years for which both intervention and outcome data are available.
-        8. Shifts the outcome variable forward by the specified number of time steps.
+        7. Shifts the outcome variable forward by the specified number of time steps determined by forward_shift.
+        8. Extracts common years for which both intervention and outcome data are available.
         9. Prepares tensors for input features (x), interventions (t), and outcomes (y).
         10. Creates indices for states and units, preparing them as tensors.
         11. Validates the shapes of the tensors.
@@ -124,50 +124,25 @@ def prep_wide_data_for_inference(
 
     assert f_covariates_joint["GeoFIPS"].equals(intervention["GeoFIPS"])
 
-    # extract data for which intervention and outcome overlap
-    year_min = max(
-        intervention.columns[2:].astype(int).min(),
-        outcome.columns[2:].astype(int).min(),
-    )
-
-    year_max = min(
-        intervention.columns[2:].astype(int).max(),
-        outcome.columns[2:].astype(int).max(),
-    )
-
-    assert all(intervention["GeoFIPS"] == outcome["GeoFIPS"])
-
+    # This is for the downstream variable
     outcome_years_to_keep = [
         year
         for year in outcome.columns[2:]
-        if year_min <= int(year) <= year_max + forward_shift
+        if str(int(year) - forward_shift) in intervention.columns[2:]
     ]
 
-    outcome_years_to_keep = [
-        year for year in outcome_years_to_keep if year in intervention.columns[2:]
-    ]
-
-    outcome = outcome[outcome_years_to_keep]
-
-    # shift outcome `forward_shift` steps ahead
-    # for the prediction task
-    outcome_shifted = outcome.copy()
-
-    for i in range(len(outcome_years_to_keep) - forward_shift):
-        outcome_shifted.iloc[:, i] = outcome_shifted.iloc[:, i + forward_shift]
-
-    years_to_drop = [
-        f"{year}" for year in range(year_max - forward_shift + 1, year_max + 1)
-    ]
-    outcome_shifted.drop(columns=years_to_drop, inplace=True)
-
+    # extract data for which intervention and outcome overlap
+    outcome.drop(columns=["GeoFIPS", "GeoName"], inplace=True)
     intervention.drop(columns=["GeoFIPS", "GeoName"], inplace=True)
-    intervention = intervention[outcome_shifted.columns]
+    outcome_shifted = outcome.rename(lambda x: str(int(x) - forward_shift), axis=1)
+    years_available = [
+        year for year in intervention.columns if year in outcome_shifted.columns
+    ]
+    intervention = intervention[years_available]
+    outcome_shifted = outcome_shifted[years_available]
 
     assert intervention.shape == outcome_shifted.shape
 
-    years_available = outcome_shifted.columns.astype(int).values
-
     unit_index = pd.factorize(f_covariates_joint["GeoFIPS"].values)[0]
     state_index = pd.factorize(f_covariates_joint["GeoFIPS"].values // 1000)[0]
 
@@ -197,12 +172,13 @@ def prep_wide_data_for_inference(
 
     model_args = (N_t, N_cov, N_s, N_u, state_index, unit_index)
 
+    int_year_available = [int(year) for year in years_available]
     return {
         "model_args": model_args,
         "x": x,
         "t": t,
         "y": y,
-        "years_available": years_available,
+        "years_available": int_year_available,
         "outcome_years": outcome_years_to_keep,
         "covariates_df": f_covariates_joint,
     }
@@ -222,7 +198,10 @@ def train_interactions_model(
     guide = AutoNormal(conditioned_model)
 
     svi = SVI(
-        model=conditioned_model, guide=guide, optim=Adam({"lr": lr}), loss=Trace_ELBO()
+        model=conditioned_model,
+        guide=guide,
+        optim=Adam({"lr": lr}),  # type: ignore
+        loss=Trace_ELBO(),
     )
 
     losses = []

diff --git a/cities/modeling/tau_caching_pipeline.py b/cities/modeling/tau_caching_pipeline.py
@@ -42,8 +42,8 @@
 num_files = len(files)
 
 logging.info(
-    f"{(num_files-2)} sample dictionaries already exist. "
-    f"Starting to obtain {N_combinations_samples - (num_files -2)}"
+    f"{(num_files - 2)} sample dictionaries already exist. "
+    f"Starting to obtain {N_combinations_samples - (num_files - 2)}"
     f" out of {N_combinations_samples} sample dictionaries needed."
 )
 remaining = N_combinations_samples - (num_files - 2)
@@ -84,5 +84,5 @@
 
 logging.info(
     f"All samples are now available."
-    f"Sampling took {session_ends - session_start:.2f} seconds, or {(session_ends - session_start)/60:.2f} minutes."
+    f"Sampling took {session_ends - session_start:.2f} seconds, or {(session_ends - session_start) / 60:.2f} minutes."
 )
diff --git a/cities/modeling/training_pipeline.py b/cities/modeling/training_pipeline.py
@@ -42,8 +42,8 @@
 
 
 logging.info(
-    f"{(num_files-2)/2} guides already exist. "
-    f"Starting to train {N_combinations - (num_files -2)/2} out of {N_combinations} guides needed."
+    f"{(num_files - 2) / 2} guides already exist. "
+    f"Starting to train {N_combinations - (num_files - 2) / 2} out of {N_combinations} guides needed."
 )
 
 remaining = N_combinations - (num_files - 2) / 2

diff --git a/scripts/clean.sh b/scripts/clean.sh
@@ -1,23 +1,13 @@
 #!/bin/bash
 set -euxo pipefail
 
-<<<<<<< HEAD
 isort --profile="black" cities/ tests/
 black cities/ tests/
 autoflake --remove-all-unused-imports --in-place --recursive ./cities ./tests
 
 nbqa --nbqa-shell autoflake --remove-all-unused-imports --recursive --in-place docs/guides/ docs/testing_notebooks
 nbqa --nbqa-shell isort --profile="black" docs/guides/ docs/testing_notebooks
 black docs/guides/ docs/testing_notebooks
-=======
-# isort suspended till the CI-vs-local issue is resolved
-# isort cities/ tests/
 
-black cities/ tests/
-autoflake --remove-all-unused-imports --in-place --recursive ./cities ./tests
 
-nbqa autoflake --remove-all-unused-imports --recursive --in-place docs/guides/ 
-# nbqa isort docs/guides/
-nbqa black docs/guides/
->>>>>>> e3a66ed4029913c0706d064001cdfede0cc6f413
 
diff --git a/scripts/lint.sh b/scripts/lint.sh
@@ -2,21 +2,12 @@
 set -euxo pipefail
 
 mypy --ignore-missing-imports cities/
-<<<<<<< HEAD
+
 isort --profile="black" --check --diff cities/ tests/
-=======
-#isort --check --diff cities/ tests/
->>>>>>> e3a66ed4029913c0706d064001cdfede0cc6f413
 black --check cities/ tests/
 flake8 cities/ tests/ --ignore=E203,W503 --max-line-length=127
 
 
-<<<<<<< HEAD
 nbqa --nbqa-shell autoflake -v --recursive --check docs/guides/
 nbqa --nbqa-shell isort --profile="black" --check  docs/guides/
 black --check docs/guides/
-=======
-nbqa autoflake -v --recursive --check docs/guides/
-#nbqa isort --check  docs/guides/
-nbqa black --check docs/guides/
->>>>>>> e3a66ed4029913c0706d064001cdfede0cc6f413
diff --git a/scripts/test_notebooks.sh b/scripts/test_notebooks.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
 
-INCLUDED_NOTEBOOKS="docs/guides/ docs/testing_notebooks/"
+INCLUDED_NOTEBOOKS="docs/guides/ " # docs/testing_notebooks/"  will revert when the pyro-ppl 1.9 bug is fixed
 
 CI=1 pytest -v --nbval-lax --dist loadscope -n auto $INCLUDED_NOTEBOOKS