BasisResearch · rfl-urbaniak · Aug 14, 2024 · Jul 25, 2024 · Jul 26, 2024 · Jul 26, 2024
diff --git a/cities/queries/fips_query.py b/cities/queries/fips_query.py
@@ -3,6 +3,7 @@
 import plotly.graph_objects as go
 
 from cities.utils.data_grabber import (
+    CTDataGrabberCSV,
     DataGrabber,
     MSADataGrabber,
     check_if_tensed,
@@ -795,3 +796,120 @@ def __init__(
                 self.outcome_with_percentiles["percentile"] * 100, 2
             )
             self.outcome_percentile_range = outcome_percentile_range
+
+
+class CTFipsQuery(FipsQuery):  # census tract FipsQuery
+
+    def __init__(
+        self,
+        fips,
+        outcome_var=None,
+        feature_groups_with_weights=None,
+        lag=0,
+        top=5,
+        time_decay=1.08,
+        outcome_comparison_period=None,
+        outcome_percentile_range=None,
+        ct_time_period: str = "pre_2020",  # "pre_2020" or "post_2020"
+    ):
+        # self.data = MSADataGrabber()
+        # self.all_available_features = list_available_features(level="msa")
+        # self.gdp_var = "gdp_ma"
+        # print("MSAFipsQuery __init__ data:", self.data)
+
+        if feature_groups_with_weights is None and outcome_var:
+            feature_groups_with_weights = {outcome_var: 4}
+
+        if outcome_var:
+            outcome_var_dict = {
+                outcome_var: feature_groups_with_weights.pop(outcome_var)
+            }
+            outcome_var_dict.update(feature_groups_with_weights)
+            feature_groups_with_weights = outcome_var_dict
+
+        assert not (
+            lag > 0 and outcome_var is None
+        ), "Lag will be idle with no outcome variable"
+
+        assert not (
+            lag > 0 and outcome_comparison_period is not None
+        ), "outcome_comparison_period is only used when lag = 0"
+
+        assert not (
+            outcome_var is None and outcome_comparison_period is not None
+        ), "outcome_comparison_period requires an outcome variable"
+
+        assert not (
+            outcome_var is None and outcome_percentile_range is not None
+        ), "outcome_percentile_range requires an outcome variable"
+
+        self.all_available_features = list_available_features(
+            level="census_tract", ct_time_period=ct_time_period
+        )
+
+        feature_groups = list(feature_groups_with_weights.keys())
+
+        assert feature_groups, "You need to specify at least one feature group"
+
+        assert all(
+            isinstance(value, int) and -4 <= value <= 4
+            for value in feature_groups_with_weights.values()
+        ), "Feature weights need to be integers between -4 and 4"
+
+        self.feature_groups_with_weights = feature_groups_with_weights
+        self.feature_groups = feature_groups
+        self.data = CTDataGrabberCSV(ct_time_period=ct_time_period)
+        self.repo_root = self.data.repo_root
+        self.fips = fips
+        self.lag = lag
+        self.top = top
+        self.population_var = "population"  # default valuable
+        # population instead of gdp
+
+        self.gdp_var = self.population_var  # for sake of using 'find_eucleadian_kins'
+
+        # it's fine if they're None (by default)
+        self.outcome_var = outcome_var
+        self.outcome_comparison_period = outcome_comparison_period
+
+        self.time_decay = time_decay
+
+        if self.population_var not in self.feature_groups:
+            self.all_features = [self.population_var] + feature_groups
+        else:
+            self.all_features = feature_groups
+
+        self.data.get_features_std_wide(self.all_features)
+        self.data.get_features_wide(self.all_features)
+
+        assert (
+            fips in self.data.std_wide[self.population_var]["GeoFIPS"].values
+        ), "FIPS not found in the data set."
+        self.name = self.data.std_wide[self.population_var]["GeoName"][
+            self.data.std_wide[self.population_var]["GeoFIPS"] == self.fips
+        ].values[0]
+
+        assert (
+            self.lag >= 0 and self.lag < 6 and isinstance(self.lag, int)
+        ), "lag must be  an iteger between 0 and 5"
+        assert (
+            self.top > 0
+            and isinstance(self.top, int)
+            and self.top
+            < 100  # TODO Make sure the number makes sense once we add all datasets we need
+        ), "top must be a positive integer smaller than the number of locations in the dataset"
+
+        if outcome_var:
+            assert check_if_tensed(
+                self.data.std_wide[self.outcome_var]
+            ), "Outcome needs to be a time series."
+
+            self.outcome_with_percentiles = self.data.std_wide[self.outcome_var].copy()
+            most_recent_outcome = self.data.wide[self.outcome_var].iloc[:, -1].values
+            self.outcome_with_percentiles["percentile"] = (
+                most_recent_outcome < most_recent_outcome[:, np.newaxis]
+            ).sum(axis=1) / most_recent_outcome.shape[0]
+            self.outcome_with_percentiles["percentile"] = round(
+                self.outcome_with_percentiles["percentile"] * 100, 2
+            )
+            self.outcome_percentile_range = outcome_percentile_range
diff --git a/docs/experimental_notebooks/CTfipsQuery_performance.ipynb b/docs/experimental_notebooks/CTfipsQuery_performance.ipynb
@@ -0,0 +1,248 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "from cities.queries.fips_query import CTFipsQuery\n",
+    "from cities.utils.data_grabber import CTDataGrabberCSV, find_repo_root, list_available_features\n",
+    "import time\n",
+    "\n",
+    "root = find_repo_root()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Defining the Functions Used for the Comparison\n",
+    "\n",
+    "The goal is to demonstrate that the size of the census tract level variables is responsible for the poor time performance of `CTFipsQuery`. To achieve this, we will compare the execution time of the `CTFipsQuery` function for the original size of the `population` census tract level variable, as well as for half and a quarter of its size.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def save_first_half_of_population_data(data, feature_type, fraction, file_path, pre_or_post='pre_2020'):\n",
+    "\n",
+    "    if fraction not in [0.25, 0.5]:\n",
+    "        raise ValueError(\"Fraction must be 0.25 or 0.5.\")\n",
+    "\n",
+    "    data = CTDataGrabberCSV(ct_time_period = pre_or_post)\n",
+    "\n",
+    "    if feature_type == 'std_wide':\n",
+    "        data.get_features_std_wide(['population'])\n",
+    "        pop = data.std_wide['population']\n",
+    "    elif feature_type == 'wide':\n",
+    "        data.get_features_wide(['population'])\n",
+    "        pop = data.wide['population']\n",
+    "    else:\n",
+    "        raise ValueError(\"Invalid feature_type. Must be 'std_wide' or 'wide'.\")\n",
+    "    \n",
+    "    num_rows = len(pop)\n",
+    "    num_to_save = int(num_rows * fraction)\n",
+    "    \n",
+    "    selected_data = pop.iloc[:num_to_save]\n",
+    "\n",
+    "    os.makedirs(os.path.dirname(file_path), exist_ok=True)\n",
+    "\n",
+    "    selected_data.to_csv(file_path, index=False)\n",
+    "\n",
+    "\n",
+    "\n",
+    "def delete_file(file_path):\n",
+    "    if os.path.isfile(file_path):\n",
+    "        os.remove(file_path)\n",
+    "        print(f\"Deleted: {file_path}\")\n",
+    "    else:\n",
+    "        print(f\"File not found: {file_path}\")\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def measure_time(command):\n",
+    "\n",
+    "    start_time = time.time()\n",
+    "    command()\n",
+    "    end_time = time.time()\n",
+    "    return end_time - start_time\n",
+    "\n",
+    "def run_query(fips, population_type, ct_time_period):\n",
+    "\n",
+    "    f = CTFipsQuery(\n",
+    "        fips,\n",
+    "        population_type,\n",
+    "        ct_time_period= ct_time_period,\n",
+    "    )\n",
+    "    f.find_euclidean_kins()\n",
+    "\n",
+    "def main(fips, ct_time = \"pre_2020\"):\n",
+    "\n",
+    "    time_population = measure_time(lambda: run_query(fips, \"population\", ct_time))\n",
+    "    time_population_half = measure_time(lambda: run_query(fips, \"populationHalf\", ct_time))\n",
+    "    time_population_quarter = measure_time(lambda: run_query(fips, \"populationQuarter\", ct_time))\n",
+    "\n",
+    "    print(f\"Time for 'population': {time_population:.4f} seconds\")\n",
+    "    print(f\"Time for 'populationHalf': {time_population_half:.4f} seconds\")\n",
+    "    print(f\"Time for 'populationQuarter': {time_population_quarter:.4f} seconds\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The test for `pre_2020` population variable"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Time for 'population': 47.8251 seconds\n",
+      "Time for 'populationHalf': 2.6128 seconds\n",
+      "Time for 'populationQuarter': 1.0029 seconds\n",
+      "Deleted: /home/nikodem/Documents/pythonProjects/cities/data/Census_tract_level/populationHalf_pre2020_CT_std_wide.csv\n",
+      "Deleted: /home/nikodem/Documents/pythonProjects/cities/data/Census_tract_level/populationHalf_pre2020_CT_wide.csv\n",
+      "Deleted: /home/nikodem/Documents/pythonProjects/cities/data/Census_tract_level/populationQuarter_pre2020_CT_std_wide.csv\n",
+      "Deleted: /home/nikodem/Documents/pythonProjects/cities/data/Census_tract_level/populationQuarter_pre2020_CT_wide.csv\n"
+     ]
+    }
+   ],
+   "source": [
+    "# halfs\n",
+    "\n",
+    "std_wide_path = f\"{root}/data/Census_tract_level/populationHalf_pre2020_CT_std_wide.csv\"\n",
+    "wide_path = f\"{root}/data/Census_tract_level/populationHalf_pre2020_CT_wide.csv\"\n",
+    "\n",
+    "save_first_half_of_population_data(data, 'std_wide', 0.5, std_wide_path, pre_or_post='pre_2020')\n",
+    "save_first_half_of_population_data(data, 'wide', 0.5, wide_path, pre_or_post='pre_2020')\n",
+    "\n",
+    "# quarters\n",
+    "\n",
+    "std_wide_pathQ = f\"{root}/data/Census_tract_level/populationQuarter_pre2020_CT_std_wide.csv\"\n",
+    "wide_pathQ = f\"{root}/data/Census_tract_level/populationQuarter_pre2020_CT_wide.csv\"\n",
+    "\n",
+    "save_first_half_of_population_data(data, 'std_wide', 0.25, std_wide_pathQ, pre_or_post='pre_2020')\n",
+    "save_first_half_of_population_data(data, 'wide', 0.25, wide_pathQ, pre_or_post='pre_2020')\n",
+    "\n",
+    "# the test\n",
+    "\n",
+    "fips_number = 1003010705\n",
+    "main(fips_number, \"pre_2020\")\n",
+    "\n",
+    "# deleting the files created for the comparison\n",
+    "file_paths = [std_wide_path, wide_path, std_wide_pathQ, wide_pathQ]\n",
+    "\n",
+    "for file_path in file_paths:\n",
+    "    delete_file(file_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The test for `post_2020` population variable"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Time for 'population': 81.2661 seconds\n",
+      "Time for 'populationHalf': 3.4648 seconds\n",
+      "Time for 'populationQuarter': 1.0445 seconds\n",
+      "Deleted: /home/nikodem/Documents/pythonProjects/cities/data/Census_tract_level/populationHalf_post2020_CT_std_wide.csv\n",
+      "Deleted: /home/nikodem/Documents/pythonProjects/cities/data/Census_tract_level/populationHalf_post2020_CT_wide.csv\n",
+      "Deleted: /home/nikodem/Documents/pythonProjects/cities/data/Census_tract_level/populationQuarter_post2020_CT_std_wide.csv\n",
+      "Deleted: /home/nikodem/Documents/pythonProjects/cities/data/Census_tract_level/populationQuarter_post2020_CT_wide.csv\n"
+     ]
+    }
+   ],
+   "source": [
+    "# halfs\n",
+    "\n",
+    "std_wide_path = f\"{root}/data/Census_tract_level/populationHalf_post2020_CT_std_wide.csv\"\n",
+    "wide_path = f\"{root}/data/Census_tract_level/populationHalf_post2020_CT_wide.csv\"\n",
+    "\n",
+    "save_first_half_of_population_data(data, 'std_wide', 0.5, std_wide_path, pre_or_post='post_2020')\n",
+    "save_first_half_of_population_data(data, 'wide', 0.5, wide_path, pre_or_post='post_2020')\n",
+    "\n",
+    "# quarters\n",
+    "\n",
+    "std_wide_pathQ = f\"{root}/data/Census_tract_level/populationQuarter_post2020_CT_std_wide.csv\"\n",
+    "wide_pathQ = f\"{root}/data/Census_tract_level/populationQuarter_post2020_CT_wide.csv\"\n",
+    "\n",
+    "save_first_half_of_population_data(data, 'std_wide', 0.25, std_wide_pathQ, pre_or_post='post_2020')\n",
+    "save_first_half_of_population_data(data, 'wide', 0.25, wide_pathQ, pre_or_post='post_2020')\n",
+    "\n",
+    "# the test\n",
+    "\n",
+    "fips_number = 12117021203\n",
+    "main(fips_number, \"post_2020\")\n",
+    "\n",
+    "# deleting the files created for the comparison\n",
+    "file_paths = [std_wide_path, wide_path, std_wide_pathQ, wide_pathQ]\n",
+    "\n",
+    "for file_path in file_paths:\n",
+    "    delete_file(file_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}