Skip to content

Commit

Permalink
Merge pull request CodeForPhilly#900 from CodeForPhilly/lebovits/issu…
Browse files Browse the repository at this point in the history
…851-trim-fields

Lebovits/issu851 trim fields
  • Loading branch information
nlebovits authored Sep 19, 2024
2 parents 0320257 + 3b57542 commit 3478c56
Show file tree
Hide file tree
Showing 12 changed files with 199 additions and 109 deletions.
52 changes: 33 additions & 19 deletions data/src/classes/featurelayer.py
Original file line number Diff line number Diff line change
@@ -1,40 +1,49 @@
import logging as log
import os
import subprocess
import traceback
import sqlalchemy as sa
import logging as log

import geopandas as gpd
import pandas as pd
import requests
import sqlalchemy as sa
from config.config import (
FORCE_RELOAD,
USE_CRS,
log_level,
min_tiles_file_size_in_bytes,
write_production_tiles_file,
)
from config.psql import conn, local_engine
from esridump.dumper import EsriDumper
from google.cloud import storage
from google.cloud.storage.bucket import Bucket
from shapely import Point, wkb

from config.config import FORCE_RELOAD, USE_CRS, write_production_tiles_file, min_tiles_file_size_in_bytes, log_level

log.basicConfig(level=log_level)


def google_cloud_bucket() -> Bucket:
"""Build the google cloud bucket with name configured in your environ or default of cleanandgreenphl
Returns:
Bucket: the gcp bucket
"""
credentials_path = os.path.expanduser("/app/service-account-key.json")

if not os.path.exists(credentials_path):
raise FileNotFoundError(f"Credentials file not found at {credentials_path}")

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials_path
bucket_name = os.getenv("GOOGLE_CLOUD_BUCKET_NAME", "cleanandgreenphl")

storage_client = storage.Client(project="clean-and-green-philly")
return storage_client.bucket(bucket_name)


bucket = google_cloud_bucket()


class FeatureLayer:
"""
FeatureLayer is a class to represent a GIS dataset. It can be initialized with a URL to an Esri Feature Service, a SQL query to Carto, or a GeoDataFrame.
Expand All @@ -50,7 +59,7 @@ def __init__(
force_reload=FORCE_RELOAD,
from_xy=False,
use_wkb_geom_field=None,
cols: list[str] = None
cols: list[str] = None,
):
self.name = name
self.esri_rest_urls = (
Expand Down Expand Up @@ -230,7 +239,7 @@ def spatial_join(self, other_layer, how="left", predicate="intersects"):
self.gdf.drop_duplicates(inplace=True)

# Coerce opa_id to integer and drop rows where opa_id is null or non-numeric
self.gdf["opa_id"] = pd.to_numeric(self.gdf["opa_id"], errors="coerce")
self.gdf.loc[:, "opa_id"] = pd.to_numeric(self.gdf["opa_id"], errors="coerce")
self.gdf = self.gdf.dropna(subset=["opa_id"])

def opa_join(self, other_df, opa_column):
Expand All @@ -239,11 +248,13 @@ def opa_join(self, other_df, opa_column):
"""

# Coerce opa_column to integer and drop rows where opa_column is null or non-numeric
other_df[opa_column] = pd.to_numeric(other_df[opa_column], errors="coerce")
other_df.loc[:, opa_column] = pd.to_numeric(
other_df[opa_column], errors="coerce"
)
other_df = other_df.dropna(subset=[opa_column])

# Coerce opa_id to integer and drop rows where opa_id is null or non-numeric
self.gdf["opa_id"] = pd.to_numeric(self.gdf["opa_id"], errors="coerce")
self.gdf.loc[:, "opa_id"] = pd.to_numeric(self.gdf["opa_id"], errors="coerce")
self.gdf = self.gdf.dropna(subset=["opa_id"])

# Perform the merge
Expand All @@ -253,13 +264,14 @@ def opa_join(self, other_df, opa_column):

# Check if 'geometry' column exists in both dataframes and clean up
if "geometry_x" in joined.columns and "geometry_y" in joined.columns:
joined = joined.drop(columns=["geometry_y"])
joined = joined.drop(columns=["geometry_y"]).copy() # Ensure a full copy
joined = joined.rename(columns={"geometry_x": "geometry"})

if opa_column != "opa_id":
joined = joined.drop(columns=[opa_column])

self.gdf = joined
# Assign the joined DataFrame to self.gdf as a full copy
self.gdf = joined.copy()
self.rebuild_gdf()

def rebuild_gdf(self):
Expand All @@ -270,7 +282,7 @@ def create_centroid_gdf(self):
Convert the geometry of the GeoDataFrame to centroids.
"""
self.centroid_gdf = self.gdf.copy()
self.centroid_gdf["geometry"] = self.gdf["geometry"].centroid
self.centroid_gdf.loc[:, "geometry"] = self.gdf["geometry"].centroid

def build_and_publish_pmtiles(self, tileset_id):
zoom_threshold = 13
Expand Down Expand Up @@ -336,17 +348,19 @@ def build_and_publish_pmtiles(self, tileset_id):
subprocess.run(command)

write_files = [f"{tileset_id}_staging.pmtiles"]

if write_production_tiles_file:
write_files.append(f"{tileset_id}.pmtiles")

# check whether the temp saved tiles files is big enough.
# If not then it might be corrupted so log error and don't upload to gcp.
file_size = os.stat(temp_merged_pmtiles).st_size
if file_size < min_tiles_file_size_in_bytes:
raise ValueError(f"{temp_merged_pmtiles} is {file_size} bytes in size but should be at least {min_tiles_file_size_in_bytes}. Therefore, we are not uploading any files to the GCP bucket. The file may be corrupt or incomplete.")

raise ValueError(
f"{temp_merged_pmtiles} is {file_size} bytes in size but should be at least {min_tiles_file_size_in_bytes}. Therefore, we are not uploading any files to the GCP bucket. The file may be corrupt or incomplete."
)

# Upload to Google Cloud Storage
for file in write_files:
blob = bucket.blob(file)
blob.upload_from_filename(temp_merged_pmtiles)
blob.upload_from_filename(temp_merged_pmtiles)
50 changes: 35 additions & 15 deletions data/src/data_utils/city_owned_properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,26 +18,46 @@ def city_owned_properties(primary_featurelayer):
"sideyardeligible": "side_yard_eligible",
}
primary_featurelayer.gdf.rename(columns=rename_columns, inplace=True)

primary_featurelayer.gdf.loc[primary_featurelayer.gdf['owner_1'].isin(["PHILADELPHIA HOUSING AUTH", "PHILADELPHIA LAND BANK", "REDEVELOPMENT AUTHORITY", "PHILA REDEVELOPMENT AUTH"]), 'city_owner_agency'] = primary_featurelayer.gdf["owner_1"].replace({
"PHILADELPHIA HOUSING AUTH": "PHA",
"PHILADELPHIA LAND BANK": "Land Bank (PHDC)",
"REDEVELOPMENT AUTHORITY": "PRA",
"PHILA REDEVELOPMENT AUTH": "PRA"
})

primary_featurelayer.gdf.loc[
(primary_featurelayer.gdf['owner_1'] == "CITY OF PHILA") &
(primary_featurelayer.gdf['owner_2'].str.contains("PUBLIC PROP|PUBLC PROP", na=False)),
'city_owner_agency'
primary_featurelayer.gdf["owner_1"].isin(
[
"PHILADELPHIA HOUSING AUTH",
"PHILADELPHIA LAND BANK",
"REDEVELOPMENT AUTHORITY",
"PHILA REDEVELOPMENT AUTH",
]
),
"city_owner_agency",
] = primary_featurelayer.gdf["owner_1"].replace(
{
"PHILADELPHIA HOUSING AUTH": "PHA",
"PHILADELPHIA LAND BANK": "Land Bank (PHDC)",
"REDEVELOPMENT AUTHORITY": "PRA",
"PHILA REDEVELOPMENT AUTH": "PRA",
}
)

primary_featurelayer.gdf.loc[
(primary_featurelayer.gdf["owner_1"] == "CITY OF PHILA")
& (
primary_featurelayer.gdf["owner_2"].str.contains(
"PUBLIC PROP|PUBLC PROP", na=False
)
),
"city_owner_agency",
] = "DPP"

primary_featurelayer.gdf.loc[
primary_featurelayer.gdf['owner_1'].isin(["CITY OF PHILADELPHIA", "CITY OF PHILA"]) &
primary_featurelayer.gdf['owner_2'].isna(),
'city_owner_agency'
primary_featurelayer.gdf["owner_1"].isin(
["CITY OF PHILADELPHIA", "CITY OF PHILA"]
)
& primary_featurelayer.gdf["owner_2"].isna(),
"city_owner_agency",
] = "City of Philadelphia"

primary_featurelayer.gdf["side_yard_eligible"].fillna("No", inplace=True)
primary_featurelayer.gdf.loc[:, "side_yard_eligible"] = primary_featurelayer.gdf[
"side_yard_eligible"
].fillna("No")

return primary_featurelayer
4 changes: 3 additions & 1 deletion data/src/data_utils/deliquencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ def deliquencies(primary_featurelayer):
"opa_number",
)

primary_featurelayer.gdf["sheriff_sale"].fillna("N", inplace=True)
primary_featurelayer.gdf.loc[:, "sheriff_sale"] = primary_featurelayer.gdf[
"sheriff_sale"
].fillna("N")

return primary_featurelayer
43 changes: 29 additions & 14 deletions data/src/data_utils/drug_crimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,10 @@
import rasterio
from awkde.awkde import GaussianKDE
from classes.featurelayer import FeatureLayer
from config.config import USE_CRS
from constants.services import DRUGCRIME_SQL_QUERY
from rasterio.transform import Affine

from config.config import USE_CRS


def drug_crimes(primary_featurelayer):
# Initialize gun_crimes object
Expand All @@ -30,9 +29,10 @@ def drug_crimes(primary_featurelayer):

# Generate grid for plotting
grid_length = 2500

x_grid, y_grid = np.linspace(x.min(), x.max(), grid_length), np.linspace(
y.min(), y.max(), grid_length

x_grid, y_grid = (
np.linspace(x.min(), x.max(), grid_length),
np.linspace(y.min(), y.max(), grid_length),
)
xx, yy = np.meshgrid(x_grid, y_grid)
grid_points = np.array([xx.ravel(), yy.ravel()]).T
Expand Down Expand Up @@ -90,21 +90,36 @@ def drug_crimes(primary_featurelayer):

primary_featurelayer.gdf["drugcrime_density"] = sampled_values

percentile_breaks = list(range(101)) # [0, 1, 2, ..., 100]

drugcrime_classifier = mapclassify.Percentiles(
primary_featurelayer.gdf["drugcrime_density"], pct=[50, 75, 90, 95, 99, 100]
primary_featurelayer.gdf["drugcrime_density"], pct=percentile_breaks
)
primary_featurelayer.gdf["drugcrime_density"] = primary_featurelayer.gdf[

primary_featurelayer.gdf["drugcrime_density_percentile"] = primary_featurelayer.gdf[
"drugcrime_density"
].apply(drugcrime_classifier)
primary_featurelayer.gdf["drugcrime_density"] = primary_featurelayer.gdf[
"drugcrime_density"

def label_percentile(value):
if value == 1:
return "1st Percentile"
elif value == 2:
return "2nd Percentile"
elif value == 3:
return "3rd Percentile"
else:
return f"{value}th Percentile"

primary_featurelayer.gdf["drugcrime_density_label"] = primary_featurelayer.gdf[
"drugcrime_density_percentile"
].apply(label_percentile)

primary_featurelayer.gdf["drugcrime_density_percentile"] = primary_featurelayer.gdf[
"drugcrime_density_percentile"
].astype(float)

primary_featurelayer.gdf["drugcrime_density"] = primary_featurelayer.gdf[
"drugcrime_density"
].replace(
[0, 1, 2, 3, 4, 5],
["Bottom 50%", "Top 50%", "Top 25%", "Top 10%", "Top 5%", "Top 1%"],
primary_featurelayer.gdf = primary_featurelayer.gdf.drop(
columns=["drugcrime_density"]
)

return primary_featurelayer
43 changes: 29 additions & 14 deletions data/src/data_utils/gun_crimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,10 @@
import rasterio
from awkde.awkde import GaussianKDE
from classes.featurelayer import FeatureLayer
from config.config import USE_CRS
from constants.services import GUNCRIME_SQL_QUERY
from rasterio.transform import Affine

from config.config import USE_CRS


def gun_crimes(primary_featurelayer):
# Initialize gun_crimes object
Expand All @@ -28,9 +27,10 @@ def gun_crimes(primary_featurelayer):

# Generate grid for plotting
grid_length = 2500

x_grid, y_grid = np.linspace(x.min(), x.max(), grid_length), np.linspace(
y.min(), y.max(), grid_length

x_grid, y_grid = (
np.linspace(x.min(), x.max(), grid_length),
np.linspace(y.min(), y.max(), grid_length),
)
xx, yy = np.meshgrid(x_grid, y_grid)
grid_points = np.array([xx.ravel(), yy.ravel()]).T
Expand Down Expand Up @@ -88,21 +88,36 @@ def gun_crimes(primary_featurelayer):

primary_featurelayer.gdf["guncrime_density"] = sampled_values

percentile_breaks = list(range(101)) # [0, 1, 2, ..., 100]

guncrime_classifier = mapclassify.Percentiles(
primary_featurelayer.gdf["guncrime_density"], pct=[50, 75, 90, 95, 99, 100]
primary_featurelayer.gdf["guncrime_density"], pct=percentile_breaks
)
primary_featurelayer.gdf["guncrime_density"] = primary_featurelayer.gdf[

primary_featurelayer.gdf["guncrime_density_percentile"] = primary_featurelayer.gdf[
"guncrime_density"
].apply(guncrime_classifier)
primary_featurelayer.gdf["guncrime_density"] = primary_featurelayer.gdf[
"guncrime_density"

def label_percentile(value):
if value == 1:
return "1st Percentile"
elif value == 2:
return "2nd Percentile"
elif value == 3:
return "3rd Percentile"
else:
return f"{value}th Percentile"

primary_featurelayer.gdf["guncrime_density_label"] = primary_featurelayer.gdf[
"guncrime_density_percentile"
].apply(label_percentile)

primary_featurelayer.gdf["guncrime_density_percentile"] = primary_featurelayer.gdf[
"guncrime_density_percentile"
].astype(float)

primary_featurelayer.gdf["guncrime_density"] = primary_featurelayer.gdf[
"guncrime_density"
].replace(
[0, 1, 2, 3, 4, 5],
["Bottom 50%", "Top 50%", "Top 25%", "Top 10%", "Top 5%", "Top 1%"],
primary_featurelayer.gdf = primary_featurelayer.gdf.drop(
columns=["guncrime_density"]
)

return primary_featurelayer
10 changes: 6 additions & 4 deletions data/src/data_utils/imm_dang_buildings.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,19 @@ def imm_dang_buildings(primary_featurelayer):
cols=["opa_account_num"],
)

imm_dang_buildings.gdf["imm_dang_building"] = "Y"
imm_dang_buildings.gdf.loc[:, "imm_dang_building"] = "Y"

imm_dang_buildings.gdf.rename(
columns={"opa_account_num": "opa_number"}, inplace=True
imm_dang_buildings.gdf = imm_dang_buildings.gdf.rename(
columns={"opa_account_num": "opa_number"}
)

primary_featurelayer.opa_join(
imm_dang_buildings.gdf,
"opa_number",
)

primary_featurelayer.gdf["imm_dang_building"].fillna("N", inplace=True)
primary_featurelayer.gdf.loc[:, "imm_dang_building"] = primary_featurelayer.gdf[
"imm_dang_building"
].fillna("N")

return primary_featurelayer
Loading

0 comments on commit 3478c56

Please sign in to comment.