Skip to content

Commit

Permalink
trim unndeeded fields; fix issues with .iloc
Browse files Browse the repository at this point in the history
  • Loading branch information
nlebovits committed Sep 19, 2024
1 parent 4e4ae40 commit 3b57542
Show file tree
Hide file tree
Showing 10 changed files with 141 additions and 81 deletions.
52 changes: 33 additions & 19 deletions data/src/classes/featurelayer.py
Original file line number Diff line number Diff line change
@@ -1,40 +1,49 @@
import logging as log
import os
import subprocess
import traceback
import sqlalchemy as sa
import logging as log

import geopandas as gpd
import pandas as pd
import requests
import sqlalchemy as sa
from config.config import (
FORCE_RELOAD,
USE_CRS,
log_level,
min_tiles_file_size_in_bytes,
write_production_tiles_file,
)
from config.psql import conn, local_engine
from esridump.dumper import EsriDumper
from google.cloud import storage
from google.cloud.storage.bucket import Bucket
from shapely import Point, wkb

from config.config import FORCE_RELOAD, USE_CRS, write_production_tiles_file, min_tiles_file_size_in_bytes, log_level

log.basicConfig(level=log_level)


def google_cloud_bucket() -> Bucket:
"""Build the google cloud bucket with name configured in your environ or default of cleanandgreenphl
Returns:
Bucket: the gcp bucket
"""
credentials_path = os.path.expanduser("/app/service-account-key.json")

if not os.path.exists(credentials_path):
raise FileNotFoundError(f"Credentials file not found at {credentials_path}")

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials_path
bucket_name = os.getenv("GOOGLE_CLOUD_BUCKET_NAME", "cleanandgreenphl")

storage_client = storage.Client(project="clean-and-green-philly")
return storage_client.bucket(bucket_name)


bucket = google_cloud_bucket()


class FeatureLayer:
"""
FeatureLayer is a class to represent a GIS dataset. It can be initialized with a URL to an Esri Feature Service, a SQL query to Carto, or a GeoDataFrame.
Expand All @@ -50,7 +59,7 @@ def __init__(
force_reload=FORCE_RELOAD,
from_xy=False,
use_wkb_geom_field=None,
cols: list[str] = None
cols: list[str] = None,
):
self.name = name
self.esri_rest_urls = (
Expand Down Expand Up @@ -230,7 +239,7 @@ def spatial_join(self, other_layer, how="left", predicate="intersects"):
self.gdf.drop_duplicates(inplace=True)

# Coerce opa_id to integer and drop rows where opa_id is null or non-numeric
self.gdf["opa_id"] = pd.to_numeric(self.gdf["opa_id"], errors="coerce")
self.gdf.loc[:, "opa_id"] = pd.to_numeric(self.gdf["opa_id"], errors="coerce")
self.gdf = self.gdf.dropna(subset=["opa_id"])

def opa_join(self, other_df, opa_column):
Expand All @@ -239,11 +248,13 @@ def opa_join(self, other_df, opa_column):
"""

# Coerce opa_column to integer and drop rows where opa_column is null or non-numeric
other_df[opa_column] = pd.to_numeric(other_df[opa_column], errors="coerce")
other_df.loc[:, opa_column] = pd.to_numeric(
other_df[opa_column], errors="coerce"
)
other_df = other_df.dropna(subset=[opa_column])

# Coerce opa_id to integer and drop rows where opa_id is null or non-numeric
self.gdf["opa_id"] = pd.to_numeric(self.gdf["opa_id"], errors="coerce")
self.gdf.loc[:, "opa_id"] = pd.to_numeric(self.gdf["opa_id"], errors="coerce")
self.gdf = self.gdf.dropna(subset=["opa_id"])

# Perform the merge
Expand All @@ -253,13 +264,14 @@ def opa_join(self, other_df, opa_column):

# Check if 'geometry' column exists in both dataframes and clean up
if "geometry_x" in joined.columns and "geometry_y" in joined.columns:
joined = joined.drop(columns=["geometry_y"])
joined = joined.drop(columns=["geometry_y"]).copy() # Ensure a full copy
joined = joined.rename(columns={"geometry_x": "geometry"})

if opa_column != "opa_id":
joined = joined.drop(columns=[opa_column])

self.gdf = joined
# Assign the joined DataFrame to self.gdf as a full copy
self.gdf = joined.copy()
self.rebuild_gdf()

def rebuild_gdf(self):
Expand All @@ -270,7 +282,7 @@ def create_centroid_gdf(self):
Convert the geometry of the GeoDataFrame to centroids.
"""
self.centroid_gdf = self.gdf.copy()
self.centroid_gdf["geometry"] = self.gdf["geometry"].centroid
self.centroid_gdf.loc[:, "geometry"] = self.gdf["geometry"].centroid

def build_and_publish_pmtiles(self, tileset_id):
zoom_threshold = 13
Expand Down Expand Up @@ -336,17 +348,19 @@ def build_and_publish_pmtiles(self, tileset_id):
subprocess.run(command)

write_files = [f"{tileset_id}_staging.pmtiles"]

if write_production_tiles_file:
write_files.append(f"{tileset_id}.pmtiles")

# check whether the temp saved tiles files is big enough.
# If not then it might be corrupted so log error and don't upload to gcp.
file_size = os.stat(temp_merged_pmtiles).st_size
if file_size < min_tiles_file_size_in_bytes:
raise ValueError(f"{temp_merged_pmtiles} is {file_size} bytes in size but should be at least {min_tiles_file_size_in_bytes}. Therefore, we are not uploading any files to the GCP bucket. The file may be corrupt or incomplete.")

raise ValueError(
f"{temp_merged_pmtiles} is {file_size} bytes in size but should be at least {min_tiles_file_size_in_bytes}. Therefore, we are not uploading any files to the GCP bucket. The file may be corrupt or incomplete."
)

# Upload to Google Cloud Storage
for file in write_files:
blob = bucket.blob(file)
blob.upload_from_filename(temp_merged_pmtiles)
blob.upload_from_filename(temp_merged_pmtiles)
50 changes: 35 additions & 15 deletions data/src/data_utils/city_owned_properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,26 +18,46 @@ def city_owned_properties(primary_featurelayer):
"sideyardeligible": "side_yard_eligible",
}
primary_featurelayer.gdf.rename(columns=rename_columns, inplace=True)

primary_featurelayer.gdf.loc[primary_featurelayer.gdf['owner_1'].isin(["PHILADELPHIA HOUSING AUTH", "PHILADELPHIA LAND BANK", "REDEVELOPMENT AUTHORITY", "PHILA REDEVELOPMENT AUTH"]), 'city_owner_agency'] = primary_featurelayer.gdf["owner_1"].replace({
"PHILADELPHIA HOUSING AUTH": "PHA",
"PHILADELPHIA LAND BANK": "Land Bank (PHDC)",
"REDEVELOPMENT AUTHORITY": "PRA",
"PHILA REDEVELOPMENT AUTH": "PRA"
})

primary_featurelayer.gdf.loc[
(primary_featurelayer.gdf['owner_1'] == "CITY OF PHILA") &
(primary_featurelayer.gdf['owner_2'].str.contains("PUBLIC PROP|PUBLC PROP", na=False)),
'city_owner_agency'
primary_featurelayer.gdf["owner_1"].isin(
[
"PHILADELPHIA HOUSING AUTH",
"PHILADELPHIA LAND BANK",
"REDEVELOPMENT AUTHORITY",
"PHILA REDEVELOPMENT AUTH",
]
),
"city_owner_agency",
] = primary_featurelayer.gdf["owner_1"].replace(
{
"PHILADELPHIA HOUSING AUTH": "PHA",
"PHILADELPHIA LAND BANK": "Land Bank (PHDC)",
"REDEVELOPMENT AUTHORITY": "PRA",
"PHILA REDEVELOPMENT AUTH": "PRA",
}
)

primary_featurelayer.gdf.loc[
(primary_featurelayer.gdf["owner_1"] == "CITY OF PHILA")
& (
primary_featurelayer.gdf["owner_2"].str.contains(
"PUBLIC PROP|PUBLC PROP", na=False
)
),
"city_owner_agency",
] = "DPP"

primary_featurelayer.gdf.loc[
primary_featurelayer.gdf['owner_1'].isin(["CITY OF PHILADELPHIA", "CITY OF PHILA"]) &
primary_featurelayer.gdf['owner_2'].isna(),
'city_owner_agency'
primary_featurelayer.gdf["owner_1"].isin(
["CITY OF PHILADELPHIA", "CITY OF PHILA"]
)
& primary_featurelayer.gdf["owner_2"].isna(),
"city_owner_agency",
] = "City of Philadelphia"

primary_featurelayer.gdf["side_yard_eligible"].fillna("No", inplace=True)
primary_featurelayer.gdf.loc[:, "side_yard_eligible"] = primary_featurelayer.gdf[
"side_yard_eligible"
].fillna("No")

return primary_featurelayer
4 changes: 3 additions & 1 deletion data/src/data_utils/deliquencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ def deliquencies(primary_featurelayer):
"opa_number",
)

primary_featurelayer.gdf["sheriff_sale"].fillna("N", inplace=True)
primary_featurelayer.gdf.loc[:, "sheriff_sale"] = primary_featurelayer.gdf[
"sheriff_sale"
].fillna("N")

return primary_featurelayer
10 changes: 6 additions & 4 deletions data/src/data_utils/imm_dang_buildings.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,19 @@ def imm_dang_buildings(primary_featurelayer):
cols=["opa_account_num"],
)

imm_dang_buildings.gdf["imm_dang_building"] = "Y"
imm_dang_buildings.gdf.loc[:, "imm_dang_building"] = "Y"

imm_dang_buildings.gdf.rename(
columns={"opa_account_num": "opa_number"}, inplace=True
imm_dang_buildings.gdf = imm_dang_buildings.gdf.rename(
columns={"opa_account_num": "opa_number"}
)

primary_featurelayer.opa_join(
imm_dang_buildings.gdf,
"opa_number",
)

primary_featurelayer.gdf["imm_dang_building"].fillna("N", inplace=True)
primary_featurelayer.gdf.loc[:, "imm_dang_building"] = primary_featurelayer.gdf[
"imm_dang_building"
].fillna("N")

return primary_featurelayer
52 changes: 31 additions & 21 deletions data/src/data_utils/negligent_devs.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,41 +40,51 @@ def standardize_street(street):

def create_standardized_address(row):
parts = [
row["mailing_address_1"].strip(),
row["mailing_address_2"].strip(),
row["mailing_street"].strip(),
row["mailing_city_state"].strip(),
row["mailing_zip"].strip(),
row["mailing_address_1"].strip()
if pd.notnull(row["mailing_address_1"])
else "",
row["mailing_address_2"].strip()
if pd.notnull(row["mailing_address_2"])
else "",
row["mailing_street"].strip() if pd.notnull(row["mailing_street"]) else "",
row["mailing_city_state"].strip()
if pd.notnull(row["mailing_city_state"])
else "",
row["mailing_zip"].strip() if pd.notnull(row["mailing_zip"]) else "",
]
standardized_address = ", ".join(part for part in parts if part)
standardized_address = ", ".join([part for part in parts if part])
return standardized_address.lower()


def negligent_devs(primary_featurelayer):
devs = primary_featurelayer.gdf
city_owners = devs[~devs["city_owner_agency"].isna()]
non_city_owners = devs[devs["city_owner_agency"].isna()]
city_owners = devs.loc[~devs["city_owner_agency"].isna()].copy()
non_city_owners = devs.loc[devs["city_owner_agency"].isna()].copy()

non_city_owners["mailing_street"] = (
non_city_owners.loc[:, "mailing_street"] = (
non_city_owners["mailing_street"].astype(str).apply(standardize_street)
)

for term in ["ST", "AVE", "RD", "BLVD"]:
non_city_owners["mailing_street"] = non_city_owners["mailing_street"].replace(
regex={f"{term}.*": term}
)
non_city_owners["mailing_address_1"] = non_city_owners["mailing_address_1"].fillna(
""
)
non_city_owners["mailing_address_2"] = non_city_owners["mailing_address_2"].fillna(
non_city_owners.loc[:, "mailing_street"] = non_city_owners[
"mailing_street"
].replace(regex={f"{term}.*": term})

non_city_owners.loc[:, "mailing_address_1"] = non_city_owners[
"mailing_address_1"
].fillna("")
non_city_owners.loc[:, "mailing_address_2"] = non_city_owners[
"mailing_address_2"
].fillna("")
non_city_owners.loc[:, "mailing_street"] = non_city_owners["mailing_street"].fillna(
""
)
non_city_owners["mailing_street"] = non_city_owners["mailing_street"].fillna("")
non_city_owners["mailing_city_state"] = non_city_owners[
non_city_owners.loc[:, "mailing_city_state"] = non_city_owners[
"mailing_city_state"
].fillna("")
non_city_owners["mailing_zip"] = non_city_owners["mailing_zip"].fillna("")
non_city_owners.loc[:, "mailing_zip"] = non_city_owners["mailing_zip"].fillna("")

non_city_owners["standardized_address"] = non_city_owners.apply(
non_city_owners.loc[:, "standardized_address"] = non_city_owners.apply(
create_standardized_address, axis=1
)

Expand Down Expand Up @@ -107,7 +117,7 @@ def negligent_devs(primary_featurelayer):
primary_featurelayer.gdf.rename(
columns={"property_count": "n_properties_owned"}, inplace=True
)
primary_featurelayer.gdf["negligent_dev"] = (
primary_featurelayer.gdf.loc[:, "negligent_dev"] = (
primary_featurelayer.gdf["n_properties_owned"] > 5
) & (primary_featurelayer.gdf["city_owner_agency"].isna())

Expand Down
6 changes: 4 additions & 2 deletions data/src/data_utils/phs_properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,16 @@ def phs_properties(primary_featurelayer):
name="PHS Properties", esri_rest_urls=PHS_LAYERS_TO_LOAD, cols=["BRT_ID"]
)

phs_properties.gdf["phs_partner_agency"] = "PHS"
phs_properties.gdf.loc[:, "phs_partner_agency"] = "PHS"

primary_featurelayer.opa_join(
phs_properties.gdf,
"brt_id",
)

primary_featurelayer.gdf["phs_partner_agency"].fillna("None", inplace=True)
primary_featurelayer.gdf.loc[:, "phs_partner_agency"] = primary_featurelayer.gdf[
"phs_partner_agency"
].fillna("None")

primary_featurelayer.rebuild_gdf()

Expand Down
13 changes: 7 additions & 6 deletions data/src/data_utils/priority_level.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,21 @@ def priority_level(dataset):
priority_level = ""

# Decision Points
guncrime_density = row["guncrime_density"]
guncrime_density_percentile = row["guncrime_density_percentile"]
in_phs_landcare = row["phs_partner_agency"] == "PHS"
has_li_complaint_or_violation = (
row["li_complaints"] is not None
and float(row["all_violations_past_year"]) > 0
)
very_low_tree_canopy = row["tree_canopy_gap"] >= 0.3

if guncrime_density == "Bottom 50%":
# Low Gun Crime Density
# Updated logic based on percentile values
if guncrime_density_percentile <= 50:
# Low Gun Crime Density (Bottom 50%)
priority_level = "Low"

elif guncrime_density in ["Top 25%", "Top 10%", "Top 5%", "Top 1%"]:
# High Gun Crime Density
elif guncrime_density_percentile > 75:
# High Gun Crime Density (Top 25%)

if has_li_complaint_or_violation:
priority_level = "High"
Expand All @@ -31,7 +32,7 @@ def priority_level(dataset):
priority_level = "High"

else:
# Medium Gun Crime Density
# Medium Gun Crime Density (Between 50% and 75%)
if has_li_complaint_or_violation:
if in_phs_landcare:
priority_level = "Medium"
Expand Down
Loading

0 comments on commit 3b57542

Please sign in to comment.