Skip to content

Commit

Permalink
Merge pull request #966 from CodeForPhilly/lebovits/fix-phs-and-dupli…
Browse files Browse the repository at this point in the history
…cates

Lebovits/fix phs and duplicates
  • Loading branch information
nlebovits authored Oct 19, 2024
2 parents a38843c + 3d89cc7 commit d66fc4f
Show file tree
Hide file tree
Showing 6 changed files with 30 additions and 43 deletions.
30 changes: 1 addition & 29 deletions data/src/classes/featurelayer.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,35 +314,7 @@ def build_and_publish(self, tiles_file_id_prefix: str) -> None:
self.centroid_gdf["geometry"] = self.centroid_gdf["geometry"].centroid
self.centroid_gdf = self.centroid_gdf.to_crs(epsg=4326)
self.centroid_gdf.to_file(temp_geojson_points, driver="GeoJSON")

# Load the GeoJSON from the polygons, drop geometry, and save as Parquet
gdf_polygons = gpd.read_file(temp_geojson_polygons)
df_no_geom = gdf_polygons.drop(columns=["geometry"])

# Check if the DataFrame has fewer than 25,000 rows
num_rows, num_cols = df_no_geom.shape
if num_rows < 25000:
print(
f"Parquet file has {num_rows} rows, which is fewer than 25,000. Skipping upload."
)
return

# Save the DataFrame as Parquet
df_no_geom.to_parquet(temp_parquet)

# Upload Parquet to Google Cloud Storage
blob_parquet = bucket.blob(f"{tiles_file_id_prefix}.parquet")
try:
blob_parquet.upload_from_filename(temp_parquet)
parquet_size = os.stat(temp_parquet).st_size
parquet_size_mb = parquet_size / (1024 * 1024)
print(
f"Parquet upload successful! Size: {parquet_size} bytes ({parquet_size_mb:.2f} MB), Dimensions: {num_rows} rows, {num_cols} columns."
)
except Exception as e:
print(f"Parquet upload failed: {e}")
return


# Command for generating PMTiles for points up to zoom level zoom_threshold
points_command: list[str] = [
"tippecanoe",
Expand Down
7 changes: 1 addition & 6 deletions data/src/data_utils/access_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,5 @@ def access_process(dataset: Any) -> Any:
access_processes.append(access_process)

dataset.gdf["access_process"] = access_processes

# Print the distribution of "access_process"
distribution = dataset.gdf["access_process"].value_counts()
print("Distribution of access process:")
print(distribution)


return dataset
4 changes: 2 additions & 2 deletions data/src/data_utils/phs_properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@ def phs_properties(primary_featurelayer: FeatureLayer) -> FeatureLayer:
primary_featurelayer.spatial_join(phs_properties)

# Initialize 'phs_care_program' column with default "no" for all rows
primary_featurelayer.gdf["phs_care_program"] = "no"
primary_featurelayer.gdf["phs_care_program"] = "No"

# Set 'phs_care_program' to "yes" for matched rows
primary_featurelayer.gdf.loc[primary_featurelayer.gdf["phs_care_program"] != "no", "phs_care_program"] = "yes"
primary_featurelayer.gdf.loc[primary_featurelayer.gdf["program"].notna(), "phs_care_program"] = "Yes"

# Rebuild the GeoDataFrame after updates
primary_featurelayer.rebuild_gdf()
Expand Down
1 change: 1 addition & 0 deletions data/src/data_utils/priority_level.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,4 +47,5 @@ def priority_level(dataset):
priority_levels.append(priority_level)

dataset.gdf["priority_level"] = priority_levels

return dataset
11 changes: 5 additions & 6 deletions data/src/data_utils/vacant_properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,13 +145,7 @@ def vacant_properties() -> FeatureLayer:
vacant_properties.gdf, geometry="geometry"
)

print(
f"Vacant properties data size before dropping NAs: {len(vacant_properties.gdf)} rows."
)
vacant_properties.gdf.dropna(subset=["opa_id"], inplace=True)
print(
f"Vacant properties data size after dropping NAs: {len(vacant_properties.gdf)} rows."
)

# Final null value check before returning
check_null_percentage(vacant_properties.gdf)
Expand Down Expand Up @@ -184,4 +178,9 @@ def vacant_properties() -> FeatureLayer:
# Ensure concatenated data is still a GeoDataFrame
vacant_properties.gdf = gpd.GeoDataFrame(vacant_properties.gdf, geometry="geometry")

before_drop = vacant_properties.gdf.shape[0]
vacant_properties.gdf = vacant_properties.gdf.drop_duplicates(subset="opa_id")
after_drop = vacant_properties.gdf.shape[0]
print(f"Duplicate vacant properties dropped: {before_drop - after_drop}")

return vacant_properties
20 changes: 20 additions & 0 deletions data/src/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,12 +80,32 @@
for service in services:
dataset = service(dataset)

before_drop = dataset.gdf.shape[0]
dataset.gdf = dataset.gdf.drop_duplicates(subset="opa_id")
after_drop = dataset.gdf.shape[0]
print(f"Duplicate dataset rows dropped after initial services: {before_drop - after_drop}")

# Add Priority Level
dataset = priority_level(dataset)

# Print the distribution of "priority_level"
distribution = dataset.gdf["priority_level"].value_counts()
print("Distribution of priority level:")
print(distribution)

# Add Access Process
dataset = access_process(dataset)

# Print the distribution of "access_process"
distribution = dataset.gdf["access_process"].value_counts()
print("Distribution of access process:")
print(distribution)

before_drop = dataset.gdf.shape[0]
dataset.gdf = dataset.gdf.drop_duplicates(subset="opa_id")
after_drop = dataset.gdf.shape[0]
print(f"Duplicate final dataset rows droppeds: {before_drop - after_drop}")

# back up old tiles file whether we are reloading data or not
if backup is None:
backup = BackupArchiveDatabase()
Expand Down

0 comments on commit d66fc4f

Please sign in to comment.