Merge pull request #966 from CodeForPhilly/lebovits/fix-phs-and-dupli…

…cates Lebovits/fix phs and duplicates
CodeForPhilly · Oct 19, 2024 · d66fc4f · d66fc4f
2 parents a38843c + 3d89cc7
commit d66fc4f
Show file tree

Hide file tree

Showing 6 changed files with 30 additions and 43 deletions.
diff --git a/data/src/classes/featurelayer.py b/data/src/classes/featurelayer.py
@@ -314,35 +314,7 @@ def build_and_publish(self, tiles_file_id_prefix: str) -> None:
         self.centroid_gdf["geometry"] = self.centroid_gdf["geometry"].centroid
         self.centroid_gdf = self.centroid_gdf.to_crs(epsg=4326)
         self.centroid_gdf.to_file(temp_geojson_points, driver="GeoJSON")
-
-        # Load the GeoJSON from the polygons, drop geometry, and save as Parquet
-        gdf_polygons = gpd.read_file(temp_geojson_polygons)
-        df_no_geom = gdf_polygons.drop(columns=["geometry"])
-
-        # Check if the DataFrame has fewer than 25,000 rows
-        num_rows, num_cols = df_no_geom.shape
-        if num_rows < 25000:
-            print(
-                f"Parquet file has {num_rows} rows, which is fewer than 25,000. Skipping upload."
-            )
-            return
-
-        # Save the DataFrame as Parquet
-        df_no_geom.to_parquet(temp_parquet)
-
-        # Upload Parquet to Google Cloud Storage
-        blob_parquet = bucket.blob(f"{tiles_file_id_prefix}.parquet")
-        try:
-            blob_parquet.upload_from_filename(temp_parquet)
-            parquet_size = os.stat(temp_parquet).st_size
-            parquet_size_mb = parquet_size / (1024 * 1024)
-            print(
-                f"Parquet upload successful! Size: {parquet_size} bytes ({parquet_size_mb:.2f} MB), Dimensions: {num_rows} rows, {num_cols} columns."
-            )
-        except Exception as e:
-            print(f"Parquet upload failed: {e}")
-            return
-
+
         # Command for generating PMTiles for points up to zoom level zoom_threshold
         points_command: list[str] = [
             "tippecanoe",

diff --git a/data/src/data_utils/access_process.py b/data/src/data_utils/access_process.py
@@ -39,10 +39,5 @@ def access_process(dataset: Any) -> Any:
         access_processes.append(access_process)
 
     dataset.gdf["access_process"] = access_processes
-
-    # Print the distribution of "access_process"
-    distribution = dataset.gdf["access_process"].value_counts()
-    print("Distribution of access process:")
-    print(distribution)
-
+
     return dataset
diff --git a/data/src/data_utils/phs_properties.py b/data/src/data_utils/phs_properties.py
@@ -22,10 +22,10 @@ def phs_properties(primary_featurelayer: FeatureLayer) -> FeatureLayer:
     primary_featurelayer.spatial_join(phs_properties)
 
     # Initialize 'phs_care_program' column with default "no" for all rows
-    primary_featurelayer.gdf["phs_care_program"] = "no"
+    primary_featurelayer.gdf["phs_care_program"] = "No"
 
     # Set 'phs_care_program' to "yes" for matched rows
-    primary_featurelayer.gdf.loc[primary_featurelayer.gdf["phs_care_program"] != "no", "phs_care_program"] = "yes"
+    primary_featurelayer.gdf.loc[primary_featurelayer.gdf["program"].notna(), "phs_care_program"] = "Yes"
 
     # Rebuild the GeoDataFrame after updates
     primary_featurelayer.rebuild_gdf()

diff --git a/data/src/data_utils/priority_level.py b/data/src/data_utils/priority_level.py
@@ -47,4 +47,5 @@ def priority_level(dataset):
         priority_levels.append(priority_level)
 
     dataset.gdf["priority_level"] = priority_levels
+
     return dataset
diff --git a/data/src/data_utils/vacant_properties.py b/data/src/data_utils/vacant_properties.py
@@ -145,13 +145,7 @@ def vacant_properties() -> FeatureLayer:
             vacant_properties.gdf, geometry="geometry"
         )
 
-    print(
-        f"Vacant properties data size before dropping NAs: {len(vacant_properties.gdf)} rows."
-    )
     vacant_properties.gdf.dropna(subset=["opa_id"], inplace=True)
-    print(
-        f"Vacant properties data size after dropping NAs: {len(vacant_properties.gdf)} rows."
-    )
 
     # Final null value check before returning
     check_null_percentage(vacant_properties.gdf)
@@ -184,4 +178,9 @@ def vacant_properties() -> FeatureLayer:
     # Ensure concatenated data is still a GeoDataFrame
     vacant_properties.gdf = gpd.GeoDataFrame(vacant_properties.gdf, geometry="geometry")
 
+    before_drop = vacant_properties.gdf.shape[0]
+    vacant_properties.gdf = vacant_properties.gdf.drop_duplicates(subset="opa_id")
+    after_drop = vacant_properties.gdf.shape[0]
+    print(f"Duplicate vacant properties dropped: {before_drop - after_drop}")
+
     return vacant_properties
diff --git a/data/src/script.py b/data/src/script.py
@@ -80,12 +80,32 @@
 for service in services:
     dataset = service(dataset)
 
+before_drop = dataset.gdf.shape[0]
+dataset.gdf = dataset.gdf.drop_duplicates(subset="opa_id")
+after_drop = dataset.gdf.shape[0]
+print(f"Duplicate dataset rows dropped after initial services: {before_drop - after_drop}")
+
 # Add Priority Level
 dataset = priority_level(dataset)
 
+# Print the distribution of "priority_level"
+distribution = dataset.gdf["priority_level"].value_counts()
+print("Distribution of priority level:")
+print(distribution)
+
 # Add Access Process
 dataset = access_process(dataset)
 
+# Print the distribution of "access_process"
+distribution = dataset.gdf["access_process"].value_counts()
+print("Distribution of access process:")
+print(distribution)
+
+before_drop = dataset.gdf.shape[0]
+dataset.gdf = dataset.gdf.drop_duplicates(subset="opa_id")
+after_drop = dataset.gdf.shape[0]
+print(f"Duplicate final dataset rows droppeds: {before_drop - after_drop}")
+
 # back up old tiles file whether we are reloading data or not
 if backup is None:
     backup = BackupArchiveDatabase()