Merge pull request #314 from perfectly-preserved-pie/dev

Switch from serverside to clientside callbacks
perfectly-preserved-pie · Dec 28, 2024 · 05675d0 · 05675d0
2 parents 7d05ea4 + ef3b44a
commit 05675d0
Show file tree

Hide file tree

Showing 10 changed files with 5,440 additions and 559 deletions.
diff --git a/assets/datasets/lease.geojson b/assets/datasets/lease.geojson
diff --git a/assets/javascript/clientside_callbacks.js b/assets/javascript/clientside_callbacks.js
diff --git a/assets/javascript/popup.js b/assets/javascript/popup.js
diff --git a/functions/dataframe_utils.py b/functions/dataframe_utils.py
@@ -103,4 +103,112 @@ def update_dataframe_with_listing_data(
                 df.at[row.Index, 'listing_url'] = webscrape[2]
         except Exception as e:
             logger.error(f"Error processing MLS {mls_number} at index {row.Index}: {e}")
+    return df
+
+def categorize_laundry_features(feature) -> str:
+    # If it's NaN, treat as unknown
+    if pd.isna(feature):
+        return 'Unknown'
+
+    # Convert to string, lowercase, and strip whitespace
+    feature_str = str(feature).lower().strip()
+
+    # If it's empty or literally 'unknown', just call it 'Unknown'
+    if feature_str in ['', 'unknown']:
+        return 'Unknown'
+
+    # Split on commas
+    tokens = [token.strip() for token in feature_str.split(',')]
+
+    has_any = lambda keywords: any(any_kw in t for t in tokens for any_kw in keywords)
+
+    if has_any(['in closet', 'in kitchen', 'in garage', 'inside', 'individual room']):
+        return 'In Unit'
+    elif has_any(['community laundry', 'common area', 'shared']):
+        return 'Shared'
+    elif has_any(['hookup', 'electric dryer hookup', 'gas dryer hookup', 'washer hookup']):
+        return 'Hookups'
+    elif has_any(['dryer included', 'dryer', 'washer included', 'washer']):
+        return 'Included Appliances'
+    elif has_any(['outside', 'upper level', 'in carport']):
+        return 'Location Specific'
+    elif feature_str == 'none':
+        return 'None'
+    else:
+        return 'Other'
+
+def flatten_subtype_column(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Flatten the 'subtype' column in-place by mapping attached/detached abbreviations
+    (e.g. 'SFR/A', 'SFR/D', 'CONDO/A', etc.) to a simplified label 
+    (e.g. 'Single Family', 'Condominium', etc.).
+    
+    :param df: A pandas DataFrame with a column named 'subtype'.
+    :return: The same DataFrame (df) with its 'subtype' column flattened.
+    """
+
+    # Create a mapping from various raw subtype strings → flattened label
+    subtype_map = {
+        # Single Family
+        "SFR": "Single Family",
+        "SFR/A": "Single Family",
+        "SFR/D": "Single Family",
+
+        # Condominium
+        "CONDO": "Condominium",
+        "CONDO/A": "Condominium",
+        "CONDO/D": "Condominium",
+
+        # Apartment
+        "APT": "Apartment",
+        "APT/A": "Apartment",
+        "APT/D": "Apartment",
+
+        # Townhouse
+        "TWNHS": "Townhouse",
+        "TWNHS/A": "Townhouse",
+        "TWNHS/D": "Townhouse",
+
+        # Duplex
+        "DPLX": "Duplex",
+        "DPLX/A": "Duplex",
+        "DPLX/D": "Duplex",
+
+        # Triplex
+        "TPLX": "Triplex",
+        "TPLX/A": "Triplex",
+        "TPLX/D": "Triplex",
+
+        # Quadplex
+        "QUAD": "Quadplex",
+        "QUAD/A": "Quadplex",
+        "QUAD/D": "Quadplex",
+
+        # Lofts
+        "LOFT": "Loft",
+        "LOFT/A": "Loft",
+
+        # Studios
+        "STUD": "Studio",
+        "STUD/A": "Studio",
+        "STUD/D": "Studio",
+
+        # Room for Rent
+        "RMRT/A": "Room For Rent",
+        "RMRT/D": "Room For Rent",
+
+        # Cabin
+        "CABIN": "Cabin",
+        "CABIN/A": "Cabin",
+        "CABIN/D": "Cabin",
+
+        # Commercial Residential
+        "COMRES/A": "Commercial Residential",
+        "COMRES/D": "Commercial Residential",
+        "Combo - Res &amp; Com": "Commercial Residential",
+    }
+
+    # Apply the mapping: where a key is found, replace with its value; otherwise leave as is
+    df["subtype"] = df["subtype"].map(subtype_map).fillna(df["subtype"])
+
     return df
diff --git a/functions/mls_image_processing_utils.py b/functions/mls_image_processing_utils.py
@@ -2,6 +2,7 @@
 from imagekitio.models.UploadFileRequestOptions import UploadFileRequestOptions
 from loguru import logger
 from typing import Optional, List, Generator, Set
+import geopandas as gpd
 import pandas as pd
 import sys
 
@@ -75,26 +76,26 @@ def chunked_list(lst: List, chunk_size: int) -> Generator[List, None, None]:
     for i in range(0, len(lst), chunk_size):
         yield lst[i:i + chunk_size]
 
-def reclaim_imagekit_space(df_path: str, imagekit_instance: ImageKit) -> None:
+def reclaim_imagekit_space(geojson_path: str, imagekit_instance: ImageKit) -> None:
     """
-    This function reclaims space in ImageKit by deleting images in bulk that are not referenced in the dataframe.
+    This function reclaims space in ImageKit by deleting images in bulk that are not referenced in the GeoJSON.
 
     Parameters:
-    df_path (str): The path to the dataframe stored in a parquet file.
+    df_path (str): The path to the GeoJSON file.
     imagekit_instance (ImageKit): An instance of ImageKit initialized with the appropriate credentials.
 
     Returns:
     None
     """
-    # Load the dataframe
-    df = pd.read_parquet(df_path)
+    # Load the GeoJSON file
+    gdf = gpd.read_file(geojson_path)
 
     # Get the list of files
     list_files_response = imagekit_instance.list_files()
     list_files: list = list_files_response.list if hasattr(list_files_response, 'list') else []
 
     # Create a set of referenced mls numbers for faster searching
-    referenced_mls_numbers: Set[str] = set(df['mls_number'].astype(str))
+    referenced_mls_numbers: Set[str] = set(gdf['mls_number'].astype(str))
 
     # Initialize a list for file IDs to delete
     file_ids_for_deletion: List[str] = [

diff --git a/lease.json b/lease.json
diff --git a/lease_dataframe.py b/lease_dataframe.py
@@ -1,5 +1,5 @@
 from dotenv import load_dotenv, find_dotenv
-from functions.dataframe_utils import remove_inactive_listings, update_dataframe_with_listing_data
+from functions.dataframe_utils import remove_inactive_listings, update_dataframe_with_listing_data, categorize_laundry_features, flatten_subtype_column
 from functions.geocoding_utils import *
 from functions.mls_image_processing_utils import *
 from functions.noise_level_utils import *
@@ -8,7 +8,7 @@
 from geopy.geocoders import GoogleV3
 from imagekitio import ImageKit
 from loguru import logger
-import asyncio
+import geopandas as gpd
 import glob
 import os
 import pandas as pd
@@ -196,6 +196,9 @@
   regex=True
 )
 
+# Flatten the subtype column
+df = flatten_subtype_column(df)
+
 # Convert the listed date into DateTime and use the "mixed" format to handle the different date formats
 # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html
 df['listed_date'] = pd.to_datetime(df['listed_date'], errors='raise', format='mixed')
@@ -211,13 +214,15 @@
 
 # Save the dataframe for later ingestion by app.py
 # Read in the old dataframe
-df_old = pd.read_parquet(path='https://github.com/perfectly-preserved-pie/larentals/raw/master/assets/datasets/lease.parquet')
+df_old = gpd.read_file(path='https://github.com/perfectly-preserved-pie/larentals/raw/master/assets/datasets/lease.geojson')
 # Combine both old and new dataframes
 df_combined = pd.concat([df, df_old], ignore_index=True)
 # Drop any dupes again
 df_combined = df_combined.drop_duplicates(subset=['mls_number'], keep="last")
 # Iterate through the dataframe and drop rows with expired listings
 df_combined = remove_inactive_listings(df_combined)
+# Categorize the laundry features
+df_combined['laundry_category'] = df_combined['laundry'].apply(categorize_laundry_features)
 # Reset the index
 df_combined = df_combined.reset_index(drop=True)
 # Filter the dataframe for rows outside of California
@@ -237,16 +242,17 @@
   df_combined.at[row.Index, 'latitude'] = coordinates[0]
   df_combined.at[row.Index, 'longitude'] = coordinates[1]
 # Save the new combined dataframe
+# Convert the combined DataFrame to a GeoDataFrame
+gdf_combined = gpd.GeoDataFrame(
+  df_combined, 
+  geometry=gpd.points_from_xy(df_combined.longitude, df_combined.latitude)
+)
+# Save the GeoDataFrame as a GeoJSON file
 try:
-  df_combined.to_parquet(path="assets/datasets/lease.parquet")
+  gdf_combined.to_file("assets/datasets/lease.geojson", driver="GeoJSON")
+  logger.info("Saved the combined GeoDataFrame to a GeoJSON file.")
 except Exception as e:
-  logger.warning(f"Error saving the combined dataframe as a parquet file: {e}. Falling back to CSV...")
-  # Save the new combined dataframe to a CSV file
-  try:
-    df_combined.to_csv(path_or_buf="assets/datasets/lease.csv", index=False)
-    logger.info("Saved the combined dataframe to a CSV file")
-  except Exception as e:
-    logger.error(f"Error saving the combined dataframe to a CSV file: {e}")
+  logger.error(f"Error saving the combined GeoDataFrame to a GeoJSON file: {e}")
 
 # Reclaim space in ImageKit
-reclaim_imagekit_space(df_path="assets/datasets/lease.parquet", imagekit_instance=imagekit)
+reclaim_imagekit_space(geojson_path="assets/datasets/lease.geojson", imagekit_instance=imagekit)