Skip to content

Commit

Permalink
Merge pull request #313 from perfectly-preserved-pie/clientside-filte…
Browse files Browse the repository at this point in the history
…ring-using-geojson

Resolves #300.

- Migrate from Parquet to GeoJSON.
- Convert all server-side callbacks into JavaScript clientside callbacks.
  • Loading branch information
perfectly-preserved-pie authored Dec 28, 2024
2 parents 898f80b + b288624 commit 78f2765
Show file tree
Hide file tree
Showing 9 changed files with 5,433 additions and 552 deletions.
4,601 changes: 4,601 additions & 0 deletions assets/datasets/lease.geojson

Large diffs are not rendered by default.

389 changes: 385 additions & 4 deletions assets/javascript/clientside_callbacks.js

Large diffs are not rendered by default.

465 changes: 214 additions & 251 deletions assets/javascript/popup.js

Large diffs are not rendered by default.

108 changes: 108 additions & 0 deletions functions/dataframe_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,4 +103,112 @@ def update_dataframe_with_listing_data(
df.at[row.Index, 'listing_url'] = webscrape[2]
except Exception as e:
logger.error(f"Error processing MLS {mls_number} at index {row.Index}: {e}")
return df

def categorize_laundry_features(feature) -> str:
# If it's NaN, treat as unknown
if pd.isna(feature):
return 'Unknown'

# Convert to string, lowercase, and strip whitespace
feature_str = str(feature).lower().strip()

# If it's empty or literally 'unknown', just call it 'Unknown'
if feature_str in ['', 'unknown']:
return 'Unknown'

# Split on commas
tokens = [token.strip() for token in feature_str.split(',')]

has_any = lambda keywords: any(any_kw in t for t in tokens for any_kw in keywords)

if has_any(['in closet', 'in kitchen', 'in garage', 'inside', 'individual room']):
return 'In Unit'
elif has_any(['community laundry', 'common area', 'shared']):
return 'Shared'
elif has_any(['hookup', 'electric dryer hookup', 'gas dryer hookup', 'washer hookup']):
return 'Hookups'
elif has_any(['dryer included', 'dryer', 'washer included', 'washer']):
return 'Included Appliances'
elif has_any(['outside', 'upper level', 'in carport']):
return 'Location Specific'
elif feature_str == 'none':
return 'None'
else:
return 'Other'

def flatten_subtype_column(df: pd.DataFrame) -> pd.DataFrame:
"""
Flatten the 'subtype' column in-place by mapping attached/detached abbreviations
(e.g. 'SFR/A', 'SFR/D', 'CONDO/A', etc.) to a simplified label
(e.g. 'Single Family', 'Condominium', etc.).
:param df: A pandas DataFrame with a column named 'subtype'.
:return: The same DataFrame (df) with its 'subtype' column flattened.
"""

# Create a mapping from various raw subtype strings → flattened label
subtype_map = {
# Single Family
"SFR": "Single Family",
"SFR/A": "Single Family",
"SFR/D": "Single Family",

# Condominium
"CONDO": "Condominium",
"CONDO/A": "Condominium",
"CONDO/D": "Condominium",

# Apartment
"APT": "Apartment",
"APT/A": "Apartment",
"APT/D": "Apartment",

# Townhouse
"TWNHS": "Townhouse",
"TWNHS/A": "Townhouse",
"TWNHS/D": "Townhouse",

# Duplex
"DPLX": "Duplex",
"DPLX/A": "Duplex",
"DPLX/D": "Duplex",

# Triplex
"TPLX": "Triplex",
"TPLX/A": "Triplex",
"TPLX/D": "Triplex",

# Quadplex
"QUAD": "Quadplex",
"QUAD/A": "Quadplex",
"QUAD/D": "Quadplex",

# Lofts
"LOFT": "Loft",
"LOFT/A": "Loft",

# Studios
"STUD": "Studio",
"STUD/A": "Studio",
"STUD/D": "Studio",

# Room for Rent
"RMRT/A": "Room For Rent",
"RMRT/D": "Room For Rent",

# Cabin
"CABIN": "Cabin",
"CABIN/A": "Cabin",
"CABIN/D": "Cabin",

# Commercial Residential
"COMRES/A": "Commercial Residential",
"COMRES/D": "Commercial Residential",
"Combo - Res & Com": "Commercial Residential",
}

# Apply the mapping: where a key is found, replace with its value; otherwise leave as is
df["subtype"] = df["subtype"].map(subtype_map).fillna(df["subtype"])

return df
13 changes: 7 additions & 6 deletions functions/mls_image_processing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from imagekitio.models.UploadFileRequestOptions import UploadFileRequestOptions
from loguru import logger
from typing import Optional, List, Generator, Set
import geopandas as gpd
import pandas as pd
import sys

Expand Down Expand Up @@ -75,26 +76,26 @@ def chunked_list(lst: List, chunk_size: int) -> Generator[List, None, None]:
for i in range(0, len(lst), chunk_size):
yield lst[i:i + chunk_size]

def reclaim_imagekit_space(df_path: str, imagekit_instance: ImageKit) -> None:
def reclaim_imagekit_space(geojson_path: str, imagekit_instance: ImageKit) -> None:
"""
This function reclaims space in ImageKit by deleting images in bulk that are not referenced in the dataframe.
This function reclaims space in ImageKit by deleting images in bulk that are not referenced in the GeoJSON.
Parameters:
df_path (str): The path to the dataframe stored in a parquet file.
df_path (str): The path to the GeoJSON file.
imagekit_instance (ImageKit): An instance of ImageKit initialized with the appropriate credentials.
Returns:
None
"""
# Load the dataframe
df = pd.read_parquet(df_path)
# Load the GeoJSON file
gdf = gpd.read_file(geojson_path)

# Get the list of files
list_files_response = imagekit_instance.list_files()
list_files: list = list_files_response.list if hasattr(list_files_response, 'list') else []

# Create a set of referenced mls numbers for faster searching
referenced_mls_numbers: Set[str] = set(df['mls_number'].astype(str))
referenced_mls_numbers: Set[str] = set(gdf['mls_number'].astype(str))

# Initialize a list for file IDs to delete
file_ids_for_deletion: List[str] = [
Expand Down
1 change: 1 addition & 0 deletions lease.json

Large diffs are not rendered by default.

30 changes: 18 additions & 12 deletions lease_dataframe.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from dotenv import load_dotenv, find_dotenv
from functions.dataframe_utils import remove_inactive_listings, update_dataframe_with_listing_data
from functions.dataframe_utils import remove_inactive_listings, update_dataframe_with_listing_data, categorize_laundry_features, flatten_subtype_column
from functions.geocoding_utils import *
from functions.mls_image_processing_utils import *
from functions.noise_level_utils import *
Expand All @@ -8,7 +8,7 @@
from geopy.geocoders import GoogleV3
from imagekitio import ImageKit
from loguru import logger
import asyncio
import geopandas as gpd
import glob
import os
import pandas as pd
Expand Down Expand Up @@ -196,6 +196,9 @@
regex=True
)

# Flatten the subtype column
df = flatten_subtype_column(df)

# Convert the listed date into DateTime and use the "mixed" format to handle the different date formats
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html
df['listed_date'] = pd.to_datetime(df['listed_date'], errors='raise', format='mixed')
Expand All @@ -211,13 +214,15 @@

# Save the dataframe for later ingestion by app.py
# Read in the old dataframe
df_old = pd.read_parquet(path='https://github.com/perfectly-preserved-pie/larentals/raw/master/assets/datasets/lease.parquet')
df_old = gpd.read_file(path='https://github.com/perfectly-preserved-pie/larentals/raw/master/assets/datasets/lease.geojson')
# Combine both old and new dataframes
df_combined = pd.concat([df, df_old], ignore_index=True)
# Drop any dupes again
df_combined = df_combined.drop_duplicates(subset=['mls_number'], keep="last")
# Iterate through the dataframe and drop rows with expired listings
df_combined = remove_inactive_listings(df_combined)
# Categorize the laundry features
df_combined['laundry_category'] = df_combined['laundry'].apply(categorize_laundry_features)
# Reset the index
df_combined = df_combined.reset_index(drop=True)
# Filter the dataframe for rows outside of California
Expand All @@ -237,16 +242,17 @@
df_combined.at[row.Index, 'latitude'] = coordinates[0]
df_combined.at[row.Index, 'longitude'] = coordinates[1]
# Save the new combined dataframe
# Convert the combined DataFrame to a GeoDataFrame
gdf_combined = gpd.GeoDataFrame(
df_combined,
geometry=gpd.points_from_xy(df_combined.longitude, df_combined.latitude)
)
# Save the GeoDataFrame as a GeoJSON file
try:
df_combined.to_parquet(path="assets/datasets/lease.parquet")
gdf_combined.to_file("assets/datasets/lease.geojson", driver="GeoJSON")
logger.info("Saved the combined GeoDataFrame to a GeoJSON file.")
except Exception as e:
logger.warning(f"Error saving the combined dataframe as a parquet file: {e}. Falling back to CSV...")
# Save the new combined dataframe to a CSV file
try:
df_combined.to_csv(path_or_buf="assets/datasets/lease.csv", index=False)
logger.info("Saved the combined dataframe to a CSV file")
except Exception as e:
logger.error(f"Error saving the combined dataframe to a CSV file: {e}")
logger.error(f"Error saving the combined GeoDataFrame to a GeoJSON file: {e}")

# Reclaim space in ImageKit
reclaim_imagekit_space(df_path="assets/datasets/lease.parquet", imagekit_instance=imagekit)
reclaim_imagekit_space(geojson_path="assets/datasets/lease.geojson", imagekit_instance=imagekit)
Loading

0 comments on commit 78f2765

Please sign in to comment.