diff --git a/.gitignore b/.gitignore
index 7b3d0e8c..40f52974 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,6 +5,7 @@ __pycache__/larentals.cpython-310.pyc
*.csv
*.pyc
*.xlsx
+.venv/
env
hdf
larentals-checkpoint.py
diff --git a/assets/datasets/lease.parquet b/assets/datasets/lease.parquet
index 662aecea..0ff9ecbc 100644
Binary files a/assets/datasets/lease.parquet and b/assets/datasets/lease.parquet differ
diff --git a/assets/datasets/lease.parquet.bak.newest.kindafuckedup b/assets/datasets/lease.parquet.bak.newest.kindafuckedup
new file mode 100644
index 00000000..4a3925a2
Binary files /dev/null and b/assets/datasets/lease.parquet.bak.newest.kindafuckedup differ
diff --git a/assets/javascript/popup.js b/assets/javascript/popup.js
index 4a132307..a44b2bd1 100644
--- a/assets/javascript/popup.js
+++ b/assets/javascript/popup.js
@@ -22,7 +22,7 @@ window.dash_props = Object.assign({}, window.dash_props, {
return `
Listing ID (MLS#) |
- Not Available |
+ ${data.mls_number} |
`;
}
@@ -47,9 +47,9 @@ window.dash_props = Object.assign({}, window.dash_props, {
const listingUrlBlock = getListingUrlBlock(data);
// Conditionally include the property image row if the image URL is available
- const imageRow = data.image_url ? `
+ const imageRow = data.mls_photo ? `
-
+
` : '';
@@ -64,7 +64,7 @@ window.dash_props = Object.assign({}, window.dash_props, {
${imageRow}
-
${data.address}
+ ${data.full_street_address}
@@ -106,11 +106,11 @@ window.dash_props = Object.assign({}, window.dash_props, {
Bedrooms/Bathrooms |
- ${data.bedrooms}/${data.bathrooms} |
+ ${data.bedrooms}/${data.total_bathrooms} |
- Garage Spaces |
- ${data.garage_spaces || "Unknown"} |
+ Parking Spaces |
+ ${data.parking_spaces || "Unknown"} |
Pets Allowed? |
diff --git a/functions/dataframe_utils.py b/functions/dataframe_utils.py
index 3cbd8010..beb8f6dc 100644
--- a/functions/dataframe_utils.py
+++ b/functions/dataframe_utils.py
@@ -1,5 +1,5 @@
-from aiolimiter import AsyncLimiter
-from functions.webscraping_utils import check_expired_listing
+from functions.mls_image_processing_utils import imagekit_transform
+from functions.webscraping_utils import check_expired_listing_bhhs, check_expired_listing_theagency, webscrape_bhhs, fetch_the_agency_data
from loguru import logger
import asyncio
import pandas as pd
@@ -8,40 +8,99 @@
# Initialize logging
logger.add(sys.stderr, format="{time} {level} {message}", filter="my_module", level="INFO")
-async def remove_expired_listings(df: pd.DataFrame, limiter: AsyncLimiter) -> pd.DataFrame:
+def remove_inactive_listings(df: pd.DataFrame) -> pd.DataFrame:
"""
- Asynchronously checks each listing URL in the DataFrame to determine if it has expired,
- and removes rows with expired listings, applying rate limiting. Also counts the number of expired listings removed.
+ Checks each listing to determine if it has expired or been sold, and removes inactive listings.
+ If 'bhhs' is in the 'listing_url', it checks for expired listings.
+ If 'idcrealestate' is in the 'listing_url', it checks for sold listings.
Parameters:
df (pd.DataFrame): The DataFrame containing listing URLs and MLS numbers.
- limiter (AsyncLimiter): The rate limiter to control request frequency.
Returns:
- pd.DataFrame: The DataFrame with expired listings removed.
+ pd.DataFrame: The DataFrame with inactive listings removed.
"""
- async def check_and_mark_expired(row):
- async with limiter:
- expired = await check_expired_listing(row.listing_url, row.mls_number)
- return (row.Index, expired)
-
- # Gather tasks for all rows that need to be checked
- tasks = [check_and_mark_expired(row) for row in df[df.listing_url.notnull()].itertuples()]
- results = await asyncio.gather(*tasks)
-
- # Determine indexes of rows to drop (where listing has expired)
- indexes_to_drop = [index for index, expired in results if expired]
-
- # Counter for expired listings
- expired_count = len(indexes_to_drop)
-
- # Log success messages for dropped listings and the count of expired listings
- for index in indexes_to_drop:
- mls_number = df.loc[index, 'mls_number']
- logger.success(f"Removed {mls_number} (Index: {index}) from the dataframe because the listing has expired.")
-
- logger.info(f"Total expired listings removed: {expired_count}")
-
- # Drop the rows from the DataFrame and return the modified DataFrame
- df_dropped_expired = df.drop(indexes_to_drop)
- return df_dropped_expired
\ No newline at end of file
+ indexes_to_drop = []
+
+ for row in df.itertuples():
+ listing_url = str(getattr(row, 'listing_url', ''))
+ mls_number = str(getattr(row, 'mls_number', ''))
+
+ # Check if the listing is expired on BHHS
+ if 'bhhscalifornia.com' in listing_url:
+ is_expired = check_expired_listing_bhhs(listing_url, mls_number)
+ if is_expired:
+ indexes_to_drop.append(row.Index)
+ logger.success(f"Removed MLS {mls_number} (Index: {row.Index}) from the DataFrame because the listing has expired on BHHS.")
+ # Check if the listing is expired on The Agency
+ elif 'theagencyre.com' in listing_url:
+ is_sold = check_expired_listing_theagency(listing_url, mls_number)
+ if is_sold:
+ indexes_to_drop.append(row.Index)
+ logger.success(f"Removed MLS {mls_number} (Index: {row.Index}) from the DataFrame because the listing has expired on The Agency.")
+
+ inactive_count = len(indexes_to_drop)
+ logger.info(f"Total inactive listings removed: {inactive_count}")
+
+ df_active = df.drop(indexes_to_drop)
+ return df_active.reset_index(drop=True)
+
+def update_dataframe_with_listing_data(
+ df: pd.DataFrame, imagekit_instance
+) -> pd.DataFrame:
+ """
+ Updates the DataFrame with listing date, MLS photo, and listing URL by scraping BHHS and using The Agency's API.
+
+ Parameters:
+ df (pd.DataFrame): The DataFrame to update.
+ imagekit_instance: The ImageKit instance for image transformations.
+
+ Returns:
+ pd.DataFrame: The updated DataFrame.
+ """
+ for row in df.itertuples():
+ mls_number = row.mls_number
+ try:
+ webscrape = webscrape_bhhs(
+ url=f"https://www.bhhscalifornia.com/for-lease/{mls_number}-t_q;/",
+ row_index=row.Index,
+ mls_number=mls_number,
+ total_rows=len(df)
+ )
+
+ if not all(webscrape):
+ logger.warning(f"BHHS did not return complete data for MLS {mls_number}. Trying The Agency.")
+ agency_data = fetch_the_agency_data(
+ mls_number,
+ row_index=row.Index,
+ total_rows=len(df),
+ full_street_address=row.full_street_address
+ )
+
+ if agency_data and any(agency_data):
+ listed_date, listing_url, mls_photo = agency_data
+ if listed_date:
+ df.at[row.Index, 'listed_date'] = listed_date
+ if listing_url:
+ df.at[row.Index, 'listing_url'] = listing_url
+ if mls_photo:
+ df.at[row.Index, 'mls_photo'] = imagekit_transform(
+ mls_photo,
+ mls_number,
+ imagekit_instance=imagekit_instance
+ )
+ else:
+ logger.warning(f"No photo URL found for MLS {mls_number} from The Agency.")
+ else:
+ pass
+ else:
+ df.at[row.Index, 'listed_date'] = webscrape[0]
+ df.at[row.Index, 'mls_photo'] = imagekit_transform(
+ webscrape[1],
+ mls_number,
+ imagekit_instance=imagekit_instance
+ )
+ df.at[row.Index, 'listing_url'] = webscrape[2]
+ except Exception as e:
+ logger.error(f"Error processing MLS {mls_number} at index {row.Index}: {e}")
+ return df
\ No newline at end of file
diff --git a/functions/geocoding_utils.py b/functions/geocoding_utils.py
index abd0b782..773d249c 100644
--- a/functions/geocoding_utils.py
+++ b/functions/geocoding_utils.py
@@ -66,39 +66,39 @@ def fetch_missing_city(address: str, geolocator: GoogleV3) -> Optional[str]:
return city
-def return_postalcode(address: str, geolocator: GoogleV3) -> Optional[Union[int, type(pd.NA)]]:
+def return_zip_code(address: str, geolocator: GoogleV3) -> Optional[str]:
"""
- Fetches the postal code for a given short address using forward and reverse geocoding.
-
+ Fetches the postal code for a given address using geocoding.
+
Parameters:
- address (str): The short address.
- geolocator (GoogleV3): An instance of a GoogleV3 geocoding class.
-
+ address (str): The full street address.
+ geolocator (GoogleV3): An instance of the GoogleV3 geocoding class.
+
Returns:
- Optional[Union[int, type(pd.NA)]]: The postal code as an integer, or pd.NA if unsuccessful.
+ Optional[str]: The postal code as a string, or None if unsuccessful.
"""
- # Initialize postalcode variable
postalcode = None
try:
- geocode_info = geolocator.geocode(address, components={'administrative_area': 'CA', 'country': 'US'})
- components = geolocator.geocode(f"{geocode_info.latitude}, {geocode_info.longitude}").raw['address_components']
-
- # Create a dataframe from the list of dictionaries
- components_df = pd.DataFrame(components)
-
- # Iterate through rows to find the postal code
- for row in components_df.itertuples():
- if row.types == ['postal_code']:
- postalcode = int(row.long_name)
-
- logger.info(f"Fetched postal code {postalcode} for {address}.")
- except AttributeError:
- logger.warning(f"Geocoding returned no results for {address}.")
- return pd.NA
+ geocode_info = geolocator.geocode(
+ address, components={'administrative_area': 'CA', 'country': 'US'}
+ )
+ if geocode_info:
+ raw = geocode_info.raw['address_components']
+ # Find the 'postal_code'
+ postalcode = next(
+ (addr['long_name'] for addr in raw if 'postal_code' in addr['types']),
+ None
+ )
+ if postalcode:
+ logger.info(f"Fetched zip code ({postalcode}) for {address}.")
+ else:
+ logger.warning(f"No postal code found in geocoding results for {address}.")
+ else:
+ logger.warning(f"Geocoding returned no results for {address}.")
except Exception as e:
- logger.warning(f"Couldn't fetch postal code for {address} because {e}.")
- return pd.NA
+ logger.warning(f"Couldn't fetch zip code for {address} because of {e}.")
+ postalcode = None
return postalcode
diff --git a/functions/webscraping_utils.py b/functions/webscraping_utils.py
index 6fd167de..aa689eeb 100644
--- a/functions/webscraping_utils.py
+++ b/functions/webscraping_utils.py
@@ -1,115 +1,327 @@
-from aiolimiter import AsyncLimiter
from bs4 import BeautifulSoup
+from datetime import datetime, timezone
from loguru import logger
from typing import Tuple, Optional
-import asyncio
-import httpx
import pandas as pd
import re
import requests
import sys
+import time
# Initialize logging
-logger.add(sys.stderr, format="{time} {level} {message}", filter="my_module", level="INFO")
+logger.add(sys.stderr, format="{time} {level} {message}", filter="my_module", level="DEBUG")
-# Limit to 1 request per second
-limiter = AsyncLimiter(1, 1)
+import requests
+from bs4 import BeautifulSoup
+from loguru import logger
-async def check_expired_listing(url: str, mls_number: str) -> bool:
+def check_expired_listing_bhhs(url: str, mls_number: str) -> bool:
"""
- Checks if a listing has expired based on the presence of a specific HTML element, asynchronously.
-
+ Checks if a BHHS listing has expired by looking for a specific message on the page.
+
Parameters:
url (str): The URL of the listing to check.
- mls_number (str): The MLS number of the listing.
-
+ mls_number: The MLS number of the listing.
+
Returns:
bool: True if the listing has expired, False otherwise.
"""
headers = {
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.35"
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0',
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+ 'Accept-Language': 'en-US,en;q=0.5',
+ 'Accept-Encoding': 'gzip, deflate, br, zstd',
+ 'Connection': 'keep-alive',
+ 'Upgrade-Insecure-Requests': '1',
+ 'Pragma': 'no-cache',
+ 'Cache-Control': 'no-cache',
}
try:
- async with limiter:
- async with httpx.AsyncClient(timeout=10) as client:
- response = await client.get(url, headers=headers)
- response.raise_for_status()
-
+ response = requests.get(url, headers=headers, timeout=10)
+ response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
-
- description = soup.find('div', class_='page-description').text
- cleaned_description = " ".join(description.split())
-
- return bool(cleaned_description)
-
- except httpx.TimeoutException:
- logger.warning(f"Timeout occurred while checking if the listing for {mls_number} has expired.")
- except httpx.HTTPStatusError as h:
- if h.response.status_code == 429:
- retry_after = int(h.response.headers.get("Retry-After", 60)) # Use a default retry after 60 seconds if header is missing
- logger.warning(f"Rate limit exceeded, retrying after {retry_after} seconds.")
- await asyncio.sleep(retry_after)
- return await check_expired_listing(url, mls_number) # Retry the request
- else:
- logger.warning(f"HTTP error {h.response.status_code} occurred while checking if the listing for {mls_number} has expired. {h.response.text}")
- except AttributeError:
- # This occurs if the 'page-description' div is not found, meaning the listing hasn't expired
+
+ # Look for the message indicating the listing is no longer active
+ description_div = soup.find('div', class_='page-description')
+ if description_div:
+ description_text = " ".join(description_div.text.split())
+ if "We're sorry, the listing you are looking for is no longer active." in description_text:
+ return True
return False
+
+ except requests.Timeout:
+ logger.warning(f"Timeout occurred while checking if the listing for {mls_number} has expired.")
+ except requests.HTTPError as e:
+ logger.error(f"HTTP error occurred for MLS {mls_number}: {e}")
except Exception as e:
- logger.warning(f"Couldn't detect if the listing for {mls_number} has expired because {e}.")
+ logger.error(f"An unexpected error occurred for MLS {mls_number}: {e}")
return False
-async def webscrape_bhhs(url: str, row_index: int, mls_number: str, total_rows: int) -> Tuple[Optional[pd.Timestamp], Optional[str], Optional[str]]:
+def check_expired_listing_theagency(listing_url: str, mls_number: str, board_code: str = 'clr') -> bool:
"""
- Asynchronously scrapes a BHHS page to fetch the listing URL, photo, and listed date.
+ Checks if a listing has been sold based on the 'IsSold' key from The Agency API.
+
+ Parameters:
+ listing_url (str): The URL of the listing to check.
+ mls_number (str): The MLS number of the listing.
+ board_code (str, optional): The board code extracted from the listing URL or a default value.
+
+ Returns:
+ bool: True if the listing has been sold, False otherwise.
"""
+ # Try to extract the board code from the listing_url if it varies
+ try:
+ pattern = r'https://.*?idcrealestate\.com/.*?/(?P\w+)/'
+ match = re.search(pattern, listing_url)
+ if match:
+ board_code = match.group('board_code')
+ else:
+ # Use the default board_code provided in the function parameter
+ pass # board_code remains as provided
+ except Exception as e:
+ logger.warning(f"Could not extract board code from listing URL: {listing_url}. Error: {e}")
+
+ api_url = f'https://search-service.idcrealestate.com/api/property/en_US/d4/sold-detail/{board_code}/{mls_number}'
headers = {
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.35"
+ "User-Agent": "Mozilla/5.0",
+ "Accept": "*/*",
+ "Accept-Language": "en-US,en;q=0.5",
+ "Content-Type": "application/json",
+ "Referer": "https://www.theagencyre.com/",
+ "X-Tenant": "QUdZfFBST0R8Q09NUEFOWXwx",
+ "Origin": "https://www.theagencyre.com",
+ "Connection": "keep-alive",
}
try:
- async with httpx.AsyncClient(timeout=5, follow_redirects=True) as client:
- response = await client.get(url, headers=headers)
- response.raise_for_status()
-
- # Check if a redirect has occurred
- #if response.history:
- # logger.info(f"Redirected from {url} to {response.url} for {mls_number}.")
-
- # Successful HTTP request
- soup = BeautifulSoup(response.text, 'html.parser')
- listed_date, photo, link = None, None, None
-
- # Example parsing (ensure to adjust based on actual HTML structure)
- link_tag = soup.find('a', class_='btn cab waves-effect waves-light btn-details show-listing-details')
- if link_tag and 'href' in link_tag.attrs:
- link = f"https://www.bhhscalifornia.com{link_tag['href']}"
-
- photo_tag = soup.find('a', class_='show-listing-details')
- if photo_tag and photo_tag.find('img'):
- photo = photo_tag.find('img')['src']
-
- date_tag = soup.find('p', class_='summary-mlsnumber')
- if date_tag:
- listed_date_text = date_tag.text.split()[-1]
- listed_date = pd.Timestamp(listed_date_text)
-
- return listed_date, photo, link
-
- except httpx.TimeoutException:
- logger.warning(f"Timeout occurred while scraping BHHS page for {mls_number}.")
- except httpx.HTTPStatusError as h:
- if h.response.status_code == 429:
- retry_after = int(h.response.headers.get("Retry-After", 60)) # Default to 60 seconds
- logger.warning(f"Rate limit exceeded for {mls_number}, retrying after {retry_after} seconds.")
- await asyncio.sleep(retry_after)
- return await webscrape_bhhs(url, row_index, mls_number, total_rows) # Retry the request
- else:
- logger.warning(f"HTTP error {h.response.status_code} occurred while scraping BHHS page for {mls_number}.")
+ response = requests.get(api_url, headers=headers)
+ response.raise_for_status()
+ data = response.json()
+ is_sold = data.get('IsSold', False)
+ if is_sold:
+ logger.debug(f"Listing {mls_number} has been sold.")
+ return is_sold
+ except requests.HTTPError as e:
+ logger.error(f"HTTP error occurred while checking if the listing for MLS {mls_number} has been sold: {e}")
+ except Exception as e:
+ logger.error(f"An error occurred while checking if the listing for MLS {mls_number} has been sold: {e}")
+
+ return False
+
+def webscrape_bhhs(url: str, row_index: int, mls_number: str, total_rows: int) -> Tuple[Optional[pd.Timestamp], Optional[str], Optional[str]]:
+ """
+ Scrapes the BHHS website for listing details.
+
+ Parameters:
+ url (str): The URL of the listing to scrape.
+ row_index (int): The current row index being processed.
+ mls_number (str): The MLS number of the listing.
+ total_rows (int): The total number of rows to process.
+
+ Returns:
+ Tuple[Optional[pd.Timestamp], Optional[str], Optional[str]]:
+ - listed_date (pd.Timestamp): The listing date if found.
+ - photo (str): The URL of the listing photo if found.
+ - link (str): The detailed listing URL if found.
+ Returns (None, None, None) if data is not found or an error occurs.
+ """
+ logger.info(f"Scraping BHHS page for {mls_number} (row {row_index + 1} of {total_rows}).")
+ headers = {
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0',
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+ 'Accept-Language': 'en-US,en;q=0.5',
+ 'Accept-Encoding': 'gzip, deflate, br, zstd',
+ 'Connection': 'keep-alive',
+ 'Upgrade-Insecure-Requests': '1',
+ 'Pragma': 'no-cache',
+ 'Cache-Control': 'no-cache',
+ }
+ try:
+ response = requests.get(url, headers=headers, timeout=10)
+ response.raise_for_status()
+ soup = BeautifulSoup(response.text, 'html.parser')
+
+ # Initialize variables
+ listed_date = None
+ photo = None
+ link = None
+
+ # Extract the detailed listing URL
+ link_tag = soup.find('a', class_='btn cab waves-effect waves-light btn-details show-listing-details')
+ if link_tag and 'href' in link_tag.attrs:
+ link = f"https://www.bhhscalifornia.com{link_tag['href']}"
+
+ # Extract the photo URL
+ photo_tag = soup.find('a', class_='show-listing-details')
+ if photo_tag and photo_tag.find('img'):
+ photo = photo_tag.find('img')['src']
+
+ # Extract the listed date
+ date_tag = soup.find('p', class_='summary-mlsnumber')
+ if date_tag:
+ listed_date_text = date_tag.text.split()[-1]
+ listed_date = pd.Timestamp(listed_date_text)
+
+ return listed_date, photo, link
+
+ except requests.HTTPError as e:
+ logger.warning(f"HTTP error occurred while scraping BHHS page for {mls_number}: {e}")
except Exception as e:
- logger.warning(f"Error scraping BHHS page for {mls_number}: {e}.")
+ logger.warning(f"Error scraping BHHS page for {mls_number}: {e}")
+
+ return None, None, None
+
+def extract_street_name(full_street_address: str) -> Optional[str]:
+ """
+ Extracts the street name from a full street address.
+
+ This function handles addresses with or without unit numbers and directional indicators.
+ It splits the address to isolate the street name component.
+
+ Args:
+ full_street_address (str): The full street address (e.g., "118 S Cordova ST #B, ALHAMBRA 91801")
+ Returns:
+ Optional[str]: The extracted street name in lowercase if successful; otherwise, None.
+ """
+ # Split the address at the comma
+ address_first_part = full_street_address.split(',')[0].strip()
+ # Remove unit numbers (e.g., #A, #1/2)
+ address_first_part = re.sub(r'#\S+', '', address_first_part)
+ # Split the first part by spaces
+ tokens = address_first_part.split()
+ # Check if tokens are sufficient
+ if len(tokens) >= 2:
+ possible_direction = tokens[1].upper()
+ if possible_direction in ['N', 'S', 'E', 'W', 'NE', 'NW', 'SE', 'SW']:
+ # Direction present
+ if len(tokens) >= 3:
+ street_name = tokens[2]
+ else:
+ return None
+ else:
+ # No direction
+ street_name = tokens[1]
+ return street_name.lower()
+ else:
+ # Can't extract street name
+ return None
+
+def extract_zip_code(full_street_address: str) -> Optional[str]:
+ """
+ Extracts the ZIP code from a full street address.
+
+ Uses regular expressions to find a 5-digit ZIP code, optionally handling ZIP+4 formats.
+
+ Args:
+ full_street_address (str): The full street address (e.g., "118 S Cordova ST #B, ALHAMBRA 91801")
+
+ Returns:
+ Optional[str]: The extracted ZIP code if successful; otherwise, None.
+ """
+ match = re.search(r'\b\d{5}(?:-\d{4})?\b', full_street_address)
+ if match:
+ return match.group()
+ else:
+ return None
+
+def fetch_the_agency_data(mls_number: str, row_index: int, total_rows: int, full_street_address: str) -> Tuple[Optional[datetime.date], Optional[str], Optional[str]]:
+ """
+ Fetches property data for a given MLS number from The Agency API and scrapes the detail page for the image source.
+
+ Parameters:
+ mls_number (str): The MLS number of the property to fetch.
+ row_index (int): The row index for logging or debugging purposes.
+ total_rows (int): Total rows being processed for progress indication.
+ full_street_address (str): The full street address of the property (e.g., "118 S Cordova ST #B, ALHAMBRA 91801").
+
+ Returns:
+ Tuple[Optional[datetime.date], Optional[str], Optional[str]]:
+ - The listing date (as a datetime.date object) if found; otherwise, None.
+ - The detail URL of the property if found; otherwise, None.
+ - The first property image URL if found; otherwise, None.
+ Returns (None, None, None) if no matching property is found or if an error occurs.
+ """
+ url = "https://search-service.idcrealestate.com/api/property"
+ headers = {
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0",
+ "Accept": "*/*",
+ "Accept-Language": "en-US,en;q=0.5",
+ "Accept-Encoding": "gzip, deflate, br, zstd",
+ "X-Tenant": "AGY",
+ "X-TenantMode": "Production",
+ "X-TenantHost": "theagencyre.com",
+ "Content-Type": "application/json",
+ "Origin": "https://www.theagencyre.com",
+ "Connection": "keep-alive",
+ "Referer": "https://www.theagencyre.com/",
+ "Sec-Fetch-Dest": "empty",
+ "Sec-Fetch-Mode": "cors",
+ "Sec-Fetch-Site": "cross-site",
+ "Priority": "u=4",
+ "Pragma": "no-cache",
+ "Cache-Control": "no-cache"
+ }
+ normalized_mls_number = mls_number.replace("-", "").replace("_", "")
+ payload = {
+ "urlquery": f"/rent/search-{normalized_mls_number}/rental-true",
+ "countrystate": "",
+ "zoom": 21
+ }
+ #logger.debug(payload)
+ try:
+ response = requests.post(url, headers=headers, json=payload)
+ response.raise_for_status()
+ #logger.debug(response.text)
+
+ # Parse JSON response
+ data = response.json()
+
+ # Extract the street name and zip code
+ street_name = extract_street_name(full_street_address)
+ if not street_name:
+ logger.warning(f"Could not extract street name from address: {full_street_address}")
+ return None, None, None
+ logger.debug(f"Extracted street name: {street_name}")
+
+ zip_code = extract_zip_code(full_street_address)
+ if not zip_code:
+ logger.warning(f"Could not extract zip code from address: {full_street_address}")
+ return None, None, None
+ logger.debug(f"Extracted zip code: {zip_code}")
+
+ # Filter items based on the street name and zip code
+ filtered_items = [
+ item for item in data.get("items", [])
+ if street_name in item.get("fullAddress", "").lower() and zip_code in item.get("fullAddress", "").lower()
+ ]
+
+ if filtered_items:
+ if len(filtered_items) > 1:
+ logger.warning(f"Multiple properties found for street name '{street_name}' and zip code '{zip_code}'. Using the first one.")
+ item = filtered_items[0]
+ list_date_timestamp = int(item.get("listDate", 0))
+ list_date = datetime.fromtimestamp(list_date_timestamp, tz=timezone.utc).date()
+ detail_url = f"https://www.theagencyre.com{item.get('detailUrl', '')}"
+ detail_response = requests.get(detail_url, headers=headers)
+ detail_response.raise_for_status()
+ detail_soup = BeautifulSoup(detail_response.text, 'html.parser')
+ img_tag = detail_soup.find("img", {"data-src": lambda x: x and x.endswith("_1.jpg")})
+ img_src = img_tag["data-src"] if img_tag else None
+ logger.success(f"Successfully fetched {list_date} {detail_url} {img_src} for MLS {mls_number}")
+ return list_date, detail_url, img_src
+
+ logger.warning(f"No property found on The Agency with street name '{street_name}' and zip code '{zip_code}'.")
+ return None, None, None
+
+ except requests.HTTPError as e:
+ logger.error(f"HTTP error occurred: {e}")
+ logger.debug(f"Response content: {e.response.text}")
+ except requests.RequestException as e:
+ logger.error(f"Request error occurred: {e}")
+ except Exception as e:
+ logger.error(f"An unexpected error occurred: {e}")
return None, None, None
def update_hoa_fee(df: pd.DataFrame, mls_number: str) -> None:
diff --git a/lease_dataframe.py b/lease_dataframe.py
index 00101d78..a0f81f39 100644
--- a/lease_dataframe.py
+++ b/lease_dataframe.py
@@ -1,5 +1,5 @@
from dotenv import load_dotenv, find_dotenv
-from functions.dataframe_utils import remove_expired_listings
+from functions.dataframe_utils import remove_inactive_listings, update_dataframe_with_listing_data
from functions.geocoding_utils import *
from functions.mls_image_processing_utils import *
from functions.noise_level_utils import *
@@ -36,7 +36,7 @@
# Load all CSVs and concat into one dataframe
# https://stackoverflow.com/a/21232849
path = "."
-all_files = glob.glob(os.path.join(path, "*lacountyrentals*.csv"))
+all_files = glob.glob(os.path.join(path, "*Renter*.csv"))
df = pd.concat((pd.read_csv(f, float_precision="round_trip", skipinitialspace=True) for f in all_files), ignore_index=True)
pd.set_option("display.precision", 10)
@@ -45,136 +45,153 @@
# https://stackoverflow.com/a/36082588
df.columns = df.columns.str.strip()
-# Standardize the column names by renaminmg them
+# Convert all column names to lowercase
+df.columns = df.columns.str.lower()
+
+# Standardize the column names by renaming them
# https://stackoverflow.com/a/65332240
# Define a renaming dictionary based on patterns
rename_dict = {
- 'Garage Spaces': 'garage_spaces',
- 'List Office Phone': 'phone_number',
- 'Listing': 'mls_number',
- 'St Name': 'street_name',
- 'St#': 'street_number',
- 'Sub Type': 'subtype',
- 'Yr': 'YrBuilt',
+ 'agent': 'phone_number',
+ 'allowed': 'pet_policy',
+ 'baths': 'bathrooms',
+ 'bedrooms': 'bedrooms',
+ 'city': 'city',
+ 'furnished': 'furnished',
+ 'key': 'key_deposit',
+ 'laundry': 'laundry',
+ 'list': 'list_price',
+ 'lot': 'lot_size',
+ 'mls': 'mls_number',
+ 'name': 'street_name',
+ 'other': 'other_deposit',
+ 'pet deposit': 'pet_deposit',
+ 'prking': 'parking_spaces',
+ 'security': 'security_deposit',
+ 'sqft': 'sqft',
+ 'square': 'ppsqft',
+ 'st #': 'street_number',
+ 'sub': 'subtype',
+ 'terms': 'terms',
+ 'yr': 'year_built',
+ 'zip': 'zip_code',
}
-# Check if 'Price Per' column exists and add to renaming dictionary
-if any(col.startswith('Price Per') for col in df.columns):
- rename_dict['Price Per'] = 'ppsqft'
-
-# Rename columns
+# Rename columns based on substrings in the column names
df = df.rename(columns=lambda c: next((v for k, v in rename_dict.items() if k in c), c))
-# Special case for list price due to additional condition
-df = df.rename(columns=lambda c: 'list_price' if c.startswith('List') and c.endswith('Price') else c)
+# Drop the numbers in the first group of characters in the street_name column
+df['street_name'] = df['street_name'].str.replace(r'^\d+\s*', '', regex=True)
# Drop all rows with misc/irrelevant data
df.dropna(subset=['street_name'], inplace=True)
# Columns to clean
-cols = ['DepositKey', 'DepositOther', 'DepositPets', 'DepositSecurity', 'list_price', 'Sqft', 'YrBuilt']
-if 'ppsqft' in df.columns:
- cols.append('ppsqft')
+cols = ['key_deposit', 'other_deposit', 'security_deposit', 'list_price', 'pet_deposit']
+# Remove all non-numeric characters, convert to numeric, round to integers, fill NaNs with pd.NA, and cast to Nullable Integer Type
+df[cols] = (
+ df[cols]
+ .replace({r'\$': '', ',': ''}, regex=True)
+ .apply(pd.to_numeric, errors='coerce')
+ .round(0) # Round to ensure values are integers
+ .astype(pd.UInt16Dtype())
+)
-# Remove all non-numeric characters, convert to numeric, and cast to Nullable Integer Type
-df[cols] = df[cols].replace(to_replace='[^\d]', value='', regex=True).apply(pd.to_numeric, errors='coerce').astype(pd.Int64Dtype())
+# Cast 'sqft' to UInt32
+df['sqft'] = df['sqft'].replace({',': ''}, regex=True).astype(pd.UInt32Dtype())
+
+# Convert other columns to appropriate data types
+df = df.astype({
+ 'year_built': 'UInt16',
+ 'parking_spaces': 'UInt8',
+ 'street_number': 'string'
+})
+
+# Handle lot_size column separately by removing commas, converting to numeric, and then to UInt32
+df['lot_size'] = (
+ df['lot_size']
+ .replace({',': ''}, regex=True)
+ .apply(pd.to_numeric, errors='coerce')
+ .astype(pd.UInt32Dtype())
+)
-# Check if 'ppsqft' column exists
-if 'ppsqft' not in df.columns:
- # If it has a different name, replace 'Sqft' below with the correct column name
- df['ppsqft'] = (df['list_price'] / df['Sqft']).round(2)
-
-# Fetch missing city names
-for row in df.loc[(df['City'].isnull()) & (df['PostalCode'].notnull())].itertuples():
- df.at[row.Index, 'City'] = fetch_missing_city(f"{row.street_number} {row.street_name} {str(row.PostalCode)}", geolocator=g)
+# Cast the following columns as a float and remove the leading $ sign
+df['ppsqft'] = df['ppsqft'].replace(to_replace=r'[^\d]', value='', regex=True).astype(pd.Float32Dtype())
# Columns to be cast as strings
-cols = ['street_number', 'street_name', 'City', 'mls_number', 'SeniorCommunityYN', 'Furnished', 'LaundryFeatures', 'subtype']
-
-for col in cols:
- # If the column exists, replace empty strings with NaNs
- if col in df.columns:
- df[col] = df[col].replace(r'^\s*$', pd.NA, regex=True)
- # If the column does not exist, create it and fill it with NaNs
- else:
- df[col] = pd.NA
- # Cast the column as a string type (NA values will remain as NA)
- df[col] = df[col].astype(pd.StringDtype())
+cols = ['mls_number', 'phone_number', 'street_name', 'zip_code', 'city']
+df[cols] = df[cols].astype(pd.StringDtype())
+
+# Columns to be cast as categories
+cols = ['pet_policy', 'furnished', 'subtype', 'terms', 'laundry']
+df[cols] = df[cols].astype(pd.CategoricalDtype())
+
+# Extract total bathrooms and bathroom types (Full, Three-Quarter, Half, Quarter)
+df[['total_bathrooms', 'full_bathrooms', 'three_quarter_bathrooms', 'half_bathrooms', 'quarter_bathrooms']] = df['bathrooms'].str.extract(r'(\d+\.\d+)\s\((\d+)\s(\d+)\s(\d+)\s(\d+)\)').astype(float)
+
+# Convert bathroom columns to nullable integer type
+for col in ['total_bathrooms', 'full_bathrooms', 'three_quarter_bathrooms', 'half_bathrooms', 'quarter_bathrooms']:
+ df[col] = df[col].astype(pd.UInt8Dtype())
+
+# Drop the original 'Baths(FTHQ)' column since we've extracted the data we need
+df.drop(columns=['bathrooms'], inplace=True)
+
+# Convert bedrooms to nullable integer type
+df['bedrooms'] = df['bedrooms'].astype(pd.UInt8Dtype())
+
+# Fetch missing city names
+for row in df.loc[(df['city'].isnull()) & (df['zip_code'].notnull())].itertuples():
+ df.at[row.Index, 'city'] = fetch_missing_city(f"{row.street_number} {row.street_name} {str(row.zip_code)}", geolocator=g)
# Create a new column with the Street Number & Street Name
-df["short_address"] = df["street_number"] + ' ' + df["street_name"] + ',' + ' ' + df['City']
+df["short_address"] = (df["street_number"].astype(str) + ' ' + df["street_name"] + ', ' + df['city']).astype(pd.StringDtype())
# Filter the dataframe and return only rows with a NaN postal code
# For some reason some Postal Codes are "Assessor" :| so we need to include that string in an OR operation
# Then iterate through this filtered dataframe and input the right info we get using geocoding
-for row in df.loc[(df['PostalCode'].isnull()) | (df['PostalCode'] == 'Assessor')].itertuples():
+for row in df.loc[(df['zip_code'].isnull()) | (df['zip_code'] == 'Assessor')].itertuples():
short_address = df.at[row.Index, 'short_address']
- missing_postalcode = return_postalcode(short_address, geolocator=g)
- df.at[row.Index, 'PostalCode'] = missing_postalcode
+ missing_zip_code = return_zip_code(short_address, geolocator=g)
+ df.at[row.Index, 'zip_code'] = missing_zip_code
-df['PostalCode'] = df['PostalCode'].apply(pd.to_numeric, errors='coerce').astype(pd.Int64Dtype())
+df['zip_code'] = df['zip_code'].astype(pd.StringDtype())
# Tag each row with the date it was processed
for row in df.itertuples():
df.at[row.Index, 'date_processed'] = pd.Timestamp.today()
# Create a new column with the full street address
-# Also strip whitespace from the St Name column
-# Convert the postal code into a string so we can combine string and int
-# https://stackoverflow.com/a/11858532
-df["full_street_address"] = df["street_number"] + ' ' + df["street_name"].str.strip() + ',' + ' ' + df['City'] + ' ' + df["PostalCode"].astype(str)
+df["full_street_address"] = (
+ df["street_number"].astype(str) + ' ' +
+ df["street_name"].str.strip() + ', ' +
+ df['city'] + ' ' +
+ df["zip_code"].astype(str)
+).astype(pd.StringDtype())
-# Iterate through the dataframe and get the listed date and photo for rows
-for row in df.itertuples():
- mls_number = row[1]
- webscrape = asyncio.run(webscrape_bhhs(url=f"https://www.bhhscalifornia.com/for-lease/{mls_number}-t_q;/", row_index=row.Index, mls_number=mls_number, total_rows=len(df)))
- df.at[row.Index, 'listed_date'] = webscrape[0]
- df.at[row.Index, 'mls_photo'] = imagekit_transform(webscrape[1], row[1], imagekit_instance=imagekit)
- df.at[row.Index, 'listing_url'] = webscrape[2]
+# Iterate through the dataframe and get the listed date and photo for rows
+df = update_dataframe_with_listing_data(df, imagekit_instance=imagekit)
# Iterate through the dataframe and fetch coordinates for rows
for row in df.itertuples():
coordinates = return_coordinates(address=row.full_street_address, row_index=row.Index, geolocator=g, total_rows=len(df))
- df.at[row.Index, 'Latitude'] = coordinates[0]
- df.at[row.Index, 'Longitude'] = coordinates[1]
-
-#df = update_howloud_scores(df)
-
-# Split the Bedroom/Bathrooms column into separate columns based on delimiters
-# Based on the example given in the spreadsheet: 2 (beds) / 1 (total baths),1 (full baths) ,0 (half bath), 0 (three quarter bath)
-# Realtor logic based on https://www.realtor.com/advice/sell/if-i-take-out-the-tub-does-a-bathroom-still-count-as-a-full-bath/
-# TIL: A full bathroom is made up of four parts: a sink, a shower, a bathtub, and a toilet. Anything less than thpdat, and you can’t officially consider it a full bath.
-df['Bedrooms'] = df['Br/Ba'].str.split('/', expand=True)[0]
-df['Total Bathrooms'] = (df['Br/Ba'].str.split('/', expand=True)[1]).str.split(',', expand=True)[0]
-df['Full Bathrooms'] = (df['Br/Ba'].str.split('/', expand=True)[1]).str.split(',', expand=True)[1]
-df['Half Bathrooms'] = (df['Br/Ba'].str.split('/', expand=True)[1]).str.split(',', expand=True)[2]
-df['Three Quarter Bathrooms'] = (df['Br/Ba'].str.split('/', expand=True)[1]).str.split(',', expand=True)[3]
-
-# Convert a few columns into int64
-# pd.to_numeric will convert into int64 or float64 automatically, which is cool
-# These columns are assumed to have NO MISSING DATA, so we can cast them as int64 instead of floats (ints can't handle NaNs)
-df['Bedrooms'] = df['Bedrooms'].apply(pd.to_numeric, errors='coerce')
-df['Total Bathrooms'] = df['Total Bathrooms'].apply(pd.to_numeric)
-# These columns should stay floats
-df['Latitude'] = df['Latitude'].apply(pd.to_numeric, errors='coerce')
-df['Longitude'] = df['Longitude'].apply(pd.to_numeric, errors='coerce')
-df['garage_spaces'] = df['garage_spaces'].astype('Float64')
+ df.at[row.Index, 'latitude'] = coordinates[0]
+ df.at[row.Index, 'longitude'] = coordinates[1]
-# Replace all empty values in the following columns with NaN and cast the column as dtype string
-# https://stackoverflow.com/a/47810911
-df.Terms = df.Terms.astype("string").replace(r'^\s*$', pd.NA, regex=True)
+# These columns should stay floats
+df['latitude'] = df['latitude'].apply(pd.to_numeric, errors='raise', downcast='float')
+df['longitude'] = df['longitude'].apply(pd.to_numeric, errors='raise', downcast='float')
## Laundry Features ##
# Replace all empty values in the following column with "Unknown" and cast the column as dtype string
-df.LaundryFeatures = df.LaundryFeatures.astype("string").replace(r'^\s*$', "Unknown", regex=True)
+df.laundry = df.laundry.astype("string").replace(r'^\s*$', "Unknown", regex=True)
# Fill in any NaNs in the Laundry column with "Unknown"
-df.LaundryFeatures = df.LaundryFeatures.fillna(value="Unknown")
-# Any string containing "Community" in the Laundry column should be replaced with "Community Laundry"
-df['LaundryFeatures'] = df['LaundryFeatures'].str.replace("Community", "Community Laundry")
-# Any string containing "Common" in the Laundry column should be replaced with "Community Laundry"
-df['LaundryFeatures'] = df['LaundryFeatures'].str.replace("Common", "Community Laundry")
-# Replace "Community Laundry Area" with "Community Laundry"
-df['LaundryFeatures'] = df['LaundryFeatures'].str.replace("Community Laundry Area", "Community Laundry")
+df.laundry = df.laundry.fillna(value="Unknown")
+# Replace various patterns in the Laundry column with "Community Laundry"
+df.laundry = df.laundry.str.replace(
+ r'Community Laundry Area|Laundry Area|Community|Common',
+ 'Community Laundry',
+ regex=True
+)
# Convert the listed date into DateTime and use the "mixed" format to handle the different date formats
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html
@@ -183,30 +200,6 @@
# Convert date_processed into DateTime
df['date_processed'] = pd.to_datetime(df['date_processed'], errors='coerce', format='%Y-%m-%d')
-# Per CA law, ANY type of deposit is capped at rent * 3 months
-# It doesn't matter the type of deposit, they all have the same cap
-# Despite that, some landlords/realtors will list the property with an absurd deposit (100k? wtf) so let's rewrite those
-# Use numpy .values to rewrite anything greater than $18000 ($6000 rent * 3 months) into $18000
-# https://stackoverflow.com/a/54426197
-df['DepositSecurity'].values[df['DepositSecurity'] > 18000] = 18000
-df['DepositPets'].values[df['DepositPets'] > 18000] = 18000
-df['DepositOther'].values[df['DepositOther'] > 18000] = 18000
-df['DepositKey'].values[df['DepositKey'] > 18000] = 18000
-
-# Rewrite anything greater than 5000 square feet as NaN
-# Because there's no fucking way there's a RENTAL PROPERTY that is 5000+ sqft in this city
-# It clearly must be some kind of clerical error so a NaN (unknown) is more appropriate
-# All that being said, I should peruse new spreadsheets to make sure there isn't actually a valid property exceeds 5000 sqft
-df['Sqft'].values[df['Sqft'] > 5000] = pd.NA
-
-# Rewrite anything with >5 garage spaces as None
-df['garage_spaces'].values[df['garage_spaces'] > 5] = None
-
-# Keep rows with less than 6 bedrooms
-# 6 bedrooms and above are probably multi family investments and not actual rentals
-# They also skew the outliers, causing the sliders to go way up
-df = df[df.Bedrooms < 6]
-
# Reindex the dataframe
df.reset_index(drop=True, inplace=True)
@@ -221,25 +214,25 @@
# Drop any dupes again
df_combined = df_combined.drop_duplicates(subset=['mls_number'], keep="last")
# Iterate through the dataframe and drop rows with expired listings
-df_combined = asyncio.run(remove_expired_listings(df_combined, limiter))
+df_combined = remove_inactive_listings(df_combined)
# Reset the index
df_combined = df_combined.reset_index(drop=True)
# Filter the dataframe for rows outside of California
outside_ca_rows = df_combined[
- (df_combined['Latitude'] < 32.5) |
- (df_combined['Latitude'] > 42) |
- (df_combined['Longitude'] < -124) |
- (df_combined['Longitude'] > -114)
+ (df_combined['latitude'] < 32.5) |
+ (df_combined['latitude'] > 42) |
+ (df_combined['longitude'] < -124) |
+ (df_combined['longitude'] > -114)
]
total_outside_ca = len(outside_ca_rows)
counter = 0
for row in outside_ca_rows.itertuples():
counter += 1
- logger.warning(f"Row {counter} out of {total_outside_ca}: {row.mls_number} has coordinates {row.Latitude}, {row.Longitude} which is outside California. Re-geocoding {row.mls_number}...")
+ logger.warning(f"Row {counter} out of {total_outside_ca}: {row.mls_number} has coordinates {row.latitude}, {row.longitude} which is outside California. Re-geocoding {row.mls_number}...")
# Re-geocode the row
coordinates = return_coordinates(address=row.full_street_address, row_index=row.Index, geolocator=g, total_rows=len(df))
- df_combined.at[row.Index, 'Latitude'] = coordinates[0]
- df_combined.at[row.Index, 'Longitude'] = coordinates[1]
+ df_combined.at[row.Index, 'latitude'] = coordinates[0]
+ df_combined.at[row.Index, 'longitude'] = coordinates[1]
# Save the new combined dataframe
try:
df_combined.to_parquet(path="assets/datasets/lease.parquet")
diff --git a/pages/components.py b/pages/components.py
index a5ea9053..1abb45ab 100644
--- a/pages/components.py
+++ b/pages/components.py
@@ -105,7 +105,7 @@ def __init__(self, df):
# Initalize these first because they are used in other components
self.df = df
- self.df['LaundryCategory'] = self.df['LaundryFeatures'].apply(self.categorize_laundry_features)
+ self.df['laundry'] = self.df['laundry'].apply(self.categorize_laundry_features)
self.bathrooms_slider = self.create_bathrooms_slider()
self.bedrooms_slider = self.create_bedrooms_slider()
@@ -134,7 +134,7 @@ def __init__(self, df):
self.user_options_card = self.create_user_options_card()
def categorize_laundry_features(self, feature):
- if feature is None or feature in [np.nan, 'Unknown', '']:
+ if pd.isna(feature) or feature in ['Unknown', '']:
return 'Unknown'
if any(keyword in feature for keyword in ['In Closet', 'In Kitchen', 'In Garage', 'Inside', 'Individual Room']):
return 'In Unit'
@@ -197,9 +197,9 @@ def create_bedrooms_slider(self):
html.Div([
dcc.RangeSlider(
min=0,
- max=self.df['Bedrooms'].max(),
+ max=self.df['bedrooms'].max(),
step=1,
- value=[0, self.df['Bedrooms'].max()],
+ value=[0, self.df['bedrooms'].max()],
id='bedrooms_slider',
updatemode='mouseup',
tooltip={
@@ -225,9 +225,9 @@ def create_bathrooms_slider(self):
html.Div([
dcc.RangeSlider(
min=0,
- max=self.df['Total Bathrooms'].max(),
+ max=self.df['total_bathrooms'].max(),
step=1,
- value=[0, self.df['Total Bathrooms'].max()],
+ value=[0, self.df['total_bathrooms'].max()],
id='bathrooms_slider',
updatemode='mouseup',
tooltip={
@@ -252,9 +252,9 @@ def create_sqft_components(self):
]),
html.Div([
dcc.RangeSlider(
- min=self.df['Sqft'].min(),
- max=self.df['Sqft'].max(),
- value=[self.df['Sqft'].min(), self.df['Sqft'].max()],
+ min=self.df['sqft'].min(),
+ max=self.df['sqft'].max(),
+ value=[self.df['sqft'].min(), self.df['sqft'].max()],
id='sqft_slider',
updatemode='mouseup',
tooltip={
@@ -380,11 +380,23 @@ def create_pets_radio_button(self):
return pets_radio
def create_rental_terms_checklist(self):
- # Logic to calculate unique_terms
- unique_terms = pd.Series([term for sublist in self.df['Terms'].fillna('Unknown').str.split(',') for term in sublist]).unique()
+ # Add 'Unknown' to categories if necessary
+ if pd.api.types.is_categorical_dtype(self.df['terms']):
+ if 'Unknown' not in self.df['terms'].cat.categories:
+ self.df['terms'] = self.df['terms'].cat.add_categories('Unknown')
+
+ # Fill NaN values with 'Unknown'
+ terms_series = self.df['terms'].fillna('Unknown')
+
+ # Split terms and flatten the list
+ unique_terms = pd.Series([
+ term.strip() for sublist in terms_series.str.split(',')
+ if sublist for term in sublist
+ ]).unique()
+
unique_terms = sorted(unique_terms)
- # Define term_abbreviations and terms
+ # Define term abbreviations and labels
term_abbreviations = {
'12M': '12 Months',
'24M': '24 Months',
@@ -398,7 +410,8 @@ def create_rental_terms_checklist(self):
'VR': 'Vacation Rental',
'WK': 'Week-to-Week',
}
- terms = {k: term_abbreviations[k] for k in sorted(term_abbreviations)}
+
+ terms = {k: term_abbreviations.get(k, k) for k in unique_terms}
# Create the Dash component
rental_terms_checklist = html.Div([
@@ -410,33 +423,29 @@ def create_rental_terms_checklist(self):
dcc.Checklist(
id='terms_checklist',
options=[{'label': f"{terms[term]} ({term})", 'value': term} for term in terms],
- value=[term['value'] for term in [{'label': "Unknown" if pd.isnull(term) else term, 'value': "Unknown" if pd.isnull(term) else term} for term in unique_terms]],
- inputStyle={
- "margin-right": "5px",
- "margin-left": "5px"
- },
+ value=unique_terms, # Select all terms by default
+ inputStyle={"margin-right": "5px", "margin-left": "5px"},
inline=False
),
],
- id={'type': 'dynamic_output_div_lease', 'index': 'rental_terms'},
+ id={'type': 'dynamic_output_div_lease', 'index': 'rental_terms'},
),
],
- id='rental_terms_div'
+ id='rental_terms_div'
)
-
return rental_terms_checklist
def create_garage_spaces_components(self):
garage_spaces_components = html.Div([
html.Div([
- html.H5("Garage Spaces", style={'display': 'inline-block', 'margin-right': '10px'}),
+ html.H5("Parking Spaces", style={'display': 'inline-block', 'margin-right': '10px'}),
create_toggle_button(index='garage_spaces', initial_label="Hide", page_type='lease')
]),
html.Div([
dcc.RangeSlider(
min=0,
- max=self.df['garage_spaces'].max(),
- value=[0, self.df['garage_spaces'].max()],
+ max=self.df['parking_spaces'].max(),
+ value=[0, self.df['parking_spaces'].max()],
id='garage_spaces_slider',
updatemode='mouseup',
tooltip={
@@ -516,9 +525,9 @@ def create_year_built_components(self):
], style={'display': 'inline-block'}),
html.Div([
dcc.RangeSlider(
- min=self.df['YrBuilt'].min(),
- max=self.df['YrBuilt'].max(),
- value=[0, self.df['YrBuilt'].max()],
+ min=self.df['year_built'].min(),
+ max=self.df['year_built'].max(),
+ value=[0, self.df['year_built'].max()],
id='yrbuilt_slider',
updatemode='mouseup',
tooltip={
@@ -526,7 +535,7 @@ def create_year_built_components(self):
"always_visible": True
},
marks={
- float(self.df['YrBuilt'].min() + i*20): str(self.df['YrBuilt'].min() + i*20) for i in range(8)
+ float(self.df['year_built'].min() + i*20): str(self.df['year_built'].min() + i*20) for i in range(8)
}
),
dbc.Alert(
@@ -610,9 +619,9 @@ def create_security_deposit_components(self):
]),
html.Div([
dcc.RangeSlider(
- min=self.df['DepositSecurity'].min(),
- max=self.df['DepositSecurity'].max(),
- value=[self.df['DepositSecurity'].min(), self.df['DepositSecurity'].max()],
+ min=self.df['security_deposit'].min(),
+ max=self.df['security_deposit'].max(),
+ value=[self.df['security_deposit'].min(), self.df['security_deposit'].max()],
id='security_deposit_slider',
updatemode='mouseup',
tooltip={
@@ -662,9 +671,9 @@ def create_other_deposit_components(self):
]),
html.Div([
dcc.RangeSlider(
- min=self.df['DepositOther'].min(),
- max=self.df['DepositOther'].max(),
- value=[self.df['DepositOther'].min(), self.df['DepositOther'].max()],
+ min=self.df['other_deposit'].min(),
+ max=self.df['other_deposit'].max(),
+ value=[self.df['other_deposit'].min(), self.df['other_deposit'].max()],
id='other_deposit_slider',
updatemode='mouseup',
tooltip={
@@ -714,9 +723,9 @@ def create_pet_deposit_components(self):
]),
html.Div([
dcc.RangeSlider(
- min=self.df['DepositPets'].min(),
- max=self.df['DepositPets'].max(),
- value=[self.df['DepositPets'].min(), self.df['DepositPets'].max()],
+ min=self.df['pet_deposit'].min(),
+ max=self.df['pet_deposit'].max(),
+ value=[self.df['pet_deposit'].min(), self.df['pet_deposit'].max()],
id='pet_deposit_slider',
updatemode='mouseup',
tooltip={
@@ -766,9 +775,9 @@ def create_key_deposit_components(self):
]),
html.Div([
dcc.RangeSlider(
- min=self.df['DepositKey'].min(),
- max=self.df['DepositKey'].max(),
- value=[self.df['DepositKey'].min(), self.df['DepositKey'].max()],
+ min=self.df['key_deposit'].min(),
+ max=self.df['key_deposit'].max(),
+ value=[self.df['key_deposit'].min(), self.df['key_deposit'].max()],
id='key_deposit_slider',
updatemode='mouseup',
tooltip={
@@ -818,9 +827,9 @@ def create_key_deposit_components(self):
]),
html.Div([
dcc.RangeSlider(
- min=self.df['DepositKey'].min(),
- max=self.df['DepositKey'].max(),
- value=[self.df['DepositKey'].min(), self.df['DepositKey'].max()],
+ min=self.df['key_deposit'].min(),
+ max=self.df['key_deposit'].max(),
+ value=[self.df['key_deposit'].min(), self.df['key_deposit'].max()],
id='key_deposit_slider',
updatemode='mouseup',
tooltip={
@@ -863,8 +872,10 @@ def create_key_deposit_components(self):
return key_deposit_components
def create_laundry_checklist(self):
+ # Replace NaN values with 'Unknown' before sorting
+ laundry_series = self.df['laundry'].fillna('Unknown')
# Get unique laundry categories sorted alphabetically
- unique_categories = sorted(self.df['LaundryCategory'].unique())
+ unique_categories = sorted(laundry_series.unique())
# Create options for the checklist
laundry_options = [
@@ -962,7 +973,7 @@ def create_map(self):
id='map',
zoom=9,
minZoom=9,
- center=(self.df['Latitude'].mean(), self.df['Longitude'].mean()),
+ center=(self.df['latitude'].mean(), self.df['longitude'].mean()),
preferCanvas=True,
closePopupOnClick=True,
style={'width': '100%', 'height': '90vh', 'margin': "auto", "display": "inline-block"}
@@ -1138,7 +1149,7 @@ def create_bedrooms_slider(self):
# Title and toggle button
html.Div([
- html.H5("Bedrooms", style={'display': 'inline-block', 'margin-right': '10px'}),
+ html.H5("bedrooms", style={'display': 'inline-block', 'margin-right': '10px'}),
create_toggle_button(index='bedrooms', initial_label="Hide", page_type='buy')
]),
diff --git a/pages/filters.py b/pages/filters.py
index 52e23620..2fe1a0cf 100644
--- a/pages/filters.py
+++ b/pages/filters.py
@@ -1,4 +1,4 @@
-from typing import Union
+from typing import Union, List
import pandas as pd
import re
@@ -7,24 +7,24 @@ class LeaseFilters:
def __init__(self, df):
self.df = df
- def sqft_radio_button(self, include_missing: bool, slider_begin: float, slider_end: float) -> pd.Series:
+ def sqft_radio_button(self, include_missing: bool, slider_begin: int, slider_end: int) -> pd.Series:
"""
Filter the dataframe based on whether properties with missing square footage should be included.
Args:
- include_missing (bool): Whether properties with missing square footage should be included.
- - slider_begin (float): Start value of the square footage slider.
- - slider_end (float): End value of the square footage slider.
+ - slider_begin (int): Start value of the square footage slider.
+ - slider_end (int): End value of the square footage slider.
Returns:
- pd.Series: Boolean mask indicating which rows of the dataframe satisfy the filter conditions.
"""
if include_missing:
# Include properties with missing square footage
- sqft_choice = self.df['Sqft'].isnull() | self.df['Sqft'].between(slider_begin, slider_end)
+ sqft_choice = self.df['sqft'].isnull() | self.df['sqft'].between(slider_begin, slider_end)
else:
# Exclude properties with missing square footage
- sqft_choice = self.df['Sqft'].between(slider_begin, slider_end)
+ sqft_choice = self.df['sqft'].between(slider_begin, slider_end)
return sqft_choice
def yrbuilt_radio_button(self, include_missing: bool, slider_begin: int, slider_end: int) -> pd.Series:
@@ -41,10 +41,10 @@ def yrbuilt_radio_button(self, include_missing: bool, slider_begin: int, slider_
"""
if include_missing:
# Include properties with missing year built
- yrbuilt_choice = self.df['YrBuilt'].isnull() | self.df['YrBuilt'].between(slider_begin, slider_end)
+ yrbuilt_choice = self.df['year_built'].isnull() | self.df['year_built'].between(slider_begin, slider_end)
else:
# Exclude properties with missing year built
- yrbuilt_choice = self.df['YrBuilt'].between(slider_begin, slider_end)
+ yrbuilt_choice = self.df['year_built'].between(slider_begin, slider_end)
return yrbuilt_choice
def garage_radio_button(self, include_missing: bool, slider_begin: int, slider_end: int) -> pd.Series:
@@ -61,20 +61,20 @@ def garage_radio_button(self, include_missing: bool, slider_begin: int, slider_e
"""
if include_missing:
# Include properties with missing garage spaces
- garage_choice = self.df['garage_spaces'].isnull() | self.df['garage_spaces'].between(slider_begin, slider_end)
+ garage_choice = self.df['parking_spaces'].isnull() | self.df['parking_spaces'].between(slider_begin, slider_end)
else:
# Exclude properties with missing garage spaces
- garage_choice = self.df['garage_spaces'].between(slider_begin, slider_end)
+ garage_choice = self.df['parking_spaces'].between(slider_begin, slider_end)
return garage_choice
- def ppsqft_radio_button(self, include_missing: bool, slider_begin: float, slider_end: float) -> pd.Series:
+ def ppsqft_radio_button(self, include_missing: bool, slider_begin: int, slider_end: int) -> pd.Series:
"""
Filter the dataframe based on whether properties with missing price per square foot should be included.
Args:
- include_missing (bool): Whether properties with missing price per square foot should be included.
- - slider_begin (float): Start value of the price per square foot slider.
- - slider_end (float): End value of the price per square foot slider.
+ - slider_begin (int): Start value of the price per square foot slider.
+ - slider_end (int): End value of the price per square foot slider.
Returns:
- pd.Series: Boolean mask indicating which rows of the dataframe satisfy the filter conditions.
@@ -100,18 +100,18 @@ def pets_radio_button(self, choice: str) -> pd.Series:
Returns:
- pd.Series: A boolean Series indicating which rows of the DataFrame satisfy the filter conditions.
"""
- if choice == True:
+ if choice == 'Yes':
# Filter for rows where the pet policy allows pets (not 'No' or 'No, Size Limit')
- pets_radio_choice = ~self.df['PetsAllowed'].isin(['No', 'No, Size Limit'])
- elif choice == False:
+ pets_radio_choice = ~self.df['pet_policy'].isin(['No', 'No, Size Limit'])
+ elif choice == 'No':
# Filter for rows where the pet policy does not allow pets
- pets_radio_choice = self.df['PetsAllowed'].isin(['No', 'No, Size Limit'])
- else: # Assuming 'Both' includes all rows
- # Create a boolean Series of True for all rows to include everything
+ pets_radio_choice = self.df['pet_policy'].isin(['No', 'No, Size Limit'])
+ else: # 'Both'
+ # Include all properties regardless of pet policy
pets_radio_choice = pd.Series([True] * len(self.df), index=self.df.index)
return pets_radio_choice
- def furnished_checklist_function(self, choice: list[str]) -> pd.Series:
+ def furnished_checklist_function(self, choice: List[str]) -> pd.Series:
"""
Filters the DataFrame for furnished dwellings based on the user's choice.
@@ -120,244 +120,249 @@ def furnished_checklist_function(self, choice: list[str]) -> pd.Series:
might not specify their furnished state.
Args:
- - choice (list[str]): A list of user-selected options regarding the furnished status.
- Options include 'Furnished', 'Unfurnished', and 'Unknown'.
+ - choice (List[str]): A list of user-selected options regarding the furnished status.
+ Options include 'Furnished', 'Unfurnished', and 'Unknown'.
Returns:
- pd.Series: A boolean Series indicating which rows of the DataFrame satisfy the filter conditions.
"""
- # Presort the list first for potentially faster performance
- choice.sort()
+ if not choice:
+ # If no choices are selected, return False for all entries
+ return pd.Series([False] * len(self.df), index=self.df.index)
+
+ filters = []
if 'Unknown' in choice:
- # Include rows where Furnished status is NaN OR matches one of the selected choices
- furnished_checklist_filter = self.df['Furnished'].isnull() | self.df['Furnished'].isin(choice)
- else:
- # If Unknown is NOT selected, return rows that match the selected choices (implies .notnull() by default)
- furnished_checklist_filter = self.df['Furnished'].isin(choice)
+ # Include entries where 'furnished' is NaN
+ filters.append(self.df['furnished'].isna())
+ # Remove 'Unknown' from choices to avoid filtering by it in 'isin'
+ choice = [c for c in choice if c != 'Unknown']
+
+ if choice:
+ # For remaining choices, filter where 'furnished' matches the choices
+ filters.append(self.df['furnished'].isin(choice))
+
+ # Combine filters using logical OR
+ furnished_checklist_filter = pd.Series(False, index=self.df.index)
+ for f in filters:
+ furnished_checklist_filter |= f
+
return furnished_checklist_filter
- def security_deposit_function(self, include_missing: bool, slider_begin: float, slider_end: float) -> pd.Series:
+ def security_deposit_function(self, include_missing: bool, slider_begin: int, slider_end: int) -> pd.Series:
"""
- Filters the DataFrame for properties based on security deposit criteria, allowing
- for the inclusion of properties without a security deposit listed.
+ Filter the dataframe based on whether properties with missing security deposit should be included.
Args:
- - include_missing (bool): Whether to include properties with no security deposit listed.
- - slider_begin (float): The starting value of the range for the security deposit.
- - slider_end (float): The ending value of the range for the security deposit.
+ - include_missing (bool): Whether properties with missing security deposit should be included.
+ - slider_begin (int): Start value of the security deposit slider.
+ - slider_end (int): End value of the security deposit slider.
Returns:
- - pd.Series: A boolean Series indicating which rows of the DataFrame satisfy the
- filter conditions based on the security deposit.
+ - pd.Series: Boolean mask indicating which rows of the dataframe satisfy the filter conditions.
"""
if include_missing:
- # Include properties with no security deposit listed or within the specified range
- security_deposit_filter = self.df['DepositSecurity'].isnull() | self.df['DepositSecurity'].between(slider_begin, slider_end)
+ # Include properties with missing security deposit
+ security_deposit_filter = self.df['security_deposit'].isnull() | self.df['security_deposit'].between(slider_begin, slider_end)
else:
- # Include properties within the specified range, implicitly excludes nulls
- security_deposit_filter = self.df['DepositSecurity'].between(slider_begin, slider_end)
+ # Exclude properties with missing security deposit
+ security_deposit_filter = self.df['security_deposit'].between(slider_begin, slider_end)
return security_deposit_filter
- def pet_deposit_function(self, include_missing: bool, slider_begin: float, slider_end: float) -> pd.Series:
+ def pet_deposit_function(self, include_missing: bool, slider_begin: int, slider_end: int) -> pd.Series:
"""
- Filters the DataFrame for properties based on pet deposit criteria, allowing
- for the inclusion of properties without a pet deposit listed.
+ Filter the dataframe based on whether properties with missing pet deposit should be included.
Args:
- - include_missing (bool): Whether to include properties with no pet deposit listed.
- - slider_begin (float): The starting value of the range for the pet deposit.
- - slider_end (float): The ending value of the range for the pet deposit.
+ - include_missing (bool): Whether properties with missing pet deposit should be included.
+ - slider_begin (int): Start value of the pet deposit slider.
+ - slider_end (int): End value of the pet deposit slider.
Returns:
- - pd.Series: A boolean Series indicating which rows of the DataFrame satisfy the
- filter conditions based on the pet deposit.
+ - pd.Series: Boolean mask indicating which rows of the dataframe satisfy the filter conditions.
"""
if include_missing:
- # Include properties with no pet deposit listed or within the specified range
- pet_deposit_filter = self.df['DepositPets'].isnull() | self.df['DepositPets'].between(slider_begin, slider_end)
+ # Include properties with missing pet deposit
+ pet_deposit_filter = self.df['pet_deposit'].isnull() | self.df['pet_deposit'].between(slider_begin, slider_end)
else:
- # Include properties within the specified range, implicitly excludes nulls
- pet_deposit_filter = self.df['DepositPets'].between(slider_begin, slider_end)
+ # Exclude properties with missing pet deposit
+ pet_deposit_filter = self.df['pet_deposit'].between(slider_begin, slider_end)
return pet_deposit_filter
- def key_deposit_function(self, include_missing: bool, slider_begin: float, slider_end: float) -> pd.Series:
+ def key_deposit_function(self, include_missing: bool, slider_begin: int, slider_end: int) -> pd.Series:
"""
- Filters the DataFrame for properties based on key deposit criteria, allowing
- for the inclusion of properties without a key deposit listed.
-
- This function is designed to filter properties based on the presence or absence
- of a key deposit and whether the key deposit amount falls within a specified range.
+ Filter the dataframe based on whether properties with missing key deposit should be included.
Args:
- - include_missing (bool): Whether to include properties with no key deposit listed.
- - slider_begin (float): The starting value of the range for the key deposit.
- - slider_end (float): The ending value of the range for the key deposit.
+ - include_missing (bool): Whether properties with missing key deposit should be included.
+ - slider_begin (int): Start value of the key deposit slider.
+ - slider_end (int): End value of the key deposit slider.
Returns:
- - pd.Series: A boolean Series indicating which rows of the DataFrame satisfy the
- filter conditions based on the key deposit.
+ - pd.Series: Boolean mask indicating which rows of the dataframe satisfy the filter conditions.
"""
if include_missing:
- # Include properties with no key deposit listed or within the specified range
- key_deposit_filter = self.df['DepositKey'].isnull() | self.df['DepositKey'].between(slider_begin, slider_end)
+ # Include properties with missing key deposit
+ key_deposit_filter = self.df['key_deposit'].isnull() | self.df['key_deposit'].between(slider_begin, slider_end)
else:
- # Include properties within the specified range, implicitly excludes nulls
- key_deposit_filter = self.df['DepositKey'].between(slider_begin, slider_end)
+ # Exclude properties with missing key deposit
+ key_deposit_filter = self.df['key_deposit'].between(slider_begin, slider_end)
return key_deposit_filter
- def other_deposit_function(self, include_missing: bool, slider_begin: float, slider_end: float) -> pd.Series:
+ def other_deposit_function(self, include_missing: bool, slider_begin: int, slider_end: int) -> pd.Series:
"""
- Filters the DataFrame for properties based on 'other' deposit criteria, allowing
- for the inclusion of properties without an 'other' deposit listed.
+ Filter the dataframe based on whether properties with missing other deposit should be included.
Args:
- - include_missing (bool): Whether to include properties with no 'other' deposit listed.
- - slider_begin (float): The starting value of the range for the 'other' deposit.
- - slider_end (float): The ending value of the range for the 'other' deposit.
+ - include_missing (bool): Whether properties with missing other deposit should be included.
+ - slider_begin (int): Start value of the other deposit slider.
+ - slider_end (int): End value of the other deposit slider.
Returns:
- - pd.Series: A boolean Series indicating which rows of the DataFrame satisfy the
- filter conditions based on the 'other' deposit.
+ - pd.Series: Boolean mask indicating which rows of the dataframe satisfy the filter conditions.
"""
if include_missing:
- # Include properties with no 'other' deposit listed or within the specified range
- other_deposit_filter = self.df['DepositOther'].isnull() | self.df['DepositOther'].between(slider_begin, slider_end)
+ # Include properties with missing other deposit
+ other_deposit_filter = self.df['other_deposit'].isnull() | self.df['other_deposit'].between(slider_begin, slider_end)
else:
- # Include properties within the specified range, implicitly excludes nulls
- other_deposit_filter = self.df['DepositOther'].between(slider_begin, slider_end)
+ # Exclude properties with missing other deposit
+ other_deposit_filter = self.df['other_deposit'].between(slider_begin, slider_end)
return other_deposit_filter
- def listed_date_function(self, include_missing: bool, start_date: str, end_date: str) -> pd.Series:
+ def listed_date_function(self, include_missing: bool, start_date: Union[str, pd.Timestamp], end_date: Union[str, pd.Timestamp]) -> pd.Series:
"""
- Filters the DataFrame for properties based on the listing date criteria, allowing
- for the inclusion of properties without a listed date.
-
- This function allows filtering properties based on whether there is a listing date
- specified and whether this date falls within a given range.
+ Filter the dataframe based on whether properties with missing listed date should be included.
Args:
- - include_missing (bool): Whether to include properties with no listed date.
- - start_date (str): The starting date of the range for the listing date, formatted as 'YYYY-MM-DD'.
- - end_date (str): The ending date of the range for the listing date, formatted as 'YYYY-MM-DD'.
+ - include_missing (bool): Whether properties with missing listed date should be included.
+ - start_date (Union[str, pd.Timestamp]): Start date of the listed date range.
+ - end_date (Union[str, pd.Timestamp]): End date of the listed date range.
Returns:
- - pd.Series: A boolean Series indicating which rows of the DataFrame satisfy the
- filter conditions based on the listing date.
+ - pd.Series: Boolean mask indicating which rows of the dataframe satisfy the filter conditions.
"""
+ # Convert start_date and end_date to datetime if they are strings
+ start_date = pd.to_datetime(start_date)
+ end_date = pd.to_datetime(end_date)
+
if include_missing:
- # Include properties with no listed date or within the specified date range
+ # Include properties with missing listed date
listed_date_filter = self.df['listed_date'].isnull() | self.df['listed_date'].between(start_date, end_date)
else:
- # Include properties within the specified date range, implicitly excludes nulls
+ # Exclude properties with missing listed date
listed_date_filter = self.df['listed_date'].between(start_date, end_date)
return listed_date_filter
- def terms_function(self, choice: list[str]) -> pd.Series:
+ def terms_function(self, choice: List[str]) -> pd.Series:
"""
- Filters the DataFrame based on specified terms in the 'Terms' column. Supports
- inclusion of rows with missing values ('NaN') if 'Unknown' is part of the choices.
-
+ Filters the DataFrame based on the rental lease terms according to the user's choice.
+
Args:
- - choice (list[str]): A list of terms to filter the 'Terms' column by. Includes
- special handling for 'Unknown' to include or exclude NaN values.
-
+ - choice (List[str]): A list of user-selected terms. Options could include various terms like 'Lease', 'Month-to-Month', etc.
+
Returns:
- - pd.Series: A boolean Series indicating which rows of the DataFrame satisfy the
- filter conditions. If no choices are made, it defaults to False for all rows.
+ - pd.Series: A boolean Series indicating which rows of the DataFrame satisfy the filter conditions.
"""
- # Ensure choice list is not empty
if not choice:
+ # If no choices are selected, return False for all entries
return pd.Series([False] * len(self.df), index=self.df.index)
-
- # Presort the list for potentially faster performance
- choice.sort()
- # Corrected: Use re.escape for escaping regex special characters
- choice_regex = '|'.join([re.escape(term) for term in choice if term != 'Unknown'])
- # Handle 'Unknown' choice
- if 'Unknown' in choice:
- terms_filter = self.df['Terms'].isnull() | self.df['Terms'].str.contains(choice_regex, na=False)
- else:
- terms_filter = self.df['Terms'].str.contains(choice_regex, na=False)
-
- return terms_filter
+ # Handle 'Unknown' option
+ if 'Unknown' in choice:
+ unknown_filter = self.df['terms'].isnull()
+ # Remove 'Unknown' from choices to avoid filtering by it in 'str.contains'
+ choice = [c for c in choice if c != 'Unknown']
+ else:
+ unknown_filter = pd.Series([False] * len(self.df), index=self.df.index)
+
+ if choice:
+ # Create a regex pattern from the choice list, escaping any special characters
+ pattern = '|'.join([re.escape(term) for term in choice])
+ # Use vectorized string matching for efficient filtering
+ terms_filter = self.df['terms'].str.contains(pattern, na=False, case=False)
+ else:
+ terms_filter = pd.Series([False] * len(self.df), index=self.df.index)
+
+ # Combine filters
+ combined_filter = terms_filter | unknown_filter
+ return combined_filter
- def laundry_checklist_function(self, choice: list[str]) -> pd.Series:
+ def laundry_checklist_function(self, choice: List[str]) -> pd.Series:
"""
- Filters the DataFrame for properties based on selected laundry features.
-
- Special handling for 'Other' to include properties that do not match any of the
- predefined categories. 'Unknown' and 'None' are treated according to their selection.
+ Filters the DataFrame for laundry features based on the user's choice.
Args:
- - choice (list[str]): A list of user-selected laundry features.
-
+ - choice (List[str]): A list of user-selected options regarding laundry features.
+ Options include types like 'In Unit', 'Shared', 'Hookups',
+ 'Included Appliances', 'Location Specific', 'Unknown', and 'Other'.
+
Returns:
- - pd.Series: A boolean Series indicating which rows of the DataFrame satisfy
- the filter conditions based on laundry features.
+ - pd.Series: A boolean Series indicating which rows of the DataFrame satisfy the filter conditions.
"""
- # Return False for all rows if the choice list is empty
if not choice:
+ # If no choices are selected, return False for all entries
return pd.Series([False] * len(self.df), index=self.df.index)
- # Special case for 'Other'
- if 'Other' in choice:
- other_filter = ~self.df['LaundryCategory'].isin([
- 'In Unit', 'Shared', 'Hookups', 'Included Appliances', 'Location Specific', 'Unknown'
- ])
- choice.remove('Other')
- else:
- other_filter = pd.Series([False] * len(self.df), index=self.df.index)
-
- # Handle 'Unknown' choice
+ filters = []
if 'Unknown' in choice:
- unknown_filter = self.df['LaundryCategory'] == 'Unknown'
- choice.remove('Unknown')
- else:
- unknown_filter = pd.Series([False] * len(self.df), index=self.df.index)
+ # Include entries where 'laundry' is NaN
+ filters.append(self.df['laundry'].isna())
+ # Remove 'Unknown' from choices to avoid filtering by it in 'isin'
+ choice = [c for c in choice if c != 'Unknown']
+
+ if 'Other' in choice:
+ # Include entries where 'laundry' is not in known categories
+ known_categories = ['In Unit', 'Shared', 'Hookups', 'Included Appliances', 'Location Specific']
+ other_filter = ~self.df['laundry'].isin(known_categories)
+ filters.append(other_filter)
+ # Remove 'Other' from choices
+ choice = [c for c in choice if c != 'Other']
- # Filter based on the remaining choices
if choice:
- choice_filter = self.df['LaundryCategory'].isin(choice)
+ # Filter where 'laundry' matches the choices
+ filters.append(self.df['laundry'].isin(choice))
+
+ # Combine filters using logical OR
+ if filters:
+ laundry_checklist_filter = pd.Series([False] * len(self.df), index=self.df.index)
+ for f in filters:
+ laundry_checklist_filter |= f
else:
- choice_filter = pd.Series([False] * len(self.df), index=self.df.index)
-
- # Combine all filters
- combined_filter = choice_filter | other_filter | unknown_filter
+ # If no valid choices left, return False for all entries
+ laundry_checklist_filter = pd.Series([False] * len(self.df), index=self.df.index)
- return combined_filter
+ return laundry_checklist_filter
- def subtype_checklist_function(self, choice: list[str]) -> pd.Series:
+ def subtype_checklist_function(self, choice: List[str]) -> pd.Series:
"""
- Filters the DataFrame for properties based on selected property subtypes.
-
- Special handling is provided for 'Unknown' to include properties without a specified subtype,
- as well as subtypes '/A' and '/D'.
-
+ Filters the DataFrame for property subtypes based on the user's choice.
+
Args:
- - choice (list[str]): A list of user-selected property subtypes, including a special 'Unknown'
- option to include properties without a specified subtype.
-
+ - choice (List[str]): A list of user-selected subtypes. Options include various property types.
+
Returns:
- - pd.Series: A boolean Series indicating which rows of the DataFrame satisfy
- the filter conditions based on property subtypes.
+ - pd.Series: A boolean Series indicating which rows of the DataFrame satisfy the filter conditions.
"""
- # Ensure the choice list is not empty
if not choice:
+ # If no choices are selected, return False for all entries
return pd.Series([False] * len(self.df), index=self.df.index)
-
- # Map '/A' and '/D' subtypes to 'Unknown'
- self.df['subtype'] = self.df['subtype'].replace({'/A': None, '/D': None})
-
- # Handle 'Unknown' selection
+
+ # Handle 'Unknown' option
if 'Unknown' in choice:
- # Include rows where subtype is NaN OR matches one of the selected choices
- subtype_filter = self.df['subtype'].isnull() | self.df['subtype'].isin(choice)
+ unknown_filter = self.df['subtype'].isnull()
+ # Remove 'Unknown' from choices to avoid filtering by it in 'isin'
+ choice = [c for c in choice if c != 'Unknown']
else:
- # If 'Unknown' is NOT selected, filter by the selected choices
+ unknown_filter = pd.Series([False] * len(self.df), index=self.df.index)
+
+ if choice:
+ # Filter where 'subtype' matches the choices
subtype_filter = self.df['subtype'].isin(choice)
-
- return subtype_filter
+ else:
+ subtype_filter = pd.Series([False] * len(self.df), index=self.df.index)
+
+ # Combine filters
+ combined_filter = subtype_filter | unknown_filter
+ return combined_filter
# Create a class to hold all of the filters for the sale page
class BuyFilters:
@@ -406,10 +411,10 @@ def sqft_function(self, include_missing: bool, slider_begin: float, slider_end:
"""
if include_missing:
# Include properties with missing square footage
- sqft_choice = self.df['Sqft'].isnull() | self.df['Sqft'].between(slider_begin, slider_end)
+ sqft_choice = self.df['sqft'].isnull() | self.df['sqft'].between(slider_begin, slider_end)
else:
# Exclude properties with missing square footage
- sqft_choice = self.df['Sqft'].between(slider_begin, slider_end)
+ sqft_choice = self.df['sqft'].between(slider_begin, slider_end)
return sqft_choice
def year_built_function(self, include_missing: bool, slider_begin: int, slider_end: int) -> pd.Series:
diff --git a/pages/lease_page.py b/pages/lease_page.py
index c46649c8..bcab8918 100644
--- a/pages/lease_page.py
+++ b/pages/lease_page.py
@@ -101,18 +101,18 @@ def update_map(subtypes_chosen, pets_chosen, terms_chosen, garage_spaces, rental
subtypes_chosen.sort()
# Sort the DataFrame once at the beginning
- df_sorted = df.sort_values(by=['garage_spaces', 'list_price', 'Bedrooms', 'Total Bathrooms', 'Sqft', 'YrBuilt', 'ppsqft'])
+ df_sorted = df.sort_values(by=['parking_spaces', 'list_price', 'bedrooms', 'total_bathrooms', 'sqft', 'year_built', 'ppsqft'])
filters = [
lease_filters.subtype_checklist_function(subtypes_chosen),
lease_filters.pets_radio_button(pets_chosen),
lease_filters.terms_function(terms_chosen),
- ((df_sorted['garage_spaces'].between(garage_spaces[0], garage_spaces[1])) | lease_filters.garage_radio_button(garage_missing_radio_choice, garage_spaces[0], garage_spaces[1])),
+ ((df_sorted['parking_spaces'].between(garage_spaces[0], garage_spaces[1])) | lease_filters.garage_radio_button(garage_missing_radio_choice, garage_spaces[0], garage_spaces[1])),
(df_sorted['list_price'].between(rental_price[0], rental_price[1])),
- (df_sorted['Bedrooms'].between(bedrooms_chosen[0], bedrooms_chosen[1])),
- (df_sorted['Total Bathrooms'].between(bathrooms_chosen[0], bathrooms_chosen[1])),
- ((df_sorted['Sqft'].between(sqft_chosen[0], sqft_chosen[1])) | lease_filters.sqft_radio_button(sqft_missing_radio_choice, sqft_chosen[0], sqft_chosen[1])),
- ((df_sorted['YrBuilt'].between(years_chosen[0], years_chosen[1])) | lease_filters.yrbuilt_radio_button(yrbuilt_missing_radio_choice, years_chosen[0], years_chosen[1])),
+ (df_sorted['bedrooms'].between(bedrooms_chosen[0], bedrooms_chosen[1])),
+ (df_sorted['total_bathrooms'].between(bathrooms_chosen[0], bathrooms_chosen[1])),
+ ((df_sorted['sqft'].between(sqft_chosen[0], sqft_chosen[1])) | lease_filters.sqft_radio_button(sqft_missing_radio_choice, sqft_chosen[0], sqft_chosen[1])),
+ ((df_sorted['year_built'].between(years_chosen[0], years_chosen[1])) | lease_filters.yrbuilt_radio_button(yrbuilt_missing_radio_choice, years_chosen[0], years_chosen[1])),
((df_sorted['ppsqft'].between(ppsqft_chosen[0], ppsqft_chosen[1])) | lease_filters.ppsqft_radio_button(ppsqft_missing_radio_choice, ppsqft_chosen[0], ppsqft_chosen[1])),
lease_filters.furnished_checklist_function(furnished_choice),
lease_filters.security_deposit_function(security_deposit_radio_choice, security_deposit_chosen[0], security_deposit_chosen[1]),
@@ -152,33 +152,42 @@ def update_map(subtypes_chosen, pets_chosen, terms_chosen, garage_spaces, rental
for row in df_filtered.itertuples():
markers.append(
dict(
- lat=row.Latitude,
- lon=row.Longitude,
+ lat=row.latitude,
+ lon=row.longitude,
data=dict(
- address=row.full_street_address,
- bathrooms=row.Bedrooms,
- bedrooms=row.Bedrooms,
- furnished=row.Furnished,
- garage_spaces=row.garage_spaces,
- image_url=row.mls_photo,
- key_deposit=row.DepositKey,
- laundry=row.LaundryFeatures,
+ #bedrooms_bathrooms=row.total_bathrooms,
+ bedrooms=row.bedrooms,
+ city=row.city,
+ date_processed=row.date_processed,
+ full_bathrooms=row.full_bathrooms,
+ full_street_address=row.full_street_address,
+ furnished=row.furnished,
+ half_bathrooms=row.half_bathrooms,
+ key_deposit=row.key_deposit,
+ laundry=row.laundry,
list_price=row.list_price,
listed_date=row.listed_date,
listing_url=row.listing_url,
mls_number=row.mls_number,
mls_photo=row.mls_photo,
- other_deposit=row.DepositOther,
- pet_deposit=row.DepositPets,
- pet_policy=row.PetsAllowed,
+ other_deposit=row.other_deposit,
+ parking_spaces=row.parking_spaces,
+ pet_deposit=row.pet_deposit,
+ pet_policy=row.pet_policy,
phone_number=row.phone_number,
ppsqft=row.ppsqft,
- security_deposit=row.DepositSecurity,
- senior_community=row.SeniorCommunityYN,
- sqft=row.Sqft,
+ security_deposit=row.security_deposit,
+ senior_community=row.senior_community,
+ short_address=row.short_address,
+ sqft=row.sqft,
+ street_name=row.street_name,
+ street_number=row.street_number,
subtype=row.subtype,
- terms=row.Terms,
- year_built=row.YrBuilt,
+ terms=row.terms,
+ three_quarter_bathrooms=row.three_quarter_bathrooms,
+ total_bathrooms=row.total_bathrooms,
+ year_built=row.year_built,
+ zip_code=row.zip_code,
),
)
)
diff --git a/requirements.txt b/requirements.txt
index 4ea6802b..697e63b7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,22 +1,21 @@
aiolimiter==1.1.0
beautifulsoup4==4.12.3
dash_bootstrap_components==1.6.0
-dash_extensions==1.0.16
+dash_extensions==1.0.18
dash-leaflet==1.0.15
-dash==2.17.1
+dash==2.18.1
geopy==2.4.1
-gevent==24.2.1
-gunicorn==22.0.0
-imagekitio==4.0.1
+gevent==24.10.2
+gunicorn==23.0.0
+imagekitio==4.1.0
loguru==0.7.2
numpy==1.26.4
-orjson==3.10.5
-pandas==2.2.2
-protobuf==5.27.1
-pyarrow==16.1.0
+orjson==3.10.7
+pandas==2.2.3
+protobuf==5.28.2
+pyarrow==17.0.0
python-dotenv==1.0.1
pyyaml
requests==2.32.3
-sodapy==2.2.0
tables
user_agents==2.2.0
\ No newline at end of file