diff --git a/.gitignore b/.gitignore index 7b3d0e8c..40f52974 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ __pycache__/larentals.cpython-310.pyc *.csv *.pyc *.xlsx +.venv/ env hdf larentals-checkpoint.py diff --git a/assets/datasets/lease.parquet b/assets/datasets/lease.parquet index 662aecea..0ff9ecbc 100644 Binary files a/assets/datasets/lease.parquet and b/assets/datasets/lease.parquet differ diff --git a/assets/datasets/lease.parquet.bak.newest.kindafuckedup b/assets/datasets/lease.parquet.bak.newest.kindafuckedup new file mode 100644 index 00000000..4a3925a2 Binary files /dev/null and b/assets/datasets/lease.parquet.bak.newest.kindafuckedup differ diff --git a/assets/javascript/popup.js b/assets/javascript/popup.js index 4a132307..a44b2bd1 100644 --- a/assets/javascript/popup.js +++ b/assets/javascript/popup.js @@ -22,7 +22,7 @@ window.dash_props = Object.assign({}, window.dash_props, { return ` Listing ID (MLS#) - Not Available + ${data.mls_number} `; } @@ -47,9 +47,9 @@ window.dash_props = Object.assign({}, window.dash_props, { const listingUrlBlock = getListingUrlBlock(data); // Conditionally include the property image row if the image URL is available - const imageRow = data.image_url ? ` + const imageRow = data.mls_photo ? ` - Property Image + Property Image ` : ''; @@ -64,7 +64,7 @@ window.dash_props = Object.assign({}, window.dash_props, {
${imageRow}
-
${data.address}
+
${data.full_street_address}
@@ -106,11 +106,11 @@ window.dash_props = Object.assign({}, window.dash_props, { - + - - + + diff --git a/functions/dataframe_utils.py b/functions/dataframe_utils.py index 3cbd8010..beb8f6dc 100644 --- a/functions/dataframe_utils.py +++ b/functions/dataframe_utils.py @@ -1,5 +1,5 @@ -from aiolimiter import AsyncLimiter -from functions.webscraping_utils import check_expired_listing +from functions.mls_image_processing_utils import imagekit_transform +from functions.webscraping_utils import check_expired_listing_bhhs, check_expired_listing_theagency, webscrape_bhhs, fetch_the_agency_data from loguru import logger import asyncio import pandas as pd @@ -8,40 +8,99 @@ # Initialize logging logger.add(sys.stderr, format="{time} {level} {message}", filter="my_module", level="INFO") -async def remove_expired_listings(df: pd.DataFrame, limiter: AsyncLimiter) -> pd.DataFrame: +def remove_inactive_listings(df: pd.DataFrame) -> pd.DataFrame: """ - Asynchronously checks each listing URL in the DataFrame to determine if it has expired, - and removes rows with expired listings, applying rate limiting. Also counts the number of expired listings removed. + Checks each listing to determine if it has expired or been sold, and removes inactive listings. + If 'bhhs' is in the 'listing_url', it checks for expired listings. + If 'idcrealestate' is in the 'listing_url', it checks for sold listings. Parameters: df (pd.DataFrame): The DataFrame containing listing URLs and MLS numbers. - limiter (AsyncLimiter): The rate limiter to control request frequency. Returns: - pd.DataFrame: The DataFrame with expired listings removed. + pd.DataFrame: The DataFrame with inactive listings removed. """ - async def check_and_mark_expired(row): - async with limiter: - expired = await check_expired_listing(row.listing_url, row.mls_number) - return (row.Index, expired) - - # Gather tasks for all rows that need to be checked - tasks = [check_and_mark_expired(row) for row in df[df.listing_url.notnull()].itertuples()] - results = await asyncio.gather(*tasks) - - # Determine indexes of rows to drop (where listing has expired) - indexes_to_drop = [index for index, expired in results if expired] - - # Counter for expired listings - expired_count = len(indexes_to_drop) - - # Log success messages for dropped listings and the count of expired listings - for index in indexes_to_drop: - mls_number = df.loc[index, 'mls_number'] - logger.success(f"Removed {mls_number} (Index: {index}) from the dataframe because the listing has expired.") - - logger.info(f"Total expired listings removed: {expired_count}") - - # Drop the rows from the DataFrame and return the modified DataFrame - df_dropped_expired = df.drop(indexes_to_drop) - return df_dropped_expired \ No newline at end of file + indexes_to_drop = [] + + for row in df.itertuples(): + listing_url = str(getattr(row, 'listing_url', '')) + mls_number = str(getattr(row, 'mls_number', '')) + + # Check if the listing is expired on BHHS + if 'bhhscalifornia.com' in listing_url: + is_expired = check_expired_listing_bhhs(listing_url, mls_number) + if is_expired: + indexes_to_drop.append(row.Index) + logger.success(f"Removed MLS {mls_number} (Index: {row.Index}) from the DataFrame because the listing has expired on BHHS.") + # Check if the listing is expired on The Agency + elif 'theagencyre.com' in listing_url: + is_sold = check_expired_listing_theagency(listing_url, mls_number) + if is_sold: + indexes_to_drop.append(row.Index) + logger.success(f"Removed MLS {mls_number} (Index: {row.Index}) from the DataFrame because the listing has expired on The Agency.") + + inactive_count = len(indexes_to_drop) + logger.info(f"Total inactive listings removed: {inactive_count}") + + df_active = df.drop(indexes_to_drop) + return df_active.reset_index(drop=True) + +def update_dataframe_with_listing_data( + df: pd.DataFrame, imagekit_instance +) -> pd.DataFrame: + """ + Updates the DataFrame with listing date, MLS photo, and listing URL by scraping BHHS and using The Agency's API. + + Parameters: + df (pd.DataFrame): The DataFrame to update. + imagekit_instance: The ImageKit instance for image transformations. + + Returns: + pd.DataFrame: The updated DataFrame. + """ + for row in df.itertuples(): + mls_number = row.mls_number + try: + webscrape = webscrape_bhhs( + url=f"https://www.bhhscalifornia.com/for-lease/{mls_number}-t_q;/", + row_index=row.Index, + mls_number=mls_number, + total_rows=len(df) + ) + + if not all(webscrape): + logger.warning(f"BHHS did not return complete data for MLS {mls_number}. Trying The Agency.") + agency_data = fetch_the_agency_data( + mls_number, + row_index=row.Index, + total_rows=len(df), + full_street_address=row.full_street_address + ) + + if agency_data and any(agency_data): + listed_date, listing_url, mls_photo = agency_data + if listed_date: + df.at[row.Index, 'listed_date'] = listed_date + if listing_url: + df.at[row.Index, 'listing_url'] = listing_url + if mls_photo: + df.at[row.Index, 'mls_photo'] = imagekit_transform( + mls_photo, + mls_number, + imagekit_instance=imagekit_instance + ) + else: + logger.warning(f"No photo URL found for MLS {mls_number} from The Agency.") + else: + pass + else: + df.at[row.Index, 'listed_date'] = webscrape[0] + df.at[row.Index, 'mls_photo'] = imagekit_transform( + webscrape[1], + mls_number, + imagekit_instance=imagekit_instance + ) + df.at[row.Index, 'listing_url'] = webscrape[2] + except Exception as e: + logger.error(f"Error processing MLS {mls_number} at index {row.Index}: {e}") + return df \ No newline at end of file diff --git a/functions/geocoding_utils.py b/functions/geocoding_utils.py index abd0b782..773d249c 100644 --- a/functions/geocoding_utils.py +++ b/functions/geocoding_utils.py @@ -66,39 +66,39 @@ def fetch_missing_city(address: str, geolocator: GoogleV3) -> Optional[str]: return city -def return_postalcode(address: str, geolocator: GoogleV3) -> Optional[Union[int, type(pd.NA)]]: +def return_zip_code(address: str, geolocator: GoogleV3) -> Optional[str]: """ - Fetches the postal code for a given short address using forward and reverse geocoding. - + Fetches the postal code for a given address using geocoding. + Parameters: - address (str): The short address. - geolocator (GoogleV3): An instance of a GoogleV3 geocoding class. - + address (str): The full street address. + geolocator (GoogleV3): An instance of the GoogleV3 geocoding class. + Returns: - Optional[Union[int, type(pd.NA)]]: The postal code as an integer, or pd.NA if unsuccessful. + Optional[str]: The postal code as a string, or None if unsuccessful. """ - # Initialize postalcode variable postalcode = None try: - geocode_info = geolocator.geocode(address, components={'administrative_area': 'CA', 'country': 'US'}) - components = geolocator.geocode(f"{geocode_info.latitude}, {geocode_info.longitude}").raw['address_components'] - - # Create a dataframe from the list of dictionaries - components_df = pd.DataFrame(components) - - # Iterate through rows to find the postal code - for row in components_df.itertuples(): - if row.types == ['postal_code']: - postalcode = int(row.long_name) - - logger.info(f"Fetched postal code {postalcode} for {address}.") - except AttributeError: - logger.warning(f"Geocoding returned no results for {address}.") - return pd.NA + geocode_info = geolocator.geocode( + address, components={'administrative_area': 'CA', 'country': 'US'} + ) + if geocode_info: + raw = geocode_info.raw['address_components'] + # Find the 'postal_code' + postalcode = next( + (addr['long_name'] for addr in raw if 'postal_code' in addr['types']), + None + ) + if postalcode: + logger.info(f"Fetched zip code ({postalcode}) for {address}.") + else: + logger.warning(f"No postal code found in geocoding results for {address}.") + else: + logger.warning(f"Geocoding returned no results for {address}.") except Exception as e: - logger.warning(f"Couldn't fetch postal code for {address} because {e}.") - return pd.NA + logger.warning(f"Couldn't fetch zip code for {address} because of {e}.") + postalcode = None return postalcode diff --git a/functions/webscraping_utils.py b/functions/webscraping_utils.py index 6fd167de..aa689eeb 100644 --- a/functions/webscraping_utils.py +++ b/functions/webscraping_utils.py @@ -1,115 +1,327 @@ -from aiolimiter import AsyncLimiter from bs4 import BeautifulSoup +from datetime import datetime, timezone from loguru import logger from typing import Tuple, Optional -import asyncio -import httpx import pandas as pd import re import requests import sys +import time # Initialize logging -logger.add(sys.stderr, format="{time} {level} {message}", filter="my_module", level="INFO") +logger.add(sys.stderr, format="{time} {level} {message}", filter="my_module", level="DEBUG") -# Limit to 1 request per second -limiter = AsyncLimiter(1, 1) +import requests +from bs4 import BeautifulSoup +from loguru import logger -async def check_expired_listing(url: str, mls_number: str) -> bool: +def check_expired_listing_bhhs(url: str, mls_number: str) -> bool: """ - Checks if a listing has expired based on the presence of a specific HTML element, asynchronously. - + Checks if a BHHS listing has expired by looking for a specific message on the page. + Parameters: url (str): The URL of the listing to check. - mls_number (str): The MLS number of the listing. - + mls_number: The MLS number of the listing. + Returns: bool: True if the listing has expired, False otherwise. """ headers = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.35" + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.5', + 'Accept-Encoding': 'gzip, deflate, br, zstd', + 'Connection': 'keep-alive', + 'Upgrade-Insecure-Requests': '1', + 'Pragma': 'no-cache', + 'Cache-Control': 'no-cache', } try: - async with limiter: - async with httpx.AsyncClient(timeout=10) as client: - response = await client.get(url, headers=headers) - response.raise_for_status() - + response = requests.get(url, headers=headers, timeout=10) + response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') - - description = soup.find('div', class_='page-description').text - cleaned_description = " ".join(description.split()) - - return bool(cleaned_description) - - except httpx.TimeoutException: - logger.warning(f"Timeout occurred while checking if the listing for {mls_number} has expired.") - except httpx.HTTPStatusError as h: - if h.response.status_code == 429: - retry_after = int(h.response.headers.get("Retry-After", 60)) # Use a default retry after 60 seconds if header is missing - logger.warning(f"Rate limit exceeded, retrying after {retry_after} seconds.") - await asyncio.sleep(retry_after) - return await check_expired_listing(url, mls_number) # Retry the request - else: - logger.warning(f"HTTP error {h.response.status_code} occurred while checking if the listing for {mls_number} has expired. {h.response.text}") - except AttributeError: - # This occurs if the 'page-description' div is not found, meaning the listing hasn't expired + + # Look for the message indicating the listing is no longer active + description_div = soup.find('div', class_='page-description') + if description_div: + description_text = " ".join(description_div.text.split()) + if "We're sorry, the listing you are looking for is no longer active." in description_text: + return True return False + + except requests.Timeout: + logger.warning(f"Timeout occurred while checking if the listing for {mls_number} has expired.") + except requests.HTTPError as e: + logger.error(f"HTTP error occurred for MLS {mls_number}: {e}") except Exception as e: - logger.warning(f"Couldn't detect if the listing for {mls_number} has expired because {e}.") + logger.error(f"An unexpected error occurred for MLS {mls_number}: {e}") return False -async def webscrape_bhhs(url: str, row_index: int, mls_number: str, total_rows: int) -> Tuple[Optional[pd.Timestamp], Optional[str], Optional[str]]: +def check_expired_listing_theagency(listing_url: str, mls_number: str, board_code: str = 'clr') -> bool: """ - Asynchronously scrapes a BHHS page to fetch the listing URL, photo, and listed date. + Checks if a listing has been sold based on the 'IsSold' key from The Agency API. + + Parameters: + listing_url (str): The URL of the listing to check. + mls_number (str): The MLS number of the listing. + board_code (str, optional): The board code extracted from the listing URL or a default value. + + Returns: + bool: True if the listing has been sold, False otherwise. """ + # Try to extract the board code from the listing_url if it varies + try: + pattern = r'https://.*?idcrealestate\.com/.*?/(?P\w+)/' + match = re.search(pattern, listing_url) + if match: + board_code = match.group('board_code') + else: + # Use the default board_code provided in the function parameter + pass # board_code remains as provided + except Exception as e: + logger.warning(f"Could not extract board code from listing URL: {listing_url}. Error: {e}") + + api_url = f'https://search-service.idcrealestate.com/api/property/en_US/d4/sold-detail/{board_code}/{mls_number}' headers = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.35" + "User-Agent": "Mozilla/5.0", + "Accept": "*/*", + "Accept-Language": "en-US,en;q=0.5", + "Content-Type": "application/json", + "Referer": "https://www.theagencyre.com/", + "X-Tenant": "QUdZfFBST0R8Q09NUEFOWXwx", + "Origin": "https://www.theagencyre.com", + "Connection": "keep-alive", } try: - async with httpx.AsyncClient(timeout=5, follow_redirects=True) as client: - response = await client.get(url, headers=headers) - response.raise_for_status() - - # Check if a redirect has occurred - #if response.history: - # logger.info(f"Redirected from {url} to {response.url} for {mls_number}.") - - # Successful HTTP request - soup = BeautifulSoup(response.text, 'html.parser') - listed_date, photo, link = None, None, None - - # Example parsing (ensure to adjust based on actual HTML structure) - link_tag = soup.find('a', class_='btn cab waves-effect waves-light btn-details show-listing-details') - if link_tag and 'href' in link_tag.attrs: - link = f"https://www.bhhscalifornia.com{link_tag['href']}" - - photo_tag = soup.find('a', class_='show-listing-details') - if photo_tag and photo_tag.find('img'): - photo = photo_tag.find('img')['src'] - - date_tag = soup.find('p', class_='summary-mlsnumber') - if date_tag: - listed_date_text = date_tag.text.split()[-1] - listed_date = pd.Timestamp(listed_date_text) - - return listed_date, photo, link - - except httpx.TimeoutException: - logger.warning(f"Timeout occurred while scraping BHHS page for {mls_number}.") - except httpx.HTTPStatusError as h: - if h.response.status_code == 429: - retry_after = int(h.response.headers.get("Retry-After", 60)) # Default to 60 seconds - logger.warning(f"Rate limit exceeded for {mls_number}, retrying after {retry_after} seconds.") - await asyncio.sleep(retry_after) - return await webscrape_bhhs(url, row_index, mls_number, total_rows) # Retry the request - else: - logger.warning(f"HTTP error {h.response.status_code} occurred while scraping BHHS page for {mls_number}.") + response = requests.get(api_url, headers=headers) + response.raise_for_status() + data = response.json() + is_sold = data.get('IsSold', False) + if is_sold: + logger.debug(f"Listing {mls_number} has been sold.") + return is_sold + except requests.HTTPError as e: + logger.error(f"HTTP error occurred while checking if the listing for MLS {mls_number} has been sold: {e}") + except Exception as e: + logger.error(f"An error occurred while checking if the listing for MLS {mls_number} has been sold: {e}") + + return False + +def webscrape_bhhs(url: str, row_index: int, mls_number: str, total_rows: int) -> Tuple[Optional[pd.Timestamp], Optional[str], Optional[str]]: + """ + Scrapes the BHHS website for listing details. + + Parameters: + url (str): The URL of the listing to scrape. + row_index (int): The current row index being processed. + mls_number (str): The MLS number of the listing. + total_rows (int): The total number of rows to process. + + Returns: + Tuple[Optional[pd.Timestamp], Optional[str], Optional[str]]: + - listed_date (pd.Timestamp): The listing date if found. + - photo (str): The URL of the listing photo if found. + - link (str): The detailed listing URL if found. + Returns (None, None, None) if data is not found or an error occurs. + """ + logger.info(f"Scraping BHHS page for {mls_number} (row {row_index + 1} of {total_rows}).") + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.5', + 'Accept-Encoding': 'gzip, deflate, br, zstd', + 'Connection': 'keep-alive', + 'Upgrade-Insecure-Requests': '1', + 'Pragma': 'no-cache', + 'Cache-Control': 'no-cache', + } + try: + response = requests.get(url, headers=headers, timeout=10) + response.raise_for_status() + soup = BeautifulSoup(response.text, 'html.parser') + + # Initialize variables + listed_date = None + photo = None + link = None + + # Extract the detailed listing URL + link_tag = soup.find('a', class_='btn cab waves-effect waves-light btn-details show-listing-details') + if link_tag and 'href' in link_tag.attrs: + link = f"https://www.bhhscalifornia.com{link_tag['href']}" + + # Extract the photo URL + photo_tag = soup.find('a', class_='show-listing-details') + if photo_tag and photo_tag.find('img'): + photo = photo_tag.find('img')['src'] + + # Extract the listed date + date_tag = soup.find('p', class_='summary-mlsnumber') + if date_tag: + listed_date_text = date_tag.text.split()[-1] + listed_date = pd.Timestamp(listed_date_text) + + return listed_date, photo, link + + except requests.HTTPError as e: + logger.warning(f"HTTP error occurred while scraping BHHS page for {mls_number}: {e}") except Exception as e: - logger.warning(f"Error scraping BHHS page for {mls_number}: {e}.") + logger.warning(f"Error scraping BHHS page for {mls_number}: {e}") + + return None, None, None + +def extract_street_name(full_street_address: str) -> Optional[str]: + """ + Extracts the street name from a full street address. + + This function handles addresses with or without unit numbers and directional indicators. + It splits the address to isolate the street name component. + + Args: + full_street_address (str): The full street address (e.g., "118 S Cordova ST #B, ALHAMBRA 91801") + Returns: + Optional[str]: The extracted street name in lowercase if successful; otherwise, None. + """ + # Split the address at the comma + address_first_part = full_street_address.split(',')[0].strip() + # Remove unit numbers (e.g., #A, #1/2) + address_first_part = re.sub(r'#\S+', '', address_first_part) + # Split the first part by spaces + tokens = address_first_part.split() + # Check if tokens are sufficient + if len(tokens) >= 2: + possible_direction = tokens[1].upper() + if possible_direction in ['N', 'S', 'E', 'W', 'NE', 'NW', 'SE', 'SW']: + # Direction present + if len(tokens) >= 3: + street_name = tokens[2] + else: + return None + else: + # No direction + street_name = tokens[1] + return street_name.lower() + else: + # Can't extract street name + return None + +def extract_zip_code(full_street_address: str) -> Optional[str]: + """ + Extracts the ZIP code from a full street address. + + Uses regular expressions to find a 5-digit ZIP code, optionally handling ZIP+4 formats. + + Args: + full_street_address (str): The full street address (e.g., "118 S Cordova ST #B, ALHAMBRA 91801") + + Returns: + Optional[str]: The extracted ZIP code if successful; otherwise, None. + """ + match = re.search(r'\b\d{5}(?:-\d{4})?\b', full_street_address) + if match: + return match.group() + else: + return None + +def fetch_the_agency_data(mls_number: str, row_index: int, total_rows: int, full_street_address: str) -> Tuple[Optional[datetime.date], Optional[str], Optional[str]]: + """ + Fetches property data for a given MLS number from The Agency API and scrapes the detail page for the image source. + + Parameters: + mls_number (str): The MLS number of the property to fetch. + row_index (int): The row index for logging or debugging purposes. + total_rows (int): Total rows being processed for progress indication. + full_street_address (str): The full street address of the property (e.g., "118 S Cordova ST #B, ALHAMBRA 91801"). + + Returns: + Tuple[Optional[datetime.date], Optional[str], Optional[str]]: + - The listing date (as a datetime.date object) if found; otherwise, None. + - The detail URL of the property if found; otherwise, None. + - The first property image URL if found; otherwise, None. + Returns (None, None, None) if no matching property is found or if an error occurs. + """ + url = "https://search-service.idcrealestate.com/api/property" + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0", + "Accept": "*/*", + "Accept-Language": "en-US,en;q=0.5", + "Accept-Encoding": "gzip, deflate, br, zstd", + "X-Tenant": "AGY", + "X-TenantMode": "Production", + "X-TenantHost": "theagencyre.com", + "Content-Type": "application/json", + "Origin": "https://www.theagencyre.com", + "Connection": "keep-alive", + "Referer": "https://www.theagencyre.com/", + "Sec-Fetch-Dest": "empty", + "Sec-Fetch-Mode": "cors", + "Sec-Fetch-Site": "cross-site", + "Priority": "u=4", + "Pragma": "no-cache", + "Cache-Control": "no-cache" + } + normalized_mls_number = mls_number.replace("-", "").replace("_", "") + payload = { + "urlquery": f"/rent/search-{normalized_mls_number}/rental-true", + "countrystate": "", + "zoom": 21 + } + #logger.debug(payload) + try: + response = requests.post(url, headers=headers, json=payload) + response.raise_for_status() + #logger.debug(response.text) + + # Parse JSON response + data = response.json() + + # Extract the street name and zip code + street_name = extract_street_name(full_street_address) + if not street_name: + logger.warning(f"Could not extract street name from address: {full_street_address}") + return None, None, None + logger.debug(f"Extracted street name: {street_name}") + + zip_code = extract_zip_code(full_street_address) + if not zip_code: + logger.warning(f"Could not extract zip code from address: {full_street_address}") + return None, None, None + logger.debug(f"Extracted zip code: {zip_code}") + + # Filter items based on the street name and zip code + filtered_items = [ + item for item in data.get("items", []) + if street_name in item.get("fullAddress", "").lower() and zip_code in item.get("fullAddress", "").lower() + ] + + if filtered_items: + if len(filtered_items) > 1: + logger.warning(f"Multiple properties found for street name '{street_name}' and zip code '{zip_code}'. Using the first one.") + item = filtered_items[0] + list_date_timestamp = int(item.get("listDate", 0)) + list_date = datetime.fromtimestamp(list_date_timestamp, tz=timezone.utc).date() + detail_url = f"https://www.theagencyre.com{item.get('detailUrl', '')}" + detail_response = requests.get(detail_url, headers=headers) + detail_response.raise_for_status() + detail_soup = BeautifulSoup(detail_response.text, 'html.parser') + img_tag = detail_soup.find("img", {"data-src": lambda x: x and x.endswith("_1.jpg")}) + img_src = img_tag["data-src"] if img_tag else None + logger.success(f"Successfully fetched {list_date} {detail_url} {img_src} for MLS {mls_number}") + return list_date, detail_url, img_src + + logger.warning(f"No property found on The Agency with street name '{street_name}' and zip code '{zip_code}'.") + return None, None, None + + except requests.HTTPError as e: + logger.error(f"HTTP error occurred: {e}") + logger.debug(f"Response content: {e.response.text}") + except requests.RequestException as e: + logger.error(f"Request error occurred: {e}") + except Exception as e: + logger.error(f"An unexpected error occurred: {e}") return None, None, None def update_hoa_fee(df: pd.DataFrame, mls_number: str) -> None: diff --git a/lease_dataframe.py b/lease_dataframe.py index 00101d78..a0f81f39 100644 --- a/lease_dataframe.py +++ b/lease_dataframe.py @@ -1,5 +1,5 @@ from dotenv import load_dotenv, find_dotenv -from functions.dataframe_utils import remove_expired_listings +from functions.dataframe_utils import remove_inactive_listings, update_dataframe_with_listing_data from functions.geocoding_utils import * from functions.mls_image_processing_utils import * from functions.noise_level_utils import * @@ -36,7 +36,7 @@ # Load all CSVs and concat into one dataframe # https://stackoverflow.com/a/21232849 path = "." -all_files = glob.glob(os.path.join(path, "*lacountyrentals*.csv")) +all_files = glob.glob(os.path.join(path, "*Renter*.csv")) df = pd.concat((pd.read_csv(f, float_precision="round_trip", skipinitialspace=True) for f in all_files), ignore_index=True) pd.set_option("display.precision", 10) @@ -45,136 +45,153 @@ # https://stackoverflow.com/a/36082588 df.columns = df.columns.str.strip() -# Standardize the column names by renaminmg them +# Convert all column names to lowercase +df.columns = df.columns.str.lower() + +# Standardize the column names by renaming them # https://stackoverflow.com/a/65332240 # Define a renaming dictionary based on patterns rename_dict = { - 'Garage Spaces': 'garage_spaces', - 'List Office Phone': 'phone_number', - 'Listing': 'mls_number', - 'St Name': 'street_name', - 'St#': 'street_number', - 'Sub Type': 'subtype', - 'Yr': 'YrBuilt', + 'agent': 'phone_number', + 'allowed': 'pet_policy', + 'baths': 'bathrooms', + 'bedrooms': 'bedrooms', + 'city': 'city', + 'furnished': 'furnished', + 'key': 'key_deposit', + 'laundry': 'laundry', + 'list': 'list_price', + 'lot': 'lot_size', + 'mls': 'mls_number', + 'name': 'street_name', + 'other': 'other_deposit', + 'pet deposit': 'pet_deposit', + 'prking': 'parking_spaces', + 'security': 'security_deposit', + 'sqft': 'sqft', + 'square': 'ppsqft', + 'st #': 'street_number', + 'sub': 'subtype', + 'terms': 'terms', + 'yr': 'year_built', + 'zip': 'zip_code', } -# Check if 'Price Per' column exists and add to renaming dictionary -if any(col.startswith('Price Per') for col in df.columns): - rename_dict['Price Per'] = 'ppsqft' - -# Rename columns +# Rename columns based on substrings in the column names df = df.rename(columns=lambda c: next((v for k, v in rename_dict.items() if k in c), c)) -# Special case for list price due to additional condition -df = df.rename(columns=lambda c: 'list_price' if c.startswith('List') and c.endswith('Price') else c) +# Drop the numbers in the first group of characters in the street_name column +df['street_name'] = df['street_name'].str.replace(r'^\d+\s*', '', regex=True) # Drop all rows with misc/irrelevant data df.dropna(subset=['street_name'], inplace=True) # Columns to clean -cols = ['DepositKey', 'DepositOther', 'DepositPets', 'DepositSecurity', 'list_price', 'Sqft', 'YrBuilt'] -if 'ppsqft' in df.columns: - cols.append('ppsqft') +cols = ['key_deposit', 'other_deposit', 'security_deposit', 'list_price', 'pet_deposit'] +# Remove all non-numeric characters, convert to numeric, round to integers, fill NaNs with pd.NA, and cast to Nullable Integer Type +df[cols] = ( + df[cols] + .replace({r'\$': '', ',': ''}, regex=True) + .apply(pd.to_numeric, errors='coerce') + .round(0) # Round to ensure values are integers + .astype(pd.UInt16Dtype()) +) -# Remove all non-numeric characters, convert to numeric, and cast to Nullable Integer Type -df[cols] = df[cols].replace(to_replace='[^\d]', value='', regex=True).apply(pd.to_numeric, errors='coerce').astype(pd.Int64Dtype()) +# Cast 'sqft' to UInt32 +df['sqft'] = df['sqft'].replace({',': ''}, regex=True).astype(pd.UInt32Dtype()) + +# Convert other columns to appropriate data types +df = df.astype({ + 'year_built': 'UInt16', + 'parking_spaces': 'UInt8', + 'street_number': 'string' +}) + +# Handle lot_size column separately by removing commas, converting to numeric, and then to UInt32 +df['lot_size'] = ( + df['lot_size'] + .replace({',': ''}, regex=True) + .apply(pd.to_numeric, errors='coerce') + .astype(pd.UInt32Dtype()) +) -# Check if 'ppsqft' column exists -if 'ppsqft' not in df.columns: - # If it has a different name, replace 'Sqft' below with the correct column name - df['ppsqft'] = (df['list_price'] / df['Sqft']).round(2) - -# Fetch missing city names -for row in df.loc[(df['City'].isnull()) & (df['PostalCode'].notnull())].itertuples(): - df.at[row.Index, 'City'] = fetch_missing_city(f"{row.street_number} {row.street_name} {str(row.PostalCode)}", geolocator=g) +# Cast the following columns as a float and remove the leading $ sign +df['ppsqft'] = df['ppsqft'].replace(to_replace=r'[^\d]', value='', regex=True).astype(pd.Float32Dtype()) # Columns to be cast as strings -cols = ['street_number', 'street_name', 'City', 'mls_number', 'SeniorCommunityYN', 'Furnished', 'LaundryFeatures', 'subtype'] - -for col in cols: - # If the column exists, replace empty strings with NaNs - if col in df.columns: - df[col] = df[col].replace(r'^\s*$', pd.NA, regex=True) - # If the column does not exist, create it and fill it with NaNs - else: - df[col] = pd.NA - # Cast the column as a string type (NA values will remain as NA) - df[col] = df[col].astype(pd.StringDtype()) +cols = ['mls_number', 'phone_number', 'street_name', 'zip_code', 'city'] +df[cols] = df[cols].astype(pd.StringDtype()) + +# Columns to be cast as categories +cols = ['pet_policy', 'furnished', 'subtype', 'terms', 'laundry'] +df[cols] = df[cols].astype(pd.CategoricalDtype()) + +# Extract total bathrooms and bathroom types (Full, Three-Quarter, Half, Quarter) +df[['total_bathrooms', 'full_bathrooms', 'three_quarter_bathrooms', 'half_bathrooms', 'quarter_bathrooms']] = df['bathrooms'].str.extract(r'(\d+\.\d+)\s\((\d+)\s(\d+)\s(\d+)\s(\d+)\)').astype(float) + +# Convert bathroom columns to nullable integer type +for col in ['total_bathrooms', 'full_bathrooms', 'three_quarter_bathrooms', 'half_bathrooms', 'quarter_bathrooms']: + df[col] = df[col].astype(pd.UInt8Dtype()) + +# Drop the original 'Baths(FTHQ)' column since we've extracted the data we need +df.drop(columns=['bathrooms'], inplace=True) + +# Convert bedrooms to nullable integer type +df['bedrooms'] = df['bedrooms'].astype(pd.UInt8Dtype()) + +# Fetch missing city names +for row in df.loc[(df['city'].isnull()) & (df['zip_code'].notnull())].itertuples(): + df.at[row.Index, 'city'] = fetch_missing_city(f"{row.street_number} {row.street_name} {str(row.zip_code)}", geolocator=g) # Create a new column with the Street Number & Street Name -df["short_address"] = df["street_number"] + ' ' + df["street_name"] + ',' + ' ' + df['City'] +df["short_address"] = (df["street_number"].astype(str) + ' ' + df["street_name"] + ', ' + df['city']).astype(pd.StringDtype()) # Filter the dataframe and return only rows with a NaN postal code # For some reason some Postal Codes are "Assessor" :| so we need to include that string in an OR operation # Then iterate through this filtered dataframe and input the right info we get using geocoding -for row in df.loc[(df['PostalCode'].isnull()) | (df['PostalCode'] == 'Assessor')].itertuples(): +for row in df.loc[(df['zip_code'].isnull()) | (df['zip_code'] == 'Assessor')].itertuples(): short_address = df.at[row.Index, 'short_address'] - missing_postalcode = return_postalcode(short_address, geolocator=g) - df.at[row.Index, 'PostalCode'] = missing_postalcode + missing_zip_code = return_zip_code(short_address, geolocator=g) + df.at[row.Index, 'zip_code'] = missing_zip_code -df['PostalCode'] = df['PostalCode'].apply(pd.to_numeric, errors='coerce').astype(pd.Int64Dtype()) +df['zip_code'] = df['zip_code'].astype(pd.StringDtype()) # Tag each row with the date it was processed for row in df.itertuples(): df.at[row.Index, 'date_processed'] = pd.Timestamp.today() # Create a new column with the full street address -# Also strip whitespace from the St Name column -# Convert the postal code into a string so we can combine string and int -# https://stackoverflow.com/a/11858532 -df["full_street_address"] = df["street_number"] + ' ' + df["street_name"].str.strip() + ',' + ' ' + df['City'] + ' ' + df["PostalCode"].astype(str) +df["full_street_address"] = ( + df["street_number"].astype(str) + ' ' + + df["street_name"].str.strip() + ', ' + + df['city'] + ' ' + + df["zip_code"].astype(str) +).astype(pd.StringDtype()) -# Iterate through the dataframe and get the listed date and photo for rows -for row in df.itertuples(): - mls_number = row[1] - webscrape = asyncio.run(webscrape_bhhs(url=f"https://www.bhhscalifornia.com/for-lease/{mls_number}-t_q;/", row_index=row.Index, mls_number=mls_number, total_rows=len(df))) - df.at[row.Index, 'listed_date'] = webscrape[0] - df.at[row.Index, 'mls_photo'] = imagekit_transform(webscrape[1], row[1], imagekit_instance=imagekit) - df.at[row.Index, 'listing_url'] = webscrape[2] +# Iterate through the dataframe and get the listed date and photo for rows +df = update_dataframe_with_listing_data(df, imagekit_instance=imagekit) # Iterate through the dataframe and fetch coordinates for rows for row in df.itertuples(): coordinates = return_coordinates(address=row.full_street_address, row_index=row.Index, geolocator=g, total_rows=len(df)) - df.at[row.Index, 'Latitude'] = coordinates[0] - df.at[row.Index, 'Longitude'] = coordinates[1] - -#df = update_howloud_scores(df) - -# Split the Bedroom/Bathrooms column into separate columns based on delimiters -# Based on the example given in the spreadsheet: 2 (beds) / 1 (total baths),1 (full baths) ,0 (half bath), 0 (three quarter bath) -# Realtor logic based on https://www.realtor.com/advice/sell/if-i-take-out-the-tub-does-a-bathroom-still-count-as-a-full-bath/ -# TIL: A full bathroom is made up of four parts: a sink, a shower, a bathtub, and a toilet. Anything less than thpdat, and you can’t officially consider it a full bath. -df['Bedrooms'] = df['Br/Ba'].str.split('/', expand=True)[0] -df['Total Bathrooms'] = (df['Br/Ba'].str.split('/', expand=True)[1]).str.split(',', expand=True)[0] -df['Full Bathrooms'] = (df['Br/Ba'].str.split('/', expand=True)[1]).str.split(',', expand=True)[1] -df['Half Bathrooms'] = (df['Br/Ba'].str.split('/', expand=True)[1]).str.split(',', expand=True)[2] -df['Three Quarter Bathrooms'] = (df['Br/Ba'].str.split('/', expand=True)[1]).str.split(',', expand=True)[3] - -# Convert a few columns into int64 -# pd.to_numeric will convert into int64 or float64 automatically, which is cool -# These columns are assumed to have NO MISSING DATA, so we can cast them as int64 instead of floats (ints can't handle NaNs) -df['Bedrooms'] = df['Bedrooms'].apply(pd.to_numeric, errors='coerce') -df['Total Bathrooms'] = df['Total Bathrooms'].apply(pd.to_numeric) -# These columns should stay floats -df['Latitude'] = df['Latitude'].apply(pd.to_numeric, errors='coerce') -df['Longitude'] = df['Longitude'].apply(pd.to_numeric, errors='coerce') -df['garage_spaces'] = df['garage_spaces'].astype('Float64') + df.at[row.Index, 'latitude'] = coordinates[0] + df.at[row.Index, 'longitude'] = coordinates[1] -# Replace all empty values in the following columns with NaN and cast the column as dtype string -# https://stackoverflow.com/a/47810911 -df.Terms = df.Terms.astype("string").replace(r'^\s*$', pd.NA, regex=True) +# These columns should stay floats +df['latitude'] = df['latitude'].apply(pd.to_numeric, errors='raise', downcast='float') +df['longitude'] = df['longitude'].apply(pd.to_numeric, errors='raise', downcast='float') ## Laundry Features ## # Replace all empty values in the following column with "Unknown" and cast the column as dtype string -df.LaundryFeatures = df.LaundryFeatures.astype("string").replace(r'^\s*$', "Unknown", regex=True) +df.laundry = df.laundry.astype("string").replace(r'^\s*$', "Unknown", regex=True) # Fill in any NaNs in the Laundry column with "Unknown" -df.LaundryFeatures = df.LaundryFeatures.fillna(value="Unknown") -# Any string containing "Community" in the Laundry column should be replaced with "Community Laundry" -df['LaundryFeatures'] = df['LaundryFeatures'].str.replace("Community", "Community Laundry") -# Any string containing "Common" in the Laundry column should be replaced with "Community Laundry" -df['LaundryFeatures'] = df['LaundryFeatures'].str.replace("Common", "Community Laundry") -# Replace "Community Laundry Area" with "Community Laundry" -df['LaundryFeatures'] = df['LaundryFeatures'].str.replace("Community Laundry Area", "Community Laundry") +df.laundry = df.laundry.fillna(value="Unknown") +# Replace various patterns in the Laundry column with "Community Laundry" +df.laundry = df.laundry.str.replace( + r'Community Laundry Area|Laundry Area|Community|Common', + 'Community Laundry', + regex=True +) # Convert the listed date into DateTime and use the "mixed" format to handle the different date formats # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html @@ -183,30 +200,6 @@ # Convert date_processed into DateTime df['date_processed'] = pd.to_datetime(df['date_processed'], errors='coerce', format='%Y-%m-%d') -# Per CA law, ANY type of deposit is capped at rent * 3 months -# It doesn't matter the type of deposit, they all have the same cap -# Despite that, some landlords/realtors will list the property with an absurd deposit (100k? wtf) so let's rewrite those -# Use numpy .values to rewrite anything greater than $18000 ($6000 rent * 3 months) into $18000 -# https://stackoverflow.com/a/54426197 -df['DepositSecurity'].values[df['DepositSecurity'] > 18000] = 18000 -df['DepositPets'].values[df['DepositPets'] > 18000] = 18000 -df['DepositOther'].values[df['DepositOther'] > 18000] = 18000 -df['DepositKey'].values[df['DepositKey'] > 18000] = 18000 - -# Rewrite anything greater than 5000 square feet as NaN -# Because there's no fucking way there's a RENTAL PROPERTY that is 5000+ sqft in this city -# It clearly must be some kind of clerical error so a NaN (unknown) is more appropriate -# All that being said, I should peruse new spreadsheets to make sure there isn't actually a valid property exceeds 5000 sqft -df['Sqft'].values[df['Sqft'] > 5000] = pd.NA - -# Rewrite anything with >5 garage spaces as None -df['garage_spaces'].values[df['garage_spaces'] > 5] = None - -# Keep rows with less than 6 bedrooms -# 6 bedrooms and above are probably multi family investments and not actual rentals -# They also skew the outliers, causing the sliders to go way up -df = df[df.Bedrooms < 6] - # Reindex the dataframe df.reset_index(drop=True, inplace=True) @@ -221,25 +214,25 @@ # Drop any dupes again df_combined = df_combined.drop_duplicates(subset=['mls_number'], keep="last") # Iterate through the dataframe and drop rows with expired listings -df_combined = asyncio.run(remove_expired_listings(df_combined, limiter)) +df_combined = remove_inactive_listings(df_combined) # Reset the index df_combined = df_combined.reset_index(drop=True) # Filter the dataframe for rows outside of California outside_ca_rows = df_combined[ - (df_combined['Latitude'] < 32.5) | - (df_combined['Latitude'] > 42) | - (df_combined['Longitude'] < -124) | - (df_combined['Longitude'] > -114) + (df_combined['latitude'] < 32.5) | + (df_combined['latitude'] > 42) | + (df_combined['longitude'] < -124) | + (df_combined['longitude'] > -114) ] total_outside_ca = len(outside_ca_rows) counter = 0 for row in outside_ca_rows.itertuples(): counter += 1 - logger.warning(f"Row {counter} out of {total_outside_ca}: {row.mls_number} has coordinates {row.Latitude}, {row.Longitude} which is outside California. Re-geocoding {row.mls_number}...") + logger.warning(f"Row {counter} out of {total_outside_ca}: {row.mls_number} has coordinates {row.latitude}, {row.longitude} which is outside California. Re-geocoding {row.mls_number}...") # Re-geocode the row coordinates = return_coordinates(address=row.full_street_address, row_index=row.Index, geolocator=g, total_rows=len(df)) - df_combined.at[row.Index, 'Latitude'] = coordinates[0] - df_combined.at[row.Index, 'Longitude'] = coordinates[1] + df_combined.at[row.Index, 'latitude'] = coordinates[0] + df_combined.at[row.Index, 'longitude'] = coordinates[1] # Save the new combined dataframe try: df_combined.to_parquet(path="assets/datasets/lease.parquet") diff --git a/pages/components.py b/pages/components.py index a5ea9053..1abb45ab 100644 --- a/pages/components.py +++ b/pages/components.py @@ -105,7 +105,7 @@ def __init__(self, df): # Initalize these first because they are used in other components self.df = df - self.df['LaundryCategory'] = self.df['LaundryFeatures'].apply(self.categorize_laundry_features) + self.df['laundry'] = self.df['laundry'].apply(self.categorize_laundry_features) self.bathrooms_slider = self.create_bathrooms_slider() self.bedrooms_slider = self.create_bedrooms_slider() @@ -134,7 +134,7 @@ def __init__(self, df): self.user_options_card = self.create_user_options_card() def categorize_laundry_features(self, feature): - if feature is None or feature in [np.nan, 'Unknown', '']: + if pd.isna(feature) or feature in ['Unknown', '']: return 'Unknown' if any(keyword in feature for keyword in ['In Closet', 'In Kitchen', 'In Garage', 'Inside', 'Individual Room']): return 'In Unit' @@ -197,9 +197,9 @@ def create_bedrooms_slider(self): html.Div([ dcc.RangeSlider( min=0, - max=self.df['Bedrooms'].max(), + max=self.df['bedrooms'].max(), step=1, - value=[0, self.df['Bedrooms'].max()], + value=[0, self.df['bedrooms'].max()], id='bedrooms_slider', updatemode='mouseup', tooltip={ @@ -225,9 +225,9 @@ def create_bathrooms_slider(self): html.Div([ dcc.RangeSlider( min=0, - max=self.df['Total Bathrooms'].max(), + max=self.df['total_bathrooms'].max(), step=1, - value=[0, self.df['Total Bathrooms'].max()], + value=[0, self.df['total_bathrooms'].max()], id='bathrooms_slider', updatemode='mouseup', tooltip={ @@ -252,9 +252,9 @@ def create_sqft_components(self): ]), html.Div([ dcc.RangeSlider( - min=self.df['Sqft'].min(), - max=self.df['Sqft'].max(), - value=[self.df['Sqft'].min(), self.df['Sqft'].max()], + min=self.df['sqft'].min(), + max=self.df['sqft'].max(), + value=[self.df['sqft'].min(), self.df['sqft'].max()], id='sqft_slider', updatemode='mouseup', tooltip={ @@ -380,11 +380,23 @@ def create_pets_radio_button(self): return pets_radio def create_rental_terms_checklist(self): - # Logic to calculate unique_terms - unique_terms = pd.Series([term for sublist in self.df['Terms'].fillna('Unknown').str.split(',') for term in sublist]).unique() + # Add 'Unknown' to categories if necessary + if pd.api.types.is_categorical_dtype(self.df['terms']): + if 'Unknown' not in self.df['terms'].cat.categories: + self.df['terms'] = self.df['terms'].cat.add_categories('Unknown') + + # Fill NaN values with 'Unknown' + terms_series = self.df['terms'].fillna('Unknown') + + # Split terms and flatten the list + unique_terms = pd.Series([ + term.strip() for sublist in terms_series.str.split(',') + if sublist for term in sublist + ]).unique() + unique_terms = sorted(unique_terms) - # Define term_abbreviations and terms + # Define term abbreviations and labels term_abbreviations = { '12M': '12 Months', '24M': '24 Months', @@ -398,7 +410,8 @@ def create_rental_terms_checklist(self): 'VR': 'Vacation Rental', 'WK': 'Week-to-Week', } - terms = {k: term_abbreviations[k] for k in sorted(term_abbreviations)} + + terms = {k: term_abbreviations.get(k, k) for k in unique_terms} # Create the Dash component rental_terms_checklist = html.Div([ @@ -410,33 +423,29 @@ def create_rental_terms_checklist(self): dcc.Checklist( id='terms_checklist', options=[{'label': f"{terms[term]} ({term})", 'value': term} for term in terms], - value=[term['value'] for term in [{'label': "Unknown" if pd.isnull(term) else term, 'value': "Unknown" if pd.isnull(term) else term} for term in unique_terms]], - inputStyle={ - "margin-right": "5px", - "margin-left": "5px" - }, + value=unique_terms, # Select all terms by default + inputStyle={"margin-right": "5px", "margin-left": "5px"}, inline=False ), ], - id={'type': 'dynamic_output_div_lease', 'index': 'rental_terms'}, + id={'type': 'dynamic_output_div_lease', 'index': 'rental_terms'}, ), ], - id='rental_terms_div' + id='rental_terms_div' ) - return rental_terms_checklist def create_garage_spaces_components(self): garage_spaces_components = html.Div([ html.Div([ - html.H5("Garage Spaces", style={'display': 'inline-block', 'margin-right': '10px'}), + html.H5("Parking Spaces", style={'display': 'inline-block', 'margin-right': '10px'}), create_toggle_button(index='garage_spaces', initial_label="Hide", page_type='lease') ]), html.Div([ dcc.RangeSlider( min=0, - max=self.df['garage_spaces'].max(), - value=[0, self.df['garage_spaces'].max()], + max=self.df['parking_spaces'].max(), + value=[0, self.df['parking_spaces'].max()], id='garage_spaces_slider', updatemode='mouseup', tooltip={ @@ -516,9 +525,9 @@ def create_year_built_components(self): ], style={'display': 'inline-block'}), html.Div([ dcc.RangeSlider( - min=self.df['YrBuilt'].min(), - max=self.df['YrBuilt'].max(), - value=[0, self.df['YrBuilt'].max()], + min=self.df['year_built'].min(), + max=self.df['year_built'].max(), + value=[0, self.df['year_built'].max()], id='yrbuilt_slider', updatemode='mouseup', tooltip={ @@ -526,7 +535,7 @@ def create_year_built_components(self): "always_visible": True }, marks={ - float(self.df['YrBuilt'].min() + i*20): str(self.df['YrBuilt'].min() + i*20) for i in range(8) + float(self.df['year_built'].min() + i*20): str(self.df['year_built'].min() + i*20) for i in range(8) } ), dbc.Alert( @@ -610,9 +619,9 @@ def create_security_deposit_components(self): ]), html.Div([ dcc.RangeSlider( - min=self.df['DepositSecurity'].min(), - max=self.df['DepositSecurity'].max(), - value=[self.df['DepositSecurity'].min(), self.df['DepositSecurity'].max()], + min=self.df['security_deposit'].min(), + max=self.df['security_deposit'].max(), + value=[self.df['security_deposit'].min(), self.df['security_deposit'].max()], id='security_deposit_slider', updatemode='mouseup', tooltip={ @@ -662,9 +671,9 @@ def create_other_deposit_components(self): ]), html.Div([ dcc.RangeSlider( - min=self.df['DepositOther'].min(), - max=self.df['DepositOther'].max(), - value=[self.df['DepositOther'].min(), self.df['DepositOther'].max()], + min=self.df['other_deposit'].min(), + max=self.df['other_deposit'].max(), + value=[self.df['other_deposit'].min(), self.df['other_deposit'].max()], id='other_deposit_slider', updatemode='mouseup', tooltip={ @@ -714,9 +723,9 @@ def create_pet_deposit_components(self): ]), html.Div([ dcc.RangeSlider( - min=self.df['DepositPets'].min(), - max=self.df['DepositPets'].max(), - value=[self.df['DepositPets'].min(), self.df['DepositPets'].max()], + min=self.df['pet_deposit'].min(), + max=self.df['pet_deposit'].max(), + value=[self.df['pet_deposit'].min(), self.df['pet_deposit'].max()], id='pet_deposit_slider', updatemode='mouseup', tooltip={ @@ -766,9 +775,9 @@ def create_key_deposit_components(self): ]), html.Div([ dcc.RangeSlider( - min=self.df['DepositKey'].min(), - max=self.df['DepositKey'].max(), - value=[self.df['DepositKey'].min(), self.df['DepositKey'].max()], + min=self.df['key_deposit'].min(), + max=self.df['key_deposit'].max(), + value=[self.df['key_deposit'].min(), self.df['key_deposit'].max()], id='key_deposit_slider', updatemode='mouseup', tooltip={ @@ -818,9 +827,9 @@ def create_key_deposit_components(self): ]), html.Div([ dcc.RangeSlider( - min=self.df['DepositKey'].min(), - max=self.df['DepositKey'].max(), - value=[self.df['DepositKey'].min(), self.df['DepositKey'].max()], + min=self.df['key_deposit'].min(), + max=self.df['key_deposit'].max(), + value=[self.df['key_deposit'].min(), self.df['key_deposit'].max()], id='key_deposit_slider', updatemode='mouseup', tooltip={ @@ -863,8 +872,10 @@ def create_key_deposit_components(self): return key_deposit_components def create_laundry_checklist(self): + # Replace NaN values with 'Unknown' before sorting + laundry_series = self.df['laundry'].fillna('Unknown') # Get unique laundry categories sorted alphabetically - unique_categories = sorted(self.df['LaundryCategory'].unique()) + unique_categories = sorted(laundry_series.unique()) # Create options for the checklist laundry_options = [ @@ -962,7 +973,7 @@ def create_map(self): id='map', zoom=9, minZoom=9, - center=(self.df['Latitude'].mean(), self.df['Longitude'].mean()), + center=(self.df['latitude'].mean(), self.df['longitude'].mean()), preferCanvas=True, closePopupOnClick=True, style={'width': '100%', 'height': '90vh', 'margin': "auto", "display": "inline-block"} @@ -1138,7 +1149,7 @@ def create_bedrooms_slider(self): # Title and toggle button html.Div([ - html.H5("Bedrooms", style={'display': 'inline-block', 'margin-right': '10px'}), + html.H5("bedrooms", style={'display': 'inline-block', 'margin-right': '10px'}), create_toggle_button(index='bedrooms', initial_label="Hide", page_type='buy') ]), diff --git a/pages/filters.py b/pages/filters.py index 52e23620..2fe1a0cf 100644 --- a/pages/filters.py +++ b/pages/filters.py @@ -1,4 +1,4 @@ -from typing import Union +from typing import Union, List import pandas as pd import re @@ -7,24 +7,24 @@ class LeaseFilters: def __init__(self, df): self.df = df - def sqft_radio_button(self, include_missing: bool, slider_begin: float, slider_end: float) -> pd.Series: + def sqft_radio_button(self, include_missing: bool, slider_begin: int, slider_end: int) -> pd.Series: """ Filter the dataframe based on whether properties with missing square footage should be included. Args: - include_missing (bool): Whether properties with missing square footage should be included. - - slider_begin (float): Start value of the square footage slider. - - slider_end (float): End value of the square footage slider. + - slider_begin (int): Start value of the square footage slider. + - slider_end (int): End value of the square footage slider. Returns: - pd.Series: Boolean mask indicating which rows of the dataframe satisfy the filter conditions. """ if include_missing: # Include properties with missing square footage - sqft_choice = self.df['Sqft'].isnull() | self.df['Sqft'].between(slider_begin, slider_end) + sqft_choice = self.df['sqft'].isnull() | self.df['sqft'].between(slider_begin, slider_end) else: # Exclude properties with missing square footage - sqft_choice = self.df['Sqft'].between(slider_begin, slider_end) + sqft_choice = self.df['sqft'].between(slider_begin, slider_end) return sqft_choice def yrbuilt_radio_button(self, include_missing: bool, slider_begin: int, slider_end: int) -> pd.Series: @@ -41,10 +41,10 @@ def yrbuilt_radio_button(self, include_missing: bool, slider_begin: int, slider_ """ if include_missing: # Include properties with missing year built - yrbuilt_choice = self.df['YrBuilt'].isnull() | self.df['YrBuilt'].between(slider_begin, slider_end) + yrbuilt_choice = self.df['year_built'].isnull() | self.df['year_built'].between(slider_begin, slider_end) else: # Exclude properties with missing year built - yrbuilt_choice = self.df['YrBuilt'].between(slider_begin, slider_end) + yrbuilt_choice = self.df['year_built'].between(slider_begin, slider_end) return yrbuilt_choice def garage_radio_button(self, include_missing: bool, slider_begin: int, slider_end: int) -> pd.Series: @@ -61,20 +61,20 @@ def garage_radio_button(self, include_missing: bool, slider_begin: int, slider_e """ if include_missing: # Include properties with missing garage spaces - garage_choice = self.df['garage_spaces'].isnull() | self.df['garage_spaces'].between(slider_begin, slider_end) + garage_choice = self.df['parking_spaces'].isnull() | self.df['parking_spaces'].between(slider_begin, slider_end) else: # Exclude properties with missing garage spaces - garage_choice = self.df['garage_spaces'].between(slider_begin, slider_end) + garage_choice = self.df['parking_spaces'].between(slider_begin, slider_end) return garage_choice - def ppsqft_radio_button(self, include_missing: bool, slider_begin: float, slider_end: float) -> pd.Series: + def ppsqft_radio_button(self, include_missing: bool, slider_begin: int, slider_end: int) -> pd.Series: """ Filter the dataframe based on whether properties with missing price per square foot should be included. Args: - include_missing (bool): Whether properties with missing price per square foot should be included. - - slider_begin (float): Start value of the price per square foot slider. - - slider_end (float): End value of the price per square foot slider. + - slider_begin (int): Start value of the price per square foot slider. + - slider_end (int): End value of the price per square foot slider. Returns: - pd.Series: Boolean mask indicating which rows of the dataframe satisfy the filter conditions. @@ -100,18 +100,18 @@ def pets_radio_button(self, choice: str) -> pd.Series: Returns: - pd.Series: A boolean Series indicating which rows of the DataFrame satisfy the filter conditions. """ - if choice == True: + if choice == 'Yes': # Filter for rows where the pet policy allows pets (not 'No' or 'No, Size Limit') - pets_radio_choice = ~self.df['PetsAllowed'].isin(['No', 'No, Size Limit']) - elif choice == False: + pets_radio_choice = ~self.df['pet_policy'].isin(['No', 'No, Size Limit']) + elif choice == 'No': # Filter for rows where the pet policy does not allow pets - pets_radio_choice = self.df['PetsAllowed'].isin(['No', 'No, Size Limit']) - else: # Assuming 'Both' includes all rows - # Create a boolean Series of True for all rows to include everything + pets_radio_choice = self.df['pet_policy'].isin(['No', 'No, Size Limit']) + else: # 'Both' + # Include all properties regardless of pet policy pets_radio_choice = pd.Series([True] * len(self.df), index=self.df.index) return pets_radio_choice - def furnished_checklist_function(self, choice: list[str]) -> pd.Series: + def furnished_checklist_function(self, choice: List[str]) -> pd.Series: """ Filters the DataFrame for furnished dwellings based on the user's choice. @@ -120,244 +120,249 @@ def furnished_checklist_function(self, choice: list[str]) -> pd.Series: might not specify their furnished state. Args: - - choice (list[str]): A list of user-selected options regarding the furnished status. - Options include 'Furnished', 'Unfurnished', and 'Unknown'. + - choice (List[str]): A list of user-selected options regarding the furnished status. + Options include 'Furnished', 'Unfurnished', and 'Unknown'. Returns: - pd.Series: A boolean Series indicating which rows of the DataFrame satisfy the filter conditions. """ - # Presort the list first for potentially faster performance - choice.sort() + if not choice: + # If no choices are selected, return False for all entries + return pd.Series([False] * len(self.df), index=self.df.index) + + filters = [] if 'Unknown' in choice: - # Include rows where Furnished status is NaN OR matches one of the selected choices - furnished_checklist_filter = self.df['Furnished'].isnull() | self.df['Furnished'].isin(choice) - else: - # If Unknown is NOT selected, return rows that match the selected choices (implies .notnull() by default) - furnished_checklist_filter = self.df['Furnished'].isin(choice) + # Include entries where 'furnished' is NaN + filters.append(self.df['furnished'].isna()) + # Remove 'Unknown' from choices to avoid filtering by it in 'isin' + choice = [c for c in choice if c != 'Unknown'] + + if choice: + # For remaining choices, filter where 'furnished' matches the choices + filters.append(self.df['furnished'].isin(choice)) + + # Combine filters using logical OR + furnished_checklist_filter = pd.Series(False, index=self.df.index) + for f in filters: + furnished_checklist_filter |= f + return furnished_checklist_filter - def security_deposit_function(self, include_missing: bool, slider_begin: float, slider_end: float) -> pd.Series: + def security_deposit_function(self, include_missing: bool, slider_begin: int, slider_end: int) -> pd.Series: """ - Filters the DataFrame for properties based on security deposit criteria, allowing - for the inclusion of properties without a security deposit listed. + Filter the dataframe based on whether properties with missing security deposit should be included. Args: - - include_missing (bool): Whether to include properties with no security deposit listed. - - slider_begin (float): The starting value of the range for the security deposit. - - slider_end (float): The ending value of the range for the security deposit. + - include_missing (bool): Whether properties with missing security deposit should be included. + - slider_begin (int): Start value of the security deposit slider. + - slider_end (int): End value of the security deposit slider. Returns: - - pd.Series: A boolean Series indicating which rows of the DataFrame satisfy the - filter conditions based on the security deposit. + - pd.Series: Boolean mask indicating which rows of the dataframe satisfy the filter conditions. """ if include_missing: - # Include properties with no security deposit listed or within the specified range - security_deposit_filter = self.df['DepositSecurity'].isnull() | self.df['DepositSecurity'].between(slider_begin, slider_end) + # Include properties with missing security deposit + security_deposit_filter = self.df['security_deposit'].isnull() | self.df['security_deposit'].between(slider_begin, slider_end) else: - # Include properties within the specified range, implicitly excludes nulls - security_deposit_filter = self.df['DepositSecurity'].between(slider_begin, slider_end) + # Exclude properties with missing security deposit + security_deposit_filter = self.df['security_deposit'].between(slider_begin, slider_end) return security_deposit_filter - def pet_deposit_function(self, include_missing: bool, slider_begin: float, slider_end: float) -> pd.Series: + def pet_deposit_function(self, include_missing: bool, slider_begin: int, slider_end: int) -> pd.Series: """ - Filters the DataFrame for properties based on pet deposit criteria, allowing - for the inclusion of properties without a pet deposit listed. + Filter the dataframe based on whether properties with missing pet deposit should be included. Args: - - include_missing (bool): Whether to include properties with no pet deposit listed. - - slider_begin (float): The starting value of the range for the pet deposit. - - slider_end (float): The ending value of the range for the pet deposit. + - include_missing (bool): Whether properties with missing pet deposit should be included. + - slider_begin (int): Start value of the pet deposit slider. + - slider_end (int): End value of the pet deposit slider. Returns: - - pd.Series: A boolean Series indicating which rows of the DataFrame satisfy the - filter conditions based on the pet deposit. + - pd.Series: Boolean mask indicating which rows of the dataframe satisfy the filter conditions. """ if include_missing: - # Include properties with no pet deposit listed or within the specified range - pet_deposit_filter = self.df['DepositPets'].isnull() | self.df['DepositPets'].between(slider_begin, slider_end) + # Include properties with missing pet deposit + pet_deposit_filter = self.df['pet_deposit'].isnull() | self.df['pet_deposit'].between(slider_begin, slider_end) else: - # Include properties within the specified range, implicitly excludes nulls - pet_deposit_filter = self.df['DepositPets'].between(slider_begin, slider_end) + # Exclude properties with missing pet deposit + pet_deposit_filter = self.df['pet_deposit'].between(slider_begin, slider_end) return pet_deposit_filter - def key_deposit_function(self, include_missing: bool, slider_begin: float, slider_end: float) -> pd.Series: + def key_deposit_function(self, include_missing: bool, slider_begin: int, slider_end: int) -> pd.Series: """ - Filters the DataFrame for properties based on key deposit criteria, allowing - for the inclusion of properties without a key deposit listed. - - This function is designed to filter properties based on the presence or absence - of a key deposit and whether the key deposit amount falls within a specified range. + Filter the dataframe based on whether properties with missing key deposit should be included. Args: - - include_missing (bool): Whether to include properties with no key deposit listed. - - slider_begin (float): The starting value of the range for the key deposit. - - slider_end (float): The ending value of the range for the key deposit. + - include_missing (bool): Whether properties with missing key deposit should be included. + - slider_begin (int): Start value of the key deposit slider. + - slider_end (int): End value of the key deposit slider. Returns: - - pd.Series: A boolean Series indicating which rows of the DataFrame satisfy the - filter conditions based on the key deposit. + - pd.Series: Boolean mask indicating which rows of the dataframe satisfy the filter conditions. """ if include_missing: - # Include properties with no key deposit listed or within the specified range - key_deposit_filter = self.df['DepositKey'].isnull() | self.df['DepositKey'].between(slider_begin, slider_end) + # Include properties with missing key deposit + key_deposit_filter = self.df['key_deposit'].isnull() | self.df['key_deposit'].between(slider_begin, slider_end) else: - # Include properties within the specified range, implicitly excludes nulls - key_deposit_filter = self.df['DepositKey'].between(slider_begin, slider_end) + # Exclude properties with missing key deposit + key_deposit_filter = self.df['key_deposit'].between(slider_begin, slider_end) return key_deposit_filter - def other_deposit_function(self, include_missing: bool, slider_begin: float, slider_end: float) -> pd.Series: + def other_deposit_function(self, include_missing: bool, slider_begin: int, slider_end: int) -> pd.Series: """ - Filters the DataFrame for properties based on 'other' deposit criteria, allowing - for the inclusion of properties without an 'other' deposit listed. + Filter the dataframe based on whether properties with missing other deposit should be included. Args: - - include_missing (bool): Whether to include properties with no 'other' deposit listed. - - slider_begin (float): The starting value of the range for the 'other' deposit. - - slider_end (float): The ending value of the range for the 'other' deposit. + - include_missing (bool): Whether properties with missing other deposit should be included. + - slider_begin (int): Start value of the other deposit slider. + - slider_end (int): End value of the other deposit slider. Returns: - - pd.Series: A boolean Series indicating which rows of the DataFrame satisfy the - filter conditions based on the 'other' deposit. + - pd.Series: Boolean mask indicating which rows of the dataframe satisfy the filter conditions. """ if include_missing: - # Include properties with no 'other' deposit listed or within the specified range - other_deposit_filter = self.df['DepositOther'].isnull() | self.df['DepositOther'].between(slider_begin, slider_end) + # Include properties with missing other deposit + other_deposit_filter = self.df['other_deposit'].isnull() | self.df['other_deposit'].between(slider_begin, slider_end) else: - # Include properties within the specified range, implicitly excludes nulls - other_deposit_filter = self.df['DepositOther'].between(slider_begin, slider_end) + # Exclude properties with missing other deposit + other_deposit_filter = self.df['other_deposit'].between(slider_begin, slider_end) return other_deposit_filter - def listed_date_function(self, include_missing: bool, start_date: str, end_date: str) -> pd.Series: + def listed_date_function(self, include_missing: bool, start_date: Union[str, pd.Timestamp], end_date: Union[str, pd.Timestamp]) -> pd.Series: """ - Filters the DataFrame for properties based on the listing date criteria, allowing - for the inclusion of properties without a listed date. - - This function allows filtering properties based on whether there is a listing date - specified and whether this date falls within a given range. + Filter the dataframe based on whether properties with missing listed date should be included. Args: - - include_missing (bool): Whether to include properties with no listed date. - - start_date (str): The starting date of the range for the listing date, formatted as 'YYYY-MM-DD'. - - end_date (str): The ending date of the range for the listing date, formatted as 'YYYY-MM-DD'. + - include_missing (bool): Whether properties with missing listed date should be included. + - start_date (Union[str, pd.Timestamp]): Start date of the listed date range. + - end_date (Union[str, pd.Timestamp]): End date of the listed date range. Returns: - - pd.Series: A boolean Series indicating which rows of the DataFrame satisfy the - filter conditions based on the listing date. + - pd.Series: Boolean mask indicating which rows of the dataframe satisfy the filter conditions. """ + # Convert start_date and end_date to datetime if they are strings + start_date = pd.to_datetime(start_date) + end_date = pd.to_datetime(end_date) + if include_missing: - # Include properties with no listed date or within the specified date range + # Include properties with missing listed date listed_date_filter = self.df['listed_date'].isnull() | self.df['listed_date'].between(start_date, end_date) else: - # Include properties within the specified date range, implicitly excludes nulls + # Exclude properties with missing listed date listed_date_filter = self.df['listed_date'].between(start_date, end_date) return listed_date_filter - def terms_function(self, choice: list[str]) -> pd.Series: + def terms_function(self, choice: List[str]) -> pd.Series: """ - Filters the DataFrame based on specified terms in the 'Terms' column. Supports - inclusion of rows with missing values ('NaN') if 'Unknown' is part of the choices. - + Filters the DataFrame based on the rental lease terms according to the user's choice. + Args: - - choice (list[str]): A list of terms to filter the 'Terms' column by. Includes - special handling for 'Unknown' to include or exclude NaN values. - + - choice (List[str]): A list of user-selected terms. Options could include various terms like 'Lease', 'Month-to-Month', etc. + Returns: - - pd.Series: A boolean Series indicating which rows of the DataFrame satisfy the - filter conditions. If no choices are made, it defaults to False for all rows. + - pd.Series: A boolean Series indicating which rows of the DataFrame satisfy the filter conditions. """ - # Ensure choice list is not empty if not choice: + # If no choices are selected, return False for all entries return pd.Series([False] * len(self.df), index=self.df.index) - - # Presort the list for potentially faster performance - choice.sort() - # Corrected: Use re.escape for escaping regex special characters - choice_regex = '|'.join([re.escape(term) for term in choice if term != 'Unknown']) - # Handle 'Unknown' choice - if 'Unknown' in choice: - terms_filter = self.df['Terms'].isnull() | self.df['Terms'].str.contains(choice_regex, na=False) - else: - terms_filter = self.df['Terms'].str.contains(choice_regex, na=False) - - return terms_filter + # Handle 'Unknown' option + if 'Unknown' in choice: + unknown_filter = self.df['terms'].isnull() + # Remove 'Unknown' from choices to avoid filtering by it in 'str.contains' + choice = [c for c in choice if c != 'Unknown'] + else: + unknown_filter = pd.Series([False] * len(self.df), index=self.df.index) + + if choice: + # Create a regex pattern from the choice list, escaping any special characters + pattern = '|'.join([re.escape(term) for term in choice]) + # Use vectorized string matching for efficient filtering + terms_filter = self.df['terms'].str.contains(pattern, na=False, case=False) + else: + terms_filter = pd.Series([False] * len(self.df), index=self.df.index) + + # Combine filters + combined_filter = terms_filter | unknown_filter + return combined_filter - def laundry_checklist_function(self, choice: list[str]) -> pd.Series: + def laundry_checklist_function(self, choice: List[str]) -> pd.Series: """ - Filters the DataFrame for properties based on selected laundry features. - - Special handling for 'Other' to include properties that do not match any of the - predefined categories. 'Unknown' and 'None' are treated according to their selection. + Filters the DataFrame for laundry features based on the user's choice. Args: - - choice (list[str]): A list of user-selected laundry features. - + - choice (List[str]): A list of user-selected options regarding laundry features. + Options include types like 'In Unit', 'Shared', 'Hookups', + 'Included Appliances', 'Location Specific', 'Unknown', and 'Other'. + Returns: - - pd.Series: A boolean Series indicating which rows of the DataFrame satisfy - the filter conditions based on laundry features. + - pd.Series: A boolean Series indicating which rows of the DataFrame satisfy the filter conditions. """ - # Return False for all rows if the choice list is empty if not choice: + # If no choices are selected, return False for all entries return pd.Series([False] * len(self.df), index=self.df.index) - # Special case for 'Other' - if 'Other' in choice: - other_filter = ~self.df['LaundryCategory'].isin([ - 'In Unit', 'Shared', 'Hookups', 'Included Appliances', 'Location Specific', 'Unknown' - ]) - choice.remove('Other') - else: - other_filter = pd.Series([False] * len(self.df), index=self.df.index) - - # Handle 'Unknown' choice + filters = [] if 'Unknown' in choice: - unknown_filter = self.df['LaundryCategory'] == 'Unknown' - choice.remove('Unknown') - else: - unknown_filter = pd.Series([False] * len(self.df), index=self.df.index) + # Include entries where 'laundry' is NaN + filters.append(self.df['laundry'].isna()) + # Remove 'Unknown' from choices to avoid filtering by it in 'isin' + choice = [c for c in choice if c != 'Unknown'] + + if 'Other' in choice: + # Include entries where 'laundry' is not in known categories + known_categories = ['In Unit', 'Shared', 'Hookups', 'Included Appliances', 'Location Specific'] + other_filter = ~self.df['laundry'].isin(known_categories) + filters.append(other_filter) + # Remove 'Other' from choices + choice = [c for c in choice if c != 'Other'] - # Filter based on the remaining choices if choice: - choice_filter = self.df['LaundryCategory'].isin(choice) + # Filter where 'laundry' matches the choices + filters.append(self.df['laundry'].isin(choice)) + + # Combine filters using logical OR + if filters: + laundry_checklist_filter = pd.Series([False] * len(self.df), index=self.df.index) + for f in filters: + laundry_checklist_filter |= f else: - choice_filter = pd.Series([False] * len(self.df), index=self.df.index) - - # Combine all filters - combined_filter = choice_filter | other_filter | unknown_filter + # If no valid choices left, return False for all entries + laundry_checklist_filter = pd.Series([False] * len(self.df), index=self.df.index) - return combined_filter + return laundry_checklist_filter - def subtype_checklist_function(self, choice: list[str]) -> pd.Series: + def subtype_checklist_function(self, choice: List[str]) -> pd.Series: """ - Filters the DataFrame for properties based on selected property subtypes. - - Special handling is provided for 'Unknown' to include properties without a specified subtype, - as well as subtypes '/A' and '/D'. - + Filters the DataFrame for property subtypes based on the user's choice. + Args: - - choice (list[str]): A list of user-selected property subtypes, including a special 'Unknown' - option to include properties without a specified subtype. - + - choice (List[str]): A list of user-selected subtypes. Options include various property types. + Returns: - - pd.Series: A boolean Series indicating which rows of the DataFrame satisfy - the filter conditions based on property subtypes. + - pd.Series: A boolean Series indicating which rows of the DataFrame satisfy the filter conditions. """ - # Ensure the choice list is not empty if not choice: + # If no choices are selected, return False for all entries return pd.Series([False] * len(self.df), index=self.df.index) - - # Map '/A' and '/D' subtypes to 'Unknown' - self.df['subtype'] = self.df['subtype'].replace({'/A': None, '/D': None}) - - # Handle 'Unknown' selection + + # Handle 'Unknown' option if 'Unknown' in choice: - # Include rows where subtype is NaN OR matches one of the selected choices - subtype_filter = self.df['subtype'].isnull() | self.df['subtype'].isin(choice) + unknown_filter = self.df['subtype'].isnull() + # Remove 'Unknown' from choices to avoid filtering by it in 'isin' + choice = [c for c in choice if c != 'Unknown'] else: - # If 'Unknown' is NOT selected, filter by the selected choices + unknown_filter = pd.Series([False] * len(self.df), index=self.df.index) + + if choice: + # Filter where 'subtype' matches the choices subtype_filter = self.df['subtype'].isin(choice) - - return subtype_filter + else: + subtype_filter = pd.Series([False] * len(self.df), index=self.df.index) + + # Combine filters + combined_filter = subtype_filter | unknown_filter + return combined_filter # Create a class to hold all of the filters for the sale page class BuyFilters: @@ -406,10 +411,10 @@ def sqft_function(self, include_missing: bool, slider_begin: float, slider_end: """ if include_missing: # Include properties with missing square footage - sqft_choice = self.df['Sqft'].isnull() | self.df['Sqft'].between(slider_begin, slider_end) + sqft_choice = self.df['sqft'].isnull() | self.df['sqft'].between(slider_begin, slider_end) else: # Exclude properties with missing square footage - sqft_choice = self.df['Sqft'].between(slider_begin, slider_end) + sqft_choice = self.df['sqft'].between(slider_begin, slider_end) return sqft_choice def year_built_function(self, include_missing: bool, slider_begin: int, slider_end: int) -> pd.Series: diff --git a/pages/lease_page.py b/pages/lease_page.py index c46649c8..bcab8918 100644 --- a/pages/lease_page.py +++ b/pages/lease_page.py @@ -101,18 +101,18 @@ def update_map(subtypes_chosen, pets_chosen, terms_chosen, garage_spaces, rental subtypes_chosen.sort() # Sort the DataFrame once at the beginning - df_sorted = df.sort_values(by=['garage_spaces', 'list_price', 'Bedrooms', 'Total Bathrooms', 'Sqft', 'YrBuilt', 'ppsqft']) + df_sorted = df.sort_values(by=['parking_spaces', 'list_price', 'bedrooms', 'total_bathrooms', 'sqft', 'year_built', 'ppsqft']) filters = [ lease_filters.subtype_checklist_function(subtypes_chosen), lease_filters.pets_radio_button(pets_chosen), lease_filters.terms_function(terms_chosen), - ((df_sorted['garage_spaces'].between(garage_spaces[0], garage_spaces[1])) | lease_filters.garage_radio_button(garage_missing_radio_choice, garage_spaces[0], garage_spaces[1])), + ((df_sorted['parking_spaces'].between(garage_spaces[0], garage_spaces[1])) | lease_filters.garage_radio_button(garage_missing_radio_choice, garage_spaces[0], garage_spaces[1])), (df_sorted['list_price'].between(rental_price[0], rental_price[1])), - (df_sorted['Bedrooms'].between(bedrooms_chosen[0], bedrooms_chosen[1])), - (df_sorted['Total Bathrooms'].between(bathrooms_chosen[0], bathrooms_chosen[1])), - ((df_sorted['Sqft'].between(sqft_chosen[0], sqft_chosen[1])) | lease_filters.sqft_radio_button(sqft_missing_radio_choice, sqft_chosen[0], sqft_chosen[1])), - ((df_sorted['YrBuilt'].between(years_chosen[0], years_chosen[1])) | lease_filters.yrbuilt_radio_button(yrbuilt_missing_radio_choice, years_chosen[0], years_chosen[1])), + (df_sorted['bedrooms'].between(bedrooms_chosen[0], bedrooms_chosen[1])), + (df_sorted['total_bathrooms'].between(bathrooms_chosen[0], bathrooms_chosen[1])), + ((df_sorted['sqft'].between(sqft_chosen[0], sqft_chosen[1])) | lease_filters.sqft_radio_button(sqft_missing_radio_choice, sqft_chosen[0], sqft_chosen[1])), + ((df_sorted['year_built'].between(years_chosen[0], years_chosen[1])) | lease_filters.yrbuilt_radio_button(yrbuilt_missing_radio_choice, years_chosen[0], years_chosen[1])), ((df_sorted['ppsqft'].between(ppsqft_chosen[0], ppsqft_chosen[1])) | lease_filters.ppsqft_radio_button(ppsqft_missing_radio_choice, ppsqft_chosen[0], ppsqft_chosen[1])), lease_filters.furnished_checklist_function(furnished_choice), lease_filters.security_deposit_function(security_deposit_radio_choice, security_deposit_chosen[0], security_deposit_chosen[1]), @@ -152,33 +152,42 @@ def update_map(subtypes_chosen, pets_chosen, terms_chosen, garage_spaces, rental for row in df_filtered.itertuples(): markers.append( dict( - lat=row.Latitude, - lon=row.Longitude, + lat=row.latitude, + lon=row.longitude, data=dict( - address=row.full_street_address, - bathrooms=row.Bedrooms, - bedrooms=row.Bedrooms, - furnished=row.Furnished, - garage_spaces=row.garage_spaces, - image_url=row.mls_photo, - key_deposit=row.DepositKey, - laundry=row.LaundryFeatures, + #bedrooms_bathrooms=row.total_bathrooms, + bedrooms=row.bedrooms, + city=row.city, + date_processed=row.date_processed, + full_bathrooms=row.full_bathrooms, + full_street_address=row.full_street_address, + furnished=row.furnished, + half_bathrooms=row.half_bathrooms, + key_deposit=row.key_deposit, + laundry=row.laundry, list_price=row.list_price, listed_date=row.listed_date, listing_url=row.listing_url, mls_number=row.mls_number, mls_photo=row.mls_photo, - other_deposit=row.DepositOther, - pet_deposit=row.DepositPets, - pet_policy=row.PetsAllowed, + other_deposit=row.other_deposit, + parking_spaces=row.parking_spaces, + pet_deposit=row.pet_deposit, + pet_policy=row.pet_policy, phone_number=row.phone_number, ppsqft=row.ppsqft, - security_deposit=row.DepositSecurity, - senior_community=row.SeniorCommunityYN, - sqft=row.Sqft, + security_deposit=row.security_deposit, + senior_community=row.senior_community, + short_address=row.short_address, + sqft=row.sqft, + street_name=row.street_name, + street_number=row.street_number, subtype=row.subtype, - terms=row.Terms, - year_built=row.YrBuilt, + terms=row.terms, + three_quarter_bathrooms=row.three_quarter_bathrooms, + total_bathrooms=row.total_bathrooms, + year_built=row.year_built, + zip_code=row.zip_code, ), ) ) diff --git a/requirements.txt b/requirements.txt index 4ea6802b..697e63b7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,22 +1,21 @@ aiolimiter==1.1.0 beautifulsoup4==4.12.3 dash_bootstrap_components==1.6.0 -dash_extensions==1.0.16 +dash_extensions==1.0.18 dash-leaflet==1.0.15 -dash==2.17.1 +dash==2.18.1 geopy==2.4.1 -gevent==24.2.1 -gunicorn==22.0.0 -imagekitio==4.0.1 +gevent==24.10.2 +gunicorn==23.0.0 +imagekitio==4.1.0 loguru==0.7.2 numpy==1.26.4 -orjson==3.10.5 -pandas==2.2.2 -protobuf==5.27.1 -pyarrow==16.1.0 +orjson==3.10.7 +pandas==2.2.3 +protobuf==5.28.2 +pyarrow==17.0.0 python-dotenv==1.0.1 pyyaml requests==2.32.3 -sodapy==2.2.0 tables user_agents==2.2.0 \ No newline at end of file
Bedrooms/Bathrooms${data.bedrooms}/${data.bathrooms}${data.bedrooms}/${data.total_bathrooms}
Garage Spaces${data.garage_spaces || "Unknown"}Parking Spaces${data.parking_spaces || "Unknown"}
Pets Allowed?