Skip to content

Commit

Permalink
[feat] scrape projects from ORES website and insert/update Supabase
Browse files Browse the repository at this point in the history
  • Loading branch information
deenasun committed Nov 10, 2024
1 parent cd907be commit 8eacf16
Show file tree
Hide file tree
Showing 10 changed files with 358 additions and 4 deletions.
Binary file modified api/webscraper/__pycache__/database_constants.cpython-312.pyc
Binary file not shown.
Binary file modified api/webscraper/__pycache__/nyiso_scraper.cpython-312.pyc
Binary file not shown.
Binary file modified api/webscraper/__pycache__/nyserda_scraper.cpython-312.pyc
Binary file not shown.
160 changes: 160 additions & 0 deletions api/webscraper/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
filter_nyiso_cluster_sheet,
filter_nyiso_in_service_sheet,
)
from ores_scraper import query_ores_noi, query_ores_under_review, query_ores_permitted
from utils.scraper_utils import (
create_update_object,
update_kdm,
Expand Down Expand Up @@ -413,10 +414,169 @@ def nyiso_in_service_to_database():
print(exception)


def ores_noi_to_database():
database = []
database.extend(query_ores_noi())
for project in database:
existing_data = (
supabase.table("Projects_duplicate")
.select("*")
.eq("project_name", project["project_name"])
.execute()
)
if len(existing_data.data) > 0:
existing_project = existing_data.data[0]
update_object = create_update_object(existing_project, project)
try:
response = (
supabase.table("Projects_duplicate")
.update(update_object)
.eq(
"project_name",
project["project_name"],
)
.execute()
)
print("UPDATE", response, "\n")
except Exception as exception:
print(exception)
else:
try:
response = (
supabase.table("Projects_duplicate").insert(project).execute()
)
print("INSERT", response, "\n")
except Exception as exception:
print(exception)


def ores_under_review_to_database():
database = []
database.extend(query_ores_under_review())
for project in database:
existing_data = (
supabase.table("Projects_duplicate")
.select("*")
.eq("project_name", project["project_name"])
.execute()
)
if len(existing_data.data) > 0:
existing_project = existing_data.data[0]
update_object = create_update_object(existing_project, project)
# if the existing project has no kdms, add the dict first
if (
existing_project["key_development_milestones"] is None
or len(existing_project["key_development_milestones"]) < 0
):
update_object["key_development_milestones"] = initial_kdm_dict
else:
update_object["key_development_milestones"] = existing_project[
"key_development_milestones"
]

# update kdm for ores projects under review
update_object["key_development_milestones"] = update_kdm(
milestoneTitle="Application for permit to ORES",
completed=True,
date=None,
kdm=update_object["key_development_milestones"],
)
try:
response = (
supabase.table("Projects_duplicate")
.update(update_object)
.eq(
"project_name",
project["project_name"],
)
.execute()
)
print("UPDATE", response, "\n")
except Exception as exception:
print(exception)
else:
project["key_development_milestones"] = update_kdm(
milestoneTitle="Application for permit to ORES",
completed=True,
date=None,
kdm=project["key_development_milestones"],
)
try:
response = (
supabase.table("Projects_duplicate").insert(project).execute()
)
print("INSERT", response, "\n")
except Exception as exception:
print(exception)


def ores_permitted_to_database():
database = []
database.extend(query_ores_permitted())
for project in database:
existing_data = (
supabase.table("Projects_duplicate")
.select("*")
.eq("project_name", project["project_name"])
.execute()
)
if len(existing_data.data) > 0:
existing_project = existing_data.data[0]
update_object = create_update_object(existing_project, project)
# if the existing project has no kdms, add the dict first
if (
existing_project["key_development_milestones"] is None
or len(existing_project["key_development_milestones"]) < 0
):
update_object["key_development_milestones"] = initial_kdm_dict
else:
update_object["key_development_milestones"] = existing_project[
"key_development_milestones"
]

# update kdm for ores projects under review
update_object["key_development_milestones"] = update_kdm(
milestoneTitle="Issuance of permit from ORES",
completed=True,
date=None,
kdm=update_object["key_development_milestones"],
)
try:
response = (
supabase.table("Projects_duplicate")
.update(update_object)
.eq(
"project_name",
project["project_name"],
)
.execute()
)
print("UPDATE", response, "\n")
except Exception as exception:
print(exception)
else:
project["key_development_milestones"] = update_kdm(
milestoneTitle="Issuance of permit from ORES",
completed=True,
date=None,
kdm=project["key_development_milestones"],
)
try:
response = (
supabase.table("Projects_duplicate").insert(project).execute()
)
print("INSERT", response, "\n")
except Exception as exception:
print(exception)


"""
For testing
"""
# nyserda_large_to_database()
# nyserda_solar_to_database()
# nyiso_to_database()
# nyiso_in_service_to_database()
# ores_noi_to_database()
ores_under_review_to_database()
# ores_permitted_to_database()
10 changes: 10 additions & 0 deletions api/webscraper/database_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,14 @@
"date": None,
},
{"milestoneTitle": "Start of operations", "completed": False, "date": None},
{
"milestoneTitle": "Application for permit to ORES",
"completed": False,
"date": None,
},
{
"milestoneTitle": "Issuance of permit from ORES",
"completed": False,
"date": None,
},
]
2 changes: 1 addition & 1 deletion api/webscraper/nyserda_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def write_large_to_json():
file.write("\n")


write_large_to_json()
# write_large_to_json()

"""
This scrapes data from the NYSERDA Statewide Distributed Solar Projects database.
Expand Down
180 changes: 180 additions & 0 deletions api/webscraper/ores_scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
import requests
from bs4 import BeautifulSoup
import pandas as pd
from io import StringIO
from utils.scraper_utils import geocode_lat_long, update_kdm
from database_constants import initial_kdm_dict

# url = "https://dps.ny.gov/ores-permit-applications"
# page = requests.get(url)

# soup = BeautifulSoup(page.content, "html.parser")
# tables = soup.find_all("table")

# notices_of_intent = pd.read_html(StringIO(tables[0].prettify()))[0]
# noi_dict = notices_of_intent.to_dict(orient="records")

# # Complete Applications Under Review
# under_review = pd.read_html(StringIO(tables[3].prettify()))[0]
# under_review_dict = under_review.to_dict(orient="records")

# # Permitted Applications
# permitted = pd.read_html(StringIO(tables[4].prettify()))[0]
# permitted_dict = permitted.to_dict(orient="records")

"""
All the descriptions of the ORES data describe the location of the project in the following format:
... Located in the Towns of ALTONA, CLINTON, ELLENBURG, and MOOERS, CLINTON COUNTY.
"""


def parse_for_location(description):
# finds index in the description where the phrase "Town of..." appears
town_index = description.find("Town")
town_string = description[town_index:]
# splits town_string by the comma
town_split = town_string.split(",")
# town is the second to last word before the comma
town = town_split[-2].split(" ")[-1].strip()
# county is the last word when the location string is split by commas
county = town_split[-1].strip()

# removes the period from the end of county if it exists
index = county.find(".")
if index != -1:
while county.find(".", index + 1) != -1:
index = county.find(".", index + 1)
county = county[:index]

# capitalize first letter of each word in town/county name
if town:
town = " ".join([word.capitalize() for word in town.split(" ")])
if county:
county = " ".join([word.capitalize() for word in county.split(" ")])
return (town, county)


# ORES notice of intent
def filter_noi(data: list) -> list:
"""
params: data - list of dictionaries representing rows in the ORES Notices of Intent table
Parses description to find town, county of project
Reverse Geocodes for latitude and longitude
Returns list of projects with data filtered to include the desired fields
"""
filtered_list = []
for row in data:
town, county = parse_for_location(row["Description"])
lat, long = geocode_lat_long(f"{town}, NY")
project_dict = {
"permit_application_number": row.get("Permit Application Number", None),
"project_name": row.get("Project Name", None),
"town": town if town else None,
"county": county if county else None,
"latitude": lat if lat else None,
"longitude": long if long else None,
"key_development_milestones": initial_kdm_dict,
}
filtered_list.append(project_dict)

return filtered_list


def filter_under_review(data: list) -> list:
"""
params: data - list of dictionaries representing rows in the ORES Completed Projects Under Review table
Parses description to find town, county of project
Reverse Geocodes for latitude and longitude
Returns list of projects with data filtered to include the desired fields
"""
filtered_list = []
for row in data:
town, county = parse_for_location(row["Description"])
lat, long = geocode_lat_long(f"{town}, NY")
project_dict = {
"permit_application_number": row.get("Permit Application Number", None),
"project_name": row.get("Project Name", None),
"town": town if town else None,
"county": county if county else None,
"latitude": lat if lat else None,
"longitude": long if long else None,
"key_development_milestones": initial_kdm_dict,
}
project_dict["key_development_milestones"] = update_kdm(
"Application for permit to ORES",
date=None,
completed=True,
kdm=project_dict.get("key_development_milestones"),
)
filtered_list.append(project_dict)
return filtered_list


def filter_permitted(data):
"""
params: data - list of dictionaries representing rows in the ORES Permitted Applications table
Parses description to find town, county of project
Reverse Geocodes for latitude and longitude
Returns list of projects with data filtered to include the desired fields
"""
filtered_list = []
for row in data:
town, county = parse_for_location(row["Description"])
lat, long = geocode_lat_long(f"{town}, NY")
project_dict = {
"permit_application_number": row.get("Permit Application Number", None),
"project_name": row.get("Project Name", None),
"town": town if town else None,
"county": county if county else None,
"latitude": lat if lat else None,
"longitude": long if long else None,
"key_development_milestones": initial_kdm_dict,
}
project_dict["key_development_milestones"] = update_kdm(
"Issuance of permit from ORES",
date=None,
completed=True,
kdm=project_dict.get("key_development_milestones"),
)
filtered_list.append(project_dict)
return filtered_list


# ORES notice of review
def query_ores_noi():
url = "https://dps.ny.gov/ores-permit-applications"
page = requests.get(url)

soup = BeautifulSoup(page.content, "html.parser")
tables = soup.find_all("table")

notices_of_intent = pd.read_html(StringIO(tables[0].prettify()))[0]
noi_dict = notices_of_intent.to_dict(orient="records")
response = filter_noi(noi_dict)
return response


def query_ores_under_review():
url = "https://dps.ny.gov/ores-permit-applications"
page = requests.get(url)

soup = BeautifulSoup(page.content, "html.parser")
tables = soup.find_all("table")

under_review = pd.read_html(StringIO(tables[3].prettify()))[0]
under_review_dict = under_review.to_dict(orient="records")
response = filter_under_review(under_review_dict)
return response


def query_ores_permitted():
url = "https://dps.ny.gov/ores-permit-applications"
page = requests.get(url)

soup = BeautifulSoup(page.content, "html.parser")
tables = soup.find_all("table")

permitted = pd.read_html(StringIO(tables[4].prettify()))[0]
permitted_dict = permitted.to_dict(orient="records")
response = filter_under_review(permitted_dict)
return response
Binary file modified api/webscraper/utils/__pycache__/__init__.cpython-312.pyc
Binary file not shown.
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
},
"devDependencies": {
"@ianvs/prettier-plugin-sort-imports": "^4.3.1",
"@types/google.maps": "^3.58.1",
"@types/node": "^20.17.2",
"@types/react": "^18.3.12",
"@types/react-dom": "^18.3.1",
Expand Down
Loading

0 comments on commit 8eacf16

Please sign in to comment.