-
-
Notifications
You must be signed in to change notification settings - Fork 311
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
797 spider governors state university (#975)
- Loading branch information
Showing
3 changed files
with
1,467 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,242 @@ | ||
import re | ||
from datetime import datetime | ||
|
||
from city_scrapers_core.constants import BOARD, COMMITTEE | ||
from city_scrapers_core.items import Meeting | ||
from city_scrapers_core.spiders import CityScrapersSpider | ||
|
||
|
||
class IlGovernorsStateUniversitySpider(CityScrapersSpider): | ||
name = "il_governors_state_university" | ||
agency = "Governors State University" | ||
timezone = "America/Chicago" | ||
start_urls = ["https://www.govst.edu/BOT-Meetings/"] | ||
time_re = r"(?i)([01]?\d)(:?\d*)\s*([ap]\.?m\.?)" | ||
|
||
def parse(self, response): | ||
""" | ||
`parse` should always `yield` Meeting items. | ||
""" | ||
for year_section in response.xpath('//div[@class="toggle-list"]/ul/li'): | ||
year_elt = year_section.xpath('div[@class="title"]/h3/text()') | ||
# sometimes the year is not present in the table dates, so grab it from the | ||
# section heading as backup | ||
year = year_elt.get().replace("Meeting Dates for ", "").strip() | ||
for row in year_section.xpath('div[@class="content"]/table/tbody/tr'): | ||
item = row.xpath("td") | ||
title = self._parse_title(item) | ||
if title is None: | ||
continue | ||
meeting = Meeting( | ||
title=title, | ||
description=self._parse_description(item), | ||
classification=self._parse_classification(title), | ||
start=self._parse_start(item, year), | ||
end=self._parse_end(item), | ||
all_day=self._parse_all_day(item), | ||
time_notes=self._parse_time_notes(item), | ||
location=self._parse_location(item), | ||
links=self._parse_links(item, response), | ||
source=self._parse_source(response), | ||
) | ||
|
||
# if postponed or canceled appears in any of these columns, it means the | ||
# meeting is canceled, so just pass in all the row text to _get_status | ||
row_text = " ".join(row.css("* ::text").getall()) | ||
meeting["status"] = self._get_status(meeting, text=row_text) | ||
meeting["id"] = self._get_id(meeting) | ||
|
||
yield meeting | ||
|
||
def _clean_igsu_title(self, title): | ||
"""Reformat title to conform to project naming standards""" | ||
if not title.startswith("Special"): | ||
return re.sub(r"\s*Meeting\s*$", "", title) | ||
return title | ||
|
||
def _parse_title(self, item): | ||
"""Parse or generate meeting title. The inner html of the first column varies | ||
quite a bit - brs, divs, b tags - so figuring out what is the title based on | ||
line position. Sometimes the "title" is only a date, so if all else fails, | ||
return that. | ||
Returns None if the title is 'Date', which indicates we're in a header row, or | ||
if the title is empty, which indicates we're in a blank row. | ||
If returning a string, strip 'Meeting' from the end.""" | ||
cell_text = item[0].css("* ::text").getall() | ||
clean_cell_text = [elt.strip() for elt in cell_text if len(elt.strip()) > 0] | ||
if (len(clean_cell_text) == 0) or ("date" == clean_cell_text[0].lower()): | ||
return None | ||
if len(clean_cell_text) == 1: | ||
# then we either have no title or no date - or, occasionally, we have a | ||
# comma-separated title and date. First check for \d\d\d\d under the | ||
# assumption that this ends the date, and see if the remainder of the | ||
# string is non-empty. Failing that, check if there are numbers, | ||
# and if so assume it's a date and return Board of Trustees. Otherwise, | ||
# return the line, assuming the whole thing is the title. | ||
possible_title = clean_cell_text[0] | ||
title_match = re.findall(r"\d\d\d\d\s+(.*)", possible_title) | ||
if len(title_match) > 0: | ||
return self._clean_igsu_title(title_match[0]) | ||
if re.search(r"\d", clean_cell_text[0]): | ||
return "Board of Trustees" | ||
return self._clean_igsu_title(clean_cell_text[0]) | ||
return self._clean_igsu_title(" ".join(clean_cell_text[1:])) | ||
|
||
def _parse_description(self, item): | ||
"""Parse or generate meeting description. Not available for this website.""" | ||
return "" | ||
|
||
def _parse_classification(self, title): | ||
"""Parse or generate classification from allowed options.""" | ||
if "committee" in title.lower(): | ||
return COMMITTEE | ||
# if it isn't explicitly described as a committee meeting, then because this | ||
# is a board calendar, all other meetings are board by default | ||
return BOARD | ||
|
||
def _normalize_date(self, date, default_year): | ||
"""The dates appear in pretty variable formats, including in some cases without a year. | ||
This method normalizes.""" | ||
clean_date = date.replace(",", "").replace(".", "").lower().strip() | ||
# There was a stray "sept." in the data, although usually the month is | ||
# fully spelled out. Use first three chars of the date string to get the month. | ||
months = [ | ||
"january", | ||
"february", | ||
"march", | ||
"april", | ||
"may", | ||
"june", | ||
"july", | ||
"august", | ||
"september", | ||
"october", | ||
"november", | ||
"december", | ||
] | ||
month_map = {m[:3]: m for m in months} | ||
month, day, year = re.findall( | ||
r"([a-z]+)\.?\s+(\d\d?),?\s*(\d\d\d\d)?", clean_date | ||
)[0] | ||
month = month_map[month[:3]] | ||
year = year if len(year) == 4 else default_year | ||
return f"{month} {day} {year}" | ||
|
||
def _normalize_time(self, time_str): | ||
"""Normalize time format. Sometimes it comes with colons or periods, | ||
sometimes not""" | ||
times = re.findall(self.time_re, time_str) | ||
if len(times) == 0: | ||
return None | ||
hour, minute, ampm = times[0] | ||
if len(minute.strip(":")) > 0: | ||
minute = minute.strip(":") | ||
else: | ||
minute = "00" | ||
ampm = ampm.replace(".", "") | ||
return f"{hour}:{minute} {ampm}" | ||
|
||
def _parse_start(self, item, default_year): | ||
"""Parse start datetime as a naive datetime object.""" | ||
|
||
# try to find the date in the first column, and if it isn't there, fall back | ||
# to the third | ||
day = " ".join(item[0].css("* ::text").getall()) | ||
if not re.search(r"\d", day): | ||
day = " ".join(item[2].css("* ::text").getall()) | ||
clean_day = self._normalize_date(day, default_year) | ||
time = " ".join(item[1].css("* ::text").getall()).lower() | ||
clean_time = self._normalize_time(time) | ||
if clean_time is not None: | ||
return datetime.strptime(f"{clean_day} {clean_time}", "%B %d %Y %I:%M %p") | ||
# fall back to midnight if no time specified | ||
return datetime.strptime(clean_day, "%B %d %Y") | ||
|
||
def _parse_end(self, item): | ||
"""Parse end datetime as a naive datetime object. Added by pipeline if None. | ||
Not available for this website.""" | ||
return None | ||
|
||
def _parse_time_notes(self, item): | ||
"""Parse any additional notes on the timing of the meeting""" | ||
return "" | ||
|
||
def _parse_all_day(self, item): | ||
"""Parse or generate all-day status. Defaults to False. Doesn't seem to occur | ||
for this website, with the possible exception of the retreats, which aren't | ||
quite all day""" | ||
return False | ||
|
||
def _parse_location(self, item): | ||
"""Parse or generate location.""" | ||
unclean_location_cell_content = item[1].css("* ::text").getall() | ||
# remove time if present, and clean | ||
location_cell_content = [] | ||
for line in unclean_location_cell_content: | ||
line = re.sub(self.time_re, "", line) | ||
line = line.strip().strip("-").strip() | ||
if len(line) > 0: | ||
location_cell_content.append(line) | ||
# It's not obvious whether the first line of the location_cell_content | ||
# is a location name or address, so the rest of this method uses heuristics | ||
# for this | ||
default_name = "Governors State University" | ||
default_address = "1 University Pkwy,\nUniversity Park, IL 60484" | ||
name, address = default_name, default_address | ||
# If the event was postponed or canceled, we will handle that in the | ||
# event status, and can just use the defaults here | ||
for elt in location_cell_content: | ||
if ("postponed" in elt.lower()) or ("canceled" in elt.lower()): | ||
return {"address": default_address, "name": default_name} | ||
# If there is no name, just the address, we'll use the first line | ||
# of the address as the name. | ||
if len(location_cell_content) > 0: | ||
name = location_cell_content[0] | ||
# no obvious way to differentiate location names from addresses other than | ||
# presence of numbers. We'll assume that the first line is title-only if it | ||
# contains no numbers, otherwise that it begins the address. | ||
if re.search(r"\d", name): | ||
address = "\n".join(location_cell_content) | ||
elif len(location_cell_content) > 1: | ||
address = "\n".join(location_cell_content[1:]) | ||
# Room may end up in either the name or address; if it's present, we want to | ||
# make sure it's part of the address. Sometimes room numbers appear without | ||
# room, as a single word (see G330 in 2017) so handle them the same way | ||
if "room " in address.lower(): | ||
address = address + "\n" + default_address | ||
if "room " in name.lower(): | ||
if "room " not in address.lower(): | ||
address = name + "\n" + address | ||
name = default_name | ||
# special case for covid -- make sure zoom meetings don't show the university | ||
# address! | ||
if ("zoom" in name.lower()) or ("zoom" in address.lower()): | ||
address = "Zoom" | ||
name = "Zoom" | ||
elif "location tbd" in name.lower(): | ||
address = name | ||
# in some cases a one-word "address" like G330 in 2017 can make it through, | ||
# so fall back to the default here as well | ||
elif len(address.split()) == 1: | ||
address = address + "\n" + default_address | ||
return { | ||
"address": address, | ||
"name": name, | ||
} | ||
|
||
def _parse_links(self, item, response): | ||
"""Parse or generate links.""" | ||
links = [] | ||
# the links to the agenda, if present, are in the third and fourth columns | ||
for col in [2, 3]: | ||
for link_parent in item[col].xpath("a"): | ||
link_ext = link_parent.css("::attr(href)").get() | ||
if link_ext is not None: | ||
link = response.urljoin(link_ext) | ||
title = link_parent.xpath("text()").get() | ||
links.append({"href": link, "title": title}) | ||
return links | ||
|
||
def _parse_source(self, response): | ||
"""Parse or generate source.""" | ||
return response.url |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,149 @@ | ||
from datetime import datetime | ||
from os.path import dirname, join | ||
|
||
import pytest | ||
from city_scrapers_core.constants import BOARD, CANCELLED, COMMITTEE, PASSED | ||
from city_scrapers_core.utils import file_response | ||
from freezegun import freeze_time | ||
|
||
from city_scrapers.spiders.il_governors_state_university import ( | ||
IlGovernorsStateUniversitySpider, | ||
) | ||
|
||
test_response = file_response( | ||
join(dirname(__file__), "files", "il_governors_state_university.html"), | ||
url="https://www.govst.edu/BOT-Meetings/", | ||
) | ||
spider = IlGovernorsStateUniversitySpider() | ||
|
||
freezer = freeze_time("2020-09-26") | ||
freezer.start() | ||
|
||
parsed_items = [item for item in spider.parse(test_response)] | ||
|
||
freezer.stop() | ||
|
||
|
||
def test_title(): | ||
# br-separated | ||
assert parsed_items[0]["title"] == "Budget and Finance Committee" | ||
# div-separated | ||
assert parsed_items[2]["title"] == "Human Resources Committee" | ||
# no title, only date, we fall back to default | ||
assert parsed_items[4]["title"] == "Board of Trustees" | ||
# comma-separated | ||
assert parsed_items[17]["title"] == "Annual Retreat" | ||
# special board meeting keeps the word "meeting" | ||
assert parsed_items[1]["title"] == "Special Board Meeting" | ||
|
||
|
||
def test_description(): | ||
assert parsed_items[0]["description"] == "" | ||
|
||
|
||
def test_start(): | ||
assert parsed_items[0]["start"] == datetime(2020, 1, 27, 9, 0) | ||
assert parsed_items[2]["start"] == datetime(2020, 2, 7, 9, 0) | ||
assert parsed_items[4]["start"] == datetime(2020, 2, 14, 8, 30) | ||
assert parsed_items[12]["start"] == datetime(2020, 5, 15, 9, 0) | ||
# starting in 2018, sometimes dates do not have years. In case this starts up | ||
# again, check that these properly get 2018 as their year | ||
assert parsed_items[33]["start"] == datetime(2018, 2, 22, 9, 0) | ||
|
||
|
||
def test_end(): | ||
# unused | ||
assert parsed_items[0]["end"] is None | ||
|
||
|
||
def test_time_notes(): | ||
# unused | ||
assert parsed_items[0]["time_notes"] == "" | ||
|
||
|
||
def test_status(): | ||
assert parsed_items[0]["status"] == PASSED | ||
assert parsed_items[10]["status"] == CANCELLED | ||
assert parsed_items[22]["status"] == CANCELLED | ||
|
||
|
||
def test_location(): | ||
assert parsed_items[0]["location"] == { | ||
"name": "Engbretson Hall", | ||
"address": "1 University Pkwy,\nUniversity Park, IL 60484", | ||
} | ||
assert parsed_items[2]["location"] == { | ||
"name": "70 W. Madison Street", | ||
"address": "70 W. Madison Street\nSuite 4300\nChicago, IL", | ||
} | ||
assert parsed_items[7]["location"] == {"name": "Zoom", "address": "Zoom"} | ||
# check room reformatting | ||
assert parsed_items[41]["location"] == { | ||
"name": "Governors State University", | ||
"address": "Room D34000\n1 University Pkwy,\nUniversity Park, IL 60484", | ||
} | ||
# check postponement reformatting | ||
assert parsed_items[10]["location"] == { | ||
"name": "Governors State University", | ||
"address": "1 University Pkwy,\nUniversity Park, IL 60484", | ||
} | ||
|
||
|
||
def test_source(): | ||
assert parsed_items[0]["source"] == "https://www.govst.edu/BOT-Meetings/" | ||
|
||
|
||
def test_links(): | ||
# normal fully populated row: one agenda, one minutes | ||
assert parsed_items[0]["links"] == [ | ||
{ | ||
"href": ( | ||
"https://www.govst.edu/uploadedFiles/About/University_" | ||
"Governance/Board_of_Trustees/budget and finance " | ||
"committee agenda, 1-27-19, FINAL.pdf" | ||
), | ||
"title": "Budget and Finance Committee Meeting Agenda, 1-27-20", | ||
}, | ||
{ | ||
"href": ( | ||
"https://www.govst.edu/uploadedFiles/About/University_" | ||
"Governance/Board_of_Trustees/approved minutes - b and f " | ||
"committee meeting, 1-27-20 - approved at 3-23-20 b and f " | ||
"committee meeting - FINAL.pdf" | ||
), | ||
"title": "1-27-20 Budget and Finance Committee - approved meeting minutes", | ||
}, | ||
] | ||
# agenda + notification in column 3 | ||
assert parsed_items[5]["links"] == [ | ||
{ | ||
"href": ( | ||
"https://www.govst.edu/uploadedFiles/About/" | ||
"University_Governance/Board_of_Trustees/agenda, " | ||
"executive committee meeting, 3-16-20(1).pdf" | ||
), | ||
"title": "Executive Committee Meeting Agenda, 3-16-20", | ||
}, | ||
{ | ||
"href": ( | ||
"https://www.govst.edu/uploadedFiles/About/" | ||
"University_Governance/Board_of_Trustees/notification " | ||
"regarding march 16, 2020 executive committee meeting, " | ||
"3-15-20 - FINAL.docx.pdf" | ||
), | ||
"title": ( | ||
"Notification Regarding March 16, 2020 Executive " "Committee Meeting" | ||
), | ||
}, | ||
] | ||
|
||
|
||
def test_classification(): | ||
assert parsed_items[0]["classification"] == COMMITTEE | ||
assert parsed_items[1]["classification"] == BOARD | ||
assert parsed_items[4]["classification"] == BOARD | ||
|
||
|
||
@pytest.mark.parametrize("item", parsed_items) | ||
def test_all_day(item): | ||
assert item["all_day"] is False |