797 spider governors state university (#975)

City-Bureau · Oct 2, 2020 · c426204 · c426204
1 parent b7f807f
commit c426204
Show file tree

Hide file tree

Showing 3 changed files with 1,467 additions and 0 deletions.
diff --git a/city_scrapers/spiders/il_governors_state_university.py b/city_scrapers/spiders/il_governors_state_university.py
@@ -0,0 +1,242 @@
+import re
+from datetime import datetime
+
+from city_scrapers_core.constants import BOARD, COMMITTEE
+from city_scrapers_core.items import Meeting
+from city_scrapers_core.spiders import CityScrapersSpider
+
+
+class IlGovernorsStateUniversitySpider(CityScrapersSpider):
+    name = "il_governors_state_university"
+    agency = "Governors State University"
+    timezone = "America/Chicago"
+    start_urls = ["https://www.govst.edu/BOT-Meetings/"]
+    time_re = r"(?i)([01]?\d)(:?\d*)\s*([ap]\.?m\.?)"
+
+    def parse(self, response):
+        """
+        `parse` should always `yield` Meeting items.
+        """
+        for year_section in response.xpath('//div[@class="toggle-list"]/ul/li'):
+            year_elt = year_section.xpath('div[@class="title"]/h3/text()')
+            # sometimes the year is not present in the table dates, so grab it from the
+            # section heading as backup
+            year = year_elt.get().replace("Meeting Dates for ", "").strip()
+            for row in year_section.xpath('div[@class="content"]/table/tbody/tr'):
+                item = row.xpath("td")
+                title = self._parse_title(item)
+                if title is None:
+                    continue
+                meeting = Meeting(
+                    title=title,
+                    description=self._parse_description(item),
+                    classification=self._parse_classification(title),
+                    start=self._parse_start(item, year),
+                    end=self._parse_end(item),
+                    all_day=self._parse_all_day(item),
+                    time_notes=self._parse_time_notes(item),
+                    location=self._parse_location(item),
+                    links=self._parse_links(item, response),
+                    source=self._parse_source(response),
+                )
+
+                # if postponed or canceled appears in any of these columns, it means the
+                # meeting is canceled, so just pass in all the row text to _get_status
+                row_text = " ".join(row.css("* ::text").getall())
+                meeting["status"] = self._get_status(meeting, text=row_text)
+                meeting["id"] = self._get_id(meeting)
+
+                yield meeting
+
+    def _clean_igsu_title(self, title):
+        """Reformat title to conform to project naming standards"""
+        if not title.startswith("Special"):
+            return re.sub(r"\s*Meeting\s*$", "", title)
+        return title
+
+    def _parse_title(self, item):
+        """Parse or generate meeting title. The inner html of the first column varies
+        quite a bit - brs, divs, b tags - so figuring out what is the title based on
+        line position. Sometimes the "title" is only a date, so if all else fails,
+        return that.
+        Returns None if the title is 'Date', which indicates we're in a header row, or
+        if the title is empty, which indicates we're in a blank row.
+        If returning a string, strip 'Meeting' from the end."""
+        cell_text = item[0].css("* ::text").getall()
+        clean_cell_text = [elt.strip() for elt in cell_text if len(elt.strip()) > 0]
+        if (len(clean_cell_text) == 0) or ("date" == clean_cell_text[0].lower()):
+            return None
+        if len(clean_cell_text) == 1:
+            # then we either have no title or no date - or, occasionally, we have a
+            # comma-separated title and date. First check for \d\d\d\d under the
+            # assumption that this ends the date, and see if the remainder of the
+            # string is non-empty. Failing that, check if there are numbers,
+            # and if so assume it's a date and return Board of Trustees. Otherwise,
+            # return the line, assuming the whole thing is the title.
+            possible_title = clean_cell_text[0]
+            title_match = re.findall(r"\d\d\d\d\s+(.*)", possible_title)
+            if len(title_match) > 0:
+                return self._clean_igsu_title(title_match[0])
+            if re.search(r"\d", clean_cell_text[0]):
+                return "Board of Trustees"
+            return self._clean_igsu_title(clean_cell_text[0])
+        return self._clean_igsu_title(" ".join(clean_cell_text[1:]))
+
+    def _parse_description(self, item):
+        """Parse or generate meeting description. Not available for this website."""
+        return ""
+
+    def _parse_classification(self, title):
+        """Parse or generate classification from allowed options."""
+        if "committee" in title.lower():
+            return COMMITTEE
+        # if it isn't explicitly described as a committee meeting, then because this
+        # is a board calendar, all other meetings are board by default
+        return BOARD
+
+    def _normalize_date(self, date, default_year):
+        """The dates appear in pretty variable formats, including in some cases without a year.
+        This method normalizes."""
+        clean_date = date.replace(",", "").replace(".", "").lower().strip()
+        # There was a stray "sept." in the data, although usually the month is
+        # fully spelled out. Use first three chars of the date string to get the month.
+        months = [
+            "january",
+            "february",
+            "march",
+            "april",
+            "may",
+            "june",
+            "july",
+            "august",
+            "september",
+            "october",
+            "november",
+            "december",
+        ]
+        month_map = {m[:3]: m for m in months}
+        month, day, year = re.findall(
+            r"([a-z]+)\.?\s+(\d\d?),?\s*(\d\d\d\d)?", clean_date
+        )[0]
+        month = month_map[month[:3]]
+        year = year if len(year) == 4 else default_year
+        return f"{month} {day} {year}"
+
+    def _normalize_time(self, time_str):
+        """Normalize time format. Sometimes it comes with colons or periods,
+        sometimes not"""
+        times = re.findall(self.time_re, time_str)
+        if len(times) == 0:
+            return None
+        hour, minute, ampm = times[0]
+        if len(minute.strip(":")) > 0:
+            minute = minute.strip(":")
+        else:
+            minute = "00"
+        ampm = ampm.replace(".", "")
+        return f"{hour}:{minute} {ampm}"
+
+    def _parse_start(self, item, default_year):
+        """Parse start datetime as a naive datetime object."""
+
+        # try to find the date in the first column, and if it isn't there, fall back
+        # to the third
+        day = " ".join(item[0].css("* ::text").getall())
+        if not re.search(r"\d", day):
+            day = " ".join(item[2].css("* ::text").getall())
+        clean_day = self._normalize_date(day, default_year)
+        time = " ".join(item[1].css("* ::text").getall()).lower()
+        clean_time = self._normalize_time(time)
+        if clean_time is not None:
+            return datetime.strptime(f"{clean_day} {clean_time}", "%B %d %Y %I:%M %p")
+        # fall back to midnight if no time specified
+        return datetime.strptime(clean_day, "%B %d %Y")
+
+    def _parse_end(self, item):
+        """Parse end datetime as a naive datetime object. Added by pipeline if None.
+        Not available for this website."""
+        return None
+
+    def _parse_time_notes(self, item):
+        """Parse any additional notes on the timing of the meeting"""
+        return ""
+
+    def _parse_all_day(self, item):
+        """Parse or generate all-day status. Defaults to False. Doesn't seem to occur
+        for this website, with the possible exception of the retreats, which aren't
+        quite all day"""
+        return False
+
+    def _parse_location(self, item):
+        """Parse or generate location."""
+        unclean_location_cell_content = item[1].css("* ::text").getall()
+        # remove time if present, and clean
+        location_cell_content = []
+        for line in unclean_location_cell_content:
+            line = re.sub(self.time_re, "", line)
+            line = line.strip().strip("-").strip()
+            if len(line) > 0:
+                location_cell_content.append(line)
+        # It's not obvious whether the first line of the location_cell_content
+        # is a location name or address, so the rest of this method uses heuristics
+        # for this
+        default_name = "Governors State University"
+        default_address = "1 University Pkwy,\nUniversity Park, IL 60484"
+        name, address = default_name, default_address
+        # If the event was postponed or canceled, we will handle that in the
+        # event status, and can just use the defaults here
+        for elt in location_cell_content:
+            if ("postponed" in elt.lower()) or ("canceled" in elt.lower()):
+                return {"address": default_address, "name": default_name}
+        # If there is no name, just the address, we'll use the first line
+        # of the address as the name.
+        if len(location_cell_content) > 0:
+            name = location_cell_content[0]
+        # no obvious way to differentiate location names from addresses other than
+        # presence of numbers. We'll assume that the first line is title-only if it
+        # contains no numbers, otherwise that it begins the address.
+        if re.search(r"\d", name):
+            address = "\n".join(location_cell_content)
+        elif len(location_cell_content) > 1:
+            address = "\n".join(location_cell_content[1:])
+        # Room may end up in either the name or address; if it's present, we want to
+        # make sure it's part of the address. Sometimes room numbers appear without
+        # room, as a single word (see G330 in 2017) so handle them the same way
+        if "room " in address.lower():
+            address = address + "\n" + default_address
+        if "room " in name.lower():
+            if "room " not in address.lower():
+                address = name + "\n" + address
+            name = default_name
+        # special case for covid -- make sure zoom meetings don't show the university
+        # address!
+        if ("zoom" in name.lower()) or ("zoom" in address.lower()):
+            address = "Zoom"
+            name = "Zoom"
+        elif "location tbd" in name.lower():
+            address = name
+        # in some cases a one-word "address" like G330 in 2017 can make it through,
+        # so fall back to the default here as well
+        elif len(address.split()) == 1:
+            address = address + "\n" + default_address
+        return {
+            "address": address,
+            "name": name,
+        }
+
+    def _parse_links(self, item, response):
+        """Parse or generate links."""
+        links = []
+        # the links to the agenda, if present, are in the third and fourth columns
+        for col in [2, 3]:
+            for link_parent in item[col].xpath("a"):
+                link_ext = link_parent.css("::attr(href)").get()
+                if link_ext is not None:
+                    link = response.urljoin(link_ext)
+                    title = link_parent.xpath("text()").get()
+                    links.append({"href": link, "title": title})
+        return links
+
+    def _parse_source(self, response):
+        """Parse or generate source."""
+        return response.url
diff --git a/tests/files/il_governors_state_university.html b/tests/files/il_governors_state_university.html
diff --git a/tests/test_il_governors_state_university.py b/tests/test_il_governors_state_university.py
@@ -0,0 +1,149 @@
+from datetime import datetime
+from os.path import dirname, join
+
+import pytest
+from city_scrapers_core.constants import BOARD, CANCELLED, COMMITTEE, PASSED
+from city_scrapers_core.utils import file_response
+from freezegun import freeze_time
+
+from city_scrapers.spiders.il_governors_state_university import (
+    IlGovernorsStateUniversitySpider,
+)
+
+test_response = file_response(
+    join(dirname(__file__), "files", "il_governors_state_university.html"),
+    url="https://www.govst.edu/BOT-Meetings/",
+)
+spider = IlGovernorsStateUniversitySpider()
+
+freezer = freeze_time("2020-09-26")
+freezer.start()
+
+parsed_items = [item for item in spider.parse(test_response)]
+
+freezer.stop()
+
+
+def test_title():
+    # br-separated
+    assert parsed_items[0]["title"] == "Budget and Finance Committee"
+    # div-separated
+    assert parsed_items[2]["title"] == "Human Resources Committee"
+    # no title, only date, we fall back to default
+    assert parsed_items[4]["title"] == "Board of Trustees"
+    # comma-separated
+    assert parsed_items[17]["title"] == "Annual Retreat"
+    # special board meeting keeps the word "meeting"
+    assert parsed_items[1]["title"] == "Special Board Meeting"
+
+
+def test_description():
+    assert parsed_items[0]["description"] == ""
+
+
+def test_start():
+    assert parsed_items[0]["start"] == datetime(2020, 1, 27, 9, 0)
+    assert parsed_items[2]["start"] == datetime(2020, 2, 7, 9, 0)
+    assert parsed_items[4]["start"] == datetime(2020, 2, 14, 8, 30)
+    assert parsed_items[12]["start"] == datetime(2020, 5, 15, 9, 0)
+    # starting in 2018, sometimes dates do not have years. In case this starts up
+    # again, check that these properly get 2018 as their year
+    assert parsed_items[33]["start"] == datetime(2018, 2, 22, 9, 0)
+
+
+def test_end():
+    # unused
+    assert parsed_items[0]["end"] is None
+
+
+def test_time_notes():
+    # unused
+    assert parsed_items[0]["time_notes"] == ""
+
+
+def test_status():
+    assert parsed_items[0]["status"] == PASSED
+    assert parsed_items[10]["status"] == CANCELLED
+    assert parsed_items[22]["status"] == CANCELLED
+
+
+def test_location():
+    assert parsed_items[0]["location"] == {
+        "name": "Engbretson Hall",
+        "address": "1 University Pkwy,\nUniversity Park, IL 60484",
+    }
+    assert parsed_items[2]["location"] == {
+        "name": "70 W. Madison Street",
+        "address": "70 W. Madison Street\nSuite 4300\nChicago, IL",
+    }
+    assert parsed_items[7]["location"] == {"name": "Zoom", "address": "Zoom"}
+    # check room reformatting
+    assert parsed_items[41]["location"] == {
+        "name": "Governors State University",
+        "address": "Room D34000\n1 University Pkwy,\nUniversity Park, IL 60484",
+    }
+    # check postponement reformatting
+    assert parsed_items[10]["location"] == {
+        "name": "Governors State University",
+        "address": "1 University Pkwy,\nUniversity Park, IL 60484",
+    }
+
+
+def test_source():
+    assert parsed_items[0]["source"] == "https://www.govst.edu/BOT-Meetings/"
+
+
+def test_links():
+    # normal fully populated row: one agenda, one minutes
+    assert parsed_items[0]["links"] == [
+        {
+            "href": (
+                "https://www.govst.edu/uploadedFiles/About/University_"
+                "Governance/Board_of_Trustees/budget and finance "
+                "committee agenda, 1-27-19, FINAL.pdf"
+            ),
+            "title": "Budget and Finance Committee Meeting Agenda, 1-27-20",
+        },
+        {
+            "href": (
+                "https://www.govst.edu/uploadedFiles/About/University_"
+                "Governance/Board_of_Trustees/approved minutes - b and f "
+                "committee meeting, 1-27-20 - approved at 3-23-20 b and f "
+                "committee meeting - FINAL.pdf"
+            ),
+            "title": "1-27-20 Budget and Finance Committee - approved meeting minutes",
+        },
+    ]
+    # agenda + notification in column 3
+    assert parsed_items[5]["links"] == [
+        {
+            "href": (
+                "https://www.govst.edu/uploadedFiles/About/"
+                "University_Governance/Board_of_Trustees/agenda, "
+                "executive committee meeting, 3-16-20(1).pdf"
+            ),
+            "title": "Executive Committee Meeting Agenda, 3-16-20",
+        },
+        {
+            "href": (
+                "https://www.govst.edu/uploadedFiles/About/"
+                "University_Governance/Board_of_Trustees/notification "
+                "regarding march 16, 2020 executive committee meeting, "
+                "3-15-20 - FINAL.docx.pdf"
+            ),
+            "title": (
+                "Notification Regarding March 16, 2020 Executive " "Committee Meeting"
+            ),
+        },
+    ]
+
+
+def test_classification():
+    assert parsed_items[0]["classification"] == COMMITTEE
+    assert parsed_items[1]["classification"] == BOARD
+    assert parsed_items[4]["classification"] == BOARD
+
+
+@pytest.mark.parametrize("item", parsed_items)
+def test_all_day(item):
+    assert item["all_day"] is False