Skip to content

Commit 29ac57a

Browse files
committed
clean up meeting data, add action to save meetings
1 parent f8a4851 commit 29ac57a

File tree

8 files changed

+266
-27
lines changed

8 files changed

+266
-27
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
name: 'Setup Python with Poetry'
2+
description: 'Sets up Python environment with Poetry and installs dependencies'
3+
4+
inputs:
5+
python-version:
6+
description: 'Python version to set up'
7+
required: false
8+
default: '3.11'
9+
10+
runs:
11+
using: "composite"
12+
steps:
13+
- name: Set up Python
14+
uses: actions/setup-python@v4
15+
with:
16+
python-version: ${{ inputs.python-version }}
17+
18+
- name: Install Poetry
19+
uses: abatilo/actions-poetry@v2
20+
21+
- name: Configure Poetry
22+
run: poetry config virtualenvs.create true
23+
shell: bash
24+
25+
- name: Load cached Poetry dependencies
26+
uses: actions/cache@v3
27+
with:
28+
path: ~/.cache/pypoetry/virtualenvs
29+
key: poetry-${{ runner.os }}-${{ hashFiles('**/poetry.lock') }}
30+
restore-keys: |
31+
poetry-${{ runner.os }}-
32+
33+
- name: Install dependencies
34+
run: poetry install --no-interaction --no-root
35+
shell: bash

.github/workflows/run-tests.yml

+3-20
Original file line numberDiff line numberDiff line change
@@ -13,27 +13,10 @@ jobs:
1313
- name: Check out repository
1414
uses: actions/checkout@v4
1515

16-
- name: Set up Python
17-
uses: actions/setup-python@v4
16+
- name: Setup Python environment
17+
uses: ./.github/actions/setup-poetry-env
1818
with:
19-
python-version: '3.11' # Adjust as needed
20-
21-
- name: Install Poetry
22-
uses: abatilo/actions-poetry@v2
23-
24-
- name: Configure Poetry
25-
run: poetry config virtualenvs.create true
26-
27-
- name: Load cached Poetry dependencies
28-
uses: actions/cache@v3
29-
with:
30-
path: ~/.cache/pypoetry/virtualenvs
31-
key: poetry-${{ runner.os }}-${{ hashFiles('**/poetry.lock') }}
32-
restore-keys: |
33-
poetry-${{ runner.os }}-
34-
35-
- name: Install dependencies
36-
run: poetry install --no-interaction --no-root
19+
python-version: '3.11'
3720

3821
- name: Run tests
3922
run: poetry run pytest

.github/workflows/update_summary.yml

+30
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
name: Update Meetings Summary
2+
3+
on:
4+
schedule:
5+
# Run every day at midnight
6+
- cron: '0 0 * * *'
7+
workflow_dispatch:
8+
# Allow manual trigger
9+
10+
jobs:
11+
update-summary:
12+
runs-on: ubuntu-latest
13+
14+
steps:
15+
- name: Check out repository
16+
uses: actions/checkout@v4
17+
18+
- name: Setup Python environment
19+
uses: ./.github/actions/setup-poetry-env
20+
with:
21+
python-version: '3.11'
22+
23+
- name: Run summary script
24+
run: poetry run python scripts/summary.py
25+
26+
- name: Commit and push changes
27+
uses: stefanzweifel/git-auto-commit-action@v4
28+
with:
29+
commit_message: "Update meetings summary data"
30+
file_pattern: data/meetings.jsonl

.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@ data/audio/
55
data/video/
66
data/transcripts/
77
models/
8+
# Include specific directories
9+
!src/models/
10+
811
notebooks/.ipynb_checkpoints/
912

1013
# Python

pyproject.toml

+1
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ faster-whisper = "1.1.0"
2626
whisperx = {git = "https://github.com/m-bain/whisperx.git", rev = "main", python = ">=3.10,<3.13"}
2727
python-dotenv = "^1.0.1"
2828
aiofiles = "^24.1.0"
29+
pytz = "^2025.1"
2930

3031

3132
[tool.poetry.group.dev.dependencies]

scripts/summary.py

+69
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Summary generator for Tulsa Government Access Television meetings.
4+
5+
This script retrieves the list of meetings from the TGOV website and
6+
saves them to a JSONL file for further processing or analysis.
7+
"""
8+
import os
9+
import sys
10+
11+
12+
import asyncio
13+
import json
14+
from pathlib import Path
15+
from typing import List, Dict, Any
16+
17+
# Add the parent directory to the path so we can import from src
18+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
19+
from src.meetings import get_meetings
20+
from src.models.meeting import Meeting
21+
22+
23+
async def generate_summary() -> List[Dict[str, Any]]:
24+
"""
25+
Generate a summary of all meetings.
26+
27+
Returns:
28+
List of meeting data as dictionaries
29+
"""
30+
meetings: List[Meeting] = await get_meetings()
31+
32+
# Convert Pydantic models to dictionaries for JSON serialization
33+
# Use model_dump with mode='json' to ensure all values are JSON serializable
34+
return [meeting.model_dump(mode="json") for meeting in meetings]
35+
36+
37+
async def save_to_jsonl(meetings: List[Dict[str, Any]], file_path: Path) -> None:
38+
"""
39+
Save meetings data to a JSONL file.
40+
41+
Args:
42+
meetings: List of meeting data as dictionaries
43+
file_path: Path to the output JSONL file
44+
"""
45+
# Create directory if it doesn't exist
46+
file_path.parent.mkdir(parents=True, exist_ok=True)
47+
48+
# Write each meeting as a JSON line
49+
with file_path.open("w") as f:
50+
for meeting in meetings:
51+
f.write(json.dumps(meeting) + "\n")
52+
53+
print(f"Saved {len(meetings)} meetings to {file_path}")
54+
55+
56+
async def main() -> None:
57+
"""Main function to retrieve and save meeting data."""
58+
output_path = Path("data/meetings.jsonl")
59+
60+
print("Retrieving meetings data...")
61+
meetings = await generate_summary()
62+
print(f"Found {len(meetings)} meetings")
63+
64+
await save_to_jsonl(meetings, output_path)
65+
print("Summary generation complete")
66+
67+
68+
if __name__ == "__main__":
69+
asyncio.run(main())

src/meetings.py

+103-3
Original file line numberDiff line numberDiff line change
@@ -8,15 +8,19 @@
88

99
import asyncio
1010
import json
11-
from typing import Dict, List, Optional, Any
11+
import re
12+
from datetime import datetime
13+
from typing import Dict, List, Optional, Any, Union
1214
from urllib.parse import urljoin
1315

1416
import aiohttp
17+
import pytz
1518
from selectolax.parser import HTMLParser
1619

1720
from .models.meeting import Meeting
1821

1922
BASE_URL = "https://tulsa-ok.granicus.com/ViewPublisher.php?view_id=4"
23+
CENTRAL_TZ = pytz.timezone("America/Chicago")
2024

2125

2226
async def fetch_page(url: str, session: aiohttp.ClientSession) -> str:
@@ -36,7 +40,95 @@ async def fetch_page(url: str, session: aiohttp.ClientSession) -> str:
3640
return await response.text()
3741

3842

39-
async def parse_meetings(html: str) -> List[Dict[str, str]]:
43+
def parse_date_string(date_str: str) -> Optional[datetime]:
44+
"""
45+
Parse the date string into a datetime object with Central timezone.
46+
47+
Args:
48+
date_str: The raw date string from HTML
49+
50+
Returns:
51+
A datetime object with Central timezone or None if parsing fails
52+
"""
53+
# Replace non-breaking spaces with regular spaces
54+
date_str = date_str.replace("\u00a0", " ")
55+
56+
# Replace multiple spaces with a single space
57+
date_str = re.sub(r"\s+", " ", date_str)
58+
59+
# Find the month, day, year, and time parts
60+
# Pattern typically looks like "March 12, 2025 - 5:00 PM"
61+
match = re.search(
62+
r"([A-Za-z]+)\s+(\d{1,2}),?\s+(\d{4}).*?(\d{1,2}):(\d{2})\s*([APM]{2})",
63+
date_str,
64+
)
65+
66+
if match:
67+
month_str, day_str, year_str, hour_str, minute_str, am_pm = match.groups()
68+
69+
# Convert month name to number
70+
try:
71+
month_num = datetime.strptime(month_str, "%B").month
72+
except ValueError:
73+
# Try abbreviated month name
74+
try:
75+
month_num = datetime.strptime(month_str, "%b").month
76+
except ValueError:
77+
return None
78+
79+
# Convert to integers
80+
day = int(day_str)
81+
year = int(year_str)
82+
hour = int(hour_str)
83+
minute = int(minute_str)
84+
85+
# Adjust hour for PM
86+
if am_pm.upper() == "PM" and hour < 12:
87+
hour += 12
88+
elif am_pm.upper() == "AM" and hour == 12:
89+
hour = 0
90+
91+
# Create naive datetime
92+
naive_dt = datetime(year, month_num, day, hour, minute)
93+
94+
# Localize to Central Time
95+
return CENTRAL_TZ.localize(naive_dt)
96+
97+
return None
98+
99+
100+
def clean_date_string(date_str: str) -> str:
101+
"""
102+
Clean up the date string by removing extra whitespace, newlines, and normalizing formats.
103+
104+
Args:
105+
date_str: The raw date string from HTML
106+
107+
Returns:
108+
A cleaned date string in the format "Month Day, Year - Time"
109+
"""
110+
# Replace non-breaking spaces with regular spaces
111+
date_str = date_str.replace("\u00a0", " ")
112+
113+
# Replace multiple spaces with a single space
114+
date_str = re.sub(r"\s+", " ", date_str)
115+
116+
# Find the month, day, year, and time parts
117+
# Pattern typically looks like "March 12, 2025 - 5:00 PM"
118+
match = re.search(
119+
r"([A-Za-z]+)\s+(\d{1,2}),?\s+(\d{4}).*?(\d{1,2}:\d{2}\s*[APM]{2})", date_str
120+
)
121+
122+
if match:
123+
month, day, year, time = match.groups()
124+
# Format consistently
125+
return f"{month} {day}, {year} - {time}"
126+
127+
# If the regex doesn't match, do basic cleanup
128+
return date_str.strip()
129+
130+
131+
async def parse_meetings(html: str) -> List[Dict[str, Any]]:
40132
"""
41133
Parse the meeting data from the HTML content.
42134
@@ -68,9 +160,17 @@ async def parse_meetings(html: str) -> List[Dict[str, str]]:
68160
if len(cells) < 5:
69161
continue
70162

163+
# Parse the date string into a datetime object
164+
date_text = cells[1].text()
165+
date_obj = parse_date_string(date_text)
166+
167+
# Get a cleaned date string as a fallback
168+
date_str = clean_date_string(date_text)
169+
71170
meeting_data = {
72171
"meeting": cells[0].text().strip(),
73-
"date": cells[1].text().strip(),
172+
"date": date_obj.isoformat() if date_obj else date_str,
173+
"date_display": date_str, # Keep a human-readable version
74174
"duration": cells[2].text().strip(),
75175
"agenda": None,
76176
"video": None,

src/models/meeting.py

+22-4
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,9 @@
33
"""
44

55
from datetime import datetime
6-
from typing import Optional
6+
from typing import Optional, Union
77

8-
from pydantic import BaseModel, Field, HttpUrl
8+
from pydantic import BaseModel, Field, HttpUrl, validator
99

1010

1111
class Meeting(BaseModel):
@@ -14,14 +14,32 @@ class Meeting(BaseModel):
1414
"""
1515

1616
meeting: str = Field(description="Name of the meeting")
17-
date: str = Field(description="Date and time of the meeting")
17+
date: str = Field(
18+
description="ISO-formatted date and time of the meeting with timezone"
19+
)
20+
date_display: Optional[str] = Field(
21+
None, description="Human-readable date and time format"
22+
)
1823
duration: str = Field(description="Duration of the meeting")
1924
agenda: Optional[HttpUrl] = Field(None, description="URL to the meeting agenda")
2025
video: Optional[HttpUrl] = Field(None, description="URL to the meeting video")
2126

27+
@validator("date_display", pre=True, always=True)
28+
def set_date_display(cls, v, values):
29+
"""Set date_display to a readable format if not provided"""
30+
if v is None and "date" in values:
31+
# If the date is in ISO format, try to make it more readable
32+
try:
33+
dt = datetime.fromisoformat(values["date"])
34+
return dt.strftime("%B %d, %Y - %I:%M %p")
35+
except (ValueError, TypeError):
36+
return values["date"]
37+
return v
38+
2239
def __str__(self) -> str:
2340
"""String representation of the meeting"""
24-
return f"{self.meeting} - {self.date} ({self.duration})"
41+
display_date = self.date_display or self.date
42+
return f"{self.meeting} - {display_date} ({self.duration})"
2543

2644

2745
class GranicusPlayerPage(BaseModel):

0 commit comments

Comments
 (0)