Skip to content

Commit

Permalink
add more comments
Browse files Browse the repository at this point in the history
  • Loading branch information
augray committed Oct 4, 2024
1 parent 2e14980 commit 40ba8c8
Showing 1 changed file with 3 additions and 0 deletions.
3 changes: 3 additions & 0 deletions examples/ainews/ingest_ainews.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ def get_archive_list_page(page_num: int) -> tuple[list[str], int | None]:


def http_get(url: str) -> str:
"""Perform an HTTP GET request, with retries, and return resulting raw text."""
n_tries = 5
sleep_interval = 1
while n_tries > 0:
Expand All @@ -72,6 +73,7 @@ def http_get(url: str) -> str:


def get_newsletter_text(url: str) -> str:
"""Get text from a newsletter page & BeautifulSoup + markdownify to clean it"""
raw_text = http_get(url)
page = BeautifulSoup(raw_text, "html.parser")
content = page.find(class_="email-body-content")
Expand All @@ -80,6 +82,7 @@ def get_newsletter_text(url: str) -> str:


def get_newsletter_texts(urls: list[str]) -> list[str]:
"""Get markdown text for all newsletters at the given URLs."""
texts: list[str] = []
for i, url in enumerate(urls):
print(f"Getting url {i + 1}/{len(urls)}")
Expand Down

0 comments on commit 40ba8c8

Please sign in to comment.