Skip to content

Commit

Permalink
extend crawl with ordered lists
Browse files Browse the repository at this point in the history
  • Loading branch information
teotoplak committed Mar 2, 2023
1 parent a52852e commit 5b1d215
Show file tree
Hide file tree
Showing 4 changed files with 8 additions and 0 deletions.
Binary file modified src/crawl/__pycache__/crawl.cpython-311.pyc
Binary file not shown.
7 changes: 7 additions & 0 deletions src/crawl/crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ def crawl_url(url):
lists = soup.find_all('ul')
# find all tables on the page
tables = soup.find_all('table')
ordered_lists = soup.find_all('ol')

text_chunks = []

Expand All @@ -28,6 +29,12 @@ def crawl_url(url):
text_chunks.append(li.get_text())
print(li.get_text() + "\n")

# loop through the lists and print the list items
for ol in ordered_lists:
for li in ol.find_all('li'):
text_chunks.append(li.get_text())
print(li.get_text() + "\n")

# loop through the tables and print the table cells
for table in tables:
for row in table.find_all('tr'):
Expand Down
Binary file modified tst/crawl/__pycache__/test_crawl.cpython-311-pytest-7.2.1.pyc
Binary file not shown.
1 change: 1 addition & 0 deletions tst/crawl/test_crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ def test_education_fact_sheet():
chunks = crawl_url(url)
expected_chunks = [
"*Üldhariduses keskmine õpilaste arv õpetaja kohta: 9,8 (HTM), regionaalselt väga suur kõikumine – mõnes koolis 2 õpilast õpetaja kohta (maakoolid), teises 20+ õpilast õpetaja kohta (Tallinn).",
"Töötasu kasvatamine (Eesti 200: 3000eur; Isamaa: õpetaja keskmine Eesti keskmisest 125%; Sotsdem: miinimumpalk vähemalt 130% eelneva aasta Eesti keskmisest palgast; Keskerakond: 3000eur)"
]
for chunk in expected_chunks:
assert chunk in chunks

0 comments on commit 5b1d215

Please sign in to comment.