Skip to content

Commit

Permalink
improve code, add clean_llms_data tests
Browse files Browse the repository at this point in the history
  • Loading branch information
MQ37 committed Jan 27, 2025
1 parent 63238df commit fde01d9
Show file tree
Hide file tree
Showing 5 changed files with 157 additions and 73 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
.PHONY: clean install-dev lint type-check unit-test format

DIRS_WITH_CODE = src
DIRS_WITH_CODE = src/ tests/

clean:
rm -rf .mypy_cache .pytest_cache .ruff_cache build dist htmlcov .coverage
Expand Down
27 changes: 12 additions & 15 deletions src/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,43 +12,40 @@
if TYPE_CHECKING:
from apify_client.clients import KeyValueStoreClientAsync

from src.mytypes import LinkDict, LLMSData
from src.mytypes import LLMSData

# not using Actor.log because pytest then throws a warning
# about non existent event loop
logger = logging.getLogger('apify')


def clean_llms_data(data: LLMSData, section_min_links: int = 2) -> LLMSData:
def clean_llms_data(data: LLMSData, section_min_links: int = 2) -> None:
"""Cleans the LLMS data by removing sections with low link count and moving the links to the index section.
:param data: LLMS data to clean
:param section_min_links: Minimum number of links in a section to keep it
and not move the links to the index section
"""
to_remove_sections: set[str] = set()
to_index_links: list[LinkDict] = []

if 'sections' not in data:
raise ValueError('Missing "sections" attribute in the LLMS data!')

sections = data['sections']

for section in sections.values():
for section_dir, section in sections.items():
# skip the index section
if section_dir == '/':
continue
if len(section['links']) < section_min_links:
to_index_links.extend(section['links'])
to_remove_sections.add(section['title'])
to_remove_sections.add(section_dir)

if to_index_links:
if to_remove_sections:
if '/' not in sections:
sections['/'] = {'title': 'Index', 'links': to_index_links}
else:
sections['/']['links'].extend(to_index_links)

for section_name in to_remove_sections:
del sections[section_name]

return data
sections['/'] = {'title': 'Index', 'links': []}
for section_dir in to_remove_sections:
sections['/']['links'].extend(sections[section_dir]['links'])
del sections[section_dir]


def get_url_path_dir(url: str) -> str:
Expand Down
2 changes: 1 addition & 1 deletion src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ async def main() -> None:
raise RuntimeError(msg)

# move sections with less than SECTION_MIN_LINKS to the root
data = clean_llms_data(data)
clean_llms_data(data)
output = render_llms_txt(data)

# save into kv-store as a file to be able to download it
Expand Down
85 changes: 84 additions & 1 deletion tests/test_helpers.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,86 @@
from src.helpers import get_hostname_path_string_from_url, get_url_path_dir, normalize_url
from typing import TYPE_CHECKING

from src.helpers import clean_llms_data, get_hostname_path_string_from_url, get_url_path_dir, normalize_url

if TYPE_CHECKING:
from src.mytypes import LLMSData


def test_clean_llms_data() -> None:
# Test case 1: Normal case where sections with fewer links are moved to index
data: LLMSData = {
'title': 'Test LLMS',
'description': None,
'details': None,
'sections': {
'section_1': {
'title': 'Section 1',
'links': [{'url': 'http://example.com', 'title': 'Example', 'description': None}],
},
'section_2': {
'title': 'Section 2',
'links': [
{'url': 'http://example2.com', 'title': 'Example 2', 'description': None},
{'url': 'http://example3.com', 'title': 'Example 3', 'description': None},
],
},
},
}

clean_llms_data(data, section_min_links=2)

assert 'section_1' not in data['sections'] # Section 1 should be removed
assert 'section_2' in data['sections'] # Section 2 should remain
assert '/' in data['sections'] # Index section should be created
assert len(data['sections']['/']['links']) == 1 # The link from section_1 should be moved to index

# Test case 2: If all sections meet the minimum link count, nothing changes
data2: LLMSData = {
'title': 'Test LLMS',
'description': None,
'details': None,
'sections': {
'section_1': {
'title': 'Section 1',
'links': [
{'url': 'http://example.com', 'title': 'Example', 'description': None},
{'url': 'http://example2.com', 'title': 'Example 2', 'description': None},
],
}
},
}

clean_llms_data(data2, section_min_links=2)

assert 'section_1' in data2['sections'] # Section 1 should remain
assert '/' not in data2['sections'] # Index section should not be created

# Test case 4: Empty sections dictionary
data4: LLMSData = {'title': 'Empty LLMS', 'description': None, 'details': None, 'sections': {}}

clean_llms_data(data4, section_min_links=2)

assert data4['sections'] == {} # Sections should remain empty

# Test case 5: Sections already have an index section
data5: LLMSData = {
'title': 'LLMS with Index',
'description': None,
'details': None,
'sections': {
'/': {'title': 'Index', 'links': [{'url': 'http://index.com', 'title': 'Index Link', 'description': None}]},
'section_1': {
'title': 'Section 1',
'links': [{'url': 'http://example.com', 'title': 'Example', 'description': None}],
},
},
}

clean_llms_data(data5, section_min_links=2)

assert 'section_1' not in data5['sections'] # Section 1 should be removed
assert '/' in data5['sections'] # Index should remain
assert len(data5['sections']['/']['links']) == 2 # Index should now contain both the old and new links


def test_get_url_path_dir() -> None:
Expand All @@ -14,6 +96,7 @@ def test_get_url_path_dir() -> None:
_dir3 = '/dir'
assert get_url_path_dir(url3) == _dir3


def test_normalize_url() -> None:
url = 'https://example.com/'
url_normalized = 'https://example.com'
Expand Down
114 changes: 59 additions & 55 deletions tests/test_renderer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,17 @@ def test_render_llms_txt() -> None:
'details': None,
'description': None,
'sections': {
'/': {
'title': 'Index',
'links': [
{
'url': 'https://docs.apify.com/academy',
'title': 'Web Scraping Academy', 'description': 'Learn everything about web scraping.'
}
]
}
}
'/': {
'title': 'Index',
'links': [
{
'url': 'https://docs.apify.com/academy',
'title': 'Web Scraping Academy',
'description': 'Learn everything about web scraping.',
}
],
}
},
}

expected_output = """# docs.apify.com
Expand All @@ -34,22 +35,24 @@ def test_render_llms_txt() -> None:

assert render_llms_txt(data) == expected_output


def test_render_llms_txt_with_description() -> None:
data: LLMSData = {
'title': 'docs.apify.com',
'description': 'Apify documentation',
'details': None,
'sections': {
'/': {
'title': 'Index',
'links': [
{
'url': 'https://docs.apify.com/academy',
'title': 'Web Scraping Academy', 'description': 'Learn everything about web scraping.'
}
]
}
}
'/': {
'title': 'Index',
'links': [
{
'url': 'https://docs.apify.com/academy',
'title': 'Web Scraping Academy',
'description': 'Learn everything about web scraping.',
}
],
}
},
}

expected_output = """# docs.apify.com
Expand All @@ -64,22 +67,24 @@ def test_render_llms_txt_with_description() -> None:

assert render_llms_txt(data) == expected_output


def test_render_llms_txt_with_description_and_details() -> None:
data: LLMSData = {
'title': 'docs.apify.com',
'description': 'Apify documentation',
'details': 'This is the documentation for Apify',
'sections': {
'/': {
'title': 'Index',
'links': [
{
'url': 'https://docs.apify.com/academy',
'title': 'Web Scraping Academy', 'description': 'Learn everything about web scraping.'
}
]
}
}
'/': {
'title': 'Index',
'links': [
{
'url': 'https://docs.apify.com/academy',
'title': 'Web Scraping Academy',
'description': 'Learn everything about web scraping.',
}
],
}
},
}

expected_output = """# docs.apify.com
Expand All @@ -96,13 +101,9 @@ def test_render_llms_txt_with_description_and_details() -> None:

assert render_llms_txt(data) == expected_output


def test_render_llms_txt_with_no_sections() -> None:
data: LLMSData = {
'title': 'docs.apify.com',
'description': 'Apify documentation',
'details': None,
'sections': {}
}
data: LLMSData = {'title': 'docs.apify.com', 'description': 'Apify documentation', 'details': None, 'sections': {}}

expected_output = """# docs.apify.com
Expand All @@ -112,31 +113,34 @@ def test_render_llms_txt_with_no_sections() -> None:

assert render_llms_txt(data) == expected_output


def test_render_llms_txt_with_multiple_sections() -> None:
data: LLMSData = {
'title': 'docs.apify.com',
'description': 'Apify documentation',
'details': None,
'sections': {
'/': {
'title': 'Index',
'links': [
{
'url': 'https://docs.apify.com/academy',
'title': 'Web Scraping Academy', 'description': 'Learn everything about web scraping.'
}
]
},
'/guides': {
'title': 'Guides',
'links': [
{
'url': 'https://docs.apify.com/guides/getting-started',
'title': 'Getting Started', 'description': 'Learn how to get started with Apify.'
}
]
}
}
'/': {
'title': 'Index',
'links': [
{
'url': 'https://docs.apify.com/academy',
'title': 'Web Scraping Academy',
'description': 'Learn everything about web scraping.',
}
],
},
'/guides': {
'title': 'Guides',
'links': [
{
'url': 'https://docs.apify.com/guides/getting-started',
'title': 'Getting Started',
'description': 'Learn how to get started with Apify.',
}
],
},
},
}

expected_output = """# docs.apify.com
Expand Down

0 comments on commit fde01d9

Please sign in to comment.