diff --git a/Makefile b/Makefile index 4fb9170..c8a2f73 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ .PHONY: clean install-dev lint type-check unit-test format -DIRS_WITH_CODE = src +DIRS_WITH_CODE = src/ tests/ clean: rm -rf .mypy_cache .pytest_cache .ruff_cache build dist htmlcov .coverage diff --git a/src/helpers.py b/src/helpers.py index 7efb03c..22922d7 100644 --- a/src/helpers.py +++ b/src/helpers.py @@ -12,14 +12,14 @@ if TYPE_CHECKING: from apify_client.clients import KeyValueStoreClientAsync - from src.mytypes import LinkDict, LLMSData + from src.mytypes import LLMSData # not using Actor.log because pytest then throws a warning # about non existent event loop logger = logging.getLogger('apify') -def clean_llms_data(data: LLMSData, section_min_links: int = 2) -> LLMSData: +def clean_llms_data(data: LLMSData, section_min_links: int = 2) -> None: """Cleans the LLMS data by removing sections with low link count and moving the links to the index section. :param data: LLMS data to clean @@ -27,28 +27,25 @@ def clean_llms_data(data: LLMSData, section_min_links: int = 2) -> LLMSData: and not move the links to the index section """ to_remove_sections: set[str] = set() - to_index_links: list[LinkDict] = [] if 'sections' not in data: raise ValueError('Missing "sections" attribute in the LLMS data!') sections = data['sections'] - for section in sections.values(): + for section_dir, section in sections.items(): + # skip the index section + if section_dir == '/': + continue if len(section['links']) < section_min_links: - to_index_links.extend(section['links']) - to_remove_sections.add(section['title']) + to_remove_sections.add(section_dir) - if to_index_links: + if to_remove_sections: if '/' not in sections: - sections['/'] = {'title': 'Index', 'links': to_index_links} - else: - sections['/']['links'].extend(to_index_links) - - for section_name in to_remove_sections: - del sections[section_name] - - return data + sections['/'] = {'title': 'Index', 'links': []} + for section_dir in to_remove_sections: + sections['/']['links'].extend(sections[section_dir]['links']) + del sections[section_dir] def get_url_path_dir(url: str) -> str: diff --git a/src/main.py b/src/main.py index 1ed15c1..015e79d 100644 --- a/src/main.py +++ b/src/main.py @@ -155,7 +155,7 @@ async def main() -> None: raise RuntimeError(msg) # move sections with less than SECTION_MIN_LINKS to the root - data = clean_llms_data(data) + clean_llms_data(data) output = render_llms_txt(data) # save into kv-store as a file to be able to download it diff --git a/tests/test_helpers.py b/tests/test_helpers.py index d25b718..787705d 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -1,4 +1,86 @@ -from src.helpers import get_hostname_path_string_from_url, get_url_path_dir, normalize_url +from typing import TYPE_CHECKING + +from src.helpers import clean_llms_data, get_hostname_path_string_from_url, get_url_path_dir, normalize_url + +if TYPE_CHECKING: + from src.mytypes import LLMSData + + +def test_clean_llms_data() -> None: + # Test case 1: Normal case where sections with fewer links are moved to index + data: LLMSData = { + 'title': 'Test LLMS', + 'description': None, + 'details': None, + 'sections': { + 'section_1': { + 'title': 'Section 1', + 'links': [{'url': 'http://example.com', 'title': 'Example', 'description': None}], + }, + 'section_2': { + 'title': 'Section 2', + 'links': [ + {'url': 'http://example2.com', 'title': 'Example 2', 'description': None}, + {'url': 'http://example3.com', 'title': 'Example 3', 'description': None}, + ], + }, + }, + } + + clean_llms_data(data, section_min_links=2) + + assert 'section_1' not in data['sections'] # Section 1 should be removed + assert 'section_2' in data['sections'] # Section 2 should remain + assert '/' in data['sections'] # Index section should be created + assert len(data['sections']['/']['links']) == 1 # The link from section_1 should be moved to index + + # Test case 2: If all sections meet the minimum link count, nothing changes + data2: LLMSData = { + 'title': 'Test LLMS', + 'description': None, + 'details': None, + 'sections': { + 'section_1': { + 'title': 'Section 1', + 'links': [ + {'url': 'http://example.com', 'title': 'Example', 'description': None}, + {'url': 'http://example2.com', 'title': 'Example 2', 'description': None}, + ], + } + }, + } + + clean_llms_data(data2, section_min_links=2) + + assert 'section_1' in data2['sections'] # Section 1 should remain + assert '/' not in data2['sections'] # Index section should not be created + + # Test case 4: Empty sections dictionary + data4: LLMSData = {'title': 'Empty LLMS', 'description': None, 'details': None, 'sections': {}} + + clean_llms_data(data4, section_min_links=2) + + assert data4['sections'] == {} # Sections should remain empty + + # Test case 5: Sections already have an index section + data5: LLMSData = { + 'title': 'LLMS with Index', + 'description': None, + 'details': None, + 'sections': { + '/': {'title': 'Index', 'links': [{'url': 'http://index.com', 'title': 'Index Link', 'description': None}]}, + 'section_1': { + 'title': 'Section 1', + 'links': [{'url': 'http://example.com', 'title': 'Example', 'description': None}], + }, + }, + } + + clean_llms_data(data5, section_min_links=2) + + assert 'section_1' not in data5['sections'] # Section 1 should be removed + assert '/' in data5['sections'] # Index should remain + assert len(data5['sections']['/']['links']) == 2 # Index should now contain both the old and new links def test_get_url_path_dir() -> None: @@ -14,6 +96,7 @@ def test_get_url_path_dir() -> None: _dir3 = '/dir' assert get_url_path_dir(url3) == _dir3 + def test_normalize_url() -> None: url = 'https://example.com/' url_normalized = 'https://example.com' diff --git a/tests/test_renderer.py b/tests/test_renderer.py index 5bc9af6..21cc499 100644 --- a/tests/test_renderer.py +++ b/tests/test_renderer.py @@ -12,16 +12,17 @@ def test_render_llms_txt() -> None: 'details': None, 'description': None, 'sections': { - '/': { - 'title': 'Index', - 'links': [ - { - 'url': 'https://docs.apify.com/academy', - 'title': 'Web Scraping Academy', 'description': 'Learn everything about web scraping.' - } - ] - } - } + '/': { + 'title': 'Index', + 'links': [ + { + 'url': 'https://docs.apify.com/academy', + 'title': 'Web Scraping Academy', + 'description': 'Learn everything about web scraping.', + } + ], + } + }, } expected_output = """# docs.apify.com @@ -34,22 +35,24 @@ def test_render_llms_txt() -> None: assert render_llms_txt(data) == expected_output + def test_render_llms_txt_with_description() -> None: data: LLMSData = { 'title': 'docs.apify.com', 'description': 'Apify documentation', 'details': None, 'sections': { - '/': { - 'title': 'Index', - 'links': [ - { - 'url': 'https://docs.apify.com/academy', - 'title': 'Web Scraping Academy', 'description': 'Learn everything about web scraping.' - } - ] - } - } + '/': { + 'title': 'Index', + 'links': [ + { + 'url': 'https://docs.apify.com/academy', + 'title': 'Web Scraping Academy', + 'description': 'Learn everything about web scraping.', + } + ], + } + }, } expected_output = """# docs.apify.com @@ -64,22 +67,24 @@ def test_render_llms_txt_with_description() -> None: assert render_llms_txt(data) == expected_output + def test_render_llms_txt_with_description_and_details() -> None: data: LLMSData = { 'title': 'docs.apify.com', 'description': 'Apify documentation', 'details': 'This is the documentation for Apify', 'sections': { - '/': { - 'title': 'Index', - 'links': [ - { - 'url': 'https://docs.apify.com/academy', - 'title': 'Web Scraping Academy', 'description': 'Learn everything about web scraping.' - } - ] - } - } + '/': { + 'title': 'Index', + 'links': [ + { + 'url': 'https://docs.apify.com/academy', + 'title': 'Web Scraping Academy', + 'description': 'Learn everything about web scraping.', + } + ], + } + }, } expected_output = """# docs.apify.com @@ -96,13 +101,9 @@ def test_render_llms_txt_with_description_and_details() -> None: assert render_llms_txt(data) == expected_output + def test_render_llms_txt_with_no_sections() -> None: - data: LLMSData = { - 'title': 'docs.apify.com', - 'description': 'Apify documentation', - 'details': None, - 'sections': {} - } + data: LLMSData = {'title': 'docs.apify.com', 'description': 'Apify documentation', 'details': None, 'sections': {}} expected_output = """# docs.apify.com @@ -112,31 +113,34 @@ def test_render_llms_txt_with_no_sections() -> None: assert render_llms_txt(data) == expected_output + def test_render_llms_txt_with_multiple_sections() -> None: data: LLMSData = { 'title': 'docs.apify.com', 'description': 'Apify documentation', 'details': None, 'sections': { - '/': { - 'title': 'Index', - 'links': [ - { - 'url': 'https://docs.apify.com/academy', - 'title': 'Web Scraping Academy', 'description': 'Learn everything about web scraping.' - } - ] - }, - '/guides': { - 'title': 'Guides', - 'links': [ - { - 'url': 'https://docs.apify.com/guides/getting-started', - 'title': 'Getting Started', 'description': 'Learn how to get started with Apify.' - } - ] - } - } + '/': { + 'title': 'Index', + 'links': [ + { + 'url': 'https://docs.apify.com/academy', + 'title': 'Web Scraping Academy', + 'description': 'Learn everything about web scraping.', + } + ], + }, + '/guides': { + 'title': 'Guides', + 'links': [ + { + 'url': 'https://docs.apify.com/guides/getting-started', + 'title': 'Getting Started', + 'description': 'Learn how to get started with Apify.', + } + ], + }, + }, } expected_output = """# docs.apify.com