From 9bbb95de31dcb79e6f9ab568544893432a065d95 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 16 Jan 2025 12:29:07 -0500 Subject: [PATCH 01/11] Validate exclusion regex on add, return 422 if invalid --- backend/btrixcloud/crawls.py | 8 ++++++++ backend/test/test_run_crawl.py | 9 +++++++++ 2 files changed, 17 insertions(+) diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index 539c408ee6..9d7ec27eb7 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -517,6 +517,14 @@ async def add_or_remove_exclusion( """add new exclusion to config or remove exclusion from config for given crawl_id, update config on crawl""" + if add: + try: + re.compile(regex) + except re.error: + raise HTTPException( + status_code=422, detail="invalid_regular_expression" + ) + crawl = await self.get_crawl(crawl_id, org) if crawl.state not in RUNNING_AND_WAITING_STATES: diff --git a/backend/test/test_run_crawl.py b/backend/test/test_run_crawl.py index 36f0237f7d..9f1e6e6b5e 100644 --- a/backend/test/test_run_crawl.py +++ b/backend/test/test_run_crawl.py @@ -148,6 +148,15 @@ def test_add_exclusion(admin_auth_headers, default_org_id): assert r.json()["success"] == True +def test_add_invalid_exclusion(admin_auth_headers, default_org_id): + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/exclusions?regex=[", + headers=admin_auth_headers, + ) + assert r.status_code == 422 + assert r.json()["detail"] == "invalid_regular_expression" + + def test_remove_exclusion(admin_auth_headers, default_org_id): r = requests.delete( f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/exclusions?regex=test", From 6c3a5e7a992d93601a4e6629c299682cc70ae552 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 16 Jan 2025 12:33:15 -0500 Subject: [PATCH 02/11] Modify exception detail for invalid regex on frontend --- frontend/src/features/crawl-workflows/exclusion-editor.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontend/src/features/crawl-workflows/exclusion-editor.ts b/frontend/src/features/crawl-workflows/exclusion-editor.ts index a1e93cfdce..6c4384c29b 100644 --- a/frontend/src/features/crawl-workflows/exclusion-editor.ts +++ b/frontend/src/features/crawl-workflows/exclusion-editor.ts @@ -276,7 +276,7 @@ export class ExclusionEditor extends LiteElement { if (isApiError(e)) { if (e.message === "exclusion_already_exists") { this.exclusionFieldErrorMessage = msg("Exclusion already exists"); - } else if (e.message === "invalid_regex") { + } else if (e.message === "invalid_regular_expression") { this.exclusionFieldErrorMessage = msg("Invalid Regex"); } } else { From 0cd18b34894be4613f216eb1164f9d6798a70908 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 16 Jan 2025 12:36:51 -0500 Subject: [PATCH 03/11] Add pylint disable for raise-missing-from --- backend/btrixcloud/crawls.py | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index 9d7ec27eb7..ae536993c9 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -521,6 +521,7 @@ async def add_or_remove_exclusion( try: re.compile(regex) except re.error: + # pylint: disable=raise-missing-from raise HTTPException( status_code=422, detail="invalid_regular_expression" ) From 997615458dc507d5b806fd3f55243ca76425aef7 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 16 Jan 2025 22:57:01 -0500 Subject: [PATCH 04/11] Valid regex on workflow creation --- backend/btrixcloud/crawlconfigs.py | 13 +++++++++++++ backend/test/test_crawlconfigs.py | 20 ++++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 8d231733c7..71d0eedcbf 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -215,6 +215,19 @@ async def add_crawl_config( if not self.can_org_use_proxy(org, config_in.proxyId): raise HTTPException(status_code=404, detail="proxy_not_found") + if config_in.config.exclude: + exclude = config_in.config.exclude + if isinstance(exclude, str): + exclude = [exclude] + for regex in exclude: + try: + re.compile(regex) + except re.error: + # pylint: disable=raise-missing-from + raise HTTPException( + status_code=422, detail="invalid_regular_expression" + ) + now = dt_now() crawlconfig = CrawlConfig( id=uuid4(), diff --git a/backend/test/test_crawlconfigs.py b/backend/test/test_crawlconfigs.py index 967f32db33..955f8b1769 100644 --- a/backend/test/test_crawlconfigs.py +++ b/backend/test/test_crawlconfigs.py @@ -487,3 +487,23 @@ def test_get_crawler_channels(crawler_auth_headers, default_org_id): for crawler_channel in crawler_channels: assert crawler_channel["id"] assert crawler_channel["image"] + + +def test_exclude_invalid_regex(crawler_auth_headers, default_org_id, sample_crawl_data): + sample_crawl_data["exclude"] = "[" + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/", + headers=crawler_auth_headers, + json=sample_crawl_data, + ) + assert r.status_code == 422 + assert r.json()["detail"] == "invalid_regular_expression" + + sample_crawl_data["exclude"] = ["abc.*", "["] + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/", + headers=crawler_auth_headers, + json=sample_crawl_data, + ) + assert r.status_code == 422 + assert r.json()["detail"] == "invalid_regular_expression" From 4317a1cb15d608378af62b8d24534c24ed19f807 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 16 Jan 2025 23:07:21 -0500 Subject: [PATCH 05/11] Validate exclude regex on crawlconfig update --- backend/btrixcloud/crawlconfigs.py | 17 ++++++++--------- backend/btrixcloud/crawls.py | 9 ++------- backend/btrixcloud/utils.py | 10 ++++++++++ backend/test/test_crawlconfigs.py | 24 +++++++++++++++++++++++- 4 files changed, 43 insertions(+), 17 deletions(-) diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 71d0eedcbf..51c54b1b42 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -43,7 +43,7 @@ CrawlerProxy, CrawlerProxies, ) -from .utils import dt_now, slug_from_name +from .utils import dt_now, slug_from_name, validate_regexes if TYPE_CHECKING: from .orgs import OrgOps @@ -219,14 +219,7 @@ async def add_crawl_config( exclude = config_in.config.exclude if isinstance(exclude, str): exclude = [exclude] - for regex in exclude: - try: - re.compile(regex) - except re.error: - # pylint: disable=raise-missing-from - raise HTTPException( - status_code=422, detail="invalid_regular_expression" - ) + validate_regexes(exclude) now = dt_now() crawlconfig = CrawlConfig( @@ -335,6 +328,12 @@ async def update_crawl_config( orig_crawl_config = await self.get_crawl_config(cid, org.id) + if update.config.exclude: + exclude = update.config.exclude + if isinstance(exclude, str): + exclude = [exclude] + validate_regexes(exclude) + # indicates if any k8s crawl config settings changed changed = False changed = changed or ( diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index ae536993c9..92e4c43f85 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -24,6 +24,7 @@ date_to_str, parse_jsonl_error_messages, stream_dict_list_as_csv, + validate_regexes, ) from .basecrawls import BaseCrawlOps from .crawlmanager import CrawlManager @@ -518,13 +519,7 @@ async def add_or_remove_exclusion( for given crawl_id, update config on crawl""" if add: - try: - re.compile(regex) - except re.error: - # pylint: disable=raise-missing-from - raise HTTPException( - status_code=422, detail="invalid_regular_expression" - ) + validate_regexes([regex]) crawl = await self.get_crawl(crawl_id, org) diff --git a/backend/btrixcloud/utils.py b/backend/btrixcloud/utils.py index 8a555a4c7c..d1ad255272 100644 --- a/backend/btrixcloud/utils.py +++ b/backend/btrixcloud/utils.py @@ -194,3 +194,13 @@ def get_origin(headers) -> str: return default_origin return scheme + "://" + host + + +def validate_regexes(regexes: List[str]): + """Validate regular expressions, raise HTTPException if invalid""" + for regex in regexes: + try: + re.compile(regex) + except re.error: + # pylint: disable=raise-missing-from + raise HTTPException(status_code=422, detail="invalid_regular_expression") diff --git a/backend/test/test_crawlconfigs.py b/backend/test/test_crawlconfigs.py index 955f8b1769..e66faf6a5d 100644 --- a/backend/test/test_crawlconfigs.py +++ b/backend/test/test_crawlconfigs.py @@ -153,6 +153,26 @@ def test_update_config_invalid_format( assert r.status_code == 422 +def test_update_config_invalid_exclude_regex( + crawler_auth_headers, default_org_id, sample_crawl_data +): + r = requests.patch( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/", + headers=crawler_auth_headers, + json={"config": {"exclude": "["}}, + ) + assert r.status_code == 422 + assert r.json()["detail"] == "invalid_regular_expression" + + r = requests.patch( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/", + headers=crawler_auth_headers, + json={"config": {"exclude": ["abc.*", "["]}}, + ) + assert r.status_code == 422 + assert r.json()["detail"] == "invalid_regular_expression" + + def test_update_config_data(crawler_auth_headers, default_org_id, sample_crawl_data): r = requests.patch( f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/", @@ -489,7 +509,9 @@ def test_get_crawler_channels(crawler_auth_headers, default_org_id): assert crawler_channel["image"] -def test_exclude_invalid_regex(crawler_auth_headers, default_org_id, sample_crawl_data): +def test_add_crawl_config_invalid_exclude_regex( + crawler_auth_headers, default_org_id, sample_crawl_data +): sample_crawl_data["exclude"] = "[" r = requests.post( f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/", From fa53c3c51f38f08fd11c0047cb22d176e9aa739c Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 16 Jan 2025 23:09:01 -0500 Subject: [PATCH 06/11] Use invalid_regex detail already in use elsewhere --- backend/btrixcloud/utils.py | 2 +- backend/test/test_crawlconfigs.py | 8 ++++---- backend/test/test_run_crawl.py | 2 +- frontend/src/features/crawl-workflows/exclusion-editor.ts | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/backend/btrixcloud/utils.py b/backend/btrixcloud/utils.py index d1ad255272..bec9ca0b8e 100644 --- a/backend/btrixcloud/utils.py +++ b/backend/btrixcloud/utils.py @@ -203,4 +203,4 @@ def validate_regexes(regexes: List[str]): re.compile(regex) except re.error: # pylint: disable=raise-missing-from - raise HTTPException(status_code=422, detail="invalid_regular_expression") + raise HTTPException(status_code=422, detail="invalid_regex") diff --git a/backend/test/test_crawlconfigs.py b/backend/test/test_crawlconfigs.py index e66faf6a5d..0b26fddd56 100644 --- a/backend/test/test_crawlconfigs.py +++ b/backend/test/test_crawlconfigs.py @@ -162,7 +162,7 @@ def test_update_config_invalid_exclude_regex( json={"config": {"exclude": "["}}, ) assert r.status_code == 422 - assert r.json()["detail"] == "invalid_regular_expression" + assert r.json()["detail"] == "invalid_regex" r = requests.patch( f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/", @@ -170,7 +170,7 @@ def test_update_config_invalid_exclude_regex( json={"config": {"exclude": ["abc.*", "["]}}, ) assert r.status_code == 422 - assert r.json()["detail"] == "invalid_regular_expression" + assert r.json()["detail"] == "invalid_regex" def test_update_config_data(crawler_auth_headers, default_org_id, sample_crawl_data): @@ -519,7 +519,7 @@ def test_add_crawl_config_invalid_exclude_regex( json=sample_crawl_data, ) assert r.status_code == 422 - assert r.json()["detail"] == "invalid_regular_expression" + assert r.json()["detail"] == "invalid_regex" sample_crawl_data["exclude"] = ["abc.*", "["] r = requests.post( @@ -528,4 +528,4 @@ def test_add_crawl_config_invalid_exclude_regex( json=sample_crawl_data, ) assert r.status_code == 422 - assert r.json()["detail"] == "invalid_regular_expression" + assert r.json()["detail"] == "invalid_regex" diff --git a/backend/test/test_run_crawl.py b/backend/test/test_run_crawl.py index 9f1e6e6b5e..5494541009 100644 --- a/backend/test/test_run_crawl.py +++ b/backend/test/test_run_crawl.py @@ -154,7 +154,7 @@ def test_add_invalid_exclusion(admin_auth_headers, default_org_id): headers=admin_auth_headers, ) assert r.status_code == 422 - assert r.json()["detail"] == "invalid_regular_expression" + assert r.json()["detail"] == "invalid_regex" def test_remove_exclusion(admin_auth_headers, default_org_id): diff --git a/frontend/src/features/crawl-workflows/exclusion-editor.ts b/frontend/src/features/crawl-workflows/exclusion-editor.ts index 6c4384c29b..a1e93cfdce 100644 --- a/frontend/src/features/crawl-workflows/exclusion-editor.ts +++ b/frontend/src/features/crawl-workflows/exclusion-editor.ts @@ -276,7 +276,7 @@ export class ExclusionEditor extends LiteElement { if (isApiError(e)) { if (e.message === "exclusion_already_exists") { this.exclusionFieldErrorMessage = msg("Exclusion already exists"); - } else if (e.message === "invalid_regular_expression") { + } else if (e.message === "invalid_regex") { this.exclusionFieldErrorMessage = msg("Invalid Regex"); } } else { From 7b7f4294d1962e05ab394cbb43a8705b4e6882d6 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 16 Jan 2025 23:10:43 -0500 Subject: [PATCH 07/11] Use 400 status code on exception for consistency --- backend/btrixcloud/utils.py | 2 +- backend/test/test_crawlconfigs.py | 8 ++++---- backend/test/test_run_crawl.py | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/backend/btrixcloud/utils.py b/backend/btrixcloud/utils.py index bec9ca0b8e..83dfbfefb2 100644 --- a/backend/btrixcloud/utils.py +++ b/backend/btrixcloud/utils.py @@ -203,4 +203,4 @@ def validate_regexes(regexes: List[str]): re.compile(regex) except re.error: # pylint: disable=raise-missing-from - raise HTTPException(status_code=422, detail="invalid_regex") + raise HTTPException(status_code=400, detail="invalid_regex") diff --git a/backend/test/test_crawlconfigs.py b/backend/test/test_crawlconfigs.py index 0b26fddd56..4e552e211c 100644 --- a/backend/test/test_crawlconfigs.py +++ b/backend/test/test_crawlconfigs.py @@ -161,7 +161,7 @@ def test_update_config_invalid_exclude_regex( headers=crawler_auth_headers, json={"config": {"exclude": "["}}, ) - assert r.status_code == 422 + assert r.status_code == 400 assert r.json()["detail"] == "invalid_regex" r = requests.patch( @@ -169,7 +169,7 @@ def test_update_config_invalid_exclude_regex( headers=crawler_auth_headers, json={"config": {"exclude": ["abc.*", "["]}}, ) - assert r.status_code == 422 + assert r.status_code == 400 assert r.json()["detail"] == "invalid_regex" @@ -518,7 +518,7 @@ def test_add_crawl_config_invalid_exclude_regex( headers=crawler_auth_headers, json=sample_crawl_data, ) - assert r.status_code == 422 + assert r.status_code == 400 assert r.json()["detail"] == "invalid_regex" sample_crawl_data["exclude"] = ["abc.*", "["] @@ -527,5 +527,5 @@ def test_add_crawl_config_invalid_exclude_regex( headers=crawler_auth_headers, json=sample_crawl_data, ) - assert r.status_code == 422 + assert r.status_code == 400 assert r.json()["detail"] == "invalid_regex" diff --git a/backend/test/test_run_crawl.py b/backend/test/test_run_crawl.py index 5494541009..c3604e9050 100644 --- a/backend/test/test_run_crawl.py +++ b/backend/test/test_run_crawl.py @@ -153,7 +153,7 @@ def test_add_invalid_exclusion(admin_auth_headers, default_org_id): f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/exclusions?regex=[", headers=admin_auth_headers, ) - assert r.status_code == 422 + assert r.status_code == 400 assert r.json()["detail"] == "invalid_regex" From fef55a119709dce3fda18e3547624214a7746706 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 16 Jan 2025 23:14:35 -0500 Subject: [PATCH 08/11] Fix linting --- backend/btrixcloud/crawlconfigs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 51c54b1b42..898427e171 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -189,7 +189,7 @@ async def get_profile_filename( return profile_filename - # pylint: disable=invalid-name + # pylint: disable=invalid-name, too-many-branches async def add_crawl_config( self, config_in: CrawlConfigIn, @@ -323,7 +323,7 @@ def check_attr_changed( async def update_crawl_config( self, cid: UUID, org: Organization, user: User, update: UpdateCrawlConfig ) -> dict[str, bool | str]: - # pylint: disable=too-many-locals + # pylint: disable=too-many-locals, too-many-branches """Update name, scale, schedule, and/or tags for an existing crawl config""" orig_crawl_config = await self.get_crawl_config(cid, org.id) From 4c958225947ebd3220d639df0cdf42a00625ddd6 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 16 Jan 2025 23:22:11 -0500 Subject: [PATCH 09/11] One more pylint disable --- backend/btrixcloud/crawlconfigs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 898427e171..91d424197e 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -323,7 +323,7 @@ def check_attr_changed( async def update_crawl_config( self, cid: UUID, org: Organization, user: User, update: UpdateCrawlConfig ) -> dict[str, bool | str]: - # pylint: disable=too-many-locals, too-many-branches + # pylint: disable=too-many-locals, too-many-branches, too-many-statements """Update name, scale, schedule, and/or tags for an existing crawl config""" orig_crawl_config = await self.get_crawl_config(cid, org.id) From dd093dff19e3f769d29d84847c606beb66082e44 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 16 Jan 2025 23:23:57 -0500 Subject: [PATCH 10/11] Check if update.config exists first --- backend/btrixcloud/crawlconfigs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 91d424197e..86e0e0c2b8 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -328,7 +328,7 @@ async def update_crawl_config( orig_crawl_config = await self.get_crawl_config(cid, org.id) - if update.config.exclude: + if update.config and update.config.exclude: exclude = update.config.exclude if isinstance(exclude, str): exclude = [exclude] From 17ee4862be41d99bd15de1450da67646b954ff9a Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Fri, 17 Jan 2025 00:59:19 -0500 Subject: [PATCH 11/11] Fix test --- backend/test/test_crawlconfigs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/test/test_crawlconfigs.py b/backend/test/test_crawlconfigs.py index 4e552e211c..498403a894 100644 --- a/backend/test/test_crawlconfigs.py +++ b/backend/test/test_crawlconfigs.py @@ -512,7 +512,7 @@ def test_get_crawler_channels(crawler_auth_headers, default_org_id): def test_add_crawl_config_invalid_exclude_regex( crawler_auth_headers, default_org_id, sample_crawl_data ): - sample_crawl_data["exclude"] = "[" + sample_crawl_data["config"]["exclude"] = "[" r = requests.post( f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/", headers=crawler_auth_headers, @@ -521,7 +521,7 @@ def test_add_crawl_config_invalid_exclude_regex( assert r.status_code == 400 assert r.json()["detail"] == "invalid_regex" - sample_crawl_data["exclude"] = ["abc.*", "["] + sample_crawl_data["config"]["exclude"] = ["abc.*", "["] r = requests.post( f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/", headers=crawler_auth_headers,