diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 8d231733c7..86e0e0c2b8 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -43,7 +43,7 @@ CrawlerProxy, CrawlerProxies, ) -from .utils import dt_now, slug_from_name +from .utils import dt_now, slug_from_name, validate_regexes if TYPE_CHECKING: from .orgs import OrgOps @@ -189,7 +189,7 @@ async def get_profile_filename( return profile_filename - # pylint: disable=invalid-name + # pylint: disable=invalid-name, too-many-branches async def add_crawl_config( self, config_in: CrawlConfigIn, @@ -215,6 +215,12 @@ async def add_crawl_config( if not self.can_org_use_proxy(org, config_in.proxyId): raise HTTPException(status_code=404, detail="proxy_not_found") + if config_in.config.exclude: + exclude = config_in.config.exclude + if isinstance(exclude, str): + exclude = [exclude] + validate_regexes(exclude) + now = dt_now() crawlconfig = CrawlConfig( id=uuid4(), @@ -317,11 +323,17 @@ def check_attr_changed( async def update_crawl_config( self, cid: UUID, org: Organization, user: User, update: UpdateCrawlConfig ) -> dict[str, bool | str]: - # pylint: disable=too-many-locals + # pylint: disable=too-many-locals, too-many-branches, too-many-statements """Update name, scale, schedule, and/or tags for an existing crawl config""" orig_crawl_config = await self.get_crawl_config(cid, org.id) + if update.config and update.config.exclude: + exclude = update.config.exclude + if isinstance(exclude, str): + exclude = [exclude] + validate_regexes(exclude) + # indicates if any k8s crawl config settings changed changed = False changed = changed or ( diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index 539c408ee6..92e4c43f85 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -24,6 +24,7 @@ date_to_str, parse_jsonl_error_messages, stream_dict_list_as_csv, + validate_regexes, ) from .basecrawls import BaseCrawlOps from .crawlmanager import CrawlManager @@ -517,6 +518,9 @@ async def add_or_remove_exclusion( """add new exclusion to config or remove exclusion from config for given crawl_id, update config on crawl""" + if add: + validate_regexes([regex]) + crawl = await self.get_crawl(crawl_id, org) if crawl.state not in RUNNING_AND_WAITING_STATES: diff --git a/backend/btrixcloud/utils.py b/backend/btrixcloud/utils.py index 8a555a4c7c..83dfbfefb2 100644 --- a/backend/btrixcloud/utils.py +++ b/backend/btrixcloud/utils.py @@ -194,3 +194,13 @@ def get_origin(headers) -> str: return default_origin return scheme + "://" + host + + +def validate_regexes(regexes: List[str]): + """Validate regular expressions, raise HTTPException if invalid""" + for regex in regexes: + try: + re.compile(regex) + except re.error: + # pylint: disable=raise-missing-from + raise HTTPException(status_code=400, detail="invalid_regex") diff --git a/backend/test/test_crawlconfigs.py b/backend/test/test_crawlconfigs.py index 967f32db33..498403a894 100644 --- a/backend/test/test_crawlconfigs.py +++ b/backend/test/test_crawlconfigs.py @@ -153,6 +153,26 @@ def test_update_config_invalid_format( assert r.status_code == 422 +def test_update_config_invalid_exclude_regex( + crawler_auth_headers, default_org_id, sample_crawl_data +): + r = requests.patch( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/", + headers=crawler_auth_headers, + json={"config": {"exclude": "["}}, + ) + assert r.status_code == 400 + assert r.json()["detail"] == "invalid_regex" + + r = requests.patch( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/", + headers=crawler_auth_headers, + json={"config": {"exclude": ["abc.*", "["]}}, + ) + assert r.status_code == 400 + assert r.json()["detail"] == "invalid_regex" + + def test_update_config_data(crawler_auth_headers, default_org_id, sample_crawl_data): r = requests.patch( f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/", @@ -487,3 +507,25 @@ def test_get_crawler_channels(crawler_auth_headers, default_org_id): for crawler_channel in crawler_channels: assert crawler_channel["id"] assert crawler_channel["image"] + + +def test_add_crawl_config_invalid_exclude_regex( + crawler_auth_headers, default_org_id, sample_crawl_data +): + sample_crawl_data["config"]["exclude"] = "[" + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/", + headers=crawler_auth_headers, + json=sample_crawl_data, + ) + assert r.status_code == 400 + assert r.json()["detail"] == "invalid_regex" + + sample_crawl_data["config"]["exclude"] = ["abc.*", "["] + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/", + headers=crawler_auth_headers, + json=sample_crawl_data, + ) + assert r.status_code == 400 + assert r.json()["detail"] == "invalid_regex" diff --git a/backend/test/test_run_crawl.py b/backend/test/test_run_crawl.py index 36f0237f7d..c3604e9050 100644 --- a/backend/test/test_run_crawl.py +++ b/backend/test/test_run_crawl.py @@ -148,6 +148,15 @@ def test_add_exclusion(admin_auth_headers, default_org_id): assert r.json()["success"] == True +def test_add_invalid_exclusion(admin_auth_headers, default_org_id): + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/exclusions?regex=[", + headers=admin_auth_headers, + ) + assert r.status_code == 400 + assert r.json()["detail"] == "invalid_regex" + + def test_remove_exclusion(admin_auth_headers, default_org_id): r = requests.delete( f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/exclusions?regex=test",