Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Validate exclusion regex on add #2316

Merged
merged 11 commits into from
Jan 23, 2025
Prev Previous commit
Next Next commit
Validate exclude regex on crawlconfig update
tw4l committed Jan 17, 2025

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
commit 4317a1cb15d608378af62b8d24534c24ed19f807
17 changes: 8 additions & 9 deletions backend/btrixcloud/crawlconfigs.py
Original file line number Diff line number Diff line change
@@ -43,7 +43,7 @@
CrawlerProxy,
CrawlerProxies,
)
from .utils import dt_now, slug_from_name
from .utils import dt_now, slug_from_name, validate_regexes

if TYPE_CHECKING:
from .orgs import OrgOps
@@ -219,14 +219,7 @@ async def add_crawl_config(
exclude = config_in.config.exclude
if isinstance(exclude, str):
exclude = [exclude]
for regex in exclude:
try:
re.compile(regex)
except re.error:
# pylint: disable=raise-missing-from
raise HTTPException(
status_code=422, detail="invalid_regular_expression"
)
validate_regexes(exclude)

now = dt_now()
crawlconfig = CrawlConfig(
@@ -335,6 +328,12 @@ async def update_crawl_config(

orig_crawl_config = await self.get_crawl_config(cid, org.id)

if update.config.exclude:
exclude = update.config.exclude
if isinstance(exclude, str):
exclude = [exclude]
validate_regexes(exclude)

# indicates if any k8s crawl config settings changed
changed = False
changed = changed or (
9 changes: 2 additions & 7 deletions backend/btrixcloud/crawls.py
Original file line number Diff line number Diff line change
@@ -24,6 +24,7 @@
date_to_str,
parse_jsonl_error_messages,
stream_dict_list_as_csv,
validate_regexes,
)
from .basecrawls import BaseCrawlOps
from .crawlmanager import CrawlManager
@@ -518,13 +519,7 @@ async def add_or_remove_exclusion(
for given crawl_id, update config on crawl"""

if add:
try:
re.compile(regex)
except re.error:
# pylint: disable=raise-missing-from
raise HTTPException(
status_code=422, detail="invalid_regular_expression"
)
validate_regexes([regex])

crawl = await self.get_crawl(crawl_id, org)

10 changes: 10 additions & 0 deletions backend/btrixcloud/utils.py
Original file line number Diff line number Diff line change
@@ -194,3 +194,13 @@ def get_origin(headers) -> str:
return default_origin

return scheme + "://" + host


def validate_regexes(regexes: List[str]):
"""Validate regular expressions, raise HTTPException if invalid"""
for regex in regexes:
try:
re.compile(regex)
except re.error:
# pylint: disable=raise-missing-from
raise HTTPException(status_code=422, detail="invalid_regular_expression")
24 changes: 23 additions & 1 deletion backend/test/test_crawlconfigs.py
Original file line number Diff line number Diff line change
@@ -153,6 +153,26 @@ def test_update_config_invalid_format(
assert r.status_code == 422


def test_update_config_invalid_exclude_regex(
crawler_auth_headers, default_org_id, sample_crawl_data
):
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={"config": {"exclude": "["}},
)
assert r.status_code == 422
assert r.json()["detail"] == "invalid_regular_expression"

r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={"config": {"exclude": ["abc.*", "["]}},
)
assert r.status_code == 422
assert r.json()["detail"] == "invalid_regular_expression"


def test_update_config_data(crawler_auth_headers, default_org_id, sample_crawl_data):
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
@@ -489,7 +509,9 @@ def test_get_crawler_channels(crawler_auth_headers, default_org_id):
assert crawler_channel["image"]


def test_exclude_invalid_regex(crawler_auth_headers, default_org_id, sample_crawl_data):
def test_add_crawl_config_invalid_exclude_regex(
crawler_auth_headers, default_org_id, sample_crawl_data
):
sample_crawl_data["exclude"] = "["
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",