Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Validate exclusion regex on add #2316

Merged
merged 11 commits into from
Jan 23, 2025
18 changes: 15 additions & 3 deletions backend/btrixcloud/crawlconfigs.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
CrawlerProxy,
CrawlerProxies,
)
from .utils import dt_now, slug_from_name
from .utils import dt_now, slug_from_name, validate_regexes

if TYPE_CHECKING:
from .orgs import OrgOps
Expand Down Expand Up @@ -189,7 +189,7 @@ async def get_profile_filename(

return profile_filename

# pylint: disable=invalid-name
# pylint: disable=invalid-name, too-many-branches
async def add_crawl_config(
self,
config_in: CrawlConfigIn,
Expand All @@ -215,6 +215,12 @@ async def add_crawl_config(
if not self.can_org_use_proxy(org, config_in.proxyId):
raise HTTPException(status_code=404, detail="proxy_not_found")

if config_in.config.exclude:
exclude = config_in.config.exclude
if isinstance(exclude, str):
exclude = [exclude]
validate_regexes(exclude)

now = dt_now()
crawlconfig = CrawlConfig(
id=uuid4(),
Expand Down Expand Up @@ -317,11 +323,17 @@ def check_attr_changed(
async def update_crawl_config(
self, cid: UUID, org: Organization, user: User, update: UpdateCrawlConfig
) -> dict[str, bool | str]:
# pylint: disable=too-many-locals
# pylint: disable=too-many-locals, too-many-branches, too-many-statements
"""Update name, scale, schedule, and/or tags for an existing crawl config"""

orig_crawl_config = await self.get_crawl_config(cid, org.id)

if update.config and update.config.exclude:
exclude = update.config.exclude
if isinstance(exclude, str):
exclude = [exclude]
validate_regexes(exclude)

# indicates if any k8s crawl config settings changed
changed = False
changed = changed or (
Expand Down
4 changes: 4 additions & 0 deletions backend/btrixcloud/crawls.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
date_to_str,
parse_jsonl_error_messages,
stream_dict_list_as_csv,
validate_regexes,
)
from .basecrawls import BaseCrawlOps
from .crawlmanager import CrawlManager
Expand Down Expand Up @@ -517,6 +518,9 @@ async def add_or_remove_exclusion(
"""add new exclusion to config or remove exclusion from config
for given crawl_id, update config on crawl"""

if add:
validate_regexes([regex])

crawl = await self.get_crawl(crawl_id, org)

if crawl.state not in RUNNING_AND_WAITING_STATES:
Expand Down
10 changes: 10 additions & 0 deletions backend/btrixcloud/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,3 +194,13 @@ def get_origin(headers) -> str:
return default_origin

return scheme + "://" + host


def validate_regexes(regexes: List[str]):
"""Validate regular expressions, raise HTTPException if invalid"""
for regex in regexes:
try:
re.compile(regex)
except re.error:
# pylint: disable=raise-missing-from
raise HTTPException(status_code=400, detail="invalid_regex")
42 changes: 42 additions & 0 deletions backend/test/test_crawlconfigs.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,26 @@ def test_update_config_invalid_format(
assert r.status_code == 422


def test_update_config_invalid_exclude_regex(
crawler_auth_headers, default_org_id, sample_crawl_data
):
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={"config": {"exclude": "["}},
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_regex"

r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={"config": {"exclude": ["abc.*", "["]}},
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_regex"


def test_update_config_data(crawler_auth_headers, default_org_id, sample_crawl_data):
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
Expand Down Expand Up @@ -487,3 +507,25 @@ def test_get_crawler_channels(crawler_auth_headers, default_org_id):
for crawler_channel in crawler_channels:
assert crawler_channel["id"]
assert crawler_channel["image"]


def test_add_crawl_config_invalid_exclude_regex(
crawler_auth_headers, default_org_id, sample_crawl_data
):
sample_crawl_data["config"]["exclude"] = "["
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=crawler_auth_headers,
json=sample_crawl_data,
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_regex"

sample_crawl_data["config"]["exclude"] = ["abc.*", "["]
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=crawler_auth_headers,
json=sample_crawl_data,
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_regex"
9 changes: 9 additions & 0 deletions backend/test/test_run_crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,15 @@ def test_add_exclusion(admin_auth_headers, default_org_id):
assert r.json()["success"] == True


def test_add_invalid_exclusion(admin_auth_headers, default_org_id):
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/exclusions?regex=[",
headers=admin_auth_headers,
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_regex"


def test_remove_exclusion(admin_auth_headers, default_org_id):
r = requests.delete(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/exclusions?regex=test",
Expand Down
Loading