Skip to content

Commit

Permalink
fix: Make URI parsing more lenient (#316)
Browse files Browse the repository at this point in the history
The URI validation logic was too strict in some cases, causing valid
lakeFS ref expressions with relative (`~N`, `^N`) or HEAD (`@`) suffixes
to be rejected.

This commit refactors the relevant regexp to accept these URIs and also
makes the relative ref expression parsing more explicit in the regex.

Issue: #314
  • Loading branch information
AdrianoKF authored Jan 22, 2025
1 parent 6fb9701 commit 1bf0d54
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 9 deletions.
17 changes: 9 additions & 8 deletions src/lakefs_spec/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,14 @@ def md5_checksum(lpath: str | os.PathLike[str], blocksize: int = 2**22) -> str:
return file_hash.hexdigest()


_uri_parts = {
"protocol": r"^(?:lakefs://)?", # leading lakefs:// protocol (optional)
"repository": r"(?P<repository>[a-z0-9][a-z0-9\-]{2,62})/",
"ref expression": r"(?P<ref>\w[\w\-.]*(([~\^]\d*)*|@)?)/", # ref name with optional @, ~N, ^N suffixes
"resource": r"(?P<resource>.*)",
}


def parse(path: str) -> tuple[str, str, str]:
"""
Parses a lakeFS URI in the form ``lakefs://<repo>/<ref>/<resource>``.
Expand All @@ -118,16 +126,9 @@ def parse(path: str) -> tuple[str, str, str]:
If the path does not conform to the lakeFS URI format.
"""

uri_parts = {
"protocol": r"^(?:lakefs://)?", # leading lakefs:// protocol (optional)
"repository": r"(?P<repository>[a-z0-9][a-z0-9\-]{2,62})/",
"ref expression": r"(?P<ref>\w[\w\-.^~]*)/",
"resource": r"(?P<resource>.*)",
}

groups: dict[str, str] = {}
start = 0
for group, regex in uri_parts.items():
for group, regex in _uri_parts.items():
# we parse iteratively to improve the error message for the user if an invalid URI is given.
# by going front to back and parsing each part successively, we obtain the current path segment,
# and print it out to the user if it does not conform to our assumption of the lakeFS URI spec.
Expand Down
31 changes: 31 additions & 0 deletions tests/regression/test_gh_314.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from lakefs import Branch, Repository

from lakefs_spec.spec import LakeFSFileSystem


def test_gh_314(
fs: LakeFSFileSystem,
repository: Repository,
temp_branch: Branch,
) -> None:
"""
Regression test for GitHub issue 314: Enable `@` and `~N` syntax
https://github.com/aai-institute/lakefs-spec/issues/314
"""

prefix = f"lakefs://{repository.id}/{temp_branch.id}"
datapath = f"{prefix}/data.txt"

# add new file, and immediately commit.
fs.pipe(datapath, b"data1")
temp_branch.commit(message="Add data.txt")

fs.pipe(datapath, b"data2")
# Reading the committed version of the file should yield the correct data.
committed_head_path = f"{prefix}@/data.txt"
assert fs.read_text(committed_head_path) == "data1"

# Reading a relative commit should yield the correct data.
temp_branch.commit(message="Update data.txt")
relative_commit_path = f"{prefix}~1/data.txt"
assert fs.read_text(relative_commit_path) == "data1"
45 changes: 44 additions & 1 deletion tests/test_util.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import re

import pytest

from lakefs_spec.util import _batched
from lakefs_spec.util import _batched, _uri_parts


def test_batched_empty_iterable():
Expand All @@ -26,3 +28,44 @@ def test_batched_batch_size_greater_than_iterable():
def test_batched_invalid_batch_size():
with pytest.raises(ValueError, match="n must be at least one"):
list(_batched([1, 2, 3], 0))


class TestLakeFSUriPartRegexes:
@pytest.mark.parametrize(
"repo_name, valid",
[
("my-repo", True),
("@@repo", False),
("", False),
("a", False),
("a" * 63, True),
("a" * 64, False),
],
)
def test_repository(self, repo_name: str, valid: bool) -> None:
result = re.match(_uri_parts["repository"], repo_name + "/")
if valid:
assert result is not None
else:
assert result is None

@pytest.mark.parametrize(
"refexp, valid",
[
("", False),
("main", True),
("main@", True),
("main~", True),
("main^", True),
("main^2", True),
("main^^^", True),
("main^1^1", True),
("main^1~1", True),
],
)
def test_ref_expression(self, refexp: str, valid: bool) -> None:
result = re.match(_uri_parts["ref expression"], refexp + "/")
if valid:
assert result is not None
else:
assert result is None

0 comments on commit 1bf0d54

Please sign in to comment.