From 952cd981b9b8370a4bfa231a7c5c528184e92fcf Mon Sep 17 00:00:00 2001 From: Maximilian Knespel Date: Tue, 15 Oct 2024 17:47:55 +0200 Subject: [PATCH] Speed up git backend (#1712) --- fsspec/implementations/git.py | 66 ++++++++++-------------- fsspec/implementations/tests/test_git.py | 33 ++++++++++++ fsspec/spec.py | 2 +- 3 files changed, 61 insertions(+), 40 deletions(-) diff --git a/fsspec/implementations/git.py b/fsspec/implementations/git.py index fde671b8c..7b9d3539a 100644 --- a/fsspec/implementations/git.py +++ b/fsspec/implementations/git.py @@ -55,6 +55,8 @@ def _path_to_object(self, path, ref): tree = comm.tree for part in parts: if part and isinstance(tree, pygit2.Tree): + if part not in tree: + raise FileNotFoundError(path) tree = tree[part] return tree @@ -69,46 +71,32 @@ def _get_kwargs_from_urls(path): out["ref"], path = path.split("@", 1) return out + @staticmethod + def _object_to_info(obj, path=None): + # obj.name and obj.filemode are None for the root tree! + is_dir = isinstance(obj, pygit2.Tree) + return { + "type": "directory" if is_dir else "file", + "name": ( + "/".join([path, obj.name or ""]).lstrip("/") if path else obj.name + ), + "hex": str(obj.id), + "mode": "100644" if obj.filemode is None else f"{obj.filemode:o}", + "size": 0 if is_dir else obj.size, + } + def ls(self, path, detail=True, ref=None, **kwargs): - path = self._strip_protocol(path) - tree = self._path_to_object(path, ref) - if isinstance(tree, pygit2.Tree): - out = [] - for obj in tree: - if isinstance(obj, pygit2.Tree): - out.append( - { - "type": "directory", - "name": "/".join([path, obj.name]).lstrip("/"), - "hex": str(obj.id), - "mode": f"{obj.filemode:o}", - "size": 0, - } - ) - else: - out.append( - { - "type": "file", - "name": "/".join([path, obj.name]).lstrip("/"), - "hex": str(obj.id), - "mode": f"{obj.filemode:o}", - "size": obj.size, - } - ) - else: - obj = tree - out = [ - { - "type": "file", - "name": obj.name, - "hex": str(obj.id), - "mode": f"{obj.filemode:o}", - "size": obj.size, - } - ] - if detail: - return out - return [o["name"] for o in out] + tree = self._path_to_object(self._strip_protocol(path), ref) + return [ + GitFileSystem._object_to_info(obj, path) + if detail + else GitFileSystem._object_to_info(obj, path)["name"] + for obj in (tree if isinstance(tree, pygit2.Tree) else [tree]) + ] + + def info(self, path, ref=None, **kwargs): + tree = self._path_to_object(self._strip_protocol(path), ref) + return GitFileSystem._object_to_info(tree, path) def ukey(self, path, ref=None): return self.info(path, ref=ref)["hex"] diff --git a/fsspec/implementations/tests/test_git.py b/fsspec/implementations/tests/test_git.py index ffa7b47d9..2aeb544a1 100644 --- a/fsspec/implementations/tests/test_git.py +++ b/fsspec/implementations/tests/test_git.py @@ -61,6 +61,39 @@ def test_refs(repo): assert f.read() == b"data3" +def _check_FileNotFoundError(f, *args, **kwargs): + with pytest.raises(FileNotFoundError): + f(*args, **kwargs) + + +def test_file_existence_checks(repo): + d, sha = repo + + fs, _ = fsspec.url_to_fs(f"git://{d}:abranch@") + + assert fs.lexists("inner") + assert fs.exists("inner") + assert fs.isdir("inner") + assert fs.info("inner") + assert fs.ls("inner") + + assert fs.lexists("inner/file1") + assert fs.exists("inner/file1") + assert fs.info("inner/file1") + assert fs.ls("inner/file1") + + assert not fs.lexists("non-existing-file") + assert not fs.exists("non-existing-file") + + assert not fs.isfile("non-existing-file") + assert not fs.isdir("non-existing-file") + + _check_FileNotFoundError(fs.info, "non-existing-file") + _check_FileNotFoundError(fs.size, "non-existing-file") + _check_FileNotFoundError(fs.ls, "non-existing-file") + _check_FileNotFoundError(fs.open, "non-existing-file") + + def test_url(repo): d, sha = repo fs, _, paths = fsspec.core.get_fs_token_paths(f"git://file1::file://{d}") diff --git a/fsspec/spec.py b/fsspec/spec.py index e67d280c1..8284366ba 100644 --- a/fsspec/spec.py +++ b/fsspec/spec.py @@ -648,7 +648,7 @@ def info(self, path, **kwargs): Returns a single dictionary, with exactly the same information as ``ls`` would with ``detail=True``. - The default implementation should calls ls and could be overridden by a + The default implementation calls ls and could be overridden by a shortcut. kwargs are passed on to ```ls()``. Some file systems might not be able to measure the file's size, in