diff --git a/README.md b/README.md index c5d4f05e..86bd391d 100644 --- a/README.md +++ b/README.md @@ -80,6 +80,7 @@ A complete list of supported formats can be found [here](supported-formats). 7. [Remote Files](#remote-files) 8. [Writable Mounting](#writable-mounting) 9. [As a Library](#as-a-library) + 10. [Fsspec Integration](#fsspec-integration) # Installation @@ -574,6 +575,9 @@ Some often-used configuration environment variables are copied here for easier v [Many other](https://filesystem-spec.readthedocs.io/en/latest/api.html#other-known-implementations) fsspec-based projects may also work when installed. +This functionality of ratarmount offers a hopefully more-tested and out-of-the-box experience over the experimental [fsspec.fuse](https://filesystem-spec.readthedocs.io/en/latest/features.html#mount-anything-with-fuse) implementation. +And, it also works in conjunction with the other features of ratarmount such as union mounting and recursive mounting. + # Writable Mounting @@ -642,3 +646,37 @@ Here is an example for applying modifications to a writable mount and then commi Ratarmount can also be used as a library. Using [ratarmountcore](core/), files inside archives can be accessed directly from Python code without requiring FUSE. For a more detailed description, see the [ratarmountcore readme here](core/). + + +## Fsspec integration + +To use all fsspec features, either install via `pip install ratarmount[fsspec]` or `pip install ratarmount[fsspec]`. +It should also suffice to simply `pip install fsspec` if ratarmountcore is already installed. +The optional [fsspec](https://github.com/fsspec/filesystem_spec) integration is threefold: + + 1. Files can be specified on the command line via URLs pointing to remotes as explained in [this section](#remote-files). + 2. A `ratarmountcore.MountSource` wrapping fsspec `AbstractFileSystem` [implementation](https://github.com/mxmlnkn/ratarmount/blob/master/core/ratarmountcore/SQLiteIndexedTarFsspec.py) has been added. + A specialized `SQLiteIndexedTarFileSystem` as a more performant and direct replacement for `fsspec.implementations.TarFileSystem` has also been added. + ```python3 + from ratarmountcore.SQLiteIndexedTarFsspec import SQLiteIndexedTarFileSystem as ratarfs + fs = ratarfs("tests/single-file.tar") + print("Files in root:", fs.ls("/", detail=False)) + print("Contents of /bar:", fs.cat("/bar")) + ``` + 3. During installation ratarmountcore registers the `ratar://` protocol [with fsspec](https://filesystem-spec.readthedocs.io/en/latest/developer.html#implementing-a-backend) via an [entrypoint](https://setuptools.pypa.io/en/latest/userguide/quickstart.html#entry-points-and-automatic-script-creation) group. + This enables usages with `fsspec.open`. + The fsspec [URL chaining](https://filesystem-spec.readthedocs.io/en/latest/features.html#url-chaining) feature must be used in order for this to be useful. + Example for opening the file `bar`, which is contained inside the file `tests/single-file.tar.gz` with ratarmountcore: + ```python3 + import fsspec + with fsspec.open("ratar://bar::file://tests/single-file.tar.gz") as file: + print("Contents of file bar:", file.read()) + ``` + This also [works with pandas](https://pandas.pydata.org/docs/user_guide/io.html#reading-writing-remote-files): + ```python3 + import fsspec + import pandas as pd + with fsspec.open("ratar://bar::file://tests/single-file.tar.gz", compression=None) as file: + print("Contents of file bar:", file.read()) + ``` + The `compression=None` argument is currently necessary because of [this](https://github.com/pandas-dev/pandas/issues/60028) Pandas bug. diff --git a/core/pyproject.toml b/core/pyproject.toml index 57ab1d83..e4c0b35d 100644 --- a/core/pyproject.toml +++ b/core/pyproject.toml @@ -96,7 +96,8 @@ full = [ bzip2 = ["rapidgzip >= 0.13.1"] git = ["pygit2"] gzip = ["indexed_gzip >= 1.6.3, < 2.0"] -fsspec = [ +fsspec = ["fsspec"] +fsspec-backends = [ # Copy-pasted from fsspec[full] list. Some were excluded because they are too unproportionally large. "requests", "aiohttp", @@ -163,6 +164,9 @@ full-ssh = [ #"sshfs[pywin32]", # Only Windows? asyncssh has no platform specifier though... ] +[project.entry-points."fsspec.specs"] +ratar = "ratarmountcore.SQLiteIndexedTarFsspec.SQLiteIndexedTarFileSystem" + [tool.setuptools] license-files = [ "LICENSE", diff --git a/core/ratarmountcore/SQLiteIndex.py b/core/ratarmountcore/SQLiteIndex.py index 52c628d4..5960eeab 100644 --- a/core/ratarmountcore/SQLiteIndex.py +++ b/core/ratarmountcore/SQLiteIndex.py @@ -1207,7 +1207,8 @@ def _loadGzipIndex(self, fileObject: IO[bytes], table: str) -> bool: # calculated. For C, there actually is an incremental blob reading interface but not for Python: # https://www.sqlite.org/c3ref/blob_open.html # https://bugs.python.org/issue24905 - print(f"Loading gzip block offsets took {time.time() - t0:.2f}s") + if self.printDebug >= 3: + print(f"[Info] Loading gzip block offsets took {time.time() - t0:.2f}s") return True diff --git a/core/ratarmountcore/SQLiteIndexedTarFsspec.py b/core/ratarmountcore/SQLiteIndexedTarFsspec.py index fd9e8e8b..1fa7f792 100644 --- a/core/ratarmountcore/SQLiteIndexedTarFsspec.py +++ b/core/ratarmountcore/SQLiteIndexedTarFsspec.py @@ -1,15 +1,18 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- +# pylint: disable=abstract-method + import stat -from typing import Optional -from fsspec.spec import AbstractFileSystem +import fsspec from .MountSource import MountSource +from .SQLiteIndexedTar import SQLiteIndexedTar +from .utils import overrides -class MountSourceFileSystem(AbstractFileSystem): +class MountSourceFileSystem(fsspec.spec.AbstractFileSystem): """A thin adaptor from the MountSource interface to the fsspec AbstractFileSystem interface.""" cachable = False @@ -24,8 +27,6 @@ def _stripProtocol(cls, path): @staticmethod def _fileInfoToDict(name, fileInfo): - # obj.name and obj.filemode are None for the root tree! - is_dir = isinstance(obj, pygit2.Tree) return { "type": "directory" if stat.S_ISDIR(fileInfo.mode) else "file", "name": name, @@ -33,7 +34,8 @@ def _fileInfoToDict(name, fileInfo): "size": fileInfo.size, } - def ls(self, path, detail=True, ref=None, **kwargs): + @overrides(fsspec.spec.AbstractFileSystem) + def ls(self, path, detail=True, **kwargs): strippedPath = self._stripProtocol(path) if detail: result = self.mountSource.listDir(strippedPath) @@ -48,29 +50,78 @@ def ls(self, path, detail=True, ref=None, **kwargs): raise FileNotFoundError(path) return list(result.keys()) if isinstance(result, dict) else result - def info(self, path, ref=None, **kwargs): + @overrides(fsspec.spec.AbstractFileSystem) + def info(self, path, **kwargs): result = self.mountSource.getFileInfo(self._stripProtocol(path)) if result is None: raise FileNotFoundError(path) - return self._fileInfoToDict(result) + return self._fileInfoToDict(path, result) + @overrides(fsspec.spec.AbstractFileSystem) def _open( self, path, mode="rb", block_size=None, + autocommit=True, + cache_options=None, **kwargs, ): if mode != "rb": raise ValueError("Only binary reading is supported!") - return self.mountSource(self._stripProtocol(path)) + fileInfo = self.mountSource.getFileInfo(self._stripProtocol(path)) + if fileInfo is None: + raise FileNotFoundError(path) + return self.mountSource.open(fileInfo, buffering=block_size if block_size else -1) -class SQLiteIndexedTarFileSystem(AbstractFileSystem): - """Browse the files of a (compressed) TAR archive quickly.""" +class SQLiteIndexedTarFileSystem(MountSourceFileSystem): + """ + Browse the files of a (compressed) TAR archive quickly. + + This is a more optimized alternative to fsspec.implementations.TarFileSystem. + """ protocol = "ratar" - def __init__(self, tarFileName: Optional[str] = None, fileObject: Optional[IO[bytes]] = None, **kwargs): + def __init__( + self, + # It must be called "fo" for URL chaining to work! + # https://filesystem-spec.readthedocs.io/en/latest/features.html#url-chaining + fo=None, + *, # force all parameters after to be keyword-only + target_options=None, + target_protocol=None, + **kwargs, + ): """Refer to SQLiteIndexedTar for all supported arguments and options.""" - super().__init__(self, SQLiteIndexedTar(tarFileName, fileObject, **kwargs)) + + options = kwargs.copy() + + self._openFile = None + if isinstance(fo, str): + # Implement URL chaining such as when calling fsspec.open("ratar://bar::file://single-file.tar"). + if target_protocol: + self._openFile = fsspec.open(fo, protocol=target_protocol, **target_options) + # Set the TAR file name so that the index can be found/stored accordingly. + if target_protocol == 'file': + options['tarFileName'] = fo + if 'indexFilePath' not in options: + options['indexFilePath'] = fo + ".index.sqlite" + if 'writeIndex' not in options: + options['writeIndex'] = True + if isinstance(self._openFile, fsspec.core.OpenFiles): + self._openFile = self._openFile[0] + fo = self._openFile.open() + else: + options['tarFileName'] = fo + fo = None + + if fo: + options['fileObject'] = fo + + super().__init__(SQLiteIndexedTar(**options)) + + +# Only in case the entry point hooks in the pyproject.toml are not working for some reason. +fsspec.register_implementation("ratar", SQLiteIndexedTarFileSystem, clobber=True) diff --git a/core/tests/test_SQLiteIndexedTarFsspec.py b/core/tests/test_SQLiteIndexedTarFsspec.py new file mode 100644 index 00000000..f456575a --- /dev/null +++ b/core/tests/test_SQLiteIndexedTarFsspec.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# pylint: disable=wrong-import-order +# pylint: disable=wrong-import-position +# pylint: disable=protected-access + +import io +import os +import shutil +import sys +import tarfile +import tempfile + +import fsspec + +try: + import pandas as pd +except ImportError: + pd = None # type: ignore + + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) + +from ratarmountcore.SQLiteIndexedTarFsspec import SQLiteIndexedTarFileSystem as ratarfs # noqa: E402 + + +def findTestFile(relativePathOrName): + for i in range(3): + path = os.path.sep.join([".."] * i + ["tests", relativePathOrName]) + if os.path.exists(path): + return path + return relativePathOrName + + +def test_fileSystem(): + fs = ratarfs(findTestFile('single-file.tar.gz')) + + assert 'bar' in fs.ls("/", detail=False) + assert 'bar' in [info['name'] for info in fs.ls("/", detail=True)] + + assert not fs.isfile("/") + assert fs.isdir("/") + assert fs.exists("/") + + assert fs.isfile("/bar") + assert not fs.isdir("/bar") + assert not fs.exists("/bar2") + + assert fs.cat("/bar") == b"foo\n" + assert fs.cat("bar") == b"foo\n" + + with fs.open("bar") as file: + assert file.read() == b"foo\n" + + +def test_URLContextManager(): + with fsspec.open("ratar://bar::file://" + findTestFile('single-file.tar.gz')) as file: + assert file.read() == b"foo\n" + + +def test_URL(): + openFile = fsspec.open("ratar://bar::file://" + findTestFile('single-file.tar.gz')) + with openFile as file: + assert file.read() == b"foo\n" + + +def test_pandas(): + with tempfile.TemporaryDirectory(suffix=".test.ratarmount") as folderPath: + oldPath = os.getcwd() + try: + with open("test.csv", "wt") as file: + file.write("1,2\n3,4") + with tarfile.open("test-csv.tar", "w") as archive: + archive.add("test.csv") + + # Pandas seems + data = pd.read_csv("tar://test.csv::file://test-csv.tar", header=None) + assert data.iloc[0, 1] == 2 + finally: + os.chdir(oldPath) + + +def test_URLRapidgzip(): + # I had problems with resource deallocation! + # For Rapidgzip it becomes important because of the background threads. + with tempfile.TemporaryDirectory(suffix=".test.ratarmount") as folderPath: + contents = os.urandom(96 * 1024 * 1024) + + tarPath = os.path.join(folderPath, "random-data.tar.gz") + with tarfile.open(name=tarPath, mode="w:gz") as tarArchive: + # Must create a sufficiently large .tar.gz so that rapidgzip is actually used. + # In the future this "has multiple chunks" rapidgzip test is to be removed and + # this whole test becomes redundant. + tinfo = tarfile.TarInfo("random-data") + tinfo.size = len(contents) + tarArchive.addfile(tinfo, io.BytesIO(contents)) + + # Only global variables trigger the "Detected Python finalization from running rapidgzip thread." bug. + # I am not sure why. Probably, because it gets garbage-collected later. + global openFile + openFile = fsspec.open("ratar://random-data::file://" + tarPath) + with openFile as file: + assert file.read() == contents + + # This is still some step the user has to do, but it cannot be avoided. + # It might be helpful if fsspec had some kind of better resource management for filesystems though. + del openFile diff --git a/pyproject.toml b/pyproject.toml index e8046cc8..702b5e50 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,7 +55,7 @@ xz = ["ratarmountcore[xz]"] zip = ["ratarmountcore[zip]"] zstd = ["ratarmountcore[zstd]"] squashfs = ["ratarmountcore[squashfs]"] -fsspec = ["ratarmountcore[fsspec]"] +fsspec = ["ratarmountcore[fsspec-backends]"] [project.scripts] ratarmount = "ratarmount:cli"