diff --git a/development/404.html b/development/404.html index f4ac8d49..3724d987 100644 --- a/development/404.html +++ b/development/404.html @@ -101,21 +101,55 @@ - + + + + + +
-
diff --git a/development/CONTRIBUTING/index.html b/development/CONTRIBUTING/index.html index 6a5bc836..d9c53946 100644 --- a/development/CONTRIBUTING/index.html +++ b/development/CONTRIBUTING/index.html @@ -110,21 +110,55 @@ - + + + + + +
-
diff --git a/development/_theme_overrides/partials/header.html b/development/_theme_overrides/partials/header.html new file mode 100644 index 00000000..fa74eebf --- /dev/null +++ b/development/_theme_overrides/partials/header.html @@ -0,0 +1,117 @@ + + + +{% set class = "md-header" %} +{% if "navigation.tabs.sticky" in features %} + {% set class = class ~ " md-header--shadow md-header--lifted" %} +{% elif "navigation.tabs" not in features %} + {% set class = class ~ " md-header--shadow" %} +{% endif %} + + +
+ + + + {% if "navigation.tabs.sticky" in features %} + {% if "navigation.tabs" in features %} + {% include "partials/tabs.html" %} + {% endif %} + {% endif %} +
diff --git a/development/guides/configuration/index.html b/development/guides/configuration/index.html index 00048400..47cef167 100644 --- a/development/guides/configuration/index.html +++ b/development/guides/configuration/index.html @@ -112,21 +112,55 @@ - + + + + + +
-
diff --git a/development/guides/filesystem-usage/index.html b/development/guides/filesystem-usage/index.html index fa09e5e9..0c78b524 100644 --- a/development/guides/filesystem-usage/index.html +++ b/development/guides/filesystem-usage/index.html @@ -112,21 +112,55 @@ - + + + + + +
-
diff --git a/development/guides/index.html b/development/guides/index.html index edeceda1..d034086d 100644 --- a/development/guides/index.html +++ b/development/guides/index.html @@ -112,21 +112,55 @@ - + + + + + +
-
diff --git a/development/guides/integrations/index.html b/development/guides/integrations/index.html index a0713739..09f7b1b0 100644 --- a/development/guides/integrations/index.html +++ b/development/guides/integrations/index.html @@ -112,21 +112,55 @@ - + + + + + +
-
diff --git a/development/guides/transactions/index.html b/development/guides/transactions/index.html index cdcb56e5..1687fbfb 100644 --- a/development/guides/transactions/index.html +++ b/development/guides/transactions/index.html @@ -112,21 +112,55 @@ - + + + + + +
-
diff --git a/development/index.html b/development/index.html index 68f96d2e..8d881751 100644 --- a/development/index.html +++ b/development/index.html @@ -110,21 +110,55 @@ - + + + + + +
-
diff --git a/development/quickstart/index.html b/development/quickstart/index.html index dc84ea20..86d79def 100644 --- a/development/quickstart/index.html +++ b/development/quickstart/index.html @@ -112,21 +112,55 @@ - + + + + + +
-
diff --git a/development/reference/SUMMARY/index.html b/development/reference/SUMMARY/index.html index 6d18f6ac..4523662f 100644 --- a/development/reference/SUMMARY/index.html +++ b/development/reference/SUMMARY/index.html @@ -103,21 +103,55 @@ - + + + + + +
-
@@ -775,7 +823,7 @@

SUMMARY

- 2024-02-21 + 2024-02-28 diff --git a/development/reference/lakefs_spec/errors/index.html b/development/reference/lakefs_spec/errors/index.html index c4ed7856..1d6e0c6b 100644 --- a/development/reference/lakefs_spec/errors/index.html +++ b/development/reference/lakefs_spec/errors/index.html @@ -112,21 +112,55 @@ - + + + + + +
-
@@ -1129,7 +1177,7 @@

- 2024-02-21 + 2024-02-28 diff --git a/development/reference/lakefs_spec/index.html b/development/reference/lakefs_spec/index.html index 2d07a5e2..afbd50e3 100644 --- a/development/reference/lakefs_spec/index.html +++ b/development/reference/lakefs_spec/index.html @@ -112,21 +112,55 @@ - + + + + + +
-
@@ -832,7 +880,7 @@

lakefs_spec

- 2024-02-21 + 2024-02-28 diff --git a/development/reference/lakefs_spec/spec/index.html b/development/reference/lakefs_spec/spec/index.html index d6a4cf0d..4367daf3 100644 --- a/development/reference/lakefs_spec/spec/index.html +++ b/development/reference/lakefs_spec/spec/index.html @@ -112,21 +112,55 @@ - + + + + + +
-
@@ -5580,7 +5628,7 @@

- 2024-02-21 + 2024-02-28 diff --git a/development/reference/lakefs_spec/transaction/index.html b/development/reference/lakefs_spec/transaction/index.html index d9b94eca..aac1088b 100644 --- a/development/reference/lakefs_spec/transaction/index.html +++ b/development/reference/lakefs_spec/transaction/index.html @@ -112,21 +112,55 @@ - + + + + + +
-
@@ -2224,7 +2272,7 @@

- 2024-02-21 + 2024-02-28 diff --git a/development/reference/lakefs_spec/util/index.html b/development/reference/lakefs_spec/util/index.html index 8b5ebcd6..beb631be 100644 --- a/development/reference/lakefs_spec/util/index.html +++ b/development/reference/lakefs_spec/util/index.html @@ -112,21 +112,55 @@ - + + + + + +
-
@@ -1407,7 +1455,7 @@

- 2024-02-21 + 2024-02-28 diff --git a/development/search/search_index.json b/development/search/search_index.json index 042d1687..5fb47950 100644 --- a/development/search/search_index.json +++ b/development/search/search_index.json @@ -1 +1 @@ -{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Home","text":"

Welcome to lakeFS-spec, a filesystem-spec backend implementation for the lakeFS data lake. Our primary goal is to streamline versioned data operations in lakeFS, enabling seamless integration with popular data science tools such as Pandas, Polars, and DuckDB directly from Python.

Highlights:

  • Simple repository operations in lakeFS
  • Easy access to underlying storage and versioning operations
  • Seamless integration with the fsspec ecosystem
  • Directly access lakeFS objects from popular data science libraries (including Pandas, Polars, DuckDB, PyArrow) with minimal code
  • Transaction support for reliable data version control
  • Smart data transfers through client-side caching (up-/download)
  • Auto-discovery configuration

Early Adopters

We are seeking early adopters who would like to actively participate in our feedback process and shape the future of the library. If you are interested in using the library and want to get in touch with us, please reach out via Github Discussions.

Quickstart

Step-by-step installation and first operations

Tutorials

In-depth tutorials on using lakeFS-spec

API Reference

Full documentation of the Python API

User Guide

Solving specific tasks with lakeFS-spec

Contributing

How to contribute to the project

"},{"location":"CONTRIBUTING/","title":"Contributing to lakeFS-spec","text":"

Thank you for your interest in contributing to this project!

We appreciate issue reports, pull requests for code and documentation, as well as any project-related communication through GitHub Discussions.

"},{"location":"CONTRIBUTING/#getting-started","title":"Getting Started","text":"

To get started with development, you can follow these steps:

  1. Clone this repository:

    git clone https://github.com/aai-institute/lakefs-spec.git\n
  2. Navigate to the directory and install the development dependencies into a virtual environment:

    cd lakefs-spec\npython3 -m venv venv --system-site-packages\nsource venv/bin/activate\npython -m pip install -r requirements-dev.txt\npython -m pip install -e . --no-deps\n
  3. After making your changes, verify they adhere to our Python code style by running pre-commit:

    pre-commit run --all-files\n

    You can also set up Git hooks through pre-commit to perform these checks automatically:

    pre-commit install\n
  4. To run the tests against an ephemeral lakeFS instance, you just run pytest:

    pytest\n

    To spin up a local lakeFS instance quickly for testing, you can use the Docker Compose file bundled with this repository:

    docker-compose -f hack/docker-compose.yml up\n
"},{"location":"CONTRIBUTING/#updating-dependencies","title":"Updating dependencies","text":"

Dependencies should stay locked for as long as possible, ideally for a whole release. If you have to update a dependency during development, you should do the following:

  1. If it is a core dependency needed for the package, add it to the dependencies section in the pyproject.toml.
  2. In case of a development dependency, add it to the dev section of the project.optional-dependencies table instead.
  3. Dependencies needed for documentation generation are found in the docs sections of project.optional-dependencies.

After adding the dependency in either of these sections, run the helper script hack/lock-deps.sh (which in turn uses pip-compile) to pin all dependencies again:

python -m pip install --upgrade pip-tools\nhack/lock-deps.sh\n

In addition to these manual steps, we also provide pre-commit hooks that automatically lock the dependencies whenever pyproject.toml is changed.

Selective package upgrade for existing dependencies are also handled by the helper script above. If you want to update the lakefs-sdk dependency, for example, simply run:

hack/lock-deps.sh lakefs-sdk\n

Tip

Since the official development version is Python 3.11, please run the above commands in a virtual environment with Python 3.11.

"},{"location":"CONTRIBUTING/#working-on-documentation","title":"Working on Documentation","text":"

Improvements or additions to the project's documentation are highly appreciated.

The documentation is based on the MkDocs and Material for MkDocs (mkdocs-material) projects, see their homepages for in-depth guides on their features and usage. We use the Numpy documentation style for Python docstrings.

To build the documentation locally, you need to first install the optional docs dependencies from requirements-docs.txt, e.g., with pip install -r requirements-docs.txt. You can then start a local documentation server with mkdocs serve, or build the documentation into its output folder in public/.

In order to maintain documentation for multiple versions of this library, we use the mike tool, which automatically maintains individual documentation builds per version and publishes them to the gh-pages branch.

The GitHub CI pipeline automatically invokes mike as part of the release process with the correct version and updates the GitHub pages branch for the project.

"},{"location":"quickstart/","title":"Quickstart","text":"

Welcome! This quickstart guide will get you up and running with lakeFS-spec by showing you how to

  1. install the lakefs-spec package,
  2. spin up a local lakeFS server,
  3. create a lakeFS repository for experimentation, and
  4. perform basic file system operations in a lakeFS repository using lakeFS-spec.
Prerequisites

To follow along with this guide, you will need a few prerequisites ready on your machine:

  • lakeFS-spec supports Windows, macOS, or Linux
  • Docker, with Docker Compose
  • Python 3.9 or later
  • optionally, lakectl, the lakeFS command line tool

Please take a moment to make sure you have these tools available before proceeding with the next steps.

"},{"location":"quickstart/#installing-lakefs-spec","title":"Installing lakeFS-spec","text":"A note on virtual environments

We generally recommend installing the library in a virtual environment to ensure proper isolation, especially when following this quickstart guide.

If you are using Poetry, virtual environments can automatically be created by the tool.

If you prefer the venv functionality built into Python, see the official docs (tl;dr: python -m venv venv; source venv/bin/activate).

To install the package directly from PyPI, run:

pippoetry
pip install lakefs-spec\n
poetry add lakefs-spec\n

Or, if you want to try the latest pre-release version directly from GitHub:

pippoetry
pip install git+https://github.com/aai-institute/lakefs-spec.git\n
poetry add git+https://github.com/aai-institute/lakefs-spec.git\n
"},{"location":"quickstart/#first-steps","title":"First Steps","text":""},{"location":"quickstart/#spinning-up-a-local-lakefs-instance","title":"Spinning up a local lakeFS instance","text":"

Warning

This setup is not recommended for production uses, since it does not store the data persistently.

Please check out the lakeFS docs for production-ready deployment options.

If you don't already have access to a lakeFS server, you can quickly start a local instance using Docker Compose. Before continuing, please make sure Docker is installed and running on your machine.

The lakeFS quickstart deployment can be launched directly with a configuration file provided in the lakeFS-spec repository:

$ curl https://raw.githubusercontent.com/aai-institute/lakefs-spec/main/hack/docker-compose.yml | docker-compose -f - up\n

If you do not have curl installed on your machine or would like to examine and/or customize the container configuration, you can also create a docker-compose.yml file locally and use it with docker-compose up:

docker-compose.yml
version: \"3\"\n\nservices:\n  lakefs:\n    image: treeverse/lakefs:1.7.0\n    ports:\n      - 8000:8000\n    environment:\n      LAKEFS_INSTALLATION_USER_NAME: \"quickstart\"\n      LAKEFS_INSTALLATION_ACCESS_KEY_ID: \"AKIAIOSFOLQUICKSTART\"\n      LAKEFS_INSTALLATION_SECRET_ACCESS_KEY: \"wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY\"\n      LAKEFS_DATABASE_TYPE: \"local\"\n      LAKEFS_AUTH_ENCRYPT_SECRET_KEY: \"THIS_MUST_BE_CHANGED_IN_PRODUCTION\"\n      LAKEFS_BLOCKSTORE_TYPE: \"local\"\n

In order to allow lakeFS-spec to automatically discover credentials to access this lakeFS instance, create a .lakectl.yaml in your home directory containing the credentials for the quickstart environment (you can also use lakectl config to create this file interactively if you have the lakectl tool installed on your machine):

~/.lakectl.yaml
credentials: # (1)!\n  access_key_id: AKIAIOSFOLQUICKSTART\n  secret_access_key: wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY\nserver:\n  endpoint_url: http://127.0.0.1:8000\n
  1. These must match the credentials set in the environment section of the Docker Compose file above

After the container has finished initializing, you can access the web UI of your local lakeFS deployment in your browser. Fill out the setup form, where you can optionally share your email address with the developers of lakeFS to receive updates on their product. Next, you can log into your fresh lakeFS instance with the credentials listed above.

Success

Your fresh local lakeFS instance is a playground for you to explore lakeFS functionality.

In the next step, we will create your first repository on this server.

"},{"location":"quickstart/#create-a-lakefs-repository","title":"Create a lakeFS repository","text":"

Once you have logged into the web UI of the lakeFS server for the first time, you can create an empty repository on the next page. Click the small Click here link at the bottom of the page to proceed and create a repository named repo (we don't want to add the sample data for this guide):

Tip: Creating a repository later

If you have inadvertently skipped over the quickstart repository creation page, you can always create a new repository on the Repositories tab in the lakeFS web UI (and optionally choose to add the sample data):

Success

You have successfully created a lakeFS repository named repo, ready to be used with lakeFS-spec.

"},{"location":"quickstart/#using-the-lakefs-file-system","title":"Using the lakeFS file system","text":"

We will now use the lakeFS-spec file system interface to perform some basic operations on the repository created in the previous step:

  • Upload a local file to the repository
  • Read data from a file in the repository
  • Make a commit
  • Fetch metadata about repository contents
  • Delete a file from the repository

To get started, create a file called quickstart.py with the following contents:

quickstart.py
from pathlib import Path\n\nfrom lakefs_spec import LakeFSFileSystem\n\nREPO, BRANCH = \"repo\", \"main\"\n\n# Prepare example local data\nlocal_path = Path(\"demo.txt\")\nlocal_path.write_text(\"Hello, lakeFS!\")\n

Tip

We will keep adding more code to this file as we progress through the next steps. Feel free to execute the script after each step and observe the effects as noted in the guide.

This code snippet prepares a file demo.txt on your machine, ready to be added to the lakeFS repository, so let's do just that:

fs = LakeFSFileSystem()  # will auto-discover credentials from ~/.lakectl.yaml\nrepo_path = f\"{REPO}/{BRANCH}/{local_path.name}\"\n\nwith fs.transaction(REPO, BRANCH) as tx:\n    fs.put(str(local_path), f\"{REPO}/{tx.branch.id}/{local_path.name}\")\n    tx.commit(message=\"Add demo data\")\n

If you execute the quickstart.py script at this point, you can already see the committed file in the lakeFS web UI:

While examining the file contents in the browser is nice, we want to access the committed file programmatically. Add the following lines at the end of your script and observe the output:

f = fs.open(repo_path, \"rt\")\nprint(f.readline())  # prints \"Hello, lakeFS!\"\n

Note that executing the same code multiple times will only result in a single commit in the repository since the contents of the file on disk and in the repository are identical.

In addition to simple read and write operations, the fsspec file system interface also allows us to list the files in a repository folder using ls, and query the metadata of objects in the repository through info (akin to the POSIX stat system call). Let's add the following code to our script and observe the output:

# Compare the sizes of local file and repo\nfile_info = fs.info(repo_path)\nprint(\n    f\"{local_path.name}: local size: {file_info['size']}, remote size: {local_path.stat().st_size}\"\n)\n\n# Get information about all files in the repo root\nprint(fs.ls(f\"{REPO}/{BRANCH}/\"))\n

As the last order of business, let's clean up the repository to its original state by removing the file using the rm operation and creating another commit (also, the local file is deleted, since we don't need it anymore):

with fs.transaction(REPO, BRANCH) as tx:\n    fs.rm(f\"{REPO}/{tx.branch.id}/{local_path.name}\")\n    tx.commit(message=\"Delete demo data\")\n

Success

You now have all the basic tools available to version data from your Python code using the file system interface provided by lakeFS-spec.

Full example code quickstart.py
from pathlib import Path\n\nfrom lakefs_spec import LakeFSFileSystem\n\nREPO, BRANCH = \"repo\", \"main\"\n\n# Prepare example local data\nlocal_path = Path(\"demo.txt\")\nlocal_path.write_text(\"Hello, lakeFS!\")\n\n# Upload the local file to the repo and commit\nfs = LakeFSFileSystem()  # will auto-discover credentials from ~/.lakectl.yaml\nrepo_path = f\"{REPO}/{BRANCH}/{local_path.name}\"\n\nwith fs.transaction(REPO, BRANCH) as tx:\n    fs.put(str(local_path), f\"{REPO}/{tx.branch.id}/{local_path.name}\")\n    tx.commit(message=\"Add demo data\")\n\n# Read back the file contents\nf = fs.open(repo_path, \"rt\")\nprint(f.readline())  # prints \"Hello, lakeFS!\"\n\n# Compare the sizes of local file and repo\nfile_info = fs.info(repo_path)\nprint(\n    f\"{local_path.name}: local size: {file_info['size']}, remote size: {local_path.stat().st_size}\"\n)\n\n# Get information about all files in the repo root\nprint(fs.ls(f\"{REPO}/{BRANCH}/\"))\n\n# Delete uploaded file from the repository (and commit)\nwith fs.transaction(REPO, BRANCH) as tx:\n    fs.rm(f\"{REPO}/{tx.branch.id}/{local_path.name}\")\n    tx.commit(message=\"Delete demo data\")\n\nlocal_path.unlink()\n
"},{"location":"quickstart/#next-steps","title":"Next Steps","text":"

After this walkthrough of the installation and an introduction to basic file system operations using lakeFS-spec, you might want to consider more advanced topics:

  • API Reference
  • User Guide, in particular
    • How to use the lakeFS file system
    • How to use lakeFS-spec with third-party data science libraries
  • Tutorial: Using lakeFS-spec in a data science project
"},{"location":"guides/","title":"User Guide","text":"

The lakeFS-spec user guide provides documentation for users of the library looking to solve specific tasks. See the Quickstart guide for an introductory tutorial.

  • How to use the lakeFS file system
  • Passing configuration to the file system
  • Using file system transactions
  • Using transactions on the lakeFS file system
  • How to use lakeFS-spec with third-party data science libraries
"},{"location":"guides/configuration/","title":"Passing configuration to the file system","text":"

There are multiple ways to configure the LakeFSFileSystem for use with a deployed lakeFS instance. This guide introduces them in the order of least to most in-Python configuration - the preferred way to use the file system is with as little Python code as possible.

Info

The configuration methods are introduced in reverse order of precedence - config file arguments have the lowest priority and are overwritten by environment variables (if specified).

"},{"location":"guides/configuration/#the-lakectlyaml-configuration-file","title":"The .lakectl.yaml configuration file","text":"

The easiest way of configuring the lakeFS file system is with a lakectl YAML configuration file. To address a lakeFS server, the following minimum configuration is required:

~/.lakectl.yaml
credentials:\n  access_key_id: <ID>\n  secret_access_key: <KEY>\nserver:\n  endpoint_url: <LAKEFS-HOST>\n

For a local instance produced by the quickstart, the following values will work:

~/.lakectl.yaml
credentials:\n  access_key_id: AKIAIOSFOLQUICKSTART\n  secret_access_key: wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY\nserver:\n  endpoint_url: http://127.0.0.1:8000\n

To work without any more arguments \"out of the box\", the configuration file has to be placed in your home directory with the name .lakectl.yaml (this is where lakeFS expects it). If you set all values correctly, you can instantiate the lakeFS file system without any arguments:

from lakefs_spec import LakeFSFileSystem\n\n# zero config necessary.\nfs = LakeFSFileSystem()\n

If you cannot use the default location ($HOME/.lakectl.yaml), you can read a file from any other location by passing the configfile argument:

from lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem(configfile=\"/path/to/my/configfile.yaml\")\n
"},{"location":"guides/configuration/#setting-environment-variables","title":"Setting environment variables","text":"

It is also possible to specify certain configuration values used for authentication with the lakeFS server with environment variables. For these values, the variable name is exactly the constructor argument name prefaced with LAKEFS_, e.g. the host argument can be set via the LAKEFS_HOST environment variable.

import os\nfrom lakefs_spec import LakeFSFileSystem\n\nos.environ[\"LAKEFS_HOST\"] = \"http://my-lakefs.host\"\nos.environ[\"LAKEFS_USERNAME\"] = \"my-username\"\nos.environ[\"LAKEFS_PASSWORD\"] = \"my-password\"\n\n# also zero-config.\nfs = LakeFSFileSystem()\n

Info

Not all initialization values can be set via environment variables - the proxy, create_branch_ok, and source_branch arguments can only be supplied in Python.

"},{"location":"guides/configuration/#appendix-mixing-zero-config-methods","title":"Appendix: Mixing zero-config methods","text":"

Two of the introduced methods allow for \"zero-config\" (i.e. no arguments given to the constructor) initialization of the file system. However, care must be taken when working with different file systems configured by the same means (for example, file systems configured with separate environment variables).

The reason for this is the instance caching mechanism built into fsspec. While this allows for efficient reuse of file systems e.g. by third-party libraries (pandas, DuckDB, ...), it can lead to silent misconfigurations. Consider this example, with an existent .lakectl.yaml file:

~/.lakectl.yaml
credentials:\n  access_key_id: AKIAIOSFOLQUICKSTART\n  secret_access_key: wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY\nserver:\n  endpoint_url: http://127.0.0.1:8000\n

Now, mixing config file and environment variable initializations leads to the wrong result:

import os\nfrom lakefs_spec import LakeFSFileSystem\n\n# first file system, initialized from the config file\nconfig_fs = LakeFSFileSystem()\n\nos.environ[\"LAKEFS_HOST\"] = \"http://my-other-lakefs.host\"\nos.environ[\"LAKEFS_USERNAME\"] = \"my-username\"\nos.environ[\"LAKEFS_PASSWORD\"] = \"my-password\"\n\nenvvar_fs = LakeFSFileSystem()\n\nprint(config_fs is envvar_fs) # <- prints True! \n

The reason why the above code does not work as desired is that the cached config-file-initialized file system is simply reused on the second assignment. To clear the file system instance cache, you can run the following:

from lakefs_spec import LakeFSFileSystem\n\nLakeFSFileSystem.clear_instance_cache()\n
"},{"location":"guides/filesystem-usage/","title":"How to use the lakeFS file system","text":"

This guide contains instructions and code snippets on how to use the lakeFS file system.

"},{"location":"guides/filesystem-usage/#the-lakefs-uri-structure","title":"The lakeFS URI structure","text":"

In the following subsections, we frequently make use of lakeFS URIs in the example code. lakeFS URIs identify resources in a lakeFS deployment through a unique path consisting of repository name, lakeFS revision/ref name, and file name relative to the repository root. Optionally, they may be prefixed with the lakefs:// URI scheme (this is required when using third-party libraries).

As an example, a URI like repo/main/file.txt addresses the file.txt file on the main branch in the repository named repo.

In some lakeFS file system operations, directories are also allowed as resource names. For example, the URI repo/main/data/ (note the optional trailing slash) refers to the data directory on the main branch in the repo repository.

"},{"location":"guides/filesystem-usage/#on-staged-versus-committed-changes","title":"On staged versus committed changes","text":"

When uploading, copying, or removing files or directories from a branch, those removal operations will result in staged changes in the repository until a commit is created. lakeFS-spec does not create these commits automatically, since it separates file operations from versioning operations rigorously. If you want to conduct versioning operations, like creating commits, between file transfers, the best way to do so is by using filesystem transactions.

"},{"location":"guides/filesystem-usage/#how-to-use-lakefs-file-system-apis","title":"How to use lakeFS file system APIs","text":"

The following section explains more in-depth how to use the LakeFSFileSystem APIs. This section concerns the explicitly implemented operations. In addition, there are a number of file system APIs inherited from the AbstractFileSystem interface in fsspec.

More information on file system usage can be found in the fsspec documentation.

"},{"location":"guides/filesystem-usage/#uploading-and-downloading-files","title":"Uploading and downloading files","text":"

The arguably most important feature of the file system is file transfers.

"},{"location":"guides/filesystem-usage/#file-uploads","title":"File uploads","text":"

To upload a file, you can use the fs.put() and fs.put_file() methods. While fs.put_file() operates on single files only, the fs.put() API can be used for directory uploads.

from lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\n\n# remote path, then local target path.\nfs.put_file(\"file.txt\", \"my-repo/my-ref/file.txt\")\n

If you want to upload an entire directory to lakeFS, you can use the fs.put() API together with the recursive=True switch:

# structure:\n#   dir/\n#   \u251c\u2500\u2500 a.txt\n#   \u251c\u2500\u2500 b.yaml\n#   \u251c\u2500\u2500 c.csv\n#   \u2514\u2500\u2500 ...\n\nfs.put(\"dir\", \"my-repo/my-ref/dir\", recursive=True)\n

Info

The above method of file uploading results in two transfers: Once from the client to the lakeFS server, and once from the lakeFS server to the object storage. This can impact performance if the uploaded files are very large. To avoid this performance issue, you can also decide to write the file directly to the underlying object storage:

fs = LakeFSFileSystem()\n\nfs.put_file(\"my-repo/my-ref/file.txt\", \"file.txt\", use_blockstore=True)\n

Direct lakeFS blockstore uploads require the installation of the corresponding fsspec file system implementation through pip. For an S3-based lakeFS deployment, install the s3fs package. For Google Cloud Storage (GCS), install the gcsfs package. For Azure blob storage, install the adlfs package.

"},{"location":"guides/filesystem-usage/#file-downloads","title":"File downloads","text":"

To download a file, you can use the fs.get() or fs.get_file() methods. While fs.get_file() downloads single files only, the fs.get() API can be used for recursive directory downloads.

from lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\n\n# remote path, then local target path.\nfs.get_file(\"my-repo/my-ref/file.txt\", \"file.txt\")\n

In the case of a directory in lakeFS, use the fs.get() API together with the recursive=True switch:

# structure:\n#   dir/\n#   \u251c\u2500\u2500 a.txt\n#   \u251c\u2500\u2500 b.yaml\n#   \u251c\u2500\u2500 c.csv\n#   \u2514\u2500\u2500 ...\n\n# downloads the entire `dir` directory (and subdirectories) into the current directory.\nfs.get(\"my-repo/my-ref/dir\", \"dir\", recursive=True)\n
"},{"location":"guides/filesystem-usage/#checking-the-existence-of-lakefs-objects","title":"Checking the existence of lakeFS objects","text":"

To check the existence of a file in a given revision of a repository, you can use the fs.exists() API:

from lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\n\nmy_file_exists = fs.exists(\"my-repo/my-ref/my-file.txt\")\n

This function returns True if the file exists on that revision, and False if it does not. Errors (e.g. permission errors) will be raised, since in that case, object existence cannot be decided.

Warning

fs.exists() only works on file objects, and will return False if called on directories.

"},{"location":"guides/filesystem-usage/#obtaining-info-on-stored-objects","title":"Obtaining info on stored objects","text":"

To query the metadata of a single object in a lakeFS repository, use the fs.info() API:

from lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\n\nmy_file_info = fs.info(\"my-repo/my-ref/my-file.txt\")\n

The resulting my_file_info object is a dictionary containing useful information such as storage location of the file, creation timestamp, and size (in bytes).

You can also call fs.info() on directories:

dir_info = fs.info(\"my-repo/my-ref/dir/\")\n

In this case, the resulting dir_info object only contains the directory name, and the cumulated size of the files it contains.

"},{"location":"guides/filesystem-usage/#listing-directories-in-lakefs","title":"Listing directories in lakeFS","text":"

To list the files in a directory in lakeFS, use the fs.ls() method:

from lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\n\nmy_dir_listing = fs.ls(\"my-repo/my-ref/my-dir/\")\n

This returns a list of Python dictionaries containing information on the objects contained in the requested directory. The returned objects have the same fields set as those returned by a normal fs.info() call on a file object.

"},{"location":"guides/filesystem-usage/#deleting-objects-from-a-lakefs-branch","title":"Deleting objects from a lakeFS branch","text":"

To delete objects from a lakeFS branch, use the fs.rm_file() or fs.rm() APIs. As before, while the former works only for single files, the latter can be used to remove entire directories with the recursive=True option.

from lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\n\nfs.rm_file(\"my-repo/my-branch/my-file.txt\")\n\n# removes the entire `my-dir` directory.\nfs.rm(\"my-repo/my-branch/my-dir/\", recursive=True)\n
"},{"location":"guides/filesystem-usage/#copying-files-in-a-repository","title":"Copying files in a repository","text":"

To copy files on a branch or from one branch to another, use the fs.cp_file() or fs.copy() methods:

from lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\n\n# copies a single file on the same branch to a new location.\nfs.cp_file(\"my-repo/branch-a/file.txt\", \"my-repo/branch-a/file.txt.bak\")\n\n# copies a single file from branch A to branch B.\nfs.cp_file(\"my-repo/branch-a/file.txt\", \"my-repo/branch-b/file.txt\")\n\n# copies the entire `my-dir` directory from branch A to branch B (which must exist).\nfs.copy(\"my-repo/branch-a/my-dir/\", \"my-repo/branch-b/my-dir/\", recursive=True)\n

Info

Files and directories can only be copied between branches in the same repository, not between different repositories.

Trying to copy to a non-existent branch will not create the branch.

"},{"location":"guides/integrations/","title":"How to use lakeFS-spec with third-party data science libraries","text":"

lakeFS-spec is built on top of the fsspec library, which allows third-party libraries to make use of its file system abstraction to offer high-level features. The fsspec documentation lists examples of its users, mostly data science libraries.

This user guide page adds more detail on how lakeFS-spec can be used with four prominent data science libraries.

Code Examples

The code examples assume access to an existing lakeFS server with a quickstart repository containing the sample data already set up.

Please see the Quickstart guide or lakeFS quickstart guide if you need guidance in getting started.

The relevant lines for the lakeFS-spec integration in the following code snippets are highlighted.

"},{"location":"guides/integrations/#pandas","title":"Pandas","text":"

Pandas can read and write data from remote locations, and uses fsspec for all URLs that are not local or HTTP(S).

This means that (almost) all pd.read_* and pd.DataFrame.to_* operations can benefit from the lakeFS integration offered by our library without any additional configuration. See the Pandas documentation on reading/writing remote files for additional details.

The following code snippet illustrates how to read and write Pandas data frames in various formats from/to a lakeFS repository in the context of a transaction:

import pandas as pd\n\nfrom lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\n\nwith fs.transaction(\"quickstart\", \"main\") as tx:\n    lakes = pd.read_parquet(f\"lakefs://quickstart/{tx.branch.id}/lakes.parquet\")\n    german_lakes = lakes.query('Country == \"Germany\"')\n    german_lakes.to_csv(f\"lakefs://quickstart/{tx.branch.id}/german_lakes.csv\")\n\n    tx.commit(message=\"Add German lakes\")\n
"},{"location":"guides/integrations/#duckdb","title":"DuckDB","text":"

The DuckDB in-memory database management system includes support for fsspec file systems as part of its Python API (see the official documentation on using fsspec filesystems for details). This allows DuckDB to transparently query and store data located in lakeFS repositories through lakeFS-spec.

Similar to the example above, the following code snippet illustrates how to read and write data from/to a lakeFS repository in the context of a transaction through the DuckDB Python API:

import duckdb\n\nfrom lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\nduckdb.register_filesystem(fs)\n\nwith fs.transaction(\"quickstart\", \"main\") as tx:\n    lakes = duckdb.read_parquet(\"lakefs://quickstart/main/lakes.parquet\")\n    italian_lakes = duckdb.sql(\"SELECT * FROM lakes where Country='Italy'\")\n    italian_lakes.to_csv(f\"lakefs://quickstart/{tx.branch.id}/italian_lakes.csv\")\n\n    tx.commit(message=\"Add Italian lakes\")\n
  1. Makes the lakeFS-spec file system known to DuckDB (duckdb.register_filesystem(fsspec.filesystem(\"lakefs\")) can also be used to avoid the direct import of LakeFSFileSystem)
"},{"location":"guides/integrations/#polars","title":"Polars","text":"

Warning

There is an ongoing discussion in the Polars development team whether to remove support for fsspec file systems, with no clear outcome as of the time this page was written. Please refer to the discussion on the relevant GitHub issue in case you encounter any problems.

The Python API wrapper for the Rust-based Polars DataFrame library can access remote storage through fsspec, similar to Pandas (see the official documentation on cloud storage).

Again, the following code example demonstrates how to read a Parquet file and save a modified version back in CSV format to a lakeFS repository from Polars in the context of a transaction:

import polars as pl\n\nfrom lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\n\nwith fs.transaction(\"quickstart\", \"main\") as tx:\n    lakes = pl.read_parquet(f\"lakefs://quickstart/{tx.branch.id}/lakes.parquet\")\n    us_lakes = lakes.filter(pl.col(\"Country\") == \"United States of America\")\n\n    with fs.open(f\"lakefs://quickstart/{tx.branch.id}/us_lakes.csv\", \"wb\") as f:\n        us_lakes.write_csv(f)\n\n    tx.commit(message=\"Add US lakes\")\n
  1. Polars does not support directly writing to remote storage through the pl.DataFrame.write_* API (see docs)
"},{"location":"guides/integrations/#pyarrow","title":"PyArrow","text":"

Apache Arrow and its Python API, PyArrow, can also use fsspec file systems to perform I/O operations on data objects. The documentation has additional details on using fsspec-compatible file systems with Arrow.

PyArrow read_* and write_* functions take an explicit filesystem parameter, which accepts any fsspec file system, such as the LakeFSFileSystem provided by this library.

The following example code illustrates the use of lakeFS-spec with PyArrow, reading a Parquet file and writing it back to a lakeFS repository as a partitioned CSV dataset in the context of a transaction:

import pyarrow as pa\nimport pyarrow.dataset as ds\nimport pyarrow.parquet as pq\n\nfrom lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\n\nwith fs.transaction(\"quickstart\", \"main\") as tx:\n    lakes_table = pq.read_table(f\"quickstart/{tx.branch.id}/lakes.parquet\", filesystem=fs)\n\n    ds.write_dataset(\n        lakes_table,\n        f\"quickstart/{tx.branch.id}/lakes\",\n        filesystem=fs,\n        format=\"csv\",\n        partitioning=ds.partitioning(pa.schema([lakes_table.schema.field(\"Country\")])),\n    )\n\n    tx.commit(\"Add partitioned lakes data set\")\n
"},{"location":"guides/transactions/","title":"Using transactions on the lakeFS file system","text":"

In addition to file operations, you can carry out versioning operations in your Python code using file system transactions.

Transactions in lakeFS-spec behave similarly to the transactions in the high-level lakeFS SDK: Both approaches create an ephemeral branch for a transaction, perform the operations in the context block on that ephemeral branch, and optionally merge it back into the source branch upon exiting the context manager.

They are an \"all or nothing\" proposition: If an error occurs during the transaction, the base branch is left unchanged.

The lakeFS-spec transaction inherits from fsspec transactions. For more information on fsspec transactions, see the official documentation.

"},{"location":"guides/transactions/#versioning-operations","title":"Versioning operations","text":"

The lakeFS file system's transaction is the intended place for conducting versioning operations between file transfers. The following is an example of file uploads with commit creations, with a tag being applied at the end.

from lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\n\nwith fs.transaction(\"repo\", \"main\") as tx:\n    fs.put_file(\"train-data.txt\", f\"repo/{tx.branch.id}/train-data.txt\")\n    tx.commit(message=\"Add training data\")\n    fs.put_file(\"test-data.txt\", f\"repo/{tx.branch.id}/test-data.txt\")\n    sha = tx.commit(message=\"Add test data\")\n    tx.tag(sha, name=\"My train-test split\")\n

The full list of supported lakeFS versioning operations (by default, these operations target the transaction branch):

  • commit, for creating a commit, optionally with attached metadata.
  • merge, for merging a given branch.
  • revert, for reverting a previous commit.
  • rev_parse, for parsing revisions like branch/tag names and SHA fragments into full commit SHAs.
  • tag, for creating a tag pointing to a commit.
"},{"location":"guides/transactions/#lifecycle-of-ephemeral-transaction-branches","title":"Lifecycle of ephemeral transaction branches","text":"

You can control the lifecycle for a transaction branch with the delete argument:

  • By default (delete=\"onsuccess), the branch is deleted after successful completion, and left over in case of failure for debugging purposes.
  • If delete=\"always\", the branch is unconditionally deleted after the transaction regardless of its status.
  • Similarly, if delete=\"never\", the branch is unconditionally left in place after the transaction.

Additionally, the automerge keyword controls whether the transaction branch is merged after successful completion of the transaction. It has no effect if an error occurs over the course of the transaction.

"},{"location":"guides/transactions/#error-handling","title":"Error handling","text":"

Since all files are uploaded to a short-lived transaction branch, no commit on the target branch happens in case of an exception:

from lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\n\nwith fs.transaction(\"repo\", \"main\", delete=\"onsuccess\") as tx:\n    fs.put_file(\"my-file.txt\", f\"repo/{tx.branch.id}/my-file.txt\")\n    tx.commit(message=\"Add my-file.txt\")\n    raise ValueError(\"oops!\")\n

The above code will not modify the main branch, since the ValueError prevents the merge of the transaction branch. Note that you can examine the contents of the transaction branch due to delete=\"onsuccess\" (the default behavior), which prevents deletion of the branch in case of failure for debugging purposes.

"},{"location":"reference/SUMMARY/","title":"SUMMARY","text":"
  • lakefs_spec
    • errors
    • spec
    • transaction
    • util
"},{"location":"reference/lakefs_spec/","title":"lakefs_spec","text":"

lakefs-spec is an fsspec file system integration for the lakeFS data lake.

"},{"location":"reference/lakefs_spec/errors/","title":"errors","text":"

Error translation facilities to map lakeFS API errors to Python-native OS errors in the lakeFS file system.

This is important to honor the fsspec API contract, where users only need to expect builtin Python exceptions to avoid complicated error handling setups.

"},{"location":"reference/lakefs_spec/errors/#lakefs_spec.errors.translate_lakefs_error","title":"translate_lakefs_error","text":"
translate_lakefs_error(\n    error: ServerException,\n    rpath: str | None = None,\n    message: str | None = None,\n    set_cause: bool = True,\n) -> OSError\n

Convert a lakeFS server exception to a Python builtin exception.

For some subclasses of lakefs.exceptions.ServerException, a direct Python builtin equivalent exists. In these cases, the suitable equivalent is returned. All other classes are converted to a standard IOError.

PARAMETER DESCRIPTION error

The exception returned by the lakeFS SDK wrapper.

TYPE: ServerException

rpath

The remote resource path involved in the error.

TYPE: str | None DEFAULT: None

message

An error message to use for the returned exception. If not given, the error message returned by the lakeFS server is used instead.

TYPE: str | None DEFAULT: None

set_cause

Whether to set the __cause__ attribute to the previous exception if the exception is translated.

TYPE: bool DEFAULT: True

RETURNS DESCRIPTION OSError

A builtin Python exception ready to be thrown.

Source code in src/lakefs_spec/errors.py
def translate_lakefs_error(\n    error: ServerException,\n    rpath: str | None = None,\n    message: str | None = None,\n    set_cause: bool = True,\n) -> OSError:\n    \"\"\"\n    Convert a lakeFS server exception to a Python builtin exception.\n\n    For some subclasses of ``lakefs.exceptions.ServerException``, a direct Python builtin equivalent exists.\n    In these cases, the suitable equivalent is returned. All other classes are converted to a standard ``IOError``.\n\n    Parameters\n    ----------\n    error: ServerException\n        The exception returned by the lakeFS SDK wrapper.\n    rpath: str | None\n        The remote resource path involved in the error.\n    message: str | None\n        An error message to use for the returned exception.\n         If not given, the error message returned by the lakeFS server is used instead.\n    set_cause: bool\n        Whether to set the ``__cause__`` attribute to the previous exception if the exception is translated.\n\n    Returns\n    -------\n    OSError\n        A builtin Python exception ready to be thrown.\n    \"\"\"\n    status = error.status_code\n\n    if hasattr(error, \"body\"):\n        # error has a JSON response body attached\n        reason = error.body[\"message\"]\n    else:\n        reason = error.reason\n\n    emsg = f\"{status} {reason}\"\n    if rpath:\n        emsg += f\": {rpath!r}\"\n\n    constructor = HTTP_CODE_TO_ERROR.get(status, partial(IOError, errno.EIO))\n    custom_exc = constructor(message or emsg)\n\n    if set_cause:\n        custom_exc.__cause__ = error\n    return custom_exc\n
"},{"location":"reference/lakefs_spec/spec/","title":"spec","text":"

Core interface definitions for file system interaction with lakeFS from Python, namely the LakeFSFileSystem and LakeFSFile classes.

"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem","title":"LakeFSFileSystem","text":"

Bases: AbstractFileSystem

lakeFS file system implementation.

Instances of this class are cached based on their constructor arguments.

For more information, see the fsspec documentation https://filesystem-spec.readthedocs.io/en/latest/features.html#instance-caching.

PARAMETER DESCRIPTION host

The address of your lakeFS instance.

TYPE: str | None DEFAULT: None

username

The access key name to use in case of access key authentication.

TYPE: str | None DEFAULT: None

password

The access key secret to use in case of access key authentication.

TYPE: str | None DEFAULT: None

api_key

The API key to use in case of authentication with an API key.

TYPE: str | None DEFAULT: None

api_key_prefix

A string prefix to use for the API key in authentication.

TYPE: str | None DEFAULT: None

access_token

An access token to use in case of access token authentication.

TYPE: str | None DEFAULT: None

verify_ssl

Whether to verify SSL certificates in API interactions. Do not disable in production.

TYPE: bool DEFAULT: True

ssl_ca_cert

A custom certificate PEM file to use to verify the peer in SSL connections.

TYPE: str | None DEFAULT: None

proxy

Proxy address to use when connecting to a lakeFS server.

TYPE: str | None DEFAULT: None

create_branch_ok

Whether to create branches implicitly when not-existing branches are referenced on file uploads.

TYPE: bool DEFAULT: True

source_branch

Source branch set as origin when a new branch is implicitly created.

TYPE: str DEFAULT: 'main'

**storage_options

Configuration options to pass to the file system's directory cache.

TYPE: Any DEFAULT: {}

Source code in src/lakefs_spec/spec.py
class LakeFSFileSystem(AbstractFileSystem):\n    \"\"\"\n    lakeFS file system implementation.\n\n    Instances of this class are cached based on their constructor arguments.\n\n    For more information, see the fsspec documentation <https://filesystem-spec.readthedocs.io/en/latest/features.html#instance-caching>.\n\n    Parameters\n    ----------\n    host: str | None\n        The address of your lakeFS instance.\n    username: str | None\n        The access key name to use in case of access key authentication.\n    password: str | None\n        The access key secret to use in case of access key authentication.\n    api_key: str | None\n        The API key to use in case of authentication with an API key.\n    api_key_prefix: str | None\n        A string prefix to use for the API key in authentication.\n    access_token: str | None\n        An access token to use in case of access token authentication.\n    verify_ssl: bool\n        Whether to verify SSL certificates in API interactions. Do not disable in production.\n    ssl_ca_cert: str | None\n        A custom certificate PEM file to use to verify the peer in SSL connections.\n    proxy: str | None\n        Proxy address to use when connecting to a lakeFS server.\n    create_branch_ok: bool\n        Whether to create branches implicitly when not-existing branches are referenced on file uploads.\n    source_branch: str\n        Source branch set as origin when a new branch is implicitly created.\n    **storage_options: Any\n        Configuration options to pass to the file system's directory cache.\n    \"\"\"\n\n    protocol = \"lakefs\"\n\n    def __init__(\n        self,\n        host: str | None = None,\n        username: str | None = None,\n        password: str | None = None,\n        api_key: str | None = None,\n        api_key_prefix: str | None = None,\n        access_token: str | None = None,\n        verify_ssl: bool = True,\n        ssl_ca_cert: str | None = None,\n        proxy: str | None = None,\n        create_branch_ok: bool = True,\n        source_branch: str = \"main\",\n        **storage_options: Any,\n    ):\n        super().__init__(**storage_options)\n\n        # lakeFS client arguments\n        cargs = [host, username, password, api_key, api_key_prefix, access_token, ssl_ca_cert]\n\n        if all(arg is None for arg in cargs):\n            # empty kwargs means envvar and configfile autodiscovery\n            self.client = Client()\n        else:\n            self.client = Client(\n                host=host,\n                username=username,\n                password=password,\n                api_key=api_key,\n                api_key_prefix=api_key_prefix,\n                access_token=access_token,\n                ssl_ca_cert=ssl_ca_cert,\n            )\n\n        # proxy address, not part of the constructor\n        self.client.config.proxy = proxy\n        # whether to verify SSL certs, not part of the constructor\n        self.client.config.verify_ssl = verify_ssl\n\n        self.create_branch_ok = create_branch_ok\n        self.source_branch = source_branch\n\n    @cached_property\n    def _lakefs_server_version(self):\n        with self.wrapped_api_call():\n            return tuple(int(t) for t in self.client.version.split(\".\"))\n\n    @classmethod\n    @overload\n    def _strip_protocol(cls, path: str | os.PathLike[str] | Path) -> str:\n        ...\n\n    @classmethod\n    @overload\n    def _strip_protocol(cls, path: list[str | os.PathLike[str] | Path]) -> list[str]:\n        ...\n\n    @classmethod\n    def _strip_protocol(cls, path):\n        \"\"\"Copied verbatim from the base class, save for the slash rstrip.\"\"\"\n        if isinstance(path, list):\n            return [cls._strip_protocol(p) for p in path]\n        spath = super()._strip_protocol(path)\n        if stringify_path(path).endswith(\"/\"):\n            return spath + \"/\"\n        return spath\n\n    @property\n    def transaction(self) -> LakeFSTransaction:\n        \"\"\"\n        A context manager within which file uploads and versioning operations are deferred to a\n        queue, and carried out during when exiting the context.\n\n        Requires the file class to implement ``.commit()`` and ``.discard()`` for the normal and exception cases.\n        \"\"\"\n        self._transaction: LakeFSTransaction | None\n        if self._transaction is None:\n            self._transaction = LakeFSTransaction(self)\n        return self._transaction\n\n    def start_transaction(self):\n        raise NotImplementedError(\n            \"lakeFS transactions should only be used as a context manager via\"\n            \" `with LakeFSFileSystem.transaction as tx:`\"\n        )\n\n    @contextmanager\n    def wrapped_api_call(\n        self, rpath: str | None = None, message: str | None = None, set_cause: bool = True\n    ) -> Generator[None, None, None]:\n        \"\"\"\n        A context manager to wrap lakeFS API calls, translating any API errors to Python-native OS errors.\n\n        Meant for internal use.\n\n        Parameters\n        ----------\n        rpath: str | None\n            The remote path involved in the requested API call.\n        message: str | None\n            A custom error message to emit instead of parsing the API error response.\n        set_cause: bool\n            Whether to include the original lakeFS API error in the resulting traceback.\n\n        Yields\n        ------\n        None\n            An empty generator, to be used as a context manager.\n\n        Raises\n        ------\n        OSError\n            Translated error from the lakeFS API call, if any.\n        \"\"\"\n        try:\n            yield\n        except ServerException as e:\n            raise translate_lakefs_error(e, rpath=rpath, message=message, set_cause=set_cause)\n\n    def checksum(self, path: str | os.PathLike[str]) -> str | None:\n        \"\"\"\n        Get a remote lakeFS file object's checksum.\n\n        This is usually its MD5 hash, unless another hash function was used on upload.\n\n        Parameters\n        ----------\n        path: str | os.PathLike[str]\n            The remote path to look up the lakeFS checksum for. Must point to a single file object.\n\n        Returns\n        -------\n        str | None\n            The remote file's checksum, or ``None`` if ``path`` points to a directory or does not exist.\n        \"\"\"\n        path = stringify_path(path)\n        try:\n            return self.info(path).get(\"checksum\")\n        except FileNotFoundError:\n            return None\n\n    def exists(self, path: str | os.PathLike[str], **kwargs: Any) -> bool:\n        \"\"\"\n        Check existence of a remote path in a lakeFS repository.\n\n        Input paths can either be files or directories.\n\n        Parameters\n        ----------\n        path: str | os.PathLike[str]\n            The remote path whose existence to check. Must be a fully qualified lakeFS URI.\n        **kwargs: Any\n            Additional keyword arguments for fsspec compatibility, unused.\n\n        Returns\n        -------\n        bool\n            ``True`` if the requested path exists, ``False`` if it does not.\n\n        Raises\n        ------\n        PermissionError\n            If the user does not have sufficient permissions to query object existence.\n        \"\"\"\n        path = stringify_path(path)\n        repository, ref, resource = parse(path)\n        try:\n            reference = lakefs.Reference(repository, ref, client=self.client)\n            return reference.object(resource).exists()\n        except ServerException as e:\n            # in case of an error other than \"not found\", existence cannot be\n            # decided, so raise the translated error.\n            raise translate_lakefs_error(e)\n\n    def cp_file(\n        self, path1: str | os.PathLike[str], path2: str | os.PathLike[str], **kwargs: Any\n    ) -> None:\n        \"\"\"\n        Copy a single file from one remote location to another in lakeFS.\n\n        Parameters\n        ----------\n        path1: str | os.PathLike[str]\n            The remote file location to be copied.\n        path2: str | os.PathLike[str]\n            The (remote) target location to which to copy the file.\n        **kwargs: Any\n            Additional keyword arguments for fsspec compatibility, unused.\n\n        Raises\n        ------\n        ValueError\n            When attempting to copy objects between repositories.\n        \"\"\"\n        path1 = stringify_path(path1)\n        path2 = stringify_path(path2)\n        if path1 == path2:\n            return\n\n        orig_repo, orig_ref, orig_path = parse(path1)\n        dest_repo, dest_ref, dest_path = parse(path2)\n\n        if orig_repo != dest_repo:\n            raise ValueError(\n                \"can only copy objects within a repository, but got source \"\n                f\"repository {orig_repo!r} and destination repository {dest_repo!r}\"\n            )\n\n        with self.wrapped_api_call():\n            reference = lakefs.Reference(orig_repo, orig_ref, client=self.client)\n            reference.object(orig_path).copy(dest_ref, dest_path)\n\n    def get_file(\n        self,\n        rpath: str | os.PathLike[str],\n        lpath: str | os.PathLike[str],\n        callback: fsspec.callbacks.Callback = _DEFAULT_CALLBACK,\n        outfile: Any = None,\n        precheck: bool = True,\n        **kwargs: Any,\n    ) -> None:\n        \"\"\"\n        Download a single file from a remote lakeFS server to local storage.\n\n        Parameters\n        ----------\n        rpath: str | os.PathLike[str]\n            The remote path to download to local storage. Must be a fully qualified lakeFS URI, and point to a single file.\n        lpath: str | os.PathLike[str]\n            The local path on disk to save the downloaded file to.\n        callback: fsspec.callbacks.Callback\n            An fsspec callback to use during the operation. Can be used to report download progress.\n        outfile: Any\n            A file-like object to save the downloaded content to. Can be used in place of ``lpath``.\n        precheck: bool\n            Check if ``lpath`` already exists and compare its checksum with that of ``rpath``, skipping the download if they match.\n        **kwargs: Any\n            Additional keyword arguments passed to ``AbstractFileSystem.open()``.\n        \"\"\"\n        rpath = stringify_path(rpath)\n        lpath = stringify_path(lpath)\n\n        if precheck and Path(lpath).is_file():\n            local_checksum = md5_checksum(lpath, blocksize=self.blocksize)\n            remote_checksum = self.info(rpath).get(\"checksum\")\n            if local_checksum == remote_checksum:\n                logger.info(\n                    f\"Skipping download of resource {rpath!r} to local path {lpath!r}: \"\n                    f\"Resource {lpath!r} exists and checksums match.\"\n                )\n                return\n\n        with self.wrapped_api_call(rpath=rpath):\n            super().get_file(rpath, lpath, callback=callback, outfile=outfile, **kwargs)\n\n    def info(self, path: str | os.PathLike[str], **kwargs: Any) -> dict[str, Any]:\n        \"\"\"\n        Query a remote lakeFS object's metadata.\n\n        Parameters\n        ----------\n        path: str | os.PathLike[str]\n            The object for which to obtain metadata. Must be a fully qualified lakeFS URI, can either point to a file or a directory.\n        **kwargs: Any\n            Additional keyword arguments to pass to ``LakeFSFileSystem.ls()`` if ``path`` points to a directory.\n\n        Returns\n        -------\n        dict[str, Any]\n            A dictionary containing metadata on the object, including its full remote path and object type (file or directory).\n\n        Raises\n        ------\n        FileNotFoundError\n            If the ``path`` refers to a non-file path that does not exist in the repository.\n        \"\"\"\n        path = stringify_path(path)\n        repository, ref, resource = parse(path)\n        # first, try with `stat_object` in case of a file.\n        # the condition below checks edge cases of resources that cannot be files.\n        if resource and not resource.endswith(\"/\"):\n            try:\n                reference = lakefs.Reference(repository, ref, client=self.client)\n                res = reference.object(resource).stat()\n                return {\n                    \"checksum\": res.checksum,\n                    \"content-type\": res.content_type,\n                    \"mtime\": res.mtime,\n                    \"name\": f\"{repository}/{ref}/{res.path}\",\n                    \"size\": res.size_bytes,\n                    \"type\": \"file\",\n                }\n            except NotFoundException:\n                # fall through, retry with `ls` if it's a directory.\n                pass\n            except ServerException as e:\n                raise translate_lakefs_error(e, rpath=path)\n\n        out = self.ls(path, detail=True, recursive=True, **kwargs)\n        if not out:\n            raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), path)\n\n        return {\n            \"name\": path.rstrip(\"/\"),\n            \"size\": sum(o.get(\"size\") or 0 for o in out),\n            \"type\": \"directory\",\n        }\n\n    def _update_dircache(self, info: list) -> None:\n        \"\"\"Update logic for dircache (optionally recursive) based on lakeFS API response\"\"\"\n        parents = {self._parent(i[\"name\"].rstrip(\"/\")) for i in info}\n        for pp in parents:\n            # subset of info entries which are direct descendants of `parent`\n            dir_info = [i for i in info if self._parent(i[\"name\"].rstrip(\"/\")) == pp]\n            if pp not in self.dircache:\n                self.dircache[pp] = dir_info\n                continue\n\n            # Merge existing dircache entry with updated listing, which contains either:\n            # - files not present in the cache yet\n            # - a fresh listing (if `refresh=True`)\n\n            cache_entry = self.dircache[pp][:]\n\n            old_names = {e[\"name\"] for e in cache_entry}\n            new_names = {e[\"name\"] for e in dir_info}\n\n            to_remove = old_names - new_names\n            to_update = old_names.intersection(new_names)\n\n            # Remove all entries no longer present in the current listing\n            cache_entry = [e for e in cache_entry if e[\"name\"] not in to_remove]\n\n            # Overwrite existing entries in the cache with its updated values\n            for name in to_update:\n                old_idx = next(idx for idx, e in enumerate(cache_entry) if e[\"name\"] == name)\n                new_entry = next(e for e in info if e[\"name\"] == name)\n\n                cache_entry[old_idx] = new_entry\n                dir_info.remove(new_entry)\n\n            # Add the remaining (new) entries to the cache\n            cache_entry.extend(dir_info)\n            self.dircache[pp] = sorted(cache_entry, key=operator.itemgetter(\"name\"))\n\n    def _ls_from_cache(self, path: str, recursive: bool = False) -> list[dict[str, Any]] | None:\n        \"\"\"Override of ``AbstractFileSystem._ls_from_cache`` with support for recursive listings.\"\"\"\n        if not recursive:\n            return super()._ls_from_cache(path)\n\n        result = None\n        for key, files in self.dircache.items():\n            if not (key.startswith(path) or path == key + \"/\"):\n                continue\n            if result is None:\n                result = []\n            result.extend(files)\n        if not result:\n            return result\n        return sorted(result, key=operator.itemgetter(\"name\"))\n\n    @overload\n    def ls(\n        self,\n        path: str | os.PathLike[str],\n        detail: Literal[True] = ...,\n        **kwargs: Any,\n    ) -> list[dict[str, Any]]:\n        ...\n\n    @overload\n    def ls(\n        self,\n        path: str | os.PathLike[str],\n        detail: Literal[False],\n        **kwargs: Any,\n    ) -> list[str]:\n        ...\n\n    @overload\n    def ls(\n        self,\n        path: str | os.PathLike[str],\n        detail: bool = True,\n        **kwargs: Any,\n    ) -> list[str] | list[dict[str, Any]]:\n        ...\n\n    def ls(\n        self,\n        path: str | os.PathLike[str],\n        detail: bool = True,\n        **kwargs: Any,\n    ) -> list[str] | list[dict[str, Any]]:\n        \"\"\"\n        List all available objects under a given path in lakeFS.\n\n        Parameters\n        ----------\n        path: str | os.PathLike[str]\n            The path under which to list objects. Must be a fully qualified lakeFS URI.\n            Can also point to a file, in which case the file's metadata will be returned.\n        detail: bool\n            Whether to obtain all metadata on the requested objects or just their names.\n        **kwargs: Any\n            Additional keyword arguments for fsspec compatibility.\n\n            In particular:\n                `refresh: bool`: whether to skip the directory listing cache,\n                `recursive: bool`: whether to list subdirectory contents recursively\n\n        Returns\n        -------\n        list[str] | list[dict[str, Any]]\n            A list of all objects' metadata under the given remote path if ``detail=True``, or alternatively only their names if ``detail=False``.\n        \"\"\"\n        path = self._strip_protocol(path)\n        repository, ref, prefix = parse(path)\n\n        recursive = kwargs.pop(\"recursive\", False)\n\n        # Try lookup in dircache unless explicitly disabled by `refresh=True` kwarg\n        use_dircache = not kwargs.pop(\"refresh\", False)\n\n        if use_dircache:\n            cache_entry: list[Any] | None = None\n            try:\n                cache_entry = self._ls_from_cache(path, recursive=recursive)\n            except FileNotFoundError:\n                # we patch files missing from an ls call in the cache entry below,\n                # so this should not be an error.\n                pass\n\n            if cache_entry is not None:\n                if not detail:\n                    return [e[\"name\"] for e in cache_entry]\n                return cache_entry[:]\n\n        kwargs[\"prefix\"] = prefix\n\n        info = []\n        # stat infos are either the path only (`detail=False`) or a dict full of metadata\n        delimiter = \"\" if recursive else \"/\"\n        reference = lakefs.Reference(repository, ref, client=self.client)\n\n        with self.wrapped_api_call(rpath=path):\n            for obj in reference.objects(prefix=prefix, delimiter=delimiter):\n                if isinstance(obj, CommonPrefix):\n                    # prefixes are added below.\n                    info.append(\n                        {\n                            \"name\": f\"{repository}/{ref}/{obj.path}\",\n                            \"size\": 0,\n                            \"type\": \"directory\",\n                        }\n                    )\n                elif isinstance(obj, ObjectInfo):\n                    info.append(\n                        {\n                            \"checksum\": obj.checksum,\n                            \"content-type\": obj.content_type,\n                            \"mtime\": obj.mtime,\n                            \"name\": f\"{repository}/{ref}/{obj.path}\",\n                            \"size\": obj.size_bytes,\n                            \"type\": \"object\",\n                        }\n                    )\n\n        # Retry the API call with appended slash if the current result\n        # is just a single directory entry only (not its contents).\n        # This is useful to allow `ls(\"repo/branch/dir\")` calls without a trailing slash.\n        if len(info) == 1 and info[0][\"type\"] == \"directory\" and info[0][\"name\"] == path + \"/\":\n            return self.ls(\n                path + \"/\",\n                detail=detail,\n                **kwargs | {\"refresh\": not use_dircache, \"recursive\": recursive},\n            )\n\n        if recursive:\n            # To make recursive ls behave identical to the non-recursive case,\n            # add back virtual `directory` entries, which are only returned by\n            # the lakeFS API when querying non-recursively.\n            here = self._strip_protocol(path).rstrip(\"/\")\n            subdirs = {parent for o in info if (parent := self._parent(o[\"name\"])) != here}\n            for subdir in subdirs:\n                info.append(\n                    {\n                        \"name\": subdir + \"/\",\n                        \"size\": 0,\n                        \"type\": \"directory\",\n                    }\n                )\n\n        if info:\n            self._update_dircache(info[:])\n\n        if not detail:\n            info = [o[\"name\"] for o in info]  # type: ignore\n\n        return info\n\n    def open(\n        self,\n        path: str | os.PathLike[str],\n        mode: Literal[\"r\", \"rb\", \"rt\", \"w\", \"wb\", \"wt\", \"x\", \"xb\", \"xt\"] = \"rb\",\n        pre_sign: bool = False,\n        content_type: str | None = None,\n        metadata: dict[str, str] | None = None,\n        autocommit: bool = True,\n        **kwargs: Any,\n    ) -> LakeFSIOBase:\n        \"\"\"\n        Dispatch a lakeFS file-like object (local buffer on disk) for the given remote path for up- or downloads depending on ``mode``.\n\n        Parameters\n        ----------\n        path: str | os.PathLike[str]\n            The remote path for which to open a local ``LakeFSFile``. Must be a fully qualified lakeFS URI.\n        mode: Literal[\"r\", \"rb\", \"rt\", \"w\", \"wb\", \"wt\", \"x\", \"xb\", \"xt\"]\n            The file mode indicating its purpose. Use ``r/rb`` for downloads from lakeFS, ``w/wb/x/xb`` for uploads to lakeFS.\n        pre_sign: bool\n            Whether to use a pre-signed URL for the file up-/download.\n        content_type: str | None\n            Content type to use for the file, relevant for uploads only.\n        metadata: dict[str, str] | None\n            Additional metadata to attach to the file, relevant for uploads only.\n        autocommit: bool\n            Whether to process the file immediately instead of queueing it for transaction while in a transaction context.\n        **kwargs: Any\n            Additional keyword arguments for fsspec compatibility, unused.\n\n        Returns\n        -------\n        LakeFSIOBase\n            A local file-like object ready to hold data to be received from / sent to a lakeFS server.\n\n        Raises\n        ------\n        NotImplementedError\n            If ``mode`` is not supported.\n        \"\"\"\n        if mode.endswith(\"t\"):\n            # text modes {r,w,x}t are equivalent to {r,w,x} here respectively.\n            mode = mode[:-1]  # type: ignore\n\n        if mode not in {\"r\", \"rb\", \"w\", \"wb\", \"x\", \"xb\"}:\n            raise NotImplementedError(f\"unsupported mode {mode!r}\")\n\n        path = stringify_path(path)\n        repo, ref, resource = parse(path)\n\n        if mode.startswith(\"r\"):\n            reference = lakefs.Reference(repo, ref, client=self.client)\n            obj = reference.object(resource)\n\n            if not obj.exists():\n                raise FileNotFoundError(path)\n            handler = ObjectReader(obj, mode=mode, pre_sign=pre_sign, client=self.client)\n        else:\n            # for writing ops, ref must be a branch\n            branch = lakefs.Branch(repo, ref, client=self.client)\n            if self.create_branch_ok:\n                branch.create(self.source_branch, exist_ok=True)\n\n            obj = branch.object(resource)\n            handler = ObjectWriter(\n                obj,\n                mode=mode,\n                pre_sign=pre_sign,\n                content_type=content_type,\n                metadata=metadata,\n                client=self.client,\n            )\n\n        ac = kwargs.pop(\"autocommit\", not self._intrans)\n        if not ac and \"r\" not in mode:\n            self._transaction.files.append(handler)\n\n        return handler\n\n    def put_file(\n        self,\n        lpath: str | os.PathLike[str],\n        rpath: str | os.PathLike[str],\n        callback: fsspec.callbacks.Callback = _DEFAULT_CALLBACK,\n        precheck: bool = True,\n        **kwargs: Any,\n    ) -> None:\n        \"\"\"\n        Upload a local file to a remote location on a lakeFS server.\n\n        Note that depending on the block store type, additional configuration like credentials may need to be configured when ``use_blockstore=True`` and ``presign=False``.\n\n        Parameters\n        ----------\n        lpath: str | os.PathLike[str]\n            The local path on disk to upload to the lakeFS server.\n        rpath: str | os.PathLike[str]\n            The remote target path to upload the local file to. Must be a fully qualified lakeFS URI.\n        callback: fsspec.callbacks.Callback\n            An fsspec callback to use during the operation. Can be used to report download progress.\n        precheck: bool\n            Check if ``lpath`` already exists and compare its checksum with that of ``rpath``, skipping the download if they match.\n        **kwargs: Any\n            Additional keyword arguments to pass to ``LakeFSFileSystem.open()``.\n        \"\"\"\n        lpath = stringify_path(lpath)\n        rpath = stringify_path(rpath)\n\n        if precheck and Path(lpath).is_file():\n            remote_checksum = self.checksum(rpath)\n            local_checksum = md5_checksum(lpath, blocksize=self.blocksize)\n            if local_checksum == remote_checksum:\n                logger.info(\n                    f\"Skipping upload of resource {lpath!r} to remote path {rpath!r}: \"\n                    f\"Resource {rpath!r} exists and checksums match.\"\n                )\n                return\n\n        with self.wrapped_api_call(rpath=rpath):\n            super().put_file(lpath, rpath, callback=callback, **kwargs)\n\n    def rm_file(self, path: str | os.PathLike[str]) -> None:  # pragma: no cover\n        \"\"\"\n        Stage a remote file for removal on a lakeFS server.\n\n        The file will not actually be removed from the requested branch until a commit is created.\n\n        Parameters\n        ----------\n        path: str | os.PathLike[str]\n            The remote file to delete. Must be a fully qualified lakeFS URI.\n        \"\"\"\n        self.rm(path)\n\n    def rm(\n        self, path: str | os.PathLike[str], recursive: bool = False, maxdepth: int | None = None\n    ) -> None:\n        \"\"\"\n        Stage multiple remote files for removal on a lakeFS server.\n\n        The files will not actually be removed from the requested branch until a commit is created.\n\n        Parameters\n        ----------\n        path: str | os.PathLike[str]\n            File(s) to delete.\n        recursive: bool\n            If file(s) include nested directories, recursively delete their contents.\n        maxdepth: int | None\n            Depth to pass to walk for finding files to delete, if recursive.\n            If None, there will be no limit and infinite recursion may be\n            possible.\n        \"\"\"\n\n        path = stringify_path(path)\n        repository, ref, prefix = parse(path)\n\n        with self.wrapped_api_call(rpath=path):\n            branch = lakefs.Branch(repository, ref, client=self.client)\n            objgen = branch.objects(prefix=prefix, delimiter=\"\" if recursive else \"/\")\n            if maxdepth is None:\n                branch.delete_objects(obj.path for obj in objgen)\n            else:\n                # nesting level is just the amount of \"/\"s in the path, no leading \"/\".\n                branch.delete_objects(obj.path for obj in objgen if obj.path.count(\"/\") <= maxdepth)\n\n            # Directory listing cache for the containing folder must be invalidated\n            self.dircache.pop(self._parent(path), None)\n\n    def touch(self, path: str | os.PathLike[str], truncate: bool = True, **kwargs: Any) -> None:\n        \"\"\"\n        Create an empty file or update an existing file on a lakeFS server.\n\n        Parameters\n        ----------\n        path: str | os.PathLike[str]\n            The file path to create or update. Must be a fully qualified lakeFS URI.\n        truncate: bool\n            Whether to set the file size to 0 (zero) bytes, even if the path already exists.\n        **kwargs: Any\n            Additional keyword arguments to pass to ``LakeFSFileSystem.open()``.\n\n        Raises\n        ------\n        NotImplementedError\n            If the targeted lakeFS server version does not support `touch()` operations.\n        \"\"\"\n\n        # empty buffer upload errors were fixed in https://github.com/treeverse/lakeFS/issues/7130,\n        # which was first released in lakeFS v1.3.1.\n        if self._lakefs_server_version < (1, 3, 1):\n            version_string = \".\".join(str(v) for v in self._lakefs_server_version)\n            raise NotImplementedError(\n                \"LakeFSFileSystem.touch() is not supported for your lakeFS server version. \"\n                f\"minimum required version: '1.3.1', actual version: {version_string!r}\"\n            )\n\n        super().touch(path=path, truncate=truncate, **kwargs)\n\n    def tail(self, path: str | os.PathLike[str], size: int = 1024) -> bytes:\n        \"\"\"\n        Get the last ``size`` bytes from a remote file.\n\n        Parameters\n        ----------\n        path: str | os.PathLike[str]\n            The file path to read. Must be a fully qualified lakeFS URI.\n        size: int\n            The amount of bytes to get.\n\n        Returns\n        -------\n        bytes\n            The bytes at the end of the requested file.\n        \"\"\"\n        f: ObjectReader\n        with self.open(path, \"rb\") as f:\n            f.seek(max(-size, -f._obj.stat().size_bytes), 2)\n            return f.read()\n
"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.transaction","title":"transaction property","text":"
transaction: LakeFSTransaction\n

A context manager within which file uploads and versioning operations are deferred to a queue, and carried out during when exiting the context.

Requires the file class to implement .commit() and .discard() for the normal and exception cases.

"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.wrapped_api_call","title":"wrapped_api_call","text":"
wrapped_api_call(\n    rpath: str | None = None, message: str | None = None, set_cause: bool = True\n) -> Generator[None, None, None]\n

A context manager to wrap lakeFS API calls, translating any API errors to Python-native OS errors.

Meant for internal use.

PARAMETER DESCRIPTION rpath

The remote path involved in the requested API call.

TYPE: str | None DEFAULT: None

message

A custom error message to emit instead of parsing the API error response.

TYPE: str | None DEFAULT: None

set_cause

Whether to include the original lakeFS API error in the resulting traceback.

TYPE: bool DEFAULT: True

YIELDS DESCRIPTION None

An empty generator, to be used as a context manager.

RAISES DESCRIPTION OSError

Translated error from the lakeFS API call, if any.

Source code in src/lakefs_spec/spec.py
@contextmanager\ndef wrapped_api_call(\n    self, rpath: str | None = None, message: str | None = None, set_cause: bool = True\n) -> Generator[None, None, None]:\n    \"\"\"\n    A context manager to wrap lakeFS API calls, translating any API errors to Python-native OS errors.\n\n    Meant for internal use.\n\n    Parameters\n    ----------\n    rpath: str | None\n        The remote path involved in the requested API call.\n    message: str | None\n        A custom error message to emit instead of parsing the API error response.\n    set_cause: bool\n        Whether to include the original lakeFS API error in the resulting traceback.\n\n    Yields\n    ------\n    None\n        An empty generator, to be used as a context manager.\n\n    Raises\n    ------\n    OSError\n        Translated error from the lakeFS API call, if any.\n    \"\"\"\n    try:\n        yield\n    except ServerException as e:\n        raise translate_lakefs_error(e, rpath=rpath, message=message, set_cause=set_cause)\n
"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.checksum","title":"checksum","text":"
checksum(path: str | PathLike[str]) -> str | None\n

Get a remote lakeFS file object's checksum.

This is usually its MD5 hash, unless another hash function was used on upload.

PARAMETER DESCRIPTION path

The remote path to look up the lakeFS checksum for. Must point to a single file object.

TYPE: str | PathLike[str]

RETURNS DESCRIPTION str | None

The remote file's checksum, or None if path points to a directory or does not exist.

Source code in src/lakefs_spec/spec.py
def checksum(self, path: str | os.PathLike[str]) -> str | None:\n    \"\"\"\n    Get a remote lakeFS file object's checksum.\n\n    This is usually its MD5 hash, unless another hash function was used on upload.\n\n    Parameters\n    ----------\n    path: str | os.PathLike[str]\n        The remote path to look up the lakeFS checksum for. Must point to a single file object.\n\n    Returns\n    -------\n    str | None\n        The remote file's checksum, or ``None`` if ``path`` points to a directory or does not exist.\n    \"\"\"\n    path = stringify_path(path)\n    try:\n        return self.info(path).get(\"checksum\")\n    except FileNotFoundError:\n        return None\n
"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.exists","title":"exists","text":"
exists(path: str | PathLike[str], **kwargs: Any) -> bool\n

Check existence of a remote path in a lakeFS repository.

Input paths can either be files or directories.

PARAMETER DESCRIPTION path

The remote path whose existence to check. Must be a fully qualified lakeFS URI.

TYPE: str | PathLike[str]

**kwargs

Additional keyword arguments for fsspec compatibility, unused.

TYPE: Any DEFAULT: {}

RETURNS DESCRIPTION bool

True if the requested path exists, False if it does not.

RAISES DESCRIPTION PermissionError

If the user does not have sufficient permissions to query object existence.

Source code in src/lakefs_spec/spec.py
def exists(self, path: str | os.PathLike[str], **kwargs: Any) -> bool:\n    \"\"\"\n    Check existence of a remote path in a lakeFS repository.\n\n    Input paths can either be files or directories.\n\n    Parameters\n    ----------\n    path: str | os.PathLike[str]\n        The remote path whose existence to check. Must be a fully qualified lakeFS URI.\n    **kwargs: Any\n        Additional keyword arguments for fsspec compatibility, unused.\n\n    Returns\n    -------\n    bool\n        ``True`` if the requested path exists, ``False`` if it does not.\n\n    Raises\n    ------\n    PermissionError\n        If the user does not have sufficient permissions to query object existence.\n    \"\"\"\n    path = stringify_path(path)\n    repository, ref, resource = parse(path)\n    try:\n        reference = lakefs.Reference(repository, ref, client=self.client)\n        return reference.object(resource).exists()\n    except ServerException as e:\n        # in case of an error other than \"not found\", existence cannot be\n        # decided, so raise the translated error.\n        raise translate_lakefs_error(e)\n
"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.cp_file","title":"cp_file","text":"
cp_file(path1: str | PathLike[str], path2: str | PathLike[str], **kwargs: Any) -> None\n

Copy a single file from one remote location to another in lakeFS.

PARAMETER DESCRIPTION path1

The remote file location to be copied.

TYPE: str | PathLike[str]

path2

The (remote) target location to which to copy the file.

TYPE: str | PathLike[str]

**kwargs

Additional keyword arguments for fsspec compatibility, unused.

TYPE: Any DEFAULT: {}

RAISES DESCRIPTION ValueError

When attempting to copy objects between repositories.

Source code in src/lakefs_spec/spec.py
def cp_file(\n    self, path1: str | os.PathLike[str], path2: str | os.PathLike[str], **kwargs: Any\n) -> None:\n    \"\"\"\n    Copy a single file from one remote location to another in lakeFS.\n\n    Parameters\n    ----------\n    path1: str | os.PathLike[str]\n        The remote file location to be copied.\n    path2: str | os.PathLike[str]\n        The (remote) target location to which to copy the file.\n    **kwargs: Any\n        Additional keyword arguments for fsspec compatibility, unused.\n\n    Raises\n    ------\n    ValueError\n        When attempting to copy objects between repositories.\n    \"\"\"\n    path1 = stringify_path(path1)\n    path2 = stringify_path(path2)\n    if path1 == path2:\n        return\n\n    orig_repo, orig_ref, orig_path = parse(path1)\n    dest_repo, dest_ref, dest_path = parse(path2)\n\n    if orig_repo != dest_repo:\n        raise ValueError(\n            \"can only copy objects within a repository, but got source \"\n            f\"repository {orig_repo!r} and destination repository {dest_repo!r}\"\n        )\n\n    with self.wrapped_api_call():\n        reference = lakefs.Reference(orig_repo, orig_ref, client=self.client)\n        reference.object(orig_path).copy(dest_ref, dest_path)\n
"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.get_file","title":"get_file","text":"
get_file(\n    rpath: str | PathLike[str],\n    lpath: str | PathLike[str],\n    callback: Callback = _DEFAULT_CALLBACK,\n    outfile: Any = None,\n    precheck: bool = True,\n    **kwargs: Any\n) -> None\n

Download a single file from a remote lakeFS server to local storage.

PARAMETER DESCRIPTION rpath

The remote path to download to local storage. Must be a fully qualified lakeFS URI, and point to a single file.

TYPE: str | PathLike[str]

lpath

The local path on disk to save the downloaded file to.

TYPE: str | PathLike[str]

callback

An fsspec callback to use during the operation. Can be used to report download progress.

TYPE: Callback DEFAULT: _DEFAULT_CALLBACK

outfile

A file-like object to save the downloaded content to. Can be used in place of lpath.

TYPE: Any DEFAULT: None

precheck

Check if lpath already exists and compare its checksum with that of rpath, skipping the download if they match.

TYPE: bool DEFAULT: True

**kwargs

Additional keyword arguments passed to AbstractFileSystem.open().

TYPE: Any DEFAULT: {}

Source code in src/lakefs_spec/spec.py
def get_file(\n    self,\n    rpath: str | os.PathLike[str],\n    lpath: str | os.PathLike[str],\n    callback: fsspec.callbacks.Callback = _DEFAULT_CALLBACK,\n    outfile: Any = None,\n    precheck: bool = True,\n    **kwargs: Any,\n) -> None:\n    \"\"\"\n    Download a single file from a remote lakeFS server to local storage.\n\n    Parameters\n    ----------\n    rpath: str | os.PathLike[str]\n        The remote path to download to local storage. Must be a fully qualified lakeFS URI, and point to a single file.\n    lpath: str | os.PathLike[str]\n        The local path on disk to save the downloaded file to.\n    callback: fsspec.callbacks.Callback\n        An fsspec callback to use during the operation. Can be used to report download progress.\n    outfile: Any\n        A file-like object to save the downloaded content to. Can be used in place of ``lpath``.\n    precheck: bool\n        Check if ``lpath`` already exists and compare its checksum with that of ``rpath``, skipping the download if they match.\n    **kwargs: Any\n        Additional keyword arguments passed to ``AbstractFileSystem.open()``.\n    \"\"\"\n    rpath = stringify_path(rpath)\n    lpath = stringify_path(lpath)\n\n    if precheck and Path(lpath).is_file():\n        local_checksum = md5_checksum(lpath, blocksize=self.blocksize)\n        remote_checksum = self.info(rpath).get(\"checksum\")\n        if local_checksum == remote_checksum:\n            logger.info(\n                f\"Skipping download of resource {rpath!r} to local path {lpath!r}: \"\n                f\"Resource {lpath!r} exists and checksums match.\"\n            )\n            return\n\n    with self.wrapped_api_call(rpath=rpath):\n        super().get_file(rpath, lpath, callback=callback, outfile=outfile, **kwargs)\n
"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.info","title":"info","text":"
info(path: str | PathLike[str], **kwargs: Any) -> dict[str, Any]\n

Query a remote lakeFS object's metadata.

PARAMETER DESCRIPTION path

The object for which to obtain metadata. Must be a fully qualified lakeFS URI, can either point to a file or a directory.

TYPE: str | PathLike[str]

**kwargs

Additional keyword arguments to pass to LakeFSFileSystem.ls() if path points to a directory.

TYPE: Any DEFAULT: {}

RETURNS DESCRIPTION dict[str, Any]

A dictionary containing metadata on the object, including its full remote path and object type (file or directory).

RAISES DESCRIPTION FileNotFoundError

If the path refers to a non-file path that does not exist in the repository.

Source code in src/lakefs_spec/spec.py
def info(self, path: str | os.PathLike[str], **kwargs: Any) -> dict[str, Any]:\n    \"\"\"\n    Query a remote lakeFS object's metadata.\n\n    Parameters\n    ----------\n    path: str | os.PathLike[str]\n        The object for which to obtain metadata. Must be a fully qualified lakeFS URI, can either point to a file or a directory.\n    **kwargs: Any\n        Additional keyword arguments to pass to ``LakeFSFileSystem.ls()`` if ``path`` points to a directory.\n\n    Returns\n    -------\n    dict[str, Any]\n        A dictionary containing metadata on the object, including its full remote path and object type (file or directory).\n\n    Raises\n    ------\n    FileNotFoundError\n        If the ``path`` refers to a non-file path that does not exist in the repository.\n    \"\"\"\n    path = stringify_path(path)\n    repository, ref, resource = parse(path)\n    # first, try with `stat_object` in case of a file.\n    # the condition below checks edge cases of resources that cannot be files.\n    if resource and not resource.endswith(\"/\"):\n        try:\n            reference = lakefs.Reference(repository, ref, client=self.client)\n            res = reference.object(resource).stat()\n            return {\n                \"checksum\": res.checksum,\n                \"content-type\": res.content_type,\n                \"mtime\": res.mtime,\n                \"name\": f\"{repository}/{ref}/{res.path}\",\n                \"size\": res.size_bytes,\n                \"type\": \"file\",\n            }\n        except NotFoundException:\n            # fall through, retry with `ls` if it's a directory.\n            pass\n        except ServerException as e:\n            raise translate_lakefs_error(e, rpath=path)\n\n    out = self.ls(path, detail=True, recursive=True, **kwargs)\n    if not out:\n        raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), path)\n\n    return {\n        \"name\": path.rstrip(\"/\"),\n        \"size\": sum(o.get(\"size\") or 0 for o in out),\n        \"type\": \"directory\",\n    }\n
"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.ls","title":"ls","text":"
ls(\n    path: str | PathLike[str], detail: bool = True, **kwargs: Any\n) -> list[str] | list[dict[str, Any]]\n

List all available objects under a given path in lakeFS.

PARAMETER DESCRIPTION path

The path under which to list objects. Must be a fully qualified lakeFS URI. Can also point to a file, in which case the file's metadata will be returned.

TYPE: str | PathLike[str]

detail

Whether to obtain all metadata on the requested objects or just their names.

TYPE: bool DEFAULT: True

**kwargs

Additional keyword arguments for fsspec compatibility.

In particular: refresh: bool: whether to skip the directory listing cache, recursive: bool: whether to list subdirectory contents recursively

TYPE: Any DEFAULT: {}

RETURNS DESCRIPTION list[str] | list[dict[str, Any]]

A list of all objects' metadata under the given remote path if detail=True, or alternatively only their names if detail=False.

Source code in src/lakefs_spec/spec.py
def ls(\n    self,\n    path: str | os.PathLike[str],\n    detail: bool = True,\n    **kwargs: Any,\n) -> list[str] | list[dict[str, Any]]:\n    \"\"\"\n    List all available objects under a given path in lakeFS.\n\n    Parameters\n    ----------\n    path: str | os.PathLike[str]\n        The path under which to list objects. Must be a fully qualified lakeFS URI.\n        Can also point to a file, in which case the file's metadata will be returned.\n    detail: bool\n        Whether to obtain all metadata on the requested objects or just their names.\n    **kwargs: Any\n        Additional keyword arguments for fsspec compatibility.\n\n        In particular:\n            `refresh: bool`: whether to skip the directory listing cache,\n            `recursive: bool`: whether to list subdirectory contents recursively\n\n    Returns\n    -------\n    list[str] | list[dict[str, Any]]\n        A list of all objects' metadata under the given remote path if ``detail=True``, or alternatively only their names if ``detail=False``.\n    \"\"\"\n    path = self._strip_protocol(path)\n    repository, ref, prefix = parse(path)\n\n    recursive = kwargs.pop(\"recursive\", False)\n\n    # Try lookup in dircache unless explicitly disabled by `refresh=True` kwarg\n    use_dircache = not kwargs.pop(\"refresh\", False)\n\n    if use_dircache:\n        cache_entry: list[Any] | None = None\n        try:\n            cache_entry = self._ls_from_cache(path, recursive=recursive)\n        except FileNotFoundError:\n            # we patch files missing from an ls call in the cache entry below,\n            # so this should not be an error.\n            pass\n\n        if cache_entry is not None:\n            if not detail:\n                return [e[\"name\"] for e in cache_entry]\n            return cache_entry[:]\n\n    kwargs[\"prefix\"] = prefix\n\n    info = []\n    # stat infos are either the path only (`detail=False`) or a dict full of metadata\n    delimiter = \"\" if recursive else \"/\"\n    reference = lakefs.Reference(repository, ref, client=self.client)\n\n    with self.wrapped_api_call(rpath=path):\n        for obj in reference.objects(prefix=prefix, delimiter=delimiter):\n            if isinstance(obj, CommonPrefix):\n                # prefixes are added below.\n                info.append(\n                    {\n                        \"name\": f\"{repository}/{ref}/{obj.path}\",\n                        \"size\": 0,\n                        \"type\": \"directory\",\n                    }\n                )\n            elif isinstance(obj, ObjectInfo):\n                info.append(\n                    {\n                        \"checksum\": obj.checksum,\n                        \"content-type\": obj.content_type,\n                        \"mtime\": obj.mtime,\n                        \"name\": f\"{repository}/{ref}/{obj.path}\",\n                        \"size\": obj.size_bytes,\n                        \"type\": \"object\",\n                    }\n                )\n\n    # Retry the API call with appended slash if the current result\n    # is just a single directory entry only (not its contents).\n    # This is useful to allow `ls(\"repo/branch/dir\")` calls without a trailing slash.\n    if len(info) == 1 and info[0][\"type\"] == \"directory\" and info[0][\"name\"] == path + \"/\":\n        return self.ls(\n            path + \"/\",\n            detail=detail,\n            **kwargs | {\"refresh\": not use_dircache, \"recursive\": recursive},\n        )\n\n    if recursive:\n        # To make recursive ls behave identical to the non-recursive case,\n        # add back virtual `directory` entries, which are only returned by\n        # the lakeFS API when querying non-recursively.\n        here = self._strip_protocol(path).rstrip(\"/\")\n        subdirs = {parent for o in info if (parent := self._parent(o[\"name\"])) != here}\n        for subdir in subdirs:\n            info.append(\n                {\n                    \"name\": subdir + \"/\",\n                    \"size\": 0,\n                    \"type\": \"directory\",\n                }\n            )\n\n    if info:\n        self._update_dircache(info[:])\n\n    if not detail:\n        info = [o[\"name\"] for o in info]  # type: ignore\n\n    return info\n
"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.open","title":"open","text":"
open(\n    path: str | PathLike[str],\n    mode: Literal[\"r\", \"rb\", \"rt\", \"w\", \"wb\", \"wt\", \"x\", \"xb\", \"xt\"] = \"rb\",\n    pre_sign: bool = False,\n    content_type: str | None = None,\n    metadata: dict[str, str] | None = None,\n    autocommit: bool = True,\n    **kwargs: Any\n) -> LakeFSIOBase\n

Dispatch a lakeFS file-like object (local buffer on disk) for the given remote path for up- or downloads depending on mode.

PARAMETER DESCRIPTION path

The remote path for which to open a local LakeFSFile. Must be a fully qualified lakeFS URI.

TYPE: str | PathLike[str]

mode

The file mode indicating its purpose. Use r/rb for downloads from lakeFS, w/wb/x/xb for uploads to lakeFS.

TYPE: Literal['r', 'rb', 'rt', 'w', 'wb', 'wt', 'x', 'xb', 'xt'] DEFAULT: 'rb'

pre_sign

Whether to use a pre-signed URL for the file up-/download.

TYPE: bool DEFAULT: False

content_type

Content type to use for the file, relevant for uploads only.

TYPE: str | None DEFAULT: None

metadata

Additional metadata to attach to the file, relevant for uploads only.

TYPE: dict[str, str] | None DEFAULT: None

autocommit

Whether to process the file immediately instead of queueing it for transaction while in a transaction context.

TYPE: bool DEFAULT: True

**kwargs

Additional keyword arguments for fsspec compatibility, unused.

TYPE: Any DEFAULT: {}

RETURNS DESCRIPTION LakeFSIOBase

A local file-like object ready to hold data to be received from / sent to a lakeFS server.

RAISES DESCRIPTION NotImplementedError

If mode is not supported.

Source code in src/lakefs_spec/spec.py
def open(\n    self,\n    path: str | os.PathLike[str],\n    mode: Literal[\"r\", \"rb\", \"rt\", \"w\", \"wb\", \"wt\", \"x\", \"xb\", \"xt\"] = \"rb\",\n    pre_sign: bool = False,\n    content_type: str | None = None,\n    metadata: dict[str, str] | None = None,\n    autocommit: bool = True,\n    **kwargs: Any,\n) -> LakeFSIOBase:\n    \"\"\"\n    Dispatch a lakeFS file-like object (local buffer on disk) for the given remote path for up- or downloads depending on ``mode``.\n\n    Parameters\n    ----------\n    path: str | os.PathLike[str]\n        The remote path for which to open a local ``LakeFSFile``. Must be a fully qualified lakeFS URI.\n    mode: Literal[\"r\", \"rb\", \"rt\", \"w\", \"wb\", \"wt\", \"x\", \"xb\", \"xt\"]\n        The file mode indicating its purpose. Use ``r/rb`` for downloads from lakeFS, ``w/wb/x/xb`` for uploads to lakeFS.\n    pre_sign: bool\n        Whether to use a pre-signed URL for the file up-/download.\n    content_type: str | None\n        Content type to use for the file, relevant for uploads only.\n    metadata: dict[str, str] | None\n        Additional metadata to attach to the file, relevant for uploads only.\n    autocommit: bool\n        Whether to process the file immediately instead of queueing it for transaction while in a transaction context.\n    **kwargs: Any\n        Additional keyword arguments for fsspec compatibility, unused.\n\n    Returns\n    -------\n    LakeFSIOBase\n        A local file-like object ready to hold data to be received from / sent to a lakeFS server.\n\n    Raises\n    ------\n    NotImplementedError\n        If ``mode`` is not supported.\n    \"\"\"\n    if mode.endswith(\"t\"):\n        # text modes {r,w,x}t are equivalent to {r,w,x} here respectively.\n        mode = mode[:-1]  # type: ignore\n\n    if mode not in {\"r\", \"rb\", \"w\", \"wb\", \"x\", \"xb\"}:\n        raise NotImplementedError(f\"unsupported mode {mode!r}\")\n\n    path = stringify_path(path)\n    repo, ref, resource = parse(path)\n\n    if mode.startswith(\"r\"):\n        reference = lakefs.Reference(repo, ref, client=self.client)\n        obj = reference.object(resource)\n\n        if not obj.exists():\n            raise FileNotFoundError(path)\n        handler = ObjectReader(obj, mode=mode, pre_sign=pre_sign, client=self.client)\n    else:\n        # for writing ops, ref must be a branch\n        branch = lakefs.Branch(repo, ref, client=self.client)\n        if self.create_branch_ok:\n            branch.create(self.source_branch, exist_ok=True)\n\n        obj = branch.object(resource)\n        handler = ObjectWriter(\n            obj,\n            mode=mode,\n            pre_sign=pre_sign,\n            content_type=content_type,\n            metadata=metadata,\n            client=self.client,\n        )\n\n    ac = kwargs.pop(\"autocommit\", not self._intrans)\n    if not ac and \"r\" not in mode:\n        self._transaction.files.append(handler)\n\n    return handler\n
"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.put_file","title":"put_file","text":"
put_file(\n    lpath: str | PathLike[str],\n    rpath: str | PathLike[str],\n    callback: Callback = _DEFAULT_CALLBACK,\n    precheck: bool = True,\n    **kwargs: Any\n) -> None\n

Upload a local file to a remote location on a lakeFS server.

Note that depending on the block store type, additional configuration like credentials may need to be configured when use_blockstore=True and presign=False.

PARAMETER DESCRIPTION lpath

The local path on disk to upload to the lakeFS server.

TYPE: str | PathLike[str]

rpath

The remote target path to upload the local file to. Must be a fully qualified lakeFS URI.

TYPE: str | PathLike[str]

callback

An fsspec callback to use during the operation. Can be used to report download progress.

TYPE: Callback DEFAULT: _DEFAULT_CALLBACK

precheck

Check if lpath already exists and compare its checksum with that of rpath, skipping the download if they match.

TYPE: bool DEFAULT: True

**kwargs

Additional keyword arguments to pass to LakeFSFileSystem.open().

TYPE: Any DEFAULT: {}

Source code in src/lakefs_spec/spec.py
def put_file(\n    self,\n    lpath: str | os.PathLike[str],\n    rpath: str | os.PathLike[str],\n    callback: fsspec.callbacks.Callback = _DEFAULT_CALLBACK,\n    precheck: bool = True,\n    **kwargs: Any,\n) -> None:\n    \"\"\"\n    Upload a local file to a remote location on a lakeFS server.\n\n    Note that depending on the block store type, additional configuration like credentials may need to be configured when ``use_blockstore=True`` and ``presign=False``.\n\n    Parameters\n    ----------\n    lpath: str | os.PathLike[str]\n        The local path on disk to upload to the lakeFS server.\n    rpath: str | os.PathLike[str]\n        The remote target path to upload the local file to. Must be a fully qualified lakeFS URI.\n    callback: fsspec.callbacks.Callback\n        An fsspec callback to use during the operation. Can be used to report download progress.\n    precheck: bool\n        Check if ``lpath`` already exists and compare its checksum with that of ``rpath``, skipping the download if they match.\n    **kwargs: Any\n        Additional keyword arguments to pass to ``LakeFSFileSystem.open()``.\n    \"\"\"\n    lpath = stringify_path(lpath)\n    rpath = stringify_path(rpath)\n\n    if precheck and Path(lpath).is_file():\n        remote_checksum = self.checksum(rpath)\n        local_checksum = md5_checksum(lpath, blocksize=self.blocksize)\n        if local_checksum == remote_checksum:\n            logger.info(\n                f\"Skipping upload of resource {lpath!r} to remote path {rpath!r}: \"\n                f\"Resource {rpath!r} exists and checksums match.\"\n            )\n            return\n\n    with self.wrapped_api_call(rpath=rpath):\n        super().put_file(lpath, rpath, callback=callback, **kwargs)\n
"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.rm_file","title":"rm_file","text":"
rm_file(path: str | PathLike[str]) -> None\n

Stage a remote file for removal on a lakeFS server.

The file will not actually be removed from the requested branch until a commit is created.

PARAMETER DESCRIPTION path

The remote file to delete. Must be a fully qualified lakeFS URI.

TYPE: str | PathLike[str]

Source code in src/lakefs_spec/spec.py
def rm_file(self, path: str | os.PathLike[str]) -> None:  # pragma: no cover\n    \"\"\"\n    Stage a remote file for removal on a lakeFS server.\n\n    The file will not actually be removed from the requested branch until a commit is created.\n\n    Parameters\n    ----------\n    path: str | os.PathLike[str]\n        The remote file to delete. Must be a fully qualified lakeFS URI.\n    \"\"\"\n    self.rm(path)\n
"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.rm","title":"rm","text":"
rm(path: str | PathLike[str], recursive: bool = False, maxdepth: int | None = None) -> None\n

Stage multiple remote files for removal on a lakeFS server.

The files will not actually be removed from the requested branch until a commit is created.

PARAMETER DESCRIPTION path

File(s) to delete.

TYPE: str | PathLike[str]

recursive

If file(s) include nested directories, recursively delete their contents.

TYPE: bool DEFAULT: False

maxdepth

Depth to pass to walk for finding files to delete, if recursive. If None, there will be no limit and infinite recursion may be possible.

TYPE: int | None DEFAULT: None

Source code in src/lakefs_spec/spec.py
def rm(\n    self, path: str | os.PathLike[str], recursive: bool = False, maxdepth: int | None = None\n) -> None:\n    \"\"\"\n    Stage multiple remote files for removal on a lakeFS server.\n\n    The files will not actually be removed from the requested branch until a commit is created.\n\n    Parameters\n    ----------\n    path: str | os.PathLike[str]\n        File(s) to delete.\n    recursive: bool\n        If file(s) include nested directories, recursively delete their contents.\n    maxdepth: int | None\n        Depth to pass to walk for finding files to delete, if recursive.\n        If None, there will be no limit and infinite recursion may be\n        possible.\n    \"\"\"\n\n    path = stringify_path(path)\n    repository, ref, prefix = parse(path)\n\n    with self.wrapped_api_call(rpath=path):\n        branch = lakefs.Branch(repository, ref, client=self.client)\n        objgen = branch.objects(prefix=prefix, delimiter=\"\" if recursive else \"/\")\n        if maxdepth is None:\n            branch.delete_objects(obj.path for obj in objgen)\n        else:\n            # nesting level is just the amount of \"/\"s in the path, no leading \"/\".\n            branch.delete_objects(obj.path for obj in objgen if obj.path.count(\"/\") <= maxdepth)\n\n        # Directory listing cache for the containing folder must be invalidated\n        self.dircache.pop(self._parent(path), None)\n
"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.touch","title":"touch","text":"
touch(path: str | PathLike[str], truncate: bool = True, **kwargs: Any) -> None\n

Create an empty file or update an existing file on a lakeFS server.

PARAMETER DESCRIPTION path

The file path to create or update. Must be a fully qualified lakeFS URI.

TYPE: str | PathLike[str]

truncate

Whether to set the file size to 0 (zero) bytes, even if the path already exists.

TYPE: bool DEFAULT: True

**kwargs

Additional keyword arguments to pass to LakeFSFileSystem.open().

TYPE: Any DEFAULT: {}

RAISES DESCRIPTION NotImplementedError

If the targeted lakeFS server version does not support touch() operations.

Source code in src/lakefs_spec/spec.py
def touch(self, path: str | os.PathLike[str], truncate: bool = True, **kwargs: Any) -> None:\n    \"\"\"\n    Create an empty file or update an existing file on a lakeFS server.\n\n    Parameters\n    ----------\n    path: str | os.PathLike[str]\n        The file path to create or update. Must be a fully qualified lakeFS URI.\n    truncate: bool\n        Whether to set the file size to 0 (zero) bytes, even if the path already exists.\n    **kwargs: Any\n        Additional keyword arguments to pass to ``LakeFSFileSystem.open()``.\n\n    Raises\n    ------\n    NotImplementedError\n        If the targeted lakeFS server version does not support `touch()` operations.\n    \"\"\"\n\n    # empty buffer upload errors were fixed in https://github.com/treeverse/lakeFS/issues/7130,\n    # which was first released in lakeFS v1.3.1.\n    if self._lakefs_server_version < (1, 3, 1):\n        version_string = \".\".join(str(v) for v in self._lakefs_server_version)\n        raise NotImplementedError(\n            \"LakeFSFileSystem.touch() is not supported for your lakeFS server version. \"\n            f\"minimum required version: '1.3.1', actual version: {version_string!r}\"\n        )\n\n    super().touch(path=path, truncate=truncate, **kwargs)\n
"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.tail","title":"tail","text":"
tail(path: str | PathLike[str], size: int = 1024) -> bytes\n

Get the last size bytes from a remote file.

PARAMETER DESCRIPTION path

The file path to read. Must be a fully qualified lakeFS URI.

TYPE: str | PathLike[str]

size

The amount of bytes to get.

TYPE: int DEFAULT: 1024

RETURNS DESCRIPTION bytes

The bytes at the end of the requested file.

Source code in src/lakefs_spec/spec.py
def tail(self, path: str | os.PathLike[str], size: int = 1024) -> bytes:\n    \"\"\"\n    Get the last ``size`` bytes from a remote file.\n\n    Parameters\n    ----------\n    path: str | os.PathLike[str]\n        The file path to read. Must be a fully qualified lakeFS URI.\n    size: int\n        The amount of bytes to get.\n\n    Returns\n    -------\n    bytes\n        The bytes at the end of the requested file.\n    \"\"\"\n    f: ObjectReader\n    with self.open(path, \"rb\") as f:\n        f.seek(max(-size, -f._obj.stat().size_bytes), 2)\n        return f.read()\n
"},{"location":"reference/lakefs_spec/transaction/","title":"transaction","text":"

Functionality for extended lakeFS transactions to conduct versioning operations between file uploads.

"},{"location":"reference/lakefs_spec/transaction/#lakefs_spec.transaction.LakeFSTransaction","title":"LakeFSTransaction","text":"

Bases: Transaction

A lakeFS transaction model capable of versioning operations in between file uploads.

PARAMETER DESCRIPTION fs

The lakeFS file system associated with the transaction.

TYPE: 'LakeFSFileSystem'

Source code in src/lakefs_spec/transaction.py
class LakeFSTransaction(Transaction):\n    \"\"\"\n    A lakeFS transaction model capable of versioning operations in between file uploads.\n\n    Parameters\n    ----------\n    fs: LakeFSFileSystem\n        The lakeFS file system associated with the transaction.\n    \"\"\"\n\n    def __init__(\n        self,\n        fs: \"LakeFSFileSystem\",\n    ):\n        super().__init__(fs=fs)\n        self.fs: \"LakeFSFileSystem\"\n        self.files: deque[ObjectWriter] = deque(self.files)\n\n        self.repository: str | None = None\n        self.base_branch: Branch | None = None\n        self.automerge: bool = False\n        self.delete: Literal[\"onsuccess\", \"always\", \"never\"] = \"onsuccess\"\n        self._ephemeral_branch: Branch | None = None\n\n    def __call__(\n        self,\n        repository: str | Repository,\n        base_branch: str | Branch = \"main\",\n        branch_name: str | None = None,\n        automerge: bool = True,\n        delete: Literal[\"onsuccess\", \"always\", \"never\"] = \"onsuccess\",\n    ) -> \"LakeFSTransaction\":\n        \"\"\"\n        Creates an ephemeral branch, conducts all uploads and operations on that branch,\n        and optionally merges it back into the source branch.\n\n        repository: str | Repository\n            The repository in which to conduct the transaction.\n        base_branch: str | Branch\n            The branch on which the transaction operations should be based.\n        automerge: bool\n            Automatically merge the ephemeral branch into the base branch after successful\n            transaction completion.\n        delete: Literal[\"onsuccess\", \"always\", \"never\"]\n            Cleanup policy / deletion handling for the ephemeral branch after the transaction.\n\n            If ``\"onsuccess\"``, the branch is deleted if the transaction succeeded,\n            or left over if an error occurred.\n\n            If ``\"always\"``, the ephemeral branch is always deleted after transaction regardless of success\n            or failure.\n\n            If ``\"never\"``, the transaction branch is always left in the repository.\n        \"\"\"\n\n        if isinstance(repository, str):\n            self.repository = repository\n        else:\n            self.repository = repository.id\n\n        repo = lakefs.Repository(self.repository, client=self.fs.client)\n        try:\n            _ = repo.metadata\n        except ServerException:\n            raise ValueError(f\"repository {self.repository!r} does not exist\") from None\n\n        # base branch needs to be a lakefs.Branch, since it is being diffed\n        # with the ephemeral branch in __exit__.\n        self.base_branch = _ensurebranch(base_branch, self.repository, self.fs.client)\n\n        self.automerge = automerge\n        self.delete = delete\n\n        ephem_name = branch_name or \"transaction-\" + \"\".join(random.choices(string.digits, k=6))  # nosec: B311\n        self._ephemeral_branch = Branch(self.repository, ephem_name, client=self.fs.client)\n        return self\n\n    def __enter__(self):\n        logger.debug(\n            f\"Creating ephemeral branch {self._ephemeral_branch.id!r} \"\n            f\"from branch {self.base_branch.id!r}.\"\n        )\n        self._ephemeral_branch.create(self.base_branch, exist_ok=False)\n        self.fs._intrans = True\n        return self\n\n    def __exit__(self, exc_type, exc_val, exc_tb):\n        success = exc_type is None\n        while self.files:\n            # fsspec base class calls `append` on the file, which means we\n            # have to pop from the left to preserve order.\n            f = self.files.popleft()\n            if not success:\n                f.discard()\n\n        self.fs._intrans = False\n        self.fs._transaction = None\n\n        if any(self._ephemeral_branch.uncommitted()):\n            msg = f\"Finished transaction on branch {self._ephemeral_branch.id!r} with uncommitted changes.\"\n            if self.delete != \"never\":\n                msg += \" Objects added but not committed are lost.\"\n            warnings.warn(msg)\n\n        if success and self.automerge:\n            if any(self.base_branch.diff(self._ephemeral_branch)):\n                self._ephemeral_branch.merge_into(self.base_branch)\n        if self.delete == \"always\" or (success and self.delete == \"onsuccess\"):\n            self._ephemeral_branch.delete()\n\n    @property\n    def branch(self):\n        return self._ephemeral_branch\n\n    def commit(self, message: str, metadata: dict[str, str] | None = None) -> Reference:\n        \"\"\"\n        Create a commit on this transaction's ephemeral branch with a commit message\n        and attached metadata.\n\n        Parameters\n        ----------\n        message: str\n            The commit message to attach to the newly created commit.\n        metadata: dict[str, str] | None\n            Optional metadata to enrich the created commit with (author, e-mail, ...).\n\n        Returns\n        -------\n        Reference\n            The created commit.\n        \"\"\"\n\n        diff = list(self.branch.uncommitted())\n\n        if not diff:\n            logger.warning(f\"No changes to commit on branch {self.branch.id!r}.\")\n            return self.branch.head\n\n        return self.branch.commit(message, metadata=metadata)\n\n    def merge(self, source_ref: str | Branch, into: str | Branch) -> Commit:\n        \"\"\"\n        Merge a branch into another branch in a repository.\n\n        In case the branch contains no changes relevant to the target branch,\n        no merge happens, and the tip of the target branch is returned instead.\n\n        Parameters\n        ----------\n        source_ref: str | Branch\n            Source reference containing the changes to merge.\n            Can be a branch name or partial commit SHA.\n        into: str | Branch\n            Target branch into which the changes will be merged.\n\n        Returns\n        -------\n        Commit\n            Either the created merge commit, or the head commit of the target branch.\n        \"\"\"\n        source = _ensurebranch(source_ref, self.repository, self.fs.client)\n        dest = _ensurebranch(into, self.repository, self.fs.client)\n\n        if any(dest.diff(source)):\n            source.merge_into(dest)\n        return dest.head.get_commit()\n\n    def revert(self, branch: str | Branch, ref: ReferenceType, parent_number: int = 1) -> Commit:\n        \"\"\"\n        Revert a previous commit on a branch.\n\n        Parameters\n        ----------\n        branch: str | Branch\n            Branch on which the commit should be reverted.\n        ref: ReferenceType\n            The reference to revert.\n        parent_number: int\n            If there are multiple parents to a commit, specify to which parent\n            the commit should be reverted. ``parent_number = 1`` (the default)\n            refers to the first parent commit of the current ``branch`` tip.\n\n        Returns\n        -------\n        Commit\n            The created revert commit.\n        \"\"\"\n\n        b = _ensurebranch(branch, self.repository, self.fs.client)\n\n        ref_id = ref if isinstance(ref, str) else ref.id\n        b.revert(ref_id, parent_number=parent_number)\n        return b.head.get_commit()\n\n    def rev_parse(self, ref: ReferenceType) -> Commit:\n        \"\"\"\n        Parse a given lakeFS reference expression and obtain its corresponding commit.\n\n        Parameters\n        ----------\n        ref: ReferenceType\n            Reference object to resolve, can be a branch, commit SHA, or tag.\n\n        Returns\n        -------\n        Commit\n            The commit referenced by the expression ``ref``.\n        \"\"\"\n\n        ref_id = ref.id if isinstance(ref, Reference) else ref\n        reference = lakefs.Reference(self.repository, ref_id, client=self.fs.client)\n        return reference.get_commit()\n\n    def tag(self, ref: ReferenceType, name: str) -> Tag:\n        \"\"\"\n        Create a tag referencing a commit in a repository.\n\n        Parameters\n        ----------\n        ref: ReferenceType\n            Commit SHA or placeholder for a reference or commit object\n            to which the new tag will point.\n        name: str\n            Name of the tag to be created.\n\n        Returns\n        -------\n        Tag\n            The requested tag.\n        \"\"\"\n\n        return lakefs.Tag(self.repository, name, client=self.fs.client).create(ref)\n
"},{"location":"reference/lakefs_spec/transaction/#lakefs_spec.transaction.LakeFSTransaction.commit","title":"commit","text":"
commit(message: str, metadata: dict[str, str] | None = None) -> Reference\n

Create a commit on this transaction's ephemeral branch with a commit message and attached metadata.

PARAMETER DESCRIPTION message

The commit message to attach to the newly created commit.

TYPE: str

metadata

Optional metadata to enrich the created commit with (author, e-mail, ...).

TYPE: dict[str, str] | None DEFAULT: None

RETURNS DESCRIPTION Reference

The created commit.

Source code in src/lakefs_spec/transaction.py
def commit(self, message: str, metadata: dict[str, str] | None = None) -> Reference:\n    \"\"\"\n    Create a commit on this transaction's ephemeral branch with a commit message\n    and attached metadata.\n\n    Parameters\n    ----------\n    message: str\n        The commit message to attach to the newly created commit.\n    metadata: dict[str, str] | None\n        Optional metadata to enrich the created commit with (author, e-mail, ...).\n\n    Returns\n    -------\n    Reference\n        The created commit.\n    \"\"\"\n\n    diff = list(self.branch.uncommitted())\n\n    if not diff:\n        logger.warning(f\"No changes to commit on branch {self.branch.id!r}.\")\n        return self.branch.head\n\n    return self.branch.commit(message, metadata=metadata)\n
"},{"location":"reference/lakefs_spec/transaction/#lakefs_spec.transaction.LakeFSTransaction.merge","title":"merge","text":"
merge(source_ref: str | Branch, into: str | Branch) -> Commit\n

Merge a branch into another branch in a repository.

In case the branch contains no changes relevant to the target branch, no merge happens, and the tip of the target branch is returned instead.

PARAMETER DESCRIPTION source_ref

Source reference containing the changes to merge. Can be a branch name or partial commit SHA.

TYPE: str | Branch

into

Target branch into which the changes will be merged.

TYPE: str | Branch

RETURNS DESCRIPTION Commit

Either the created merge commit, or the head commit of the target branch.

Source code in src/lakefs_spec/transaction.py
def merge(self, source_ref: str | Branch, into: str | Branch) -> Commit:\n    \"\"\"\n    Merge a branch into another branch in a repository.\n\n    In case the branch contains no changes relevant to the target branch,\n    no merge happens, and the tip of the target branch is returned instead.\n\n    Parameters\n    ----------\n    source_ref: str | Branch\n        Source reference containing the changes to merge.\n        Can be a branch name or partial commit SHA.\n    into: str | Branch\n        Target branch into which the changes will be merged.\n\n    Returns\n    -------\n    Commit\n        Either the created merge commit, or the head commit of the target branch.\n    \"\"\"\n    source = _ensurebranch(source_ref, self.repository, self.fs.client)\n    dest = _ensurebranch(into, self.repository, self.fs.client)\n\n    if any(dest.diff(source)):\n        source.merge_into(dest)\n    return dest.head.get_commit()\n
"},{"location":"reference/lakefs_spec/transaction/#lakefs_spec.transaction.LakeFSTransaction.revert","title":"revert","text":"
revert(branch: str | Branch, ref: ReferenceType, parent_number: int = 1) -> Commit\n

Revert a previous commit on a branch.

PARAMETER DESCRIPTION branch

Branch on which the commit should be reverted.

TYPE: str | Branch

ref

The reference to revert.

TYPE: ReferenceType

parent_number

If there are multiple parents to a commit, specify to which parent the commit should be reverted. parent_number = 1 (the default) refers to the first parent commit of the current branch tip.

TYPE: int DEFAULT: 1

RETURNS DESCRIPTION Commit

The created revert commit.

Source code in src/lakefs_spec/transaction.py
def revert(self, branch: str | Branch, ref: ReferenceType, parent_number: int = 1) -> Commit:\n    \"\"\"\n    Revert a previous commit on a branch.\n\n    Parameters\n    ----------\n    branch: str | Branch\n        Branch on which the commit should be reverted.\n    ref: ReferenceType\n        The reference to revert.\n    parent_number: int\n        If there are multiple parents to a commit, specify to which parent\n        the commit should be reverted. ``parent_number = 1`` (the default)\n        refers to the first parent commit of the current ``branch`` tip.\n\n    Returns\n    -------\n    Commit\n        The created revert commit.\n    \"\"\"\n\n    b = _ensurebranch(branch, self.repository, self.fs.client)\n\n    ref_id = ref if isinstance(ref, str) else ref.id\n    b.revert(ref_id, parent_number=parent_number)\n    return b.head.get_commit()\n
"},{"location":"reference/lakefs_spec/transaction/#lakefs_spec.transaction.LakeFSTransaction.rev_parse","title":"rev_parse","text":"
rev_parse(ref: ReferenceType) -> Commit\n

Parse a given lakeFS reference expression and obtain its corresponding commit.

PARAMETER DESCRIPTION ref

Reference object to resolve, can be a branch, commit SHA, or tag.

TYPE: ReferenceType

RETURNS DESCRIPTION Commit

The commit referenced by the expression ref.

Source code in src/lakefs_spec/transaction.py
def rev_parse(self, ref: ReferenceType) -> Commit:\n    \"\"\"\n    Parse a given lakeFS reference expression and obtain its corresponding commit.\n\n    Parameters\n    ----------\n    ref: ReferenceType\n        Reference object to resolve, can be a branch, commit SHA, or tag.\n\n    Returns\n    -------\n    Commit\n        The commit referenced by the expression ``ref``.\n    \"\"\"\n\n    ref_id = ref.id if isinstance(ref, Reference) else ref\n    reference = lakefs.Reference(self.repository, ref_id, client=self.fs.client)\n    return reference.get_commit()\n
"},{"location":"reference/lakefs_spec/transaction/#lakefs_spec.transaction.LakeFSTransaction.tag","title":"tag","text":"
tag(ref: ReferenceType, name: str) -> Tag\n

Create a tag referencing a commit in a repository.

PARAMETER DESCRIPTION ref

Commit SHA or placeholder for a reference or commit object to which the new tag will point.

TYPE: ReferenceType

name

Name of the tag to be created.

TYPE: str

RETURNS DESCRIPTION Tag

The requested tag.

Source code in src/lakefs_spec/transaction.py
def tag(self, ref: ReferenceType, name: str) -> Tag:\n    \"\"\"\n    Create a tag referencing a commit in a repository.\n\n    Parameters\n    ----------\n    ref: ReferenceType\n        Commit SHA or placeholder for a reference or commit object\n        to which the new tag will point.\n    name: str\n        Name of the tag to be created.\n\n    Returns\n    -------\n    Tag\n        The requested tag.\n    \"\"\"\n\n    return lakefs.Tag(self.repository, name, client=self.fs.client).create(ref)\n
"},{"location":"reference/lakefs_spec/util/","title":"util","text":"

Useful utilities for handling lakeFS URIs and results of lakeFS API calls.

"},{"location":"reference/lakefs_spec/util/#lakefs_spec.util.depaginate","title":"depaginate","text":"
depaginate(\n    api: Callable[..., PaginatedApiResponse], *args: Any, **kwargs: Any\n) -> Generator[Any, None, None]\n

Unwrap the responses from a paginated lakeFS API method into a generator.

PARAMETER DESCRIPTION api

The lakeFS client API to call. Must return a paginated response with the pagination and results fields set.

TYPE: Callable[..., PaginatedApiResponse]

*args

Positional arguments to pass to the API call.

TYPE: Any DEFAULT: ()

**kwargs

Keyword arguments to pass to the API call.

TYPE: Any DEFAULT: {}

YIELDS DESCRIPTION Any

The obtained API result objects.

Source code in src/lakefs_spec/util.py
def depaginate(\n    api: Callable[..., PaginatedApiResponse], *args: Any, **kwargs: Any\n) -> Generator[Any, None, None]:\n    \"\"\"\n    Unwrap the responses from a paginated lakeFS API method into a generator.\n\n    Parameters\n    ----------\n    api: Callable[..., PaginatedApiResponse]\n        The lakeFS client API to call. Must return a paginated response with the ``pagination`` and ``results`` fields set.\n    *args: Any\n        Positional arguments to pass to the API call.\n    **kwargs: Any\n        Keyword arguments to pass to the API call.\n\n    Yields\n    ------\n    Any\n        The obtained API result objects.\n    \"\"\"\n    while True:\n        resp = api(*args, **kwargs)\n        yield from resp.results\n        if not resp.pagination.has_more:\n            break\n        kwargs[\"after\"] = resp.pagination.next_offset\n
"},{"location":"reference/lakefs_spec/util/#lakefs_spec.util.md5_checksum","title":"md5_checksum","text":"
md5_checksum(lpath: str | PathLike[str], blocksize: int = 2 ** 22) -> str\n

Calculate a local file's MD5 hash.

PARAMETER DESCRIPTION lpath

The local path whose MD5 hash to calculate. Must be a file.

TYPE: str | PathLike[str]

blocksize

Block size (in bytes) to use while reading in the file.

TYPE: int DEFAULT: 2 ** 22

RETURNS DESCRIPTION str

The file's MD5 hash value, as a string.

Source code in src/lakefs_spec/util.py
def md5_checksum(lpath: str | os.PathLike[str], blocksize: int = 2**22) -> str:\n    \"\"\"\n    Calculate a local file's MD5 hash.\n\n    Parameters\n    ----------\n    lpath: str | os.PathLike[str]\n        The local path whose MD5 hash to calculate. Must be a file.\n    blocksize: int\n        Block size (in bytes) to use while reading in the file.\n\n    Returns\n    -------\n    str\n        The file's MD5 hash value, as a string.\n    \"\"\"\n    with open(lpath, \"rb\") as f:\n        file_hash = hashlib.md5(usedforsecurity=False)\n        chunk = f.read(blocksize)\n        while chunk:\n            file_hash.update(chunk)\n            chunk = f.read(blocksize)\n    return file_hash.hexdigest()\n
"},{"location":"reference/lakefs_spec/util/#lakefs_spec.util.parse","title":"parse","text":"
parse(path: str) -> tuple[str, str, str]\n

Parses a lakeFS URI in the form lakefs://<repo>/<ref>/<resource>.

PARAMETER DESCRIPTION path

String path, needs to conform to the lakeFS URI format described above. The <resource> part can be the empty string; the leading lakefs:// scheme may be omitted.

TYPE: str

RETURNS DESCRIPTION tuple[str, str, str]

A 3-tuple of repository name, reference, and resource name.

RAISES DESCRIPTION ValueError

If the path does not conform to the lakeFS URI format.

Source code in src/lakefs_spec/util.py
def parse(path: str) -> tuple[str, str, str]:\n    \"\"\"\n    Parses a lakeFS URI in the form ``lakefs://<repo>/<ref>/<resource>``.\n\n    Parameters\n    ----------\n    path: str\n        String path, needs to conform to the lakeFS URI format described above.\n        The ``<resource>`` part can be the empty string; the leading ``lakefs://`` scheme may be omitted.\n\n    Returns\n    -------\n    tuple[str, str, str]\n        A 3-tuple of repository name, reference, and resource name.\n\n    Raises\n    ------\n    ValueError\n        If the path does not conform to the lakeFS URI format.\n    \"\"\"\n\n    # First regex reflects the lakeFS repository naming rules:\n    # only lowercase letters, digits and dash, no leading dash, minimum 3, maximum 63 characters\n    # https://docs.lakefs.io/understand/model.html#repository\n    # Second regex is the branch: Only letters, digits, underscores and dash, no leading dash.\n    path_regex = re.compile(r\"(?:lakefs://)?([a-z0-9][a-z0-9\\-]{2,62})/(\\w[\\w\\-]*)/(.*)\")\n    results = path_regex.fullmatch(path)\n    if results is None:\n        raise ValueError(\n            f\"expected path with structure lakefs://<repo>/<ref>/<resource>, got {path!r}\"\n        )\n\n    repo, ref, resource = results.groups()\n    return repo, ref, resource\n
"},{"location":"tutorials/","title":"Tutorials","text":"

Info

We aim to provide additional tutorials in the future - contributions are welcome!

  • Quickstart example: Using lakeFS-spec as a file system
  • A fully-worked data science example: Using lakeFS-spec together with Pandas to train a classifier based on a public dataset and simulate additional data being collected
"},{"location":"tutorials/demo_data_science_project/","title":"Data Science with lakeFS-spec","text":"
%pip install numpy pandas scikit-learn\n
\nCollecting numpy\n\n
\n  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)\n     \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 0.0/61.0 kB ? eta -:--:--\n     \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 61.0/61.0 kB 3.6 MB/s eta 0:00:00\n\n
\nCollecting pandas\n  Downloading pandas-2.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)\n\n
\nCollecting scikit-learn\n  Downloading scikit_learn-1.4.1.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)\n\n
\nRequirement already satisfied: python-dateutil>=2.8.2 in /opt/hostedtoolcache/Python/3.11.8/x64/lib/python3.11/site-packages (from pandas) (2.8.2)\nRequirement already satisfied: pytz>=2020.1 in /opt/hostedtoolcache/Python/3.11.8/x64/lib/python3.11/site-packages (from pandas) (2024.1)\nCollecting tzdata>=2022.7 (from pandas)\n  Downloading tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)\n\n
\nCollecting scipy>=1.6.0 (from scikit-learn)\n  Downloading scipy-1.12.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)\n     \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 0.0/60.4 kB ? eta -:--:--\n     \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 60.4/60.4 kB 17.5 MB/s eta 0:00:00\n\n
\nCollecting joblib>=1.2.0 (from scikit-learn)\n  Downloading joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)\n\n
\nCollecting threadpoolctl>=2.0.0 (from scikit-learn)\n  Downloading threadpoolctl-3.3.0-py3-none-any.whl.metadata (13 kB)\nRequirement already satisfied: six>=1.5 in /opt/hostedtoolcache/Python/3.11.8/x64/lib/python3.11/site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\n\n
\nDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 0.0/18.3 MB ? eta -:--:--\n
\n\n   \u2501\u2501\u2501\u2501\u2578\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 2.2/18.3 MB 65.1 MB/s eta 0:00:01\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u257a\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 7.4/18.3 MB 107.5 MB/s eta 0:00:01\n
\n\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2578\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 12.6/18.3 MB 152.4 MB/s eta 0:00:01\n
\n\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2578 18.2/18.3 MB 163.2 MB/s eta 0:00:01\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2578 18.2/18.3 MB 163.2 MB/s eta 0:00:01\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 18.3/18.3 MB 94.2 MB/s eta 0:00:00\n\n
\nDownloading pandas-2.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.0 MB)\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 0.0/13.0 MB ? eta -:--:--\n
\n\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2578\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 5.1/13.0 MB 153.9 MB/s eta 0:00:01\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2578\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 10.4/13.0 MB 153.4 MB/s eta 0:00:01\n
\n\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2578 13.0/13.0 MB 154.8 MB/s eta 0:00:01\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 13.0/13.0 MB 102.0 MB/s eta 0:00:00\n\n
\nDownloading scikit_learn-1.4.1.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 0.0/12.1 MB ? eta -:--:--\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u257a\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 5.5/12.1 MB 166.6 MB/s eta 0:00:01\n
\n\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u257a\u2501\u2501\u2501\u2501\u2501 10.3/12.1 MB 151.7 MB/s eta 0:00:01\n
\n\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2578 12.1/12.1 MB 151.0 MB/s eta 0:00:01\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 12.1/12.1 MB 103.2 MB/s eta 0:00:00\nDownloading joblib-1.3.2-py3-none-any.whl (302 kB)\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 0.0/302.2 kB ? eta -:--:--\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 302.2/302.2 kB 59.9 MB/s eta 0:00:00\nDownloading scipy-1.12.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.4 MB)\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 0.0/38.4 MB ? eta -:--:--\n
\n\n   \u2501\u2501\u2501\u2501\u2501\u2578\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 5.4/38.4 MB 162.5 MB/s eta 0:00:01\n
\n\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u257a\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 10.6/38.4 MB 156.3 MB/s eta 0:00:01\n
\n\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2578\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 16.1/38.4 MB 156.9 MB/s eta 0:00:01\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u257a\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 21.4/38.4 MB 158.4 MB/s eta 0:00:01\n
\n\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2578\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 26.7/38.4 MB 155.5 MB/s eta 0:00:01\n
\n\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2578\u2501\u2501\u2501\u2501\u2501\u2501 32.2/38.4 MB 157.9 MB/s eta 0:00:01\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2578 38.2/38.4 MB 169.0 MB/s eta 0:00:01\n
\n\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2578 38.4/38.4 MB 165.0 MB/s eta 0:00:01\n
\n\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2578 38.4/38.4 MB 165.0 MB/s eta 0:00:01\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 38.4/38.4 MB 66.1 MB/s eta 0:00:00\nDownloading threadpoolctl-3.3.0-py3-none-any.whl (17 kB)\n\n
\nDownloading tzdata-2024.1-py2.py3-none-any.whl (345 kB)\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 0.0/345.4 kB ? eta -:--:--\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 345.4/345.4 kB 64.5 MB/s eta 0:00:00\n\n
\nInstalling collected packages: tzdata, threadpoolctl, numpy, joblib, scipy, pandas, scikit-learn\n\n
\nSuccessfully installed joblib-1.3.2 numpy-1.26.4 pandas-2.2.0 scikit-learn-1.4.1.post1 scipy-1.12.0 threadpoolctl-3.3.0 tzdata-2024.1\n\n
\nNote: you may need to restart the kernel to use updated packages.\n\n

Also install an appropriate lakeFS-spec version, which can be either the latest release from PyPI via pip install --upgrade lakefs-spec, or the development version from GitHub via pip install git+https://github.com/aai-institute/lakefs-spec.git.

import os\nimport tempfile\nimport urllib.request\nfrom pathlib import Path\n\nurllib.request.urlretrieve(\n    \"https://raw.githubusercontent.com/aai-institute/lakefs-spec/main/docs/tutorials/.lakectl.yaml\",\n    os.path.expanduser(\"~/.lakectl.yaml\"),\n)\n
\n('/home/runner/.lakectl.yaml', <http.client.HTTPMessage at 0x7fd726d97590>)\n

We can now instantiate the LakeFSFileSystem with the credentials we just downloaded. Alternatively, we could have passed the credentials directly in the code. It is important that the credentials are available at the time of filesystem instantiation.

from lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\n\nREPO_NAME = \"weather\"\n

We will create a repository using a helper function provided by lakeFS-spec. If you have already created one in the UI, make sure to set the REPO_NAME variable accordingly in the cell directly above.

import lakefs\n\nrepo = lakefs.Repository(REPO_NAME, fs.client).create(storage_namespace=f\"local://{REPO_NAME}\")\n
def _maybe_urlretrieve(url: str, filename: str) -&gt; str:\n    # Avoid API rate limit errors by downloading to a fixed local location\n    destination = Path(tempfile.gettempdir()) / \"lakefs-spec-tutorials\" / filename\n    destination.parent.mkdir(exist_ok=True, parents=True)\n    if destination.exists():\n        return str(destination)\n\n    outfile, _ = urllib.request.urlretrieve(url, str(destination))\n    return outfile\n\n\noutfile = _maybe_urlretrieve(\n    \"https://archive-api.open-meteo.com/v1/archive?latitude=52.52&amp;longitude=13.41&amp;start_date=2010-01-01&amp;end_date=2010-12-31&amp;hourly=temperature_2m,relativehumidity_2m,rain,pressure_msl,surface_pressure,cloudcover,cloudcover_low,cloudcover_mid,cloudcover_high,windspeed_10m,windspeed_100m,winddirection_10m,winddirection_100m\",\n    \"weather-2010.json\",\n)\n

The data is in JSON format. Therefore, we need to wrangle the data a bit to make it usable. But first, we will upload it to our lakeFS instance.

NEW_BRANCH = lakefs.Branch(REPO_NAME, \"transform-raw-data\", client=fs.client)\nNEW_BRANCH.create(\"main\")\n\nwith fs.transaction(REPO_NAME, NEW_BRANCH) as tx:\n    fs.put(outfile, f\"{REPO_NAME}/{tx.branch.id}/weather-2010.json\")\n    tx.commit(message=\"Add 2010 weather data\")\n

You can inspect this commit by selecting the transform-raw-data branch, and navigating to the Commits tab.

import json\n\nimport pandas as pd\n\n\ndef transform_json_weather_data(filepath):\n    if hasattr(filepath, \"close\") and hasattr(filepath, \"tell\"):\n        data = json.load(filepath)\n    else:\n        with open(filepath, \"r\") as f:\n            data = json.load(f)\n\n    df = pd.DataFrame.from_dict(data[\"hourly\"])\n    df.time = pd.to_datetime(df.time)\n    df[\"is_raining\"] = df.rain &gt; 0\n    df[\"is_raining_in_1_day\"] = df.is_raining.shift(24).astype(bool)\n    df = df.dropna()\n    return df\n\n\ndf = transform_json_weather_data(outfile)\ndf.head(5)\n
\n/tmp/ipykernel_2291/2823322696.py:3: DeprecationWarning: \nPyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),\n(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)\nbut was not found to be installed on your system.\nIf this would cause problems for you,\nplease provide us feedback at https://github.com/pandas-dev/pandas/issues/54466\n\n  import pandas as pd\n\n
time temperature_2m relativehumidity_2m rain pressure_msl surface_pressure cloudcover cloudcover_low cloudcover_mid cloudcover_high windspeed_10m windspeed_100m winddirection_10m winddirection_100m is_raining is_raining_in_1_day 0 2010-01-01 00:00:00 -2.6 88 0.0 996.9 992.1 100 100 97 75 16.0 27.2 54 58 False True 1 2010-01-01 01:00:00 -2.7 88 0.0 996.4 991.6 100 99 96 49 16.3 28.0 55 58 False True 2 2010-01-01 02:00:00 -2.7 88 0.0 996.2 991.4 100 96 94 60 16.3 27.5 55 58 False True 3 2010-01-01 03:00:00 -2.7 88 0.0 996.1 991.3 100 97 96 83 15.4 26.6 53 57 False True 4 2010-01-01 04:00:00 -2.7 88 0.0 996.0 991.2 100 92 98 82 14.8 25.6 47 52 False True

Next, we save this data as a CSV file into the main branch. When the transaction commit helper is called, the newly put CSV file is committed. You can verify the saving worked in the lakeFS UI in your browser by switching to the commits tab of the main branch.

with fs.transaction(REPO_NAME, \"main\") as tx:\n    df.to_csv(f\"lakefs://{REPO_NAME}/{tx.branch.id}/weather_2010.csv\")\n    tx.commit(message=\"Update weather data\")\n
import sklearn.model_selection\n\nmodel_data = df.drop(\"time\", axis=1)\n\ntrain, test = sklearn.model_selection.train_test_split(model_data, random_state=7)\n

We save these train and test datasets into a new training branch. If the branch does not exist yet, as in this case, it is implicitly created by default. You can control this behaviour with the create_branch_ok flag when initializing the LakeFSFileSystem. By default, create_branch_ok is set to True, so we need to only set fs = LakeFSFileSystem() to enable implicit branch creation.

TRAINING_BRANCH = lakefs.Branch(REPO_NAME, \"training\", client=fs.client)\nTRAINING_BRANCH.create(\"main\")\n\nwith fs.transaction(REPO_NAME, TRAINING_BRANCH) as tx:\n    train.to_csv(f\"lakefs://{REPO_NAME}/{tx.branch.id}/train_weather.csv\")\n    test.to_csv(f\"lakefs://{REPO_NAME}/{tx.branch.id}/test_weather.csv\")\n    tx.commit(message=\"Add train-test split of 2010 weather data\")\n

Let's check the shape of train and test data. Later on, we will get back to this data version and reproduce the results of the experiment.

print(f\"Initial train data shape: {train.shape}\")\nprint(f\"Initial test data shape: {test.shape}\")\n
\nInitial train data shape: (6570, 15)\nInitial test data shape: (2190, 15)\n\n

We now proceed to train a decision tree classifier and evaluate it on the test set:

from sklearn.tree import DecisionTreeClassifier\n\ndependent_variable = \"is_raining_in_1_day\"\n\nmodel = DecisionTreeClassifier(random_state=7)\n\nx_train, y_train = train.drop(dependent_variable, axis=1), train[dependent_variable].astype(bool)\nx_test, y_test = test.drop(dependent_variable, axis=1), test[dependent_variable].astype(bool)\n\nmodel.fit(x_train, y_train)\n\ntest_acc = model.score(x_test, y_test)\nprint(f\"Test accuracy: {test_acc:.2%}\")\n
\nTest accuracy: 87.31%\n\n
outfile = _maybe_urlretrieve(\n    \"https://archive-api.open-meteo.com/v1/archive?latitude=52.52&amp;longitude=13.41&amp;start_date=2020-01-01&amp;end_date=2020-12-31&amp;hourly=temperature_2m,relativehumidity_2m,rain,pressure_msl,surface_pressure,cloudcover,cloudcover_low,cloudcover_mid,cloudcover_high,windspeed_10m,windspeed_100m,winddirection_10m,winddirection_100m\",\n    \"weather-2020.json\",\n)\n\nnew_data = transform_json_weather_data(outfile)\n\nwith fs.transaction(REPO_NAME, \"main\") as tx:\n    new_data.to_csv(f\"lakefs://{REPO_NAME}/{tx.branch.id}/weather_2020.csv\")\n    tx.commit(message=\"Add 2020 weather data\")\n\n# Remove leftover temporary files from previous `urlretrieve` calls\nurllib.request.urlcleanup()\n

Let's concatenate the old data and the new data, create a new train-test split, and push the updated files to lakeFS:

new_data = new_data.drop(\"time\", axis=1)\nfull_data = pd.concat([new_data, train, test])\n\ntrain_df, test_df = sklearn.model_selection.train_test_split(full_data, random_state=7)\n\nprint(f\"Updated train data shape: {train_df.shape}\")\nprint(f\"Updated test data shape: {test_df.shape}\")\n\nwith fs.transaction(REPO_NAME, TRAINING_BRANCH) as tx:\n    train_df.to_csv(f\"lakefs://{REPO_NAME}/{tx.branch.id}/train_weather.csv\")\n    test_df.to_csv(f\"lakefs://{REPO_NAME}/{tx.branch.id}/test_weather.csv\")\n    tx.commit(message=\"Add train-test split of 2010 and 2020 data\")\n
\nUpdated train data shape: (13158, 15)\nUpdated test data shape: (4386, 15)\n\n

Now, we train the model on the new data and validate on the new test data.

x_train, y_train = (\n    train_df.drop(dependent_variable, axis=1),\n    train_df[dependent_variable].astype(bool),\n)\nx_test, y_test = test_df.drop(dependent_variable, axis=1), test_df[dependent_variable].astype(bool)\n\nmodel.fit(x_train, y_train)\n\ntest_acc = model.score(x_test, y_test)\n\nprint(f\"Test accuracy: {test_acc:.2%}\")\n
\nTest accuracy: 82.67%\n\n
# access the data of the previous commit with a lakefs ref expression, in this case the same as in git.\nprevious_commit = repo.ref(f\"{TRAINING_BRANCH.id}~\").get_commit()\nfixed_commit_id = previous_commit.id\nprint(fixed_commit_id)\n
\n240be1477daa4fd3df6b6621be4398d480424683a09e65a8e0664c0a9e79f496\n\n

Let's check whether we managed to get the initial train and test data with this commit SHA, checking equality to the initial data:

orig_train = pd.read_csv(f\"lakefs://{REPO_NAME}/{fixed_commit_id}/train_weather.csv\", index_col=0)\norig_test = pd.read_csv(f\"lakefs://{REPO_NAME}/{fixed_commit_id}/test_weather.csv\", index_col=0)\n\nprint(f\"Is the pulled training data equal to the local training data? {train.equals(orig_train)}\")\nprint(f\"Is the pulled test data equal to the local test data? {test.equals(orig_test)}\")\n
\nIs the pulled training data equal to the local training data? True\nIs the pulled test data equal to the local test data? True\n\n

Let's train and validate the model again based on the redownloaded data and see if we manage to reproduce the initial accuracy.

x_train, y_train = train.drop(dependent_variable, axis=1), train[dependent_variable].astype(bool)\nx_test, y_test = test.drop(dependent_variable, axis=1), test[dependent_variable].astype(bool)\n\nmodel.fit(x_train, y_train)\n\ntest_acc = model.score(x_test, y_test)\n\nprint(f\"Test accuracy: {test_acc:.2%}\")\n
\nTest accuracy: 87.31%\n\n
with fs.transaction(REPO_NAME, \"main\") as tx:\n    # returns the tag as a lakeFS object.\n    tag = tx.tag(fixed_commit_id, name=\"train-test-split-2010\")\n

Now we can access the specific files with the semantic tag. Both the fixed_commit_id and tag reference the same version ref in lakeFS, whereas a branch name always points to the latest version on that respective branch.

train_from_commit = pd.read_csv(\n    f\"lakefs://{REPO_NAME}/{fixed_commit_id}/train_weather.csv\", index_col=0\n)\ntrain_from_tag = pd.read_csv(f\"lakefs://{REPO_NAME}/{tag.id}/train_weather.csv\", index_col=0)\n

We can verify this by comparing the DataFrames. We see that the train_from_commit and train_from_tag are equal.

print(\n    f\"Is the data tagged {tag!r} equal to the data in commit {fixed_commit_id[:8]}? {train_from_commit.equals(train_from_tag)}\"\n)\n
\nIs the data tagged Tag(repository=\"weather\", id=\"train-test-split-2010\") equal to the data in commit 240be147? True\n\n
"},{"location":"tutorials/demo_data_science_project/#data-science-with-lakefs-spec","title":"Data Science with lakeFS-spec","text":"

In this notebook, we will complete a small end-to-end data science tutorial that employs lakeFS-spec for data versioning. We will use versioned weather data to train a decision tree classifier to predict whether it is raining tomorrow given the current weather.

We will do the following:

  • Environment setup
  • LakeFS setup
  • Authenticating with the lakeFS server
  • Data ingestion via transactions
  • Model training
  • Updating data and retraining a model
  • Accessing data versions and reproducing experiments
  • Using tags for semantic versioning

Local Execution

If you want to execute the code in this tutorial as a Jupyter notebook yourself, download the demo_data_science_project.py file from the lakeFS-spec repository.

You can then convert the Python file to a notebook using Jupytext using the following command: jupytext --to notebook demo_data_science_project.py.

This tutorial assumes that you have installed lakeFS-spec in a virtual environment, and that you have followed the quickstart guide to set up a local lakeFS instance.

"},{"location":"tutorials/demo_data_science_project/#environment-setup","title":"Environment setup","text":"

Install the necessary libraries for this notebook on the environment you have just created:

"},{"location":"tutorials/demo_data_science_project/#lakefs-setup","title":"lakeFS Setup","text":"

With Docker Desktop or a similar runtime running set up lakeFS by executing the following docker run command (from the lakeFS quickstart) in your console:

docker run --name lakefs --pull always --rm --publish 8000:8000 treeverse/lakefs:latest run --quickstart\n

You find the authentication credentials in the terminal output. The default address for the local lakeFS GUI is http://localhost:8000/.

"},{"location":"tutorials/demo_data_science_project/#authenticating-with-the-lakefs-server","title":"Authenticating with the lakeFS server","text":"

There are multiple ways to authenticate with lakeFS from Python code. In this tutorial, we choose the YAML file configuration. By executing the cell below, you will download a YAML file containing the default lakeFS quickstart credentials and server URL to your user directory.

"},{"location":"tutorials/demo_data_science_project/#data-ingestion","title":"Data Ingestion","text":"

Now it's time to get some data. We will use the Open-Meteo API, where we can pull weather data from an API for free (as long as we are non-commercial) and without an API token. In order to prevent hitting the rate limits when repeatedly querying the API (and out of courtesy towards the operators of the API), the _maybe_urlretrieve function provides a simple local cache for the downloaded data.

For training our toy model, we download the full weather data of Munich for the year 2010:

"},{"location":"tutorials/demo_data_science_project/#upload-a-file-using-transactions","title":"Upload a file using transactions","text":"

lakeFS works similar to git as a versioning system. You can create commits that contain specific changes to the data. You can also work with branches to create your own isolated view of the data independently of your colleagues. Every commit (on any branch) is identified by a commit SHA. This SHA can be used to programmatically interact with specific states of your data and enables logging of the specific data versions used to create a certain model.

To easily carry out versioning operations while uploading files, you can use transactions. A transaction is a context manager that keeps track of all files that were uploaded in its scope, as well as all versioning operations happening between file uploads. All operations are deferred to the end of the transaction, and are executed sequentially on completion.

To create a commit after a file upload, you can run the following transaction:

"},{"location":"tutorials/demo_data_science_project/#data-transformation","title":"Data Transformation","text":"

Now let's transform the data for our use case. We put the transformation into a function to be able to reuse it later.

In this notebook, we use a simple toy model to predict whether it is raining at the same time tomorrow given weather data from right now.

We will skip a lot of possible feature engineering and other data science aspects in order to focus more on the application of the LakeFSFileSystem.

"},{"location":"tutorials/demo_data_science_project/#training-the-initial-weather-model","title":"Training the initial weather model","text":"

First we will do a train-test split:

"},{"location":"tutorials/demo_data_science_project/#updating-data-and-retraining-the-model","title":"Updating data and retraining the model","text":"

Until now, we only have used data from 2010. Let's download additional 2020 data, transform it, and save it to lakeFS.

"},{"location":"tutorials/demo_data_science_project/#accessing-data-versions-through-commits-and-reproducing-experiments","title":"Accessing data versions through commits and reproducing experiments","text":"

If we need to go to our initial data and reproduce the first experiment (the model trained on the 2010 data with its initial accuracy), we can go back in the commit history of the training branch and select the appropriate commit data snapshot. Since we have created multiple commits on the same branch already, we will address different data versions by their commit SHAs.

To obtain the actual commit SHA from a branch, we have multiple options. Manually, we could go into the lakeFS UI, select the training branch, and navigate to the Commits tab. There, we take the parent of the previous commit, titled Add train-test split of 2010 weather data, and copy its revision SHA (also called ID).

In code, we can obtain commit SHAs for different revisions on the training branch by using lakefs.Reference objects.

"},{"location":"tutorials/demo_data_science_project/#using-tags-instead-of-commit-shas-for-semantic-versioning","title":"Using tags instead of commit SHAs for semantic versioning","text":"

The above method for data versioning works great when you have experiment tracking tools to store and retrieve the commit SHA in automated pipelines. But it can be tedious to retrieve in manual prototyping. We can make selected versions of the dataset more accessible with semantic versioning by attaching a human-interpretable tag to a specific commit SHA.

Creating a tag is easiest when done inside a transaction, just like the files we already uploaded. To do this, simply call tx.tag on the transaction and supply the repository name, the commit SHA to tag, and the intended tag name. Tags are immutable once created, so attempting to tag two different commits with the same name will result in an error.

"}]} \ No newline at end of file +{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Home","text":"

Welcome to lakeFS-spec, a filesystem-spec backend implementation for the lakeFS data lake. Our primary goal is to streamline versioned data operations in lakeFS, enabling seamless integration with popular data science tools such as Pandas, Polars, and DuckDB directly from Python.

Highlights:

  • Simple repository operations in lakeFS
  • Easy access to underlying storage and versioning operations
  • Seamless integration with the fsspec ecosystem
  • Directly access lakeFS objects from popular data science libraries (including Pandas, Polars, DuckDB, PyArrow) with minimal code
  • Transaction support for reliable data version control
  • Smart data transfers through client-side caching (up-/download)
  • Auto-discovery configuration

Early Adopters

We are seeking early adopters who would like to actively participate in our feedback process and shape the future of the library. If you are interested in using the library and want to get in touch with us, please reach out via Github Discussions.

Quickstart

Step-by-step installation and first operations

Tutorials

In-depth tutorials on using lakeFS-spec

API Reference

Full documentation of the Python API

User Guide

Solving specific tasks with lakeFS-spec

Contributing

How to contribute to the project

"},{"location":"CONTRIBUTING/","title":"Contributing to lakeFS-spec","text":"

Thank you for your interest in contributing to this project!

We appreciate issue reports, pull requests for code and documentation, as well as any project-related communication through GitHub Discussions.

"},{"location":"CONTRIBUTING/#getting-started","title":"Getting Started","text":"

To get started with development, you can follow these steps:

  1. Clone this repository:

    git clone https://github.com/aai-institute/lakefs-spec.git\n
  2. Navigate to the directory and install the development dependencies into a virtual environment:

    cd lakefs-spec\npython3 -m venv venv --system-site-packages\nsource venv/bin/activate\npython -m pip install -r requirements-dev.txt\npython -m pip install -e . --no-deps\n
  3. After making your changes, verify they adhere to our Python code style by running pre-commit:

    pre-commit run --all-files\n

    You can also set up Git hooks through pre-commit to perform these checks automatically:

    pre-commit install\n
  4. To run the tests against an ephemeral lakeFS instance, you just run pytest:

    pytest\n

    To spin up a local lakeFS instance quickly for testing, you can use the Docker Compose file bundled with this repository:

    docker-compose -f hack/docker-compose.yml up\n
"},{"location":"CONTRIBUTING/#updating-dependencies","title":"Updating dependencies","text":"

Dependencies should stay locked for as long as possible, ideally for a whole release. If you have to update a dependency during development, you should do the following:

  1. If it is a core dependency needed for the package, add it to the dependencies section in the pyproject.toml.
  2. In case of a development dependency, add it to the dev section of the project.optional-dependencies table instead.
  3. Dependencies needed for documentation generation are found in the docs sections of project.optional-dependencies.

After adding the dependency in either of these sections, run the helper script hack/lock-deps.sh (which in turn uses pip-compile) to pin all dependencies again:

python -m pip install --upgrade pip-tools\nhack/lock-deps.sh\n

In addition to these manual steps, we also provide pre-commit hooks that automatically lock the dependencies whenever pyproject.toml is changed.

Selective package upgrade for existing dependencies are also handled by the helper script above. If you want to update the lakefs-sdk dependency, for example, simply run:

hack/lock-deps.sh lakefs-sdk\n

Tip

Since the official development version is Python 3.11, please run the above commands in a virtual environment with Python 3.11.

"},{"location":"CONTRIBUTING/#working-on-documentation","title":"Working on Documentation","text":"

Improvements or additions to the project's documentation are highly appreciated.

The documentation is based on the MkDocs and Material for MkDocs (mkdocs-material) projects, see their homepages for in-depth guides on their features and usage. We use the Numpy documentation style for Python docstrings.

To build the documentation locally, you need to first install the optional docs dependencies from requirements-docs.txt, e.g., with pip install -r requirements-docs.txt. You can then start a local documentation server with mkdocs serve, or build the documentation into its output folder in public/.

In order to maintain documentation for multiple versions of this library, we use the mike tool, which automatically maintains individual documentation builds per version and publishes them to the gh-pages branch.

The GitHub CI pipeline automatically invokes mike as part of the release process with the correct version and updates the GitHub pages branch for the project.

"},{"location":"quickstart/","title":"Quickstart","text":"

Welcome! This quickstart guide will get you up and running with lakeFS-spec by showing you how to

  1. install the lakefs-spec package,
  2. spin up a local lakeFS server,
  3. create a lakeFS repository for experimentation, and
  4. perform basic file system operations in a lakeFS repository using lakeFS-spec.
Prerequisites

To follow along with this guide, you will need a few prerequisites ready on your machine:

  • lakeFS-spec supports Windows, macOS, or Linux
  • Docker, with Docker Compose
  • Python 3.9 or later
  • optionally, lakectl, the lakeFS command line tool

Please take a moment to make sure you have these tools available before proceeding with the next steps.

"},{"location":"quickstart/#installing-lakefs-spec","title":"Installing lakeFS-spec","text":"A note on virtual environments

We generally recommend installing the library in a virtual environment to ensure proper isolation, especially when following this quickstart guide.

If you are using Poetry, virtual environments can automatically be created by the tool.

If you prefer the venv functionality built into Python, see the official docs (tl;dr: python -m venv venv; source venv/bin/activate).

To install the package directly from PyPI, run:

pippoetry
pip install lakefs-spec\n
poetry add lakefs-spec\n

Or, if you want to try the latest pre-release version directly from GitHub:

pippoetry
pip install git+https://github.com/aai-institute/lakefs-spec.git\n
poetry add git+https://github.com/aai-institute/lakefs-spec.git\n
"},{"location":"quickstart/#first-steps","title":"First Steps","text":""},{"location":"quickstart/#spinning-up-a-local-lakefs-instance","title":"Spinning up a local lakeFS instance","text":"

Warning

This setup is not recommended for production uses, since it does not store the data persistently.

Please check out the lakeFS docs for production-ready deployment options.

If you don't already have access to a lakeFS server, you can quickly start a local instance using Docker Compose. Before continuing, please make sure Docker is installed and running on your machine.

The lakeFS quickstart deployment can be launched directly with a configuration file provided in the lakeFS-spec repository:

$ curl https://raw.githubusercontent.com/aai-institute/lakefs-spec/main/hack/docker-compose.yml | docker-compose -f - up\n

If you do not have curl installed on your machine or would like to examine and/or customize the container configuration, you can also create a docker-compose.yml file locally and use it with docker-compose up:

docker-compose.yml
version: \"3\"\n\nservices:\n  lakefs:\n    image: treeverse/lakefs:1.7.0\n    ports:\n      - 8000:8000\n    environment:\n      LAKEFS_INSTALLATION_USER_NAME: \"quickstart\"\n      LAKEFS_INSTALLATION_ACCESS_KEY_ID: \"AKIAIOSFOLQUICKSTART\"\n      LAKEFS_INSTALLATION_SECRET_ACCESS_KEY: \"wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY\"\n      LAKEFS_DATABASE_TYPE: \"local\"\n      LAKEFS_AUTH_ENCRYPT_SECRET_KEY: \"THIS_MUST_BE_CHANGED_IN_PRODUCTION\"\n      LAKEFS_BLOCKSTORE_TYPE: \"local\"\n

In order to allow lakeFS-spec to automatically discover credentials to access this lakeFS instance, create a .lakectl.yaml in your home directory containing the credentials for the quickstart environment (you can also use lakectl config to create this file interactively if you have the lakectl tool installed on your machine):

~/.lakectl.yaml
credentials: # (1)!\n  access_key_id: AKIAIOSFOLQUICKSTART\n  secret_access_key: wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY\nserver:\n  endpoint_url: http://127.0.0.1:8000\n
  1. These must match the credentials set in the environment section of the Docker Compose file above

After the container has finished initializing, you can access the web UI of your local lakeFS deployment in your browser. Fill out the setup form, where you can optionally share your email address with the developers of lakeFS to receive updates on their product. Next, you can log into your fresh lakeFS instance with the credentials listed above.

Success

Your fresh local lakeFS instance is a playground for you to explore lakeFS functionality.

In the next step, we will create your first repository on this server.

"},{"location":"quickstart/#create-a-lakefs-repository","title":"Create a lakeFS repository","text":"

Once you have logged into the web UI of the lakeFS server for the first time, you can create an empty repository on the next page. Click the small Click here link at the bottom of the page to proceed and create a repository named repo (we don't want to add the sample data for this guide):

Tip: Creating a repository later

If you have inadvertently skipped over the quickstart repository creation page, you can always create a new repository on the Repositories tab in the lakeFS web UI (and optionally choose to add the sample data):

Success

You have successfully created a lakeFS repository named repo, ready to be used with lakeFS-spec.

"},{"location":"quickstart/#using-the-lakefs-file-system","title":"Using the lakeFS file system","text":"

We will now use the lakeFS-spec file system interface to perform some basic operations on the repository created in the previous step:

  • Upload a local file to the repository
  • Read data from a file in the repository
  • Make a commit
  • Fetch metadata about repository contents
  • Delete a file from the repository

To get started, create a file called quickstart.py with the following contents:

quickstart.py
from pathlib import Path\n\nfrom lakefs_spec import LakeFSFileSystem\n\nREPO, BRANCH = \"repo\", \"main\"\n\n# Prepare example local data\nlocal_path = Path(\"demo.txt\")\nlocal_path.write_text(\"Hello, lakeFS!\")\n

Tip

We will keep adding more code to this file as we progress through the next steps. Feel free to execute the script after each step and observe the effects as noted in the guide.

This code snippet prepares a file demo.txt on your machine, ready to be added to the lakeFS repository, so let's do just that:

fs = LakeFSFileSystem()  # will auto-discover credentials from ~/.lakectl.yaml\nrepo_path = f\"{REPO}/{BRANCH}/{local_path.name}\"\n\nwith fs.transaction(REPO, BRANCH) as tx:\n    fs.put(str(local_path), f\"{REPO}/{tx.branch.id}/{local_path.name}\")\n    tx.commit(message=\"Add demo data\")\n

If you execute the quickstart.py script at this point, you can already see the committed file in the lakeFS web UI:

While examining the file contents in the browser is nice, we want to access the committed file programmatically. Add the following lines at the end of your script and observe the output:

f = fs.open(repo_path, \"rt\")\nprint(f.readline())  # prints \"Hello, lakeFS!\"\n

Note that executing the same code multiple times will only result in a single commit in the repository since the contents of the file on disk and in the repository are identical.

In addition to simple read and write operations, the fsspec file system interface also allows us to list the files in a repository folder using ls, and query the metadata of objects in the repository through info (akin to the POSIX stat system call). Let's add the following code to our script and observe the output:

# Compare the sizes of local file and repo\nfile_info = fs.info(repo_path)\nprint(\n    f\"{local_path.name}: local size: {file_info['size']}, remote size: {local_path.stat().st_size}\"\n)\n\n# Get information about all files in the repo root\nprint(fs.ls(f\"{REPO}/{BRANCH}/\"))\n

As the last order of business, let's clean up the repository to its original state by removing the file using the rm operation and creating another commit (also, the local file is deleted, since we don't need it anymore):

with fs.transaction(REPO, BRANCH) as tx:\n    fs.rm(f\"{REPO}/{tx.branch.id}/{local_path.name}\")\n    tx.commit(message=\"Delete demo data\")\n

Success

You now have all the basic tools available to version data from your Python code using the file system interface provided by lakeFS-spec.

Full example code quickstart.py
from pathlib import Path\n\nfrom lakefs_spec import LakeFSFileSystem\n\nREPO, BRANCH = \"repo\", \"main\"\n\n# Prepare example local data\nlocal_path = Path(\"demo.txt\")\nlocal_path.write_text(\"Hello, lakeFS!\")\n\n# Upload the local file to the repo and commit\nfs = LakeFSFileSystem()  # will auto-discover credentials from ~/.lakectl.yaml\nrepo_path = f\"{REPO}/{BRANCH}/{local_path.name}\"\n\nwith fs.transaction(REPO, BRANCH) as tx:\n    fs.put(str(local_path), f\"{REPO}/{tx.branch.id}/{local_path.name}\")\n    tx.commit(message=\"Add demo data\")\n\n# Read back the file contents\nf = fs.open(repo_path, \"rt\")\nprint(f.readline())  # prints \"Hello, lakeFS!\"\n\n# Compare the sizes of local file and repo\nfile_info = fs.info(repo_path)\nprint(\n    f\"{local_path.name}: local size: {file_info['size']}, remote size: {local_path.stat().st_size}\"\n)\n\n# Get information about all files in the repo root\nprint(fs.ls(f\"{REPO}/{BRANCH}/\"))\n\n# Delete uploaded file from the repository (and commit)\nwith fs.transaction(REPO, BRANCH) as tx:\n    fs.rm(f\"{REPO}/{tx.branch.id}/{local_path.name}\")\n    tx.commit(message=\"Delete demo data\")\n\nlocal_path.unlink()\n
"},{"location":"quickstart/#next-steps","title":"Next Steps","text":"

After this walkthrough of the installation and an introduction to basic file system operations using lakeFS-spec, you might want to consider more advanced topics:

  • API Reference
  • User Guide, in particular
    • How to use the lakeFS file system
    • How to use lakeFS-spec with third-party data science libraries
  • Tutorial: Using lakeFS-spec in a data science project
"},{"location":"guides/","title":"User Guide","text":"

The lakeFS-spec user guide provides documentation for users of the library looking to solve specific tasks. See the Quickstart guide for an introductory tutorial.

  • How to use the lakeFS file system
  • Passing configuration to the file system
  • Using file system transactions
  • Using transactions on the lakeFS file system
  • How to use lakeFS-spec with third-party data science libraries
"},{"location":"guides/configuration/","title":"Passing configuration to the file system","text":"

There are multiple ways to configure the LakeFSFileSystem for use with a deployed lakeFS instance. This guide introduces them in the order of least to most in-Python configuration - the preferred way to use the file system is with as little Python code as possible.

Info

The configuration methods are introduced in reverse order of precedence - config file arguments have the lowest priority and are overwritten by environment variables (if specified).

"},{"location":"guides/configuration/#the-lakectlyaml-configuration-file","title":"The .lakectl.yaml configuration file","text":"

The easiest way of configuring the lakeFS file system is with a lakectl YAML configuration file. To address a lakeFS server, the following minimum configuration is required:

~/.lakectl.yaml
credentials:\n  access_key_id: <ID>\n  secret_access_key: <KEY>\nserver:\n  endpoint_url: <LAKEFS-HOST>\n

For a local instance produced by the quickstart, the following values will work:

~/.lakectl.yaml
credentials:\n  access_key_id: AKIAIOSFOLQUICKSTART\n  secret_access_key: wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY\nserver:\n  endpoint_url: http://127.0.0.1:8000\n

To work without any more arguments \"out of the box\", the configuration file has to be placed in your home directory with the name .lakectl.yaml (this is where lakeFS expects it). If you set all values correctly, you can instantiate the lakeFS file system without any arguments:

from lakefs_spec import LakeFSFileSystem\n\n# zero config necessary.\nfs = LakeFSFileSystem()\n

If you cannot use the default location ($HOME/.lakectl.yaml), you can read a file from any other location by passing the configfile argument:

from lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem(configfile=\"/path/to/my/configfile.yaml\")\n
"},{"location":"guides/configuration/#setting-environment-variables","title":"Setting environment variables","text":"

It is also possible to specify certain configuration values used for authentication with the lakeFS server with environment variables. For these values, the variable name is exactly the constructor argument name prefaced with LAKEFS_, e.g. the host argument can be set via the LAKEFS_HOST environment variable.

import os\nfrom lakefs_spec import LakeFSFileSystem\n\nos.environ[\"LAKEFS_HOST\"] = \"http://my-lakefs.host\"\nos.environ[\"LAKEFS_USERNAME\"] = \"my-username\"\nos.environ[\"LAKEFS_PASSWORD\"] = \"my-password\"\n\n# also zero-config.\nfs = LakeFSFileSystem()\n

Info

Not all initialization values can be set via environment variables - the proxy, create_branch_ok, and source_branch arguments can only be supplied in Python.

"},{"location":"guides/configuration/#appendix-mixing-zero-config-methods","title":"Appendix: Mixing zero-config methods","text":"

Two of the introduced methods allow for \"zero-config\" (i.e. no arguments given to the constructor) initialization of the file system. However, care must be taken when working with different file systems configured by the same means (for example, file systems configured with separate environment variables).

The reason for this is the instance caching mechanism built into fsspec. While this allows for efficient reuse of file systems e.g. by third-party libraries (pandas, DuckDB, ...), it can lead to silent misconfigurations. Consider this example, with an existent .lakectl.yaml file:

~/.lakectl.yaml
credentials:\n  access_key_id: AKIAIOSFOLQUICKSTART\n  secret_access_key: wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY\nserver:\n  endpoint_url: http://127.0.0.1:8000\n

Now, mixing config file and environment variable initializations leads to the wrong result:

import os\nfrom lakefs_spec import LakeFSFileSystem\n\n# first file system, initialized from the config file\nconfig_fs = LakeFSFileSystem()\n\nos.environ[\"LAKEFS_HOST\"] = \"http://my-other-lakefs.host\"\nos.environ[\"LAKEFS_USERNAME\"] = \"my-username\"\nos.environ[\"LAKEFS_PASSWORD\"] = \"my-password\"\n\nenvvar_fs = LakeFSFileSystem()\n\nprint(config_fs is envvar_fs) # <- prints True! \n

The reason why the above code does not work as desired is that the cached config-file-initialized file system is simply reused on the second assignment. To clear the file system instance cache, you can run the following:

from lakefs_spec import LakeFSFileSystem\n\nLakeFSFileSystem.clear_instance_cache()\n
"},{"location":"guides/filesystem-usage/","title":"How to use the lakeFS file system","text":"

This guide contains instructions and code snippets on how to use the lakeFS file system.

"},{"location":"guides/filesystem-usage/#the-lakefs-uri-structure","title":"The lakeFS URI structure","text":"

In the following subsections, we frequently make use of lakeFS URIs in the example code. lakeFS URIs identify resources in a lakeFS deployment through a unique path consisting of repository name, lakeFS revision/ref name, and file name relative to the repository root. Optionally, they may be prefixed with the lakefs:// URI scheme (this is required when using third-party libraries).

As an example, a URI like repo/main/file.txt addresses the file.txt file on the main branch in the repository named repo.

In some lakeFS file system operations, directories are also allowed as resource names. For example, the URI repo/main/data/ (note the optional trailing slash) refers to the data directory on the main branch in the repo repository.

"},{"location":"guides/filesystem-usage/#on-staged-versus-committed-changes","title":"On staged versus committed changes","text":"

When uploading, copying, or removing files or directories from a branch, those removal operations will result in staged changes in the repository until a commit is created. lakeFS-spec does not create these commits automatically, since it separates file operations from versioning operations rigorously. If you want to conduct versioning operations, like creating commits, between file transfers, the best way to do so is by using filesystem transactions.

"},{"location":"guides/filesystem-usage/#how-to-use-lakefs-file-system-apis","title":"How to use lakeFS file system APIs","text":"

The following section explains more in-depth how to use the LakeFSFileSystem APIs. This section concerns the explicitly implemented operations. In addition, there are a number of file system APIs inherited from the AbstractFileSystem interface in fsspec.

More information on file system usage can be found in the fsspec documentation.

"},{"location":"guides/filesystem-usage/#uploading-and-downloading-files","title":"Uploading and downloading files","text":"

The arguably most important feature of the file system is file transfers.

"},{"location":"guides/filesystem-usage/#file-uploads","title":"File uploads","text":"

To upload a file, you can use the fs.put() and fs.put_file() methods. While fs.put_file() operates on single files only, the fs.put() API can be used for directory uploads.

from lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\n\n# remote path, then local target path.\nfs.put_file(\"file.txt\", \"my-repo/my-ref/file.txt\")\n

If you want to upload an entire directory to lakeFS, you can use the fs.put() API together with the recursive=True switch:

# structure:\n#   dir/\n#   \u251c\u2500\u2500 a.txt\n#   \u251c\u2500\u2500 b.yaml\n#   \u251c\u2500\u2500 c.csv\n#   \u2514\u2500\u2500 ...\n\nfs.put(\"dir\", \"my-repo/my-ref/dir\", recursive=True)\n

Info

The above method of file uploading results in two transfers: Once from the client to the lakeFS server, and once from the lakeFS server to the object storage. This can impact performance if the uploaded files are very large. To avoid this performance issue, you can also decide to write the file directly to the underlying object storage:

fs = LakeFSFileSystem()\n\nfs.put_file(\"my-repo/my-ref/file.txt\", \"file.txt\", use_blockstore=True)\n

Direct lakeFS blockstore uploads require the installation of the corresponding fsspec file system implementation through pip. For an S3-based lakeFS deployment, install the s3fs package. For Google Cloud Storage (GCS), install the gcsfs package. For Azure blob storage, install the adlfs package.

"},{"location":"guides/filesystem-usage/#file-downloads","title":"File downloads","text":"

To download a file, you can use the fs.get() or fs.get_file() methods. While fs.get_file() downloads single files only, the fs.get() API can be used for recursive directory downloads.

from lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\n\n# remote path, then local target path.\nfs.get_file(\"my-repo/my-ref/file.txt\", \"file.txt\")\n

In the case of a directory in lakeFS, use the fs.get() API together with the recursive=True switch:

# structure:\n#   dir/\n#   \u251c\u2500\u2500 a.txt\n#   \u251c\u2500\u2500 b.yaml\n#   \u251c\u2500\u2500 c.csv\n#   \u2514\u2500\u2500 ...\n\n# downloads the entire `dir` directory (and subdirectories) into the current directory.\nfs.get(\"my-repo/my-ref/dir\", \"dir\", recursive=True)\n
"},{"location":"guides/filesystem-usage/#checking-the-existence-of-lakefs-objects","title":"Checking the existence of lakeFS objects","text":"

To check the existence of a file in a given revision of a repository, you can use the fs.exists() API:

from lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\n\nmy_file_exists = fs.exists(\"my-repo/my-ref/my-file.txt\")\n

This function returns True if the file exists on that revision, and False if it does not. Errors (e.g. permission errors) will be raised, since in that case, object existence cannot be decided.

Warning

fs.exists() only works on file objects, and will return False if called on directories.

"},{"location":"guides/filesystem-usage/#obtaining-info-on-stored-objects","title":"Obtaining info on stored objects","text":"

To query the metadata of a single object in a lakeFS repository, use the fs.info() API:

from lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\n\nmy_file_info = fs.info(\"my-repo/my-ref/my-file.txt\")\n

The resulting my_file_info object is a dictionary containing useful information such as storage location of the file, creation timestamp, and size (in bytes).

You can also call fs.info() on directories:

dir_info = fs.info(\"my-repo/my-ref/dir/\")\n

In this case, the resulting dir_info object only contains the directory name, and the cumulated size of the files it contains.

"},{"location":"guides/filesystem-usage/#listing-directories-in-lakefs","title":"Listing directories in lakeFS","text":"

To list the files in a directory in lakeFS, use the fs.ls() method:

from lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\n\nmy_dir_listing = fs.ls(\"my-repo/my-ref/my-dir/\")\n

This returns a list of Python dictionaries containing information on the objects contained in the requested directory. The returned objects have the same fields set as those returned by a normal fs.info() call on a file object.

"},{"location":"guides/filesystem-usage/#deleting-objects-from-a-lakefs-branch","title":"Deleting objects from a lakeFS branch","text":"

To delete objects from a lakeFS branch, use the fs.rm_file() or fs.rm() APIs. As before, while the former works only for single files, the latter can be used to remove entire directories with the recursive=True option.

from lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\n\nfs.rm_file(\"my-repo/my-branch/my-file.txt\")\n\n# removes the entire `my-dir` directory.\nfs.rm(\"my-repo/my-branch/my-dir/\", recursive=True)\n
"},{"location":"guides/filesystem-usage/#copying-files-in-a-repository","title":"Copying files in a repository","text":"

To copy files on a branch or from one branch to another, use the fs.cp_file() or fs.copy() methods:

from lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\n\n# copies a single file on the same branch to a new location.\nfs.cp_file(\"my-repo/branch-a/file.txt\", \"my-repo/branch-a/file.txt.bak\")\n\n# copies a single file from branch A to branch B.\nfs.cp_file(\"my-repo/branch-a/file.txt\", \"my-repo/branch-b/file.txt\")\n\n# copies the entire `my-dir` directory from branch A to branch B (which must exist).\nfs.copy(\"my-repo/branch-a/my-dir/\", \"my-repo/branch-b/my-dir/\", recursive=True)\n

Info

Files and directories can only be copied between branches in the same repository, not between different repositories.

Trying to copy to a non-existent branch will not create the branch.

"},{"location":"guides/integrations/","title":"How to use lakeFS-spec with third-party data science libraries","text":"

lakeFS-spec is built on top of the fsspec library, which allows third-party libraries to make use of its file system abstraction to offer high-level features. The fsspec documentation lists examples of its users, mostly data science libraries.

This user guide page adds more detail on how lakeFS-spec can be used with four prominent data science libraries.

Code Examples

The code examples assume access to an existing lakeFS server with a quickstart repository containing the sample data already set up.

Please see the Quickstart guide or lakeFS quickstart guide if you need guidance in getting started.

The relevant lines for the lakeFS-spec integration in the following code snippets are highlighted.

"},{"location":"guides/integrations/#pandas","title":"Pandas","text":"

Pandas can read and write data from remote locations, and uses fsspec for all URLs that are not local or HTTP(S).

This means that (almost) all pd.read_* and pd.DataFrame.to_* operations can benefit from the lakeFS integration offered by our library without any additional configuration. See the Pandas documentation on reading/writing remote files for additional details.

The following code snippet illustrates how to read and write Pandas data frames in various formats from/to a lakeFS repository in the context of a transaction:

import pandas as pd\n\nfrom lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\n\nwith fs.transaction(\"quickstart\", \"main\") as tx:\n    lakes = pd.read_parquet(f\"lakefs://quickstart/{tx.branch.id}/lakes.parquet\")\n    german_lakes = lakes.query('Country == \"Germany\"')\n    german_lakes.to_csv(f\"lakefs://quickstart/{tx.branch.id}/german_lakes.csv\")\n\n    tx.commit(message=\"Add German lakes\")\n
"},{"location":"guides/integrations/#duckdb","title":"DuckDB","text":"

The DuckDB in-memory database management system includes support for fsspec file systems as part of its Python API (see the official documentation on using fsspec filesystems for details). This allows DuckDB to transparently query and store data located in lakeFS repositories through lakeFS-spec.

Similar to the example above, the following code snippet illustrates how to read and write data from/to a lakeFS repository in the context of a transaction through the DuckDB Python API:

import duckdb\n\nfrom lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\nduckdb.register_filesystem(fs)\n\nwith fs.transaction(\"quickstart\", \"main\") as tx:\n    lakes = duckdb.read_parquet(\"lakefs://quickstart/main/lakes.parquet\")\n    italian_lakes = duckdb.sql(\"SELECT * FROM lakes where Country='Italy'\")\n    italian_lakes.to_csv(f\"lakefs://quickstart/{tx.branch.id}/italian_lakes.csv\")\n\n    tx.commit(message=\"Add Italian lakes\")\n
  1. Makes the lakeFS-spec file system known to DuckDB (duckdb.register_filesystem(fsspec.filesystem(\"lakefs\")) can also be used to avoid the direct import of LakeFSFileSystem)
"},{"location":"guides/integrations/#polars","title":"Polars","text":"

Warning

There is an ongoing discussion in the Polars development team whether to remove support for fsspec file systems, with no clear outcome as of the time this page was written. Please refer to the discussion on the relevant GitHub issue in case you encounter any problems.

The Python API wrapper for the Rust-based Polars DataFrame library can access remote storage through fsspec, similar to Pandas (see the official documentation on cloud storage).

Again, the following code example demonstrates how to read a Parquet file and save a modified version back in CSV format to a lakeFS repository from Polars in the context of a transaction:

import polars as pl\n\nfrom lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\n\nwith fs.transaction(\"quickstart\", \"main\") as tx:\n    lakes = pl.read_parquet(f\"lakefs://quickstart/{tx.branch.id}/lakes.parquet\")\n    us_lakes = lakes.filter(pl.col(\"Country\") == \"United States of America\")\n\n    with fs.open(f\"lakefs://quickstart/{tx.branch.id}/us_lakes.csv\", \"wb\") as f:\n        us_lakes.write_csv(f)\n\n    tx.commit(message=\"Add US lakes\")\n
  1. Polars does not support directly writing to remote storage through the pl.DataFrame.write_* API (see docs)
"},{"location":"guides/integrations/#pyarrow","title":"PyArrow","text":"

Apache Arrow and its Python API, PyArrow, can also use fsspec file systems to perform I/O operations on data objects. The documentation has additional details on using fsspec-compatible file systems with Arrow.

PyArrow read_* and write_* functions take an explicit filesystem parameter, which accepts any fsspec file system, such as the LakeFSFileSystem provided by this library.

The following example code illustrates the use of lakeFS-spec with PyArrow, reading a Parquet file and writing it back to a lakeFS repository as a partitioned CSV dataset in the context of a transaction:

import pyarrow as pa\nimport pyarrow.dataset as ds\nimport pyarrow.parquet as pq\n\nfrom lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\n\nwith fs.transaction(\"quickstart\", \"main\") as tx:\n    lakes_table = pq.read_table(f\"quickstart/{tx.branch.id}/lakes.parquet\", filesystem=fs)\n\n    ds.write_dataset(\n        lakes_table,\n        f\"quickstart/{tx.branch.id}/lakes\",\n        filesystem=fs,\n        format=\"csv\",\n        partitioning=ds.partitioning(pa.schema([lakes_table.schema.field(\"Country\")])),\n    )\n\n    tx.commit(\"Add partitioned lakes data set\")\n
"},{"location":"guides/transactions/","title":"Using transactions on the lakeFS file system","text":"

In addition to file operations, you can carry out versioning operations in your Python code using file system transactions.

Transactions in lakeFS-spec behave similarly to the transactions in the high-level lakeFS SDK: Both approaches create an ephemeral branch for a transaction, perform the operations in the context block on that ephemeral branch, and optionally merge it back into the source branch upon exiting the context manager.

They are an \"all or nothing\" proposition: If an error occurs during the transaction, the base branch is left unchanged.

The lakeFS-spec transaction inherits from fsspec transactions. For more information on fsspec transactions, see the official documentation.

"},{"location":"guides/transactions/#versioning-operations","title":"Versioning operations","text":"

The lakeFS file system's transaction is the intended place for conducting versioning operations between file transfers. The following is an example of file uploads with commit creations, with a tag being applied at the end.

from lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\n\nwith fs.transaction(\"repo\", \"main\") as tx:\n    fs.put_file(\"train-data.txt\", f\"repo/{tx.branch.id}/train-data.txt\")\n    tx.commit(message=\"Add training data\")\n    fs.put_file(\"test-data.txt\", f\"repo/{tx.branch.id}/test-data.txt\")\n    sha = tx.commit(message=\"Add test data\")\n    tx.tag(sha, name=\"My train-test split\")\n

The full list of supported lakeFS versioning operations (by default, these operations target the transaction branch):

  • commit, for creating a commit, optionally with attached metadata.
  • merge, for merging a given branch.
  • revert, for reverting a previous commit.
  • rev_parse, for parsing revisions like branch/tag names and SHA fragments into full commit SHAs.
  • tag, for creating a tag pointing to a commit.
"},{"location":"guides/transactions/#lifecycle-of-ephemeral-transaction-branches","title":"Lifecycle of ephemeral transaction branches","text":"

You can control the lifecycle for a transaction branch with the delete argument:

  • By default (delete=\"onsuccess), the branch is deleted after successful completion, and left over in case of failure for debugging purposes.
  • If delete=\"always\", the branch is unconditionally deleted after the transaction regardless of its status.
  • Similarly, if delete=\"never\", the branch is unconditionally left in place after the transaction.

Additionally, the automerge keyword controls whether the transaction branch is merged after successful completion of the transaction. It has no effect if an error occurs over the course of the transaction.

"},{"location":"guides/transactions/#error-handling","title":"Error handling","text":"

Since all files are uploaded to a short-lived transaction branch, no commit on the target branch happens in case of an exception:

from lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\n\nwith fs.transaction(\"repo\", \"main\", delete=\"onsuccess\") as tx:\n    fs.put_file(\"my-file.txt\", f\"repo/{tx.branch.id}/my-file.txt\")\n    tx.commit(message=\"Add my-file.txt\")\n    raise ValueError(\"oops!\")\n

The above code will not modify the main branch, since the ValueError prevents the merge of the transaction branch. Note that you can examine the contents of the transaction branch due to delete=\"onsuccess\" (the default behavior), which prevents deletion of the branch in case of failure for debugging purposes.

"},{"location":"reference/SUMMARY/","title":"SUMMARY","text":"
  • lakefs_spec
    • errors
    • spec
    • transaction
    • util
"},{"location":"reference/lakefs_spec/","title":"lakefs_spec","text":"

lakefs-spec is an fsspec file system integration for the lakeFS data lake.

"},{"location":"reference/lakefs_spec/errors/","title":"errors","text":"

Error translation facilities to map lakeFS API errors to Python-native OS errors in the lakeFS file system.

This is important to honor the fsspec API contract, where users only need to expect builtin Python exceptions to avoid complicated error handling setups.

"},{"location":"reference/lakefs_spec/errors/#lakefs_spec.errors.translate_lakefs_error","title":"translate_lakefs_error","text":"
translate_lakefs_error(\n    error: ServerException,\n    rpath: str | None = None,\n    message: str | None = None,\n    set_cause: bool = True,\n) -> OSError\n

Convert a lakeFS server exception to a Python builtin exception.

For some subclasses of lakefs.exceptions.ServerException, a direct Python builtin equivalent exists. In these cases, the suitable equivalent is returned. All other classes are converted to a standard IOError.

PARAMETER DESCRIPTION error

The exception returned by the lakeFS SDK wrapper.

TYPE: ServerException

rpath

The remote resource path involved in the error.

TYPE: str | None DEFAULT: None

message

An error message to use for the returned exception. If not given, the error message returned by the lakeFS server is used instead.

TYPE: str | None DEFAULT: None

set_cause

Whether to set the __cause__ attribute to the previous exception if the exception is translated.

TYPE: bool DEFAULT: True

RETURNS DESCRIPTION OSError

A builtin Python exception ready to be thrown.

Source code in src/lakefs_spec/errors.py
def translate_lakefs_error(\n    error: ServerException,\n    rpath: str | None = None,\n    message: str | None = None,\n    set_cause: bool = True,\n) -> OSError:\n    \"\"\"\n    Convert a lakeFS server exception to a Python builtin exception.\n\n    For some subclasses of ``lakefs.exceptions.ServerException``, a direct Python builtin equivalent exists.\n    In these cases, the suitable equivalent is returned. All other classes are converted to a standard ``IOError``.\n\n    Parameters\n    ----------\n    error: ServerException\n        The exception returned by the lakeFS SDK wrapper.\n    rpath: str | None\n        The remote resource path involved in the error.\n    message: str | None\n        An error message to use for the returned exception.\n         If not given, the error message returned by the lakeFS server is used instead.\n    set_cause: bool\n        Whether to set the ``__cause__`` attribute to the previous exception if the exception is translated.\n\n    Returns\n    -------\n    OSError\n        A builtin Python exception ready to be thrown.\n    \"\"\"\n    status = error.status_code\n\n    if hasattr(error, \"body\"):\n        # error has a JSON response body attached\n        reason = error.body[\"message\"]\n    else:\n        reason = error.reason\n\n    emsg = f\"{status} {reason}\"\n    if rpath:\n        emsg += f\": {rpath!r}\"\n\n    constructor = HTTP_CODE_TO_ERROR.get(status, partial(IOError, errno.EIO))\n    custom_exc = constructor(message or emsg)\n\n    if set_cause:\n        custom_exc.__cause__ = error\n    return custom_exc\n
"},{"location":"reference/lakefs_spec/spec/","title":"spec","text":"

Core interface definitions for file system interaction with lakeFS from Python, namely the LakeFSFileSystem and LakeFSFile classes.

"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem","title":"LakeFSFileSystem","text":"

Bases: AbstractFileSystem

lakeFS file system implementation.

Instances of this class are cached based on their constructor arguments.

For more information, see the fsspec documentation https://filesystem-spec.readthedocs.io/en/latest/features.html#instance-caching.

PARAMETER DESCRIPTION host

The address of your lakeFS instance.

TYPE: str | None DEFAULT: None

username

The access key name to use in case of access key authentication.

TYPE: str | None DEFAULT: None

password

The access key secret to use in case of access key authentication.

TYPE: str | None DEFAULT: None

api_key

The API key to use in case of authentication with an API key.

TYPE: str | None DEFAULT: None

api_key_prefix

A string prefix to use for the API key in authentication.

TYPE: str | None DEFAULT: None

access_token

An access token to use in case of access token authentication.

TYPE: str | None DEFAULT: None

verify_ssl

Whether to verify SSL certificates in API interactions. Do not disable in production.

TYPE: bool DEFAULT: True

ssl_ca_cert

A custom certificate PEM file to use to verify the peer in SSL connections.

TYPE: str | None DEFAULT: None

proxy

Proxy address to use when connecting to a lakeFS server.

TYPE: str | None DEFAULT: None

create_branch_ok

Whether to create branches implicitly when not-existing branches are referenced on file uploads.

TYPE: bool DEFAULT: True

source_branch

Source branch set as origin when a new branch is implicitly created.

TYPE: str DEFAULT: 'main'

**storage_options

Configuration options to pass to the file system's directory cache.

TYPE: Any DEFAULT: {}

Source code in src/lakefs_spec/spec.py
class LakeFSFileSystem(AbstractFileSystem):\n    \"\"\"\n    lakeFS file system implementation.\n\n    Instances of this class are cached based on their constructor arguments.\n\n    For more information, see the fsspec documentation <https://filesystem-spec.readthedocs.io/en/latest/features.html#instance-caching>.\n\n    Parameters\n    ----------\n    host: str | None\n        The address of your lakeFS instance.\n    username: str | None\n        The access key name to use in case of access key authentication.\n    password: str | None\n        The access key secret to use in case of access key authentication.\n    api_key: str | None\n        The API key to use in case of authentication with an API key.\n    api_key_prefix: str | None\n        A string prefix to use for the API key in authentication.\n    access_token: str | None\n        An access token to use in case of access token authentication.\n    verify_ssl: bool\n        Whether to verify SSL certificates in API interactions. Do not disable in production.\n    ssl_ca_cert: str | None\n        A custom certificate PEM file to use to verify the peer in SSL connections.\n    proxy: str | None\n        Proxy address to use when connecting to a lakeFS server.\n    create_branch_ok: bool\n        Whether to create branches implicitly when not-existing branches are referenced on file uploads.\n    source_branch: str\n        Source branch set as origin when a new branch is implicitly created.\n    **storage_options: Any\n        Configuration options to pass to the file system's directory cache.\n    \"\"\"\n\n    protocol = \"lakefs\"\n\n    def __init__(\n        self,\n        host: str | None = None,\n        username: str | None = None,\n        password: str | None = None,\n        api_key: str | None = None,\n        api_key_prefix: str | None = None,\n        access_token: str | None = None,\n        verify_ssl: bool = True,\n        ssl_ca_cert: str | None = None,\n        proxy: str | None = None,\n        create_branch_ok: bool = True,\n        source_branch: str = \"main\",\n        **storage_options: Any,\n    ):\n        super().__init__(**storage_options)\n\n        # lakeFS client arguments\n        cargs = [host, username, password, api_key, api_key_prefix, access_token, ssl_ca_cert]\n\n        if all(arg is None for arg in cargs):\n            # empty kwargs means envvar and configfile autodiscovery\n            self.client = Client()\n        else:\n            self.client = Client(\n                host=host,\n                username=username,\n                password=password,\n                api_key=api_key,\n                api_key_prefix=api_key_prefix,\n                access_token=access_token,\n                ssl_ca_cert=ssl_ca_cert,\n            )\n\n        # proxy address, not part of the constructor\n        self.client.config.proxy = proxy\n        # whether to verify SSL certs, not part of the constructor\n        self.client.config.verify_ssl = verify_ssl\n\n        self.create_branch_ok = create_branch_ok\n        self.source_branch = source_branch\n\n    @cached_property\n    def _lakefs_server_version(self):\n        with self.wrapped_api_call():\n            return tuple(int(t) for t in self.client.version.split(\".\"))\n\n    @classmethod\n    @overload\n    def _strip_protocol(cls, path: str | os.PathLike[str] | Path) -> str:\n        ...\n\n    @classmethod\n    @overload\n    def _strip_protocol(cls, path: list[str | os.PathLike[str] | Path]) -> list[str]:\n        ...\n\n    @classmethod\n    def _strip_protocol(cls, path):\n        \"\"\"Copied verbatim from the base class, save for the slash rstrip.\"\"\"\n        if isinstance(path, list):\n            return [cls._strip_protocol(p) for p in path]\n        spath = super()._strip_protocol(path)\n        if stringify_path(path).endswith(\"/\"):\n            return spath + \"/\"\n        return spath\n\n    @property\n    def transaction(self) -> LakeFSTransaction:\n        \"\"\"\n        A context manager within which file uploads and versioning operations are deferred to a\n        queue, and carried out during when exiting the context.\n\n        Requires the file class to implement ``.commit()`` and ``.discard()`` for the normal and exception cases.\n        \"\"\"\n        self._transaction: LakeFSTransaction | None\n        if self._transaction is None:\n            self._transaction = LakeFSTransaction(self)\n        return self._transaction\n\n    def start_transaction(self):\n        raise NotImplementedError(\n            \"lakeFS transactions should only be used as a context manager via\"\n            \" `with LakeFSFileSystem.transaction as tx:`\"\n        )\n\n    @contextmanager\n    def wrapped_api_call(\n        self, rpath: str | None = None, message: str | None = None, set_cause: bool = True\n    ) -> Generator[None, None, None]:\n        \"\"\"\n        A context manager to wrap lakeFS API calls, translating any API errors to Python-native OS errors.\n\n        Meant for internal use.\n\n        Parameters\n        ----------\n        rpath: str | None\n            The remote path involved in the requested API call.\n        message: str | None\n            A custom error message to emit instead of parsing the API error response.\n        set_cause: bool\n            Whether to include the original lakeFS API error in the resulting traceback.\n\n        Yields\n        ------\n        None\n            An empty generator, to be used as a context manager.\n\n        Raises\n        ------\n        OSError\n            Translated error from the lakeFS API call, if any.\n        \"\"\"\n        try:\n            yield\n        except ServerException as e:\n            raise translate_lakefs_error(e, rpath=rpath, message=message, set_cause=set_cause)\n\n    def checksum(self, path: str | os.PathLike[str]) -> str | None:\n        \"\"\"\n        Get a remote lakeFS file object's checksum.\n\n        This is usually its MD5 hash, unless another hash function was used on upload.\n\n        Parameters\n        ----------\n        path: str | os.PathLike[str]\n            The remote path to look up the lakeFS checksum for. Must point to a single file object.\n\n        Returns\n        -------\n        str | None\n            The remote file's checksum, or ``None`` if ``path`` points to a directory or does not exist.\n        \"\"\"\n        path = stringify_path(path)\n        try:\n            return self.info(path).get(\"checksum\")\n        except FileNotFoundError:\n            return None\n\n    def exists(self, path: str | os.PathLike[str], **kwargs: Any) -> bool:\n        \"\"\"\n        Check existence of a remote path in a lakeFS repository.\n\n        Input paths can either be files or directories.\n\n        Parameters\n        ----------\n        path: str | os.PathLike[str]\n            The remote path whose existence to check. Must be a fully qualified lakeFS URI.\n        **kwargs: Any\n            Additional keyword arguments for fsspec compatibility, unused.\n\n        Returns\n        -------\n        bool\n            ``True`` if the requested path exists, ``False`` if it does not.\n\n        Raises\n        ------\n        PermissionError\n            If the user does not have sufficient permissions to query object existence.\n        \"\"\"\n        path = stringify_path(path)\n        repository, ref, resource = parse(path)\n        try:\n            reference = lakefs.Reference(repository, ref, client=self.client)\n            return reference.object(resource).exists()\n        except ServerException as e:\n            # in case of an error other than \"not found\", existence cannot be\n            # decided, so raise the translated error.\n            raise translate_lakefs_error(e)\n\n    def cp_file(\n        self, path1: str | os.PathLike[str], path2: str | os.PathLike[str], **kwargs: Any\n    ) -> None:\n        \"\"\"\n        Copy a single file from one remote location to another in lakeFS.\n\n        Parameters\n        ----------\n        path1: str | os.PathLike[str]\n            The remote file location to be copied.\n        path2: str | os.PathLike[str]\n            The (remote) target location to which to copy the file.\n        **kwargs: Any\n            Additional keyword arguments for fsspec compatibility, unused.\n\n        Raises\n        ------\n        ValueError\n            When attempting to copy objects between repositories.\n        \"\"\"\n        path1 = stringify_path(path1)\n        path2 = stringify_path(path2)\n        if path1 == path2:\n            return\n\n        orig_repo, orig_ref, orig_path = parse(path1)\n        dest_repo, dest_ref, dest_path = parse(path2)\n\n        if orig_repo != dest_repo:\n            raise ValueError(\n                \"can only copy objects within a repository, but got source \"\n                f\"repository {orig_repo!r} and destination repository {dest_repo!r}\"\n            )\n\n        with self.wrapped_api_call():\n            reference = lakefs.Reference(orig_repo, orig_ref, client=self.client)\n            reference.object(orig_path).copy(dest_ref, dest_path)\n\n    def get_file(\n        self,\n        rpath: str | os.PathLike[str],\n        lpath: str | os.PathLike[str],\n        callback: fsspec.callbacks.Callback = _DEFAULT_CALLBACK,\n        outfile: Any = None,\n        precheck: bool = True,\n        **kwargs: Any,\n    ) -> None:\n        \"\"\"\n        Download a single file from a remote lakeFS server to local storage.\n\n        Parameters\n        ----------\n        rpath: str | os.PathLike[str]\n            The remote path to download to local storage. Must be a fully qualified lakeFS URI, and point to a single file.\n        lpath: str | os.PathLike[str]\n            The local path on disk to save the downloaded file to.\n        callback: fsspec.callbacks.Callback\n            An fsspec callback to use during the operation. Can be used to report download progress.\n        outfile: Any\n            A file-like object to save the downloaded content to. Can be used in place of ``lpath``.\n        precheck: bool\n            Check if ``lpath`` already exists and compare its checksum with that of ``rpath``, skipping the download if they match.\n        **kwargs: Any\n            Additional keyword arguments passed to ``AbstractFileSystem.open()``.\n        \"\"\"\n        rpath = stringify_path(rpath)\n        lpath = stringify_path(lpath)\n\n        if precheck and Path(lpath).is_file():\n            local_checksum = md5_checksum(lpath, blocksize=self.blocksize)\n            remote_checksum = self.info(rpath).get(\"checksum\")\n            if local_checksum == remote_checksum:\n                logger.info(\n                    f\"Skipping download of resource {rpath!r} to local path {lpath!r}: \"\n                    f\"Resource {lpath!r} exists and checksums match.\"\n                )\n                return\n\n        with self.wrapped_api_call(rpath=rpath):\n            super().get_file(rpath, lpath, callback=callback, outfile=outfile, **kwargs)\n\n    def info(self, path: str | os.PathLike[str], **kwargs: Any) -> dict[str, Any]:\n        \"\"\"\n        Query a remote lakeFS object's metadata.\n\n        Parameters\n        ----------\n        path: str | os.PathLike[str]\n            The object for which to obtain metadata. Must be a fully qualified lakeFS URI, can either point to a file or a directory.\n        **kwargs: Any\n            Additional keyword arguments to pass to ``LakeFSFileSystem.ls()`` if ``path`` points to a directory.\n\n        Returns\n        -------\n        dict[str, Any]\n            A dictionary containing metadata on the object, including its full remote path and object type (file or directory).\n\n        Raises\n        ------\n        FileNotFoundError\n            If the ``path`` refers to a non-file path that does not exist in the repository.\n        \"\"\"\n        path = stringify_path(path)\n        repository, ref, resource = parse(path)\n        # first, try with `stat_object` in case of a file.\n        # the condition below checks edge cases of resources that cannot be files.\n        if resource and not resource.endswith(\"/\"):\n            try:\n                reference = lakefs.Reference(repository, ref, client=self.client)\n                res = reference.object(resource).stat()\n                return {\n                    \"checksum\": res.checksum,\n                    \"content-type\": res.content_type,\n                    \"mtime\": res.mtime,\n                    \"name\": f\"{repository}/{ref}/{res.path}\",\n                    \"size\": res.size_bytes,\n                    \"type\": \"file\",\n                }\n            except NotFoundException:\n                # fall through, retry with `ls` if it's a directory.\n                pass\n            except ServerException as e:\n                raise translate_lakefs_error(e, rpath=path)\n\n        out = self.ls(path, detail=True, recursive=True, **kwargs)\n        if not out:\n            raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), path)\n\n        return {\n            \"name\": path.rstrip(\"/\"),\n            \"size\": sum(o.get(\"size\") or 0 for o in out),\n            \"type\": \"directory\",\n        }\n\n    def _update_dircache(self, info: list) -> None:\n        \"\"\"Update logic for dircache (optionally recursive) based on lakeFS API response\"\"\"\n        parents = {self._parent(i[\"name\"].rstrip(\"/\")) for i in info}\n        for pp in parents:\n            # subset of info entries which are direct descendants of `parent`\n            dir_info = [i for i in info if self._parent(i[\"name\"].rstrip(\"/\")) == pp]\n            if pp not in self.dircache:\n                self.dircache[pp] = dir_info\n                continue\n\n            # Merge existing dircache entry with updated listing, which contains either:\n            # - files not present in the cache yet\n            # - a fresh listing (if `refresh=True`)\n\n            cache_entry = self.dircache[pp][:]\n\n            old_names = {e[\"name\"] for e in cache_entry}\n            new_names = {e[\"name\"] for e in dir_info}\n\n            to_remove = old_names - new_names\n            to_update = old_names.intersection(new_names)\n\n            # Remove all entries no longer present in the current listing\n            cache_entry = [e for e in cache_entry if e[\"name\"] not in to_remove]\n\n            # Overwrite existing entries in the cache with its updated values\n            for name in to_update:\n                old_idx = next(idx for idx, e in enumerate(cache_entry) if e[\"name\"] == name)\n                new_entry = next(e for e in info if e[\"name\"] == name)\n\n                cache_entry[old_idx] = new_entry\n                dir_info.remove(new_entry)\n\n            # Add the remaining (new) entries to the cache\n            cache_entry.extend(dir_info)\n            self.dircache[pp] = sorted(cache_entry, key=operator.itemgetter(\"name\"))\n\n    def _ls_from_cache(self, path: str, recursive: bool = False) -> list[dict[str, Any]] | None:\n        \"\"\"Override of ``AbstractFileSystem._ls_from_cache`` with support for recursive listings.\"\"\"\n        if not recursive:\n            return super()._ls_from_cache(path)\n\n        result = None\n        for key, files in self.dircache.items():\n            if not (key.startswith(path) or path == key + \"/\"):\n                continue\n            if result is None:\n                result = []\n            result.extend(files)\n        if not result:\n            return result\n        return sorted(result, key=operator.itemgetter(\"name\"))\n\n    @overload\n    def ls(\n        self,\n        path: str | os.PathLike[str],\n        detail: Literal[True] = ...,\n        **kwargs: Any,\n    ) -> list[dict[str, Any]]:\n        ...\n\n    @overload\n    def ls(\n        self,\n        path: str | os.PathLike[str],\n        detail: Literal[False],\n        **kwargs: Any,\n    ) -> list[str]:\n        ...\n\n    @overload\n    def ls(\n        self,\n        path: str | os.PathLike[str],\n        detail: bool = True,\n        **kwargs: Any,\n    ) -> list[str] | list[dict[str, Any]]:\n        ...\n\n    def ls(\n        self,\n        path: str | os.PathLike[str],\n        detail: bool = True,\n        **kwargs: Any,\n    ) -> list[str] | list[dict[str, Any]]:\n        \"\"\"\n        List all available objects under a given path in lakeFS.\n\n        Parameters\n        ----------\n        path: str | os.PathLike[str]\n            The path under which to list objects. Must be a fully qualified lakeFS URI.\n            Can also point to a file, in which case the file's metadata will be returned.\n        detail: bool\n            Whether to obtain all metadata on the requested objects or just their names.\n        **kwargs: Any\n            Additional keyword arguments for fsspec compatibility.\n\n            In particular:\n                `refresh: bool`: whether to skip the directory listing cache,\n                `recursive: bool`: whether to list subdirectory contents recursively\n\n        Returns\n        -------\n        list[str] | list[dict[str, Any]]\n            A list of all objects' metadata under the given remote path if ``detail=True``, or alternatively only their names if ``detail=False``.\n        \"\"\"\n        path = self._strip_protocol(path)\n        repository, ref, prefix = parse(path)\n\n        recursive = kwargs.pop(\"recursive\", False)\n\n        # Try lookup in dircache unless explicitly disabled by `refresh=True` kwarg\n        use_dircache = not kwargs.pop(\"refresh\", False)\n\n        if use_dircache:\n            cache_entry: list[Any] | None = None\n            try:\n                cache_entry = self._ls_from_cache(path, recursive=recursive)\n            except FileNotFoundError:\n                # we patch files missing from an ls call in the cache entry below,\n                # so this should not be an error.\n                pass\n\n            if cache_entry is not None:\n                if not detail:\n                    return [e[\"name\"] for e in cache_entry]\n                return cache_entry[:]\n\n        kwargs[\"prefix\"] = prefix\n\n        info = []\n        # stat infos are either the path only (`detail=False`) or a dict full of metadata\n        delimiter = \"\" if recursive else \"/\"\n        reference = lakefs.Reference(repository, ref, client=self.client)\n\n        with self.wrapped_api_call(rpath=path):\n            for obj in reference.objects(prefix=prefix, delimiter=delimiter):\n                if isinstance(obj, CommonPrefix):\n                    # prefixes are added below.\n                    info.append(\n                        {\n                            \"name\": f\"{repository}/{ref}/{obj.path}\",\n                            \"size\": 0,\n                            \"type\": \"directory\",\n                        }\n                    )\n                elif isinstance(obj, ObjectInfo):\n                    info.append(\n                        {\n                            \"checksum\": obj.checksum,\n                            \"content-type\": obj.content_type,\n                            \"mtime\": obj.mtime,\n                            \"name\": f\"{repository}/{ref}/{obj.path}\",\n                            \"size\": obj.size_bytes,\n                            \"type\": \"object\",\n                        }\n                    )\n\n        # Retry the API call with appended slash if the current result\n        # is just a single directory entry only (not its contents).\n        # This is useful to allow `ls(\"repo/branch/dir\")` calls without a trailing slash.\n        if len(info) == 1 and info[0][\"type\"] == \"directory\" and info[0][\"name\"] == path + \"/\":\n            return self.ls(\n                path + \"/\",\n                detail=detail,\n                **kwargs | {\"refresh\": not use_dircache, \"recursive\": recursive},\n            )\n\n        if recursive:\n            # To make recursive ls behave identical to the non-recursive case,\n            # add back virtual `directory` entries, which are only returned by\n            # the lakeFS API when querying non-recursively.\n            here = self._strip_protocol(path).rstrip(\"/\")\n            subdirs = {parent for o in info if (parent := self._parent(o[\"name\"])) != here}\n            for subdir in subdirs:\n                info.append(\n                    {\n                        \"name\": subdir + \"/\",\n                        \"size\": 0,\n                        \"type\": \"directory\",\n                    }\n                )\n\n        if info:\n            self._update_dircache(info[:])\n\n        if not detail:\n            info = [o[\"name\"] for o in info]  # type: ignore\n\n        return info\n\n    def open(\n        self,\n        path: str | os.PathLike[str],\n        mode: Literal[\"r\", \"rb\", \"rt\", \"w\", \"wb\", \"wt\", \"x\", \"xb\", \"xt\"] = \"rb\",\n        pre_sign: bool = False,\n        content_type: str | None = None,\n        metadata: dict[str, str] | None = None,\n        autocommit: bool = True,\n        **kwargs: Any,\n    ) -> LakeFSIOBase:\n        \"\"\"\n        Dispatch a lakeFS file-like object (local buffer on disk) for the given remote path for up- or downloads depending on ``mode``.\n\n        Parameters\n        ----------\n        path: str | os.PathLike[str]\n            The remote path for which to open a local ``LakeFSFile``. Must be a fully qualified lakeFS URI.\n        mode: Literal[\"r\", \"rb\", \"rt\", \"w\", \"wb\", \"wt\", \"x\", \"xb\", \"xt\"]\n            The file mode indicating its purpose. Use ``r/rb`` for downloads from lakeFS, ``w/wb/x/xb`` for uploads to lakeFS.\n        pre_sign: bool\n            Whether to use a pre-signed URL for the file up-/download.\n        content_type: str | None\n            Content type to use for the file, relevant for uploads only.\n        metadata: dict[str, str] | None\n            Additional metadata to attach to the file, relevant for uploads only.\n        autocommit: bool\n            Whether to process the file immediately instead of queueing it for transaction while in a transaction context.\n        **kwargs: Any\n            Additional keyword arguments for fsspec compatibility, unused.\n\n        Returns\n        -------\n        LakeFSIOBase\n            A local file-like object ready to hold data to be received from / sent to a lakeFS server.\n\n        Raises\n        ------\n        NotImplementedError\n            If ``mode`` is not supported.\n        \"\"\"\n        if mode.endswith(\"t\"):\n            # text modes {r,w,x}t are equivalent to {r,w,x} here respectively.\n            mode = mode[:-1]  # type: ignore\n\n        if mode not in {\"r\", \"rb\", \"w\", \"wb\", \"x\", \"xb\"}:\n            raise NotImplementedError(f\"unsupported mode {mode!r}\")\n\n        path = stringify_path(path)\n        repo, ref, resource = parse(path)\n\n        if mode.startswith(\"r\"):\n            reference = lakefs.Reference(repo, ref, client=self.client)\n            obj = reference.object(resource)\n\n            if not obj.exists():\n                raise FileNotFoundError(path)\n            handler = ObjectReader(obj, mode=mode, pre_sign=pre_sign, client=self.client)\n        else:\n            # for writing ops, ref must be a branch\n            branch = lakefs.Branch(repo, ref, client=self.client)\n            if self.create_branch_ok:\n                branch.create(self.source_branch, exist_ok=True)\n\n            obj = branch.object(resource)\n            handler = ObjectWriter(\n                obj,\n                mode=mode,\n                pre_sign=pre_sign,\n                content_type=content_type,\n                metadata=metadata,\n                client=self.client,\n            )\n\n        ac = kwargs.pop(\"autocommit\", not self._intrans)\n        if not ac and \"r\" not in mode:\n            self._transaction.files.append(handler)\n\n        return handler\n\n    def put_file(\n        self,\n        lpath: str | os.PathLike[str],\n        rpath: str | os.PathLike[str],\n        callback: fsspec.callbacks.Callback = _DEFAULT_CALLBACK,\n        precheck: bool = True,\n        **kwargs: Any,\n    ) -> None:\n        \"\"\"\n        Upload a local file to a remote location on a lakeFS server.\n\n        Note that depending on the block store type, additional configuration like credentials may need to be configured when ``use_blockstore=True`` and ``presign=False``.\n\n        Parameters\n        ----------\n        lpath: str | os.PathLike[str]\n            The local path on disk to upload to the lakeFS server.\n        rpath: str | os.PathLike[str]\n            The remote target path to upload the local file to. Must be a fully qualified lakeFS URI.\n        callback: fsspec.callbacks.Callback\n            An fsspec callback to use during the operation. Can be used to report download progress.\n        precheck: bool\n            Check if ``lpath`` already exists and compare its checksum with that of ``rpath``, skipping the download if they match.\n        **kwargs: Any\n            Additional keyword arguments to pass to ``LakeFSFileSystem.open()``.\n        \"\"\"\n        lpath = stringify_path(lpath)\n        rpath = stringify_path(rpath)\n\n        if precheck and Path(lpath).is_file():\n            remote_checksum = self.checksum(rpath)\n            local_checksum = md5_checksum(lpath, blocksize=self.blocksize)\n            if local_checksum == remote_checksum:\n                logger.info(\n                    f\"Skipping upload of resource {lpath!r} to remote path {rpath!r}: \"\n                    f\"Resource {rpath!r} exists and checksums match.\"\n                )\n                return\n\n        with self.wrapped_api_call(rpath=rpath):\n            super().put_file(lpath, rpath, callback=callback, **kwargs)\n\n    def rm_file(self, path: str | os.PathLike[str]) -> None:  # pragma: no cover\n        \"\"\"\n        Stage a remote file for removal on a lakeFS server.\n\n        The file will not actually be removed from the requested branch until a commit is created.\n\n        Parameters\n        ----------\n        path: str | os.PathLike[str]\n            The remote file to delete. Must be a fully qualified lakeFS URI.\n        \"\"\"\n        self.rm(path)\n\n    def rm(\n        self, path: str | os.PathLike[str], recursive: bool = False, maxdepth: int | None = None\n    ) -> None:\n        \"\"\"\n        Stage multiple remote files for removal on a lakeFS server.\n\n        The files will not actually be removed from the requested branch until a commit is created.\n\n        Parameters\n        ----------\n        path: str | os.PathLike[str]\n            File(s) to delete.\n        recursive: bool\n            If file(s) include nested directories, recursively delete their contents.\n        maxdepth: int | None\n            Depth to pass to walk for finding files to delete, if recursive.\n            If None, there will be no limit and infinite recursion may be\n            possible.\n        \"\"\"\n\n        path = stringify_path(path)\n        repository, ref, prefix = parse(path)\n\n        with self.wrapped_api_call(rpath=path):\n            branch = lakefs.Branch(repository, ref, client=self.client)\n            objgen = branch.objects(prefix=prefix, delimiter=\"\" if recursive else \"/\")\n            if maxdepth is None:\n                branch.delete_objects(obj.path for obj in objgen)\n            else:\n                # nesting level is just the amount of \"/\"s in the path, no leading \"/\".\n                branch.delete_objects(obj.path for obj in objgen if obj.path.count(\"/\") <= maxdepth)\n\n            # Directory listing cache for the containing folder must be invalidated\n            self.dircache.pop(self._parent(path), None)\n\n    def touch(self, path: str | os.PathLike[str], truncate: bool = True, **kwargs: Any) -> None:\n        \"\"\"\n        Create an empty file or update an existing file on a lakeFS server.\n\n        Parameters\n        ----------\n        path: str | os.PathLike[str]\n            The file path to create or update. Must be a fully qualified lakeFS URI.\n        truncate: bool\n            Whether to set the file size to 0 (zero) bytes, even if the path already exists.\n        **kwargs: Any\n            Additional keyword arguments to pass to ``LakeFSFileSystem.open()``.\n\n        Raises\n        ------\n        NotImplementedError\n            If the targeted lakeFS server version does not support `touch()` operations.\n        \"\"\"\n\n        # empty buffer upload errors were fixed in https://github.com/treeverse/lakeFS/issues/7130,\n        # which was first released in lakeFS v1.3.1.\n        if self._lakefs_server_version < (1, 3, 1):\n            version_string = \".\".join(str(v) for v in self._lakefs_server_version)\n            raise NotImplementedError(\n                \"LakeFSFileSystem.touch() is not supported for your lakeFS server version. \"\n                f\"minimum required version: '1.3.1', actual version: {version_string!r}\"\n            )\n\n        super().touch(path=path, truncate=truncate, **kwargs)\n\n    def tail(self, path: str | os.PathLike[str], size: int = 1024) -> bytes:\n        \"\"\"\n        Get the last ``size`` bytes from a remote file.\n\n        Parameters\n        ----------\n        path: str | os.PathLike[str]\n            The file path to read. Must be a fully qualified lakeFS URI.\n        size: int\n            The amount of bytes to get.\n\n        Returns\n        -------\n        bytes\n            The bytes at the end of the requested file.\n        \"\"\"\n        f: ObjectReader\n        with self.open(path, \"rb\") as f:\n            f.seek(max(-size, -f._obj.stat().size_bytes), 2)\n            return f.read()\n
"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.transaction","title":"transaction property","text":"
transaction: LakeFSTransaction\n

A context manager within which file uploads and versioning operations are deferred to a queue, and carried out during when exiting the context.

Requires the file class to implement .commit() and .discard() for the normal and exception cases.

"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.wrapped_api_call","title":"wrapped_api_call","text":"
wrapped_api_call(\n    rpath: str | None = None, message: str | None = None, set_cause: bool = True\n) -> Generator[None, None, None]\n

A context manager to wrap lakeFS API calls, translating any API errors to Python-native OS errors.

Meant for internal use.

PARAMETER DESCRIPTION rpath

The remote path involved in the requested API call.

TYPE: str | None DEFAULT: None

message

A custom error message to emit instead of parsing the API error response.

TYPE: str | None DEFAULT: None

set_cause

Whether to include the original lakeFS API error in the resulting traceback.

TYPE: bool DEFAULT: True

YIELDS DESCRIPTION None

An empty generator, to be used as a context manager.

RAISES DESCRIPTION OSError

Translated error from the lakeFS API call, if any.

Source code in src/lakefs_spec/spec.py
@contextmanager\ndef wrapped_api_call(\n    self, rpath: str | None = None, message: str | None = None, set_cause: bool = True\n) -> Generator[None, None, None]:\n    \"\"\"\n    A context manager to wrap lakeFS API calls, translating any API errors to Python-native OS errors.\n\n    Meant for internal use.\n\n    Parameters\n    ----------\n    rpath: str | None\n        The remote path involved in the requested API call.\n    message: str | None\n        A custom error message to emit instead of parsing the API error response.\n    set_cause: bool\n        Whether to include the original lakeFS API error in the resulting traceback.\n\n    Yields\n    ------\n    None\n        An empty generator, to be used as a context manager.\n\n    Raises\n    ------\n    OSError\n        Translated error from the lakeFS API call, if any.\n    \"\"\"\n    try:\n        yield\n    except ServerException as e:\n        raise translate_lakefs_error(e, rpath=rpath, message=message, set_cause=set_cause)\n
"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.checksum","title":"checksum","text":"
checksum(path: str | PathLike[str]) -> str | None\n

Get a remote lakeFS file object's checksum.

This is usually its MD5 hash, unless another hash function was used on upload.

PARAMETER DESCRIPTION path

The remote path to look up the lakeFS checksum for. Must point to a single file object.

TYPE: str | PathLike[str]

RETURNS DESCRIPTION str | None

The remote file's checksum, or None if path points to a directory or does not exist.

Source code in src/lakefs_spec/spec.py
def checksum(self, path: str | os.PathLike[str]) -> str | None:\n    \"\"\"\n    Get a remote lakeFS file object's checksum.\n\n    This is usually its MD5 hash, unless another hash function was used on upload.\n\n    Parameters\n    ----------\n    path: str | os.PathLike[str]\n        The remote path to look up the lakeFS checksum for. Must point to a single file object.\n\n    Returns\n    -------\n    str | None\n        The remote file's checksum, or ``None`` if ``path`` points to a directory or does not exist.\n    \"\"\"\n    path = stringify_path(path)\n    try:\n        return self.info(path).get(\"checksum\")\n    except FileNotFoundError:\n        return None\n
"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.exists","title":"exists","text":"
exists(path: str | PathLike[str], **kwargs: Any) -> bool\n

Check existence of a remote path in a lakeFS repository.

Input paths can either be files or directories.

PARAMETER DESCRIPTION path

The remote path whose existence to check. Must be a fully qualified lakeFS URI.

TYPE: str | PathLike[str]

**kwargs

Additional keyword arguments for fsspec compatibility, unused.

TYPE: Any DEFAULT: {}

RETURNS DESCRIPTION bool

True if the requested path exists, False if it does not.

RAISES DESCRIPTION PermissionError

If the user does not have sufficient permissions to query object existence.

Source code in src/lakefs_spec/spec.py
def exists(self, path: str | os.PathLike[str], **kwargs: Any) -> bool:\n    \"\"\"\n    Check existence of a remote path in a lakeFS repository.\n\n    Input paths can either be files or directories.\n\n    Parameters\n    ----------\n    path: str | os.PathLike[str]\n        The remote path whose existence to check. Must be a fully qualified lakeFS URI.\n    **kwargs: Any\n        Additional keyword arguments for fsspec compatibility, unused.\n\n    Returns\n    -------\n    bool\n        ``True`` if the requested path exists, ``False`` if it does not.\n\n    Raises\n    ------\n    PermissionError\n        If the user does not have sufficient permissions to query object existence.\n    \"\"\"\n    path = stringify_path(path)\n    repository, ref, resource = parse(path)\n    try:\n        reference = lakefs.Reference(repository, ref, client=self.client)\n        return reference.object(resource).exists()\n    except ServerException as e:\n        # in case of an error other than \"not found\", existence cannot be\n        # decided, so raise the translated error.\n        raise translate_lakefs_error(e)\n
"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.cp_file","title":"cp_file","text":"
cp_file(path1: str | PathLike[str], path2: str | PathLike[str], **kwargs: Any) -> None\n

Copy a single file from one remote location to another in lakeFS.

PARAMETER DESCRIPTION path1

The remote file location to be copied.

TYPE: str | PathLike[str]

path2

The (remote) target location to which to copy the file.

TYPE: str | PathLike[str]

**kwargs

Additional keyword arguments for fsspec compatibility, unused.

TYPE: Any DEFAULT: {}

RAISES DESCRIPTION ValueError

When attempting to copy objects between repositories.

Source code in src/lakefs_spec/spec.py
def cp_file(\n    self, path1: str | os.PathLike[str], path2: str | os.PathLike[str], **kwargs: Any\n) -> None:\n    \"\"\"\n    Copy a single file from one remote location to another in lakeFS.\n\n    Parameters\n    ----------\n    path1: str | os.PathLike[str]\n        The remote file location to be copied.\n    path2: str | os.PathLike[str]\n        The (remote) target location to which to copy the file.\n    **kwargs: Any\n        Additional keyword arguments for fsspec compatibility, unused.\n\n    Raises\n    ------\n    ValueError\n        When attempting to copy objects between repositories.\n    \"\"\"\n    path1 = stringify_path(path1)\n    path2 = stringify_path(path2)\n    if path1 == path2:\n        return\n\n    orig_repo, orig_ref, orig_path = parse(path1)\n    dest_repo, dest_ref, dest_path = parse(path2)\n\n    if orig_repo != dest_repo:\n        raise ValueError(\n            \"can only copy objects within a repository, but got source \"\n            f\"repository {orig_repo!r} and destination repository {dest_repo!r}\"\n        )\n\n    with self.wrapped_api_call():\n        reference = lakefs.Reference(orig_repo, orig_ref, client=self.client)\n        reference.object(orig_path).copy(dest_ref, dest_path)\n
"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.get_file","title":"get_file","text":"
get_file(\n    rpath: str | PathLike[str],\n    lpath: str | PathLike[str],\n    callback: Callback = _DEFAULT_CALLBACK,\n    outfile: Any = None,\n    precheck: bool = True,\n    **kwargs: Any\n) -> None\n

Download a single file from a remote lakeFS server to local storage.

PARAMETER DESCRIPTION rpath

The remote path to download to local storage. Must be a fully qualified lakeFS URI, and point to a single file.

TYPE: str | PathLike[str]

lpath

The local path on disk to save the downloaded file to.

TYPE: str | PathLike[str]

callback

An fsspec callback to use during the operation. Can be used to report download progress.

TYPE: Callback DEFAULT: _DEFAULT_CALLBACK

outfile

A file-like object to save the downloaded content to. Can be used in place of lpath.

TYPE: Any DEFAULT: None

precheck

Check if lpath already exists and compare its checksum with that of rpath, skipping the download if they match.

TYPE: bool DEFAULT: True

**kwargs

Additional keyword arguments passed to AbstractFileSystem.open().

TYPE: Any DEFAULT: {}

Source code in src/lakefs_spec/spec.py
def get_file(\n    self,\n    rpath: str | os.PathLike[str],\n    lpath: str | os.PathLike[str],\n    callback: fsspec.callbacks.Callback = _DEFAULT_CALLBACK,\n    outfile: Any = None,\n    precheck: bool = True,\n    **kwargs: Any,\n) -> None:\n    \"\"\"\n    Download a single file from a remote lakeFS server to local storage.\n\n    Parameters\n    ----------\n    rpath: str | os.PathLike[str]\n        The remote path to download to local storage. Must be a fully qualified lakeFS URI, and point to a single file.\n    lpath: str | os.PathLike[str]\n        The local path on disk to save the downloaded file to.\n    callback: fsspec.callbacks.Callback\n        An fsspec callback to use during the operation. Can be used to report download progress.\n    outfile: Any\n        A file-like object to save the downloaded content to. Can be used in place of ``lpath``.\n    precheck: bool\n        Check if ``lpath`` already exists and compare its checksum with that of ``rpath``, skipping the download if they match.\n    **kwargs: Any\n        Additional keyword arguments passed to ``AbstractFileSystem.open()``.\n    \"\"\"\n    rpath = stringify_path(rpath)\n    lpath = stringify_path(lpath)\n\n    if precheck and Path(lpath).is_file():\n        local_checksum = md5_checksum(lpath, blocksize=self.blocksize)\n        remote_checksum = self.info(rpath).get(\"checksum\")\n        if local_checksum == remote_checksum:\n            logger.info(\n                f\"Skipping download of resource {rpath!r} to local path {lpath!r}: \"\n                f\"Resource {lpath!r} exists and checksums match.\"\n            )\n            return\n\n    with self.wrapped_api_call(rpath=rpath):\n        super().get_file(rpath, lpath, callback=callback, outfile=outfile, **kwargs)\n
"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.info","title":"info","text":"
info(path: str | PathLike[str], **kwargs: Any) -> dict[str, Any]\n

Query a remote lakeFS object's metadata.

PARAMETER DESCRIPTION path

The object for which to obtain metadata. Must be a fully qualified lakeFS URI, can either point to a file or a directory.

TYPE: str | PathLike[str]

**kwargs

Additional keyword arguments to pass to LakeFSFileSystem.ls() if path points to a directory.

TYPE: Any DEFAULT: {}

RETURNS DESCRIPTION dict[str, Any]

A dictionary containing metadata on the object, including its full remote path and object type (file or directory).

RAISES DESCRIPTION FileNotFoundError

If the path refers to a non-file path that does not exist in the repository.

Source code in src/lakefs_spec/spec.py
def info(self, path: str | os.PathLike[str], **kwargs: Any) -> dict[str, Any]:\n    \"\"\"\n    Query a remote lakeFS object's metadata.\n\n    Parameters\n    ----------\n    path: str | os.PathLike[str]\n        The object for which to obtain metadata. Must be a fully qualified lakeFS URI, can either point to a file or a directory.\n    **kwargs: Any\n        Additional keyword arguments to pass to ``LakeFSFileSystem.ls()`` if ``path`` points to a directory.\n\n    Returns\n    -------\n    dict[str, Any]\n        A dictionary containing metadata on the object, including its full remote path and object type (file or directory).\n\n    Raises\n    ------\n    FileNotFoundError\n        If the ``path`` refers to a non-file path that does not exist in the repository.\n    \"\"\"\n    path = stringify_path(path)\n    repository, ref, resource = parse(path)\n    # first, try with `stat_object` in case of a file.\n    # the condition below checks edge cases of resources that cannot be files.\n    if resource and not resource.endswith(\"/\"):\n        try:\n            reference = lakefs.Reference(repository, ref, client=self.client)\n            res = reference.object(resource).stat()\n            return {\n                \"checksum\": res.checksum,\n                \"content-type\": res.content_type,\n                \"mtime\": res.mtime,\n                \"name\": f\"{repository}/{ref}/{res.path}\",\n                \"size\": res.size_bytes,\n                \"type\": \"file\",\n            }\n        except NotFoundException:\n            # fall through, retry with `ls` if it's a directory.\n            pass\n        except ServerException as e:\n            raise translate_lakefs_error(e, rpath=path)\n\n    out = self.ls(path, detail=True, recursive=True, **kwargs)\n    if not out:\n        raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), path)\n\n    return {\n        \"name\": path.rstrip(\"/\"),\n        \"size\": sum(o.get(\"size\") or 0 for o in out),\n        \"type\": \"directory\",\n    }\n
"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.ls","title":"ls","text":"
ls(\n    path: str | PathLike[str], detail: bool = True, **kwargs: Any\n) -> list[str] | list[dict[str, Any]]\n

List all available objects under a given path in lakeFS.

PARAMETER DESCRIPTION path

The path under which to list objects. Must be a fully qualified lakeFS URI. Can also point to a file, in which case the file's metadata will be returned.

TYPE: str | PathLike[str]

detail

Whether to obtain all metadata on the requested objects or just their names.

TYPE: bool DEFAULT: True

**kwargs

Additional keyword arguments for fsspec compatibility.

In particular: refresh: bool: whether to skip the directory listing cache, recursive: bool: whether to list subdirectory contents recursively

TYPE: Any DEFAULT: {}

RETURNS DESCRIPTION list[str] | list[dict[str, Any]]

A list of all objects' metadata under the given remote path if detail=True, or alternatively only their names if detail=False.

Source code in src/lakefs_spec/spec.py
def ls(\n    self,\n    path: str | os.PathLike[str],\n    detail: bool = True,\n    **kwargs: Any,\n) -> list[str] | list[dict[str, Any]]:\n    \"\"\"\n    List all available objects under a given path in lakeFS.\n\n    Parameters\n    ----------\n    path: str | os.PathLike[str]\n        The path under which to list objects. Must be a fully qualified lakeFS URI.\n        Can also point to a file, in which case the file's metadata will be returned.\n    detail: bool\n        Whether to obtain all metadata on the requested objects or just their names.\n    **kwargs: Any\n        Additional keyword arguments for fsspec compatibility.\n\n        In particular:\n            `refresh: bool`: whether to skip the directory listing cache,\n            `recursive: bool`: whether to list subdirectory contents recursively\n\n    Returns\n    -------\n    list[str] | list[dict[str, Any]]\n        A list of all objects' metadata under the given remote path if ``detail=True``, or alternatively only their names if ``detail=False``.\n    \"\"\"\n    path = self._strip_protocol(path)\n    repository, ref, prefix = parse(path)\n\n    recursive = kwargs.pop(\"recursive\", False)\n\n    # Try lookup in dircache unless explicitly disabled by `refresh=True` kwarg\n    use_dircache = not kwargs.pop(\"refresh\", False)\n\n    if use_dircache:\n        cache_entry: list[Any] | None = None\n        try:\n            cache_entry = self._ls_from_cache(path, recursive=recursive)\n        except FileNotFoundError:\n            # we patch files missing from an ls call in the cache entry below,\n            # so this should not be an error.\n            pass\n\n        if cache_entry is not None:\n            if not detail:\n                return [e[\"name\"] for e in cache_entry]\n            return cache_entry[:]\n\n    kwargs[\"prefix\"] = prefix\n\n    info = []\n    # stat infos are either the path only (`detail=False`) or a dict full of metadata\n    delimiter = \"\" if recursive else \"/\"\n    reference = lakefs.Reference(repository, ref, client=self.client)\n\n    with self.wrapped_api_call(rpath=path):\n        for obj in reference.objects(prefix=prefix, delimiter=delimiter):\n            if isinstance(obj, CommonPrefix):\n                # prefixes are added below.\n                info.append(\n                    {\n                        \"name\": f\"{repository}/{ref}/{obj.path}\",\n                        \"size\": 0,\n                        \"type\": \"directory\",\n                    }\n                )\n            elif isinstance(obj, ObjectInfo):\n                info.append(\n                    {\n                        \"checksum\": obj.checksum,\n                        \"content-type\": obj.content_type,\n                        \"mtime\": obj.mtime,\n                        \"name\": f\"{repository}/{ref}/{obj.path}\",\n                        \"size\": obj.size_bytes,\n                        \"type\": \"object\",\n                    }\n                )\n\n    # Retry the API call with appended slash if the current result\n    # is just a single directory entry only (not its contents).\n    # This is useful to allow `ls(\"repo/branch/dir\")` calls without a trailing slash.\n    if len(info) == 1 and info[0][\"type\"] == \"directory\" and info[0][\"name\"] == path + \"/\":\n        return self.ls(\n            path + \"/\",\n            detail=detail,\n            **kwargs | {\"refresh\": not use_dircache, \"recursive\": recursive},\n        )\n\n    if recursive:\n        # To make recursive ls behave identical to the non-recursive case,\n        # add back virtual `directory` entries, which are only returned by\n        # the lakeFS API when querying non-recursively.\n        here = self._strip_protocol(path).rstrip(\"/\")\n        subdirs = {parent for o in info if (parent := self._parent(o[\"name\"])) != here}\n        for subdir in subdirs:\n            info.append(\n                {\n                    \"name\": subdir + \"/\",\n                    \"size\": 0,\n                    \"type\": \"directory\",\n                }\n            )\n\n    if info:\n        self._update_dircache(info[:])\n\n    if not detail:\n        info = [o[\"name\"] for o in info]  # type: ignore\n\n    return info\n
"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.open","title":"open","text":"
open(\n    path: str | PathLike[str],\n    mode: Literal[\"r\", \"rb\", \"rt\", \"w\", \"wb\", \"wt\", \"x\", \"xb\", \"xt\"] = \"rb\",\n    pre_sign: bool = False,\n    content_type: str | None = None,\n    metadata: dict[str, str] | None = None,\n    autocommit: bool = True,\n    **kwargs: Any\n) -> LakeFSIOBase\n

Dispatch a lakeFS file-like object (local buffer on disk) for the given remote path for up- or downloads depending on mode.

PARAMETER DESCRIPTION path

The remote path for which to open a local LakeFSFile. Must be a fully qualified lakeFS URI.

TYPE: str | PathLike[str]

mode

The file mode indicating its purpose. Use r/rb for downloads from lakeFS, w/wb/x/xb for uploads to lakeFS.

TYPE: Literal['r', 'rb', 'rt', 'w', 'wb', 'wt', 'x', 'xb', 'xt'] DEFAULT: 'rb'

pre_sign

Whether to use a pre-signed URL for the file up-/download.

TYPE: bool DEFAULT: False

content_type

Content type to use for the file, relevant for uploads only.

TYPE: str | None DEFAULT: None

metadata

Additional metadata to attach to the file, relevant for uploads only.

TYPE: dict[str, str] | None DEFAULT: None

autocommit

Whether to process the file immediately instead of queueing it for transaction while in a transaction context.

TYPE: bool DEFAULT: True

**kwargs

Additional keyword arguments for fsspec compatibility, unused.

TYPE: Any DEFAULT: {}

RETURNS DESCRIPTION LakeFSIOBase

A local file-like object ready to hold data to be received from / sent to a lakeFS server.

RAISES DESCRIPTION NotImplementedError

If mode is not supported.

Source code in src/lakefs_spec/spec.py
def open(\n    self,\n    path: str | os.PathLike[str],\n    mode: Literal[\"r\", \"rb\", \"rt\", \"w\", \"wb\", \"wt\", \"x\", \"xb\", \"xt\"] = \"rb\",\n    pre_sign: bool = False,\n    content_type: str | None = None,\n    metadata: dict[str, str] | None = None,\n    autocommit: bool = True,\n    **kwargs: Any,\n) -> LakeFSIOBase:\n    \"\"\"\n    Dispatch a lakeFS file-like object (local buffer on disk) for the given remote path for up- or downloads depending on ``mode``.\n\n    Parameters\n    ----------\n    path: str | os.PathLike[str]\n        The remote path for which to open a local ``LakeFSFile``. Must be a fully qualified lakeFS URI.\n    mode: Literal[\"r\", \"rb\", \"rt\", \"w\", \"wb\", \"wt\", \"x\", \"xb\", \"xt\"]\n        The file mode indicating its purpose. Use ``r/rb`` for downloads from lakeFS, ``w/wb/x/xb`` for uploads to lakeFS.\n    pre_sign: bool\n        Whether to use a pre-signed URL for the file up-/download.\n    content_type: str | None\n        Content type to use for the file, relevant for uploads only.\n    metadata: dict[str, str] | None\n        Additional metadata to attach to the file, relevant for uploads only.\n    autocommit: bool\n        Whether to process the file immediately instead of queueing it for transaction while in a transaction context.\n    **kwargs: Any\n        Additional keyword arguments for fsspec compatibility, unused.\n\n    Returns\n    -------\n    LakeFSIOBase\n        A local file-like object ready to hold data to be received from / sent to a lakeFS server.\n\n    Raises\n    ------\n    NotImplementedError\n        If ``mode`` is not supported.\n    \"\"\"\n    if mode.endswith(\"t\"):\n        # text modes {r,w,x}t are equivalent to {r,w,x} here respectively.\n        mode = mode[:-1]  # type: ignore\n\n    if mode not in {\"r\", \"rb\", \"w\", \"wb\", \"x\", \"xb\"}:\n        raise NotImplementedError(f\"unsupported mode {mode!r}\")\n\n    path = stringify_path(path)\n    repo, ref, resource = parse(path)\n\n    if mode.startswith(\"r\"):\n        reference = lakefs.Reference(repo, ref, client=self.client)\n        obj = reference.object(resource)\n\n        if not obj.exists():\n            raise FileNotFoundError(path)\n        handler = ObjectReader(obj, mode=mode, pre_sign=pre_sign, client=self.client)\n    else:\n        # for writing ops, ref must be a branch\n        branch = lakefs.Branch(repo, ref, client=self.client)\n        if self.create_branch_ok:\n            branch.create(self.source_branch, exist_ok=True)\n\n        obj = branch.object(resource)\n        handler = ObjectWriter(\n            obj,\n            mode=mode,\n            pre_sign=pre_sign,\n            content_type=content_type,\n            metadata=metadata,\n            client=self.client,\n        )\n\n    ac = kwargs.pop(\"autocommit\", not self._intrans)\n    if not ac and \"r\" not in mode:\n        self._transaction.files.append(handler)\n\n    return handler\n
"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.put_file","title":"put_file","text":"
put_file(\n    lpath: str | PathLike[str],\n    rpath: str | PathLike[str],\n    callback: Callback = _DEFAULT_CALLBACK,\n    precheck: bool = True,\n    **kwargs: Any\n) -> None\n

Upload a local file to a remote location on a lakeFS server.

Note that depending on the block store type, additional configuration like credentials may need to be configured when use_blockstore=True and presign=False.

PARAMETER DESCRIPTION lpath

The local path on disk to upload to the lakeFS server.

TYPE: str | PathLike[str]

rpath

The remote target path to upload the local file to. Must be a fully qualified lakeFS URI.

TYPE: str | PathLike[str]

callback

An fsspec callback to use during the operation. Can be used to report download progress.

TYPE: Callback DEFAULT: _DEFAULT_CALLBACK

precheck

Check if lpath already exists and compare its checksum with that of rpath, skipping the download if they match.

TYPE: bool DEFAULT: True

**kwargs

Additional keyword arguments to pass to LakeFSFileSystem.open().

TYPE: Any DEFAULT: {}

Source code in src/lakefs_spec/spec.py
def put_file(\n    self,\n    lpath: str | os.PathLike[str],\n    rpath: str | os.PathLike[str],\n    callback: fsspec.callbacks.Callback = _DEFAULT_CALLBACK,\n    precheck: bool = True,\n    **kwargs: Any,\n) -> None:\n    \"\"\"\n    Upload a local file to a remote location on a lakeFS server.\n\n    Note that depending on the block store type, additional configuration like credentials may need to be configured when ``use_blockstore=True`` and ``presign=False``.\n\n    Parameters\n    ----------\n    lpath: str | os.PathLike[str]\n        The local path on disk to upload to the lakeFS server.\n    rpath: str | os.PathLike[str]\n        The remote target path to upload the local file to. Must be a fully qualified lakeFS URI.\n    callback: fsspec.callbacks.Callback\n        An fsspec callback to use during the operation. Can be used to report download progress.\n    precheck: bool\n        Check if ``lpath`` already exists and compare its checksum with that of ``rpath``, skipping the download if they match.\n    **kwargs: Any\n        Additional keyword arguments to pass to ``LakeFSFileSystem.open()``.\n    \"\"\"\n    lpath = stringify_path(lpath)\n    rpath = stringify_path(rpath)\n\n    if precheck and Path(lpath).is_file():\n        remote_checksum = self.checksum(rpath)\n        local_checksum = md5_checksum(lpath, blocksize=self.blocksize)\n        if local_checksum == remote_checksum:\n            logger.info(\n                f\"Skipping upload of resource {lpath!r} to remote path {rpath!r}: \"\n                f\"Resource {rpath!r} exists and checksums match.\"\n            )\n            return\n\n    with self.wrapped_api_call(rpath=rpath):\n        super().put_file(lpath, rpath, callback=callback, **kwargs)\n
"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.rm_file","title":"rm_file","text":"
rm_file(path: str | PathLike[str]) -> None\n

Stage a remote file for removal on a lakeFS server.

The file will not actually be removed from the requested branch until a commit is created.

PARAMETER DESCRIPTION path

The remote file to delete. Must be a fully qualified lakeFS URI.

TYPE: str | PathLike[str]

Source code in src/lakefs_spec/spec.py
def rm_file(self, path: str | os.PathLike[str]) -> None:  # pragma: no cover\n    \"\"\"\n    Stage a remote file for removal on a lakeFS server.\n\n    The file will not actually be removed from the requested branch until a commit is created.\n\n    Parameters\n    ----------\n    path: str | os.PathLike[str]\n        The remote file to delete. Must be a fully qualified lakeFS URI.\n    \"\"\"\n    self.rm(path)\n
"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.rm","title":"rm","text":"
rm(path: str | PathLike[str], recursive: bool = False, maxdepth: int | None = None) -> None\n

Stage multiple remote files for removal on a lakeFS server.

The files will not actually be removed from the requested branch until a commit is created.

PARAMETER DESCRIPTION path

File(s) to delete.

TYPE: str | PathLike[str]

recursive

If file(s) include nested directories, recursively delete their contents.

TYPE: bool DEFAULT: False

maxdepth

Depth to pass to walk for finding files to delete, if recursive. If None, there will be no limit and infinite recursion may be possible.

TYPE: int | None DEFAULT: None

Source code in src/lakefs_spec/spec.py
def rm(\n    self, path: str | os.PathLike[str], recursive: bool = False, maxdepth: int | None = None\n) -> None:\n    \"\"\"\n    Stage multiple remote files for removal on a lakeFS server.\n\n    The files will not actually be removed from the requested branch until a commit is created.\n\n    Parameters\n    ----------\n    path: str | os.PathLike[str]\n        File(s) to delete.\n    recursive: bool\n        If file(s) include nested directories, recursively delete their contents.\n    maxdepth: int | None\n        Depth to pass to walk for finding files to delete, if recursive.\n        If None, there will be no limit and infinite recursion may be\n        possible.\n    \"\"\"\n\n    path = stringify_path(path)\n    repository, ref, prefix = parse(path)\n\n    with self.wrapped_api_call(rpath=path):\n        branch = lakefs.Branch(repository, ref, client=self.client)\n        objgen = branch.objects(prefix=prefix, delimiter=\"\" if recursive else \"/\")\n        if maxdepth is None:\n            branch.delete_objects(obj.path for obj in objgen)\n        else:\n            # nesting level is just the amount of \"/\"s in the path, no leading \"/\".\n            branch.delete_objects(obj.path for obj in objgen if obj.path.count(\"/\") <= maxdepth)\n\n        # Directory listing cache for the containing folder must be invalidated\n        self.dircache.pop(self._parent(path), None)\n
"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.touch","title":"touch","text":"
touch(path: str | PathLike[str], truncate: bool = True, **kwargs: Any) -> None\n

Create an empty file or update an existing file on a lakeFS server.

PARAMETER DESCRIPTION path

The file path to create or update. Must be a fully qualified lakeFS URI.

TYPE: str | PathLike[str]

truncate

Whether to set the file size to 0 (zero) bytes, even if the path already exists.

TYPE: bool DEFAULT: True

**kwargs

Additional keyword arguments to pass to LakeFSFileSystem.open().

TYPE: Any DEFAULT: {}

RAISES DESCRIPTION NotImplementedError

If the targeted lakeFS server version does not support touch() operations.

Source code in src/lakefs_spec/spec.py
def touch(self, path: str | os.PathLike[str], truncate: bool = True, **kwargs: Any) -> None:\n    \"\"\"\n    Create an empty file or update an existing file on a lakeFS server.\n\n    Parameters\n    ----------\n    path: str | os.PathLike[str]\n        The file path to create or update. Must be a fully qualified lakeFS URI.\n    truncate: bool\n        Whether to set the file size to 0 (zero) bytes, even if the path already exists.\n    **kwargs: Any\n        Additional keyword arguments to pass to ``LakeFSFileSystem.open()``.\n\n    Raises\n    ------\n    NotImplementedError\n        If the targeted lakeFS server version does not support `touch()` operations.\n    \"\"\"\n\n    # empty buffer upload errors were fixed in https://github.com/treeverse/lakeFS/issues/7130,\n    # which was first released in lakeFS v1.3.1.\n    if self._lakefs_server_version < (1, 3, 1):\n        version_string = \".\".join(str(v) for v in self._lakefs_server_version)\n        raise NotImplementedError(\n            \"LakeFSFileSystem.touch() is not supported for your lakeFS server version. \"\n            f\"minimum required version: '1.3.1', actual version: {version_string!r}\"\n        )\n\n    super().touch(path=path, truncate=truncate, **kwargs)\n
"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.tail","title":"tail","text":"
tail(path: str | PathLike[str], size: int = 1024) -> bytes\n

Get the last size bytes from a remote file.

PARAMETER DESCRIPTION path

The file path to read. Must be a fully qualified lakeFS URI.

TYPE: str | PathLike[str]

size

The amount of bytes to get.

TYPE: int DEFAULT: 1024

RETURNS DESCRIPTION bytes

The bytes at the end of the requested file.

Source code in src/lakefs_spec/spec.py
def tail(self, path: str | os.PathLike[str], size: int = 1024) -> bytes:\n    \"\"\"\n    Get the last ``size`` bytes from a remote file.\n\n    Parameters\n    ----------\n    path: str | os.PathLike[str]\n        The file path to read. Must be a fully qualified lakeFS URI.\n    size: int\n        The amount of bytes to get.\n\n    Returns\n    -------\n    bytes\n        The bytes at the end of the requested file.\n    \"\"\"\n    f: ObjectReader\n    with self.open(path, \"rb\") as f:\n        f.seek(max(-size, -f._obj.stat().size_bytes), 2)\n        return f.read()\n
"},{"location":"reference/lakefs_spec/transaction/","title":"transaction","text":"

Functionality for extended lakeFS transactions to conduct versioning operations between file uploads.

"},{"location":"reference/lakefs_spec/transaction/#lakefs_spec.transaction.LakeFSTransaction","title":"LakeFSTransaction","text":"

Bases: Transaction

A lakeFS transaction model capable of versioning operations in between file uploads.

PARAMETER DESCRIPTION fs

The lakeFS file system associated with the transaction.

TYPE: 'LakeFSFileSystem'

Source code in src/lakefs_spec/transaction.py
class LakeFSTransaction(Transaction):\n    \"\"\"\n    A lakeFS transaction model capable of versioning operations in between file uploads.\n\n    Parameters\n    ----------\n    fs: LakeFSFileSystem\n        The lakeFS file system associated with the transaction.\n    \"\"\"\n\n    def __init__(\n        self,\n        fs: \"LakeFSFileSystem\",\n    ):\n        super().__init__(fs=fs)\n        self.fs: \"LakeFSFileSystem\"\n        self.files: deque[ObjectWriter] = deque(self.files)\n\n        self.repository: str | None = None\n        self.base_branch: Branch | None = None\n        self.automerge: bool = False\n        self.delete: Literal[\"onsuccess\", \"always\", \"never\"] = \"onsuccess\"\n        self._ephemeral_branch: Branch | None = None\n\n    def __call__(\n        self,\n        repository: str | Repository,\n        base_branch: str | Branch = \"main\",\n        branch_name: str | None = None,\n        automerge: bool = True,\n        delete: Literal[\"onsuccess\", \"always\", \"never\"] = \"onsuccess\",\n    ) -> \"LakeFSTransaction\":\n        \"\"\"\n        Creates an ephemeral branch, conducts all uploads and operations on that branch,\n        and optionally merges it back into the source branch.\n\n        repository: str | Repository\n            The repository in which to conduct the transaction.\n        base_branch: str | Branch\n            The branch on which the transaction operations should be based.\n        automerge: bool\n            Automatically merge the ephemeral branch into the base branch after successful\n            transaction completion.\n        delete: Literal[\"onsuccess\", \"always\", \"never\"]\n            Cleanup policy / deletion handling for the ephemeral branch after the transaction.\n\n            If ``\"onsuccess\"``, the branch is deleted if the transaction succeeded,\n            or left over if an error occurred.\n\n            If ``\"always\"``, the ephemeral branch is always deleted after transaction regardless of success\n            or failure.\n\n            If ``\"never\"``, the transaction branch is always left in the repository.\n        \"\"\"\n\n        if isinstance(repository, str):\n            self.repository = repository\n        else:\n            self.repository = repository.id\n\n        repo = lakefs.Repository(self.repository, client=self.fs.client)\n        try:\n            _ = repo.metadata\n        except ServerException:\n            raise ValueError(f\"repository {self.repository!r} does not exist\") from None\n\n        # base branch needs to be a lakefs.Branch, since it is being diffed\n        # with the ephemeral branch in __exit__.\n        self.base_branch = _ensurebranch(base_branch, self.repository, self.fs.client)\n\n        self.automerge = automerge\n        self.delete = delete\n\n        ephem_name = branch_name or \"transaction-\" + \"\".join(random.choices(string.digits, k=6))  # nosec: B311\n        self._ephemeral_branch = Branch(self.repository, ephem_name, client=self.fs.client)\n        return self\n\n    def __enter__(self):\n        logger.debug(\n            f\"Creating ephemeral branch {self._ephemeral_branch.id!r} \"\n            f\"from branch {self.base_branch.id!r}.\"\n        )\n        self._ephemeral_branch.create(self.base_branch, exist_ok=False)\n        self.fs._intrans = True\n        return self\n\n    def __exit__(self, exc_type, exc_val, exc_tb):\n        success = exc_type is None\n        while self.files:\n            # fsspec base class calls `append` on the file, which means we\n            # have to pop from the left to preserve order.\n            f = self.files.popleft()\n            if not success:\n                f.discard()\n\n        self.fs._intrans = False\n        self.fs._transaction = None\n\n        if any(self._ephemeral_branch.uncommitted()):\n            msg = f\"Finished transaction on branch {self._ephemeral_branch.id!r} with uncommitted changes.\"\n            if self.delete != \"never\":\n                msg += \" Objects added but not committed are lost.\"\n            warnings.warn(msg)\n\n        if success and self.automerge:\n            if any(self.base_branch.diff(self._ephemeral_branch)):\n                self._ephemeral_branch.merge_into(self.base_branch)\n        if self.delete == \"always\" or (success and self.delete == \"onsuccess\"):\n            self._ephemeral_branch.delete()\n\n    @property\n    def branch(self):\n        return self._ephemeral_branch\n\n    def commit(self, message: str, metadata: dict[str, str] | None = None) -> Reference:\n        \"\"\"\n        Create a commit on this transaction's ephemeral branch with a commit message\n        and attached metadata.\n\n        Parameters\n        ----------\n        message: str\n            The commit message to attach to the newly created commit.\n        metadata: dict[str, str] | None\n            Optional metadata to enrich the created commit with (author, e-mail, ...).\n\n        Returns\n        -------\n        Reference\n            The created commit.\n        \"\"\"\n\n        diff = list(self.branch.uncommitted())\n\n        if not diff:\n            logger.warning(f\"No changes to commit on branch {self.branch.id!r}.\")\n            return self.branch.head\n\n        return self.branch.commit(message, metadata=metadata)\n\n    def merge(self, source_ref: str | Branch, into: str | Branch) -> Commit:\n        \"\"\"\n        Merge a branch into another branch in a repository.\n\n        In case the branch contains no changes relevant to the target branch,\n        no merge happens, and the tip of the target branch is returned instead.\n\n        Parameters\n        ----------\n        source_ref: str | Branch\n            Source reference containing the changes to merge.\n            Can be a branch name or partial commit SHA.\n        into: str | Branch\n            Target branch into which the changes will be merged.\n\n        Returns\n        -------\n        Commit\n            Either the created merge commit, or the head commit of the target branch.\n        \"\"\"\n        source = _ensurebranch(source_ref, self.repository, self.fs.client)\n        dest = _ensurebranch(into, self.repository, self.fs.client)\n\n        if any(dest.diff(source)):\n            source.merge_into(dest)\n        return dest.head.get_commit()\n\n    def revert(self, branch: str | Branch, ref: ReferenceType, parent_number: int = 1) -> Commit:\n        \"\"\"\n        Revert a previous commit on a branch.\n\n        Parameters\n        ----------\n        branch: str | Branch\n            Branch on which the commit should be reverted.\n        ref: ReferenceType\n            The reference to revert.\n        parent_number: int\n            If there are multiple parents to a commit, specify to which parent\n            the commit should be reverted. ``parent_number = 1`` (the default)\n            refers to the first parent commit of the current ``branch`` tip.\n\n        Returns\n        -------\n        Commit\n            The created revert commit.\n        \"\"\"\n\n        b = _ensurebranch(branch, self.repository, self.fs.client)\n\n        ref_id = ref if isinstance(ref, str) else ref.id\n        b.revert(ref_id, parent_number=parent_number)\n        return b.head.get_commit()\n\n    def rev_parse(self, ref: ReferenceType) -> Commit:\n        \"\"\"\n        Parse a given lakeFS reference expression and obtain its corresponding commit.\n\n        Parameters\n        ----------\n        ref: ReferenceType\n            Reference object to resolve, can be a branch, commit SHA, or tag.\n\n        Returns\n        -------\n        Commit\n            The commit referenced by the expression ``ref``.\n        \"\"\"\n\n        ref_id = ref.id if isinstance(ref, Reference) else ref\n        reference = lakefs.Reference(self.repository, ref_id, client=self.fs.client)\n        return reference.get_commit()\n\n    def tag(self, ref: ReferenceType, name: str) -> Tag:\n        \"\"\"\n        Create a tag referencing a commit in a repository.\n\n        Parameters\n        ----------\n        ref: ReferenceType\n            Commit SHA or placeholder for a reference or commit object\n            to which the new tag will point.\n        name: str\n            Name of the tag to be created.\n\n        Returns\n        -------\n        Tag\n            The requested tag.\n        \"\"\"\n\n        return lakefs.Tag(self.repository, name, client=self.fs.client).create(ref)\n
"},{"location":"reference/lakefs_spec/transaction/#lakefs_spec.transaction.LakeFSTransaction.commit","title":"commit","text":"
commit(message: str, metadata: dict[str, str] | None = None) -> Reference\n

Create a commit on this transaction's ephemeral branch with a commit message and attached metadata.

PARAMETER DESCRIPTION message

The commit message to attach to the newly created commit.

TYPE: str

metadata

Optional metadata to enrich the created commit with (author, e-mail, ...).

TYPE: dict[str, str] | None DEFAULT: None

RETURNS DESCRIPTION Reference

The created commit.

Source code in src/lakefs_spec/transaction.py
def commit(self, message: str, metadata: dict[str, str] | None = None) -> Reference:\n    \"\"\"\n    Create a commit on this transaction's ephemeral branch with a commit message\n    and attached metadata.\n\n    Parameters\n    ----------\n    message: str\n        The commit message to attach to the newly created commit.\n    metadata: dict[str, str] | None\n        Optional metadata to enrich the created commit with (author, e-mail, ...).\n\n    Returns\n    -------\n    Reference\n        The created commit.\n    \"\"\"\n\n    diff = list(self.branch.uncommitted())\n\n    if not diff:\n        logger.warning(f\"No changes to commit on branch {self.branch.id!r}.\")\n        return self.branch.head\n\n    return self.branch.commit(message, metadata=metadata)\n
"},{"location":"reference/lakefs_spec/transaction/#lakefs_spec.transaction.LakeFSTransaction.merge","title":"merge","text":"
merge(source_ref: str | Branch, into: str | Branch) -> Commit\n

Merge a branch into another branch in a repository.

In case the branch contains no changes relevant to the target branch, no merge happens, and the tip of the target branch is returned instead.

PARAMETER DESCRIPTION source_ref

Source reference containing the changes to merge. Can be a branch name or partial commit SHA.

TYPE: str | Branch

into

Target branch into which the changes will be merged.

TYPE: str | Branch

RETURNS DESCRIPTION Commit

Either the created merge commit, or the head commit of the target branch.

Source code in src/lakefs_spec/transaction.py
def merge(self, source_ref: str | Branch, into: str | Branch) -> Commit:\n    \"\"\"\n    Merge a branch into another branch in a repository.\n\n    In case the branch contains no changes relevant to the target branch,\n    no merge happens, and the tip of the target branch is returned instead.\n\n    Parameters\n    ----------\n    source_ref: str | Branch\n        Source reference containing the changes to merge.\n        Can be a branch name or partial commit SHA.\n    into: str | Branch\n        Target branch into which the changes will be merged.\n\n    Returns\n    -------\n    Commit\n        Either the created merge commit, or the head commit of the target branch.\n    \"\"\"\n    source = _ensurebranch(source_ref, self.repository, self.fs.client)\n    dest = _ensurebranch(into, self.repository, self.fs.client)\n\n    if any(dest.diff(source)):\n        source.merge_into(dest)\n    return dest.head.get_commit()\n
"},{"location":"reference/lakefs_spec/transaction/#lakefs_spec.transaction.LakeFSTransaction.revert","title":"revert","text":"
revert(branch: str | Branch, ref: ReferenceType, parent_number: int = 1) -> Commit\n

Revert a previous commit on a branch.

PARAMETER DESCRIPTION branch

Branch on which the commit should be reverted.

TYPE: str | Branch

ref

The reference to revert.

TYPE: ReferenceType

parent_number

If there are multiple parents to a commit, specify to which parent the commit should be reverted. parent_number = 1 (the default) refers to the first parent commit of the current branch tip.

TYPE: int DEFAULT: 1

RETURNS DESCRIPTION Commit

The created revert commit.

Source code in src/lakefs_spec/transaction.py
def revert(self, branch: str | Branch, ref: ReferenceType, parent_number: int = 1) -> Commit:\n    \"\"\"\n    Revert a previous commit on a branch.\n\n    Parameters\n    ----------\n    branch: str | Branch\n        Branch on which the commit should be reverted.\n    ref: ReferenceType\n        The reference to revert.\n    parent_number: int\n        If there are multiple parents to a commit, specify to which parent\n        the commit should be reverted. ``parent_number = 1`` (the default)\n        refers to the first parent commit of the current ``branch`` tip.\n\n    Returns\n    -------\n    Commit\n        The created revert commit.\n    \"\"\"\n\n    b = _ensurebranch(branch, self.repository, self.fs.client)\n\n    ref_id = ref if isinstance(ref, str) else ref.id\n    b.revert(ref_id, parent_number=parent_number)\n    return b.head.get_commit()\n
"},{"location":"reference/lakefs_spec/transaction/#lakefs_spec.transaction.LakeFSTransaction.rev_parse","title":"rev_parse","text":"
rev_parse(ref: ReferenceType) -> Commit\n

Parse a given lakeFS reference expression and obtain its corresponding commit.

PARAMETER DESCRIPTION ref

Reference object to resolve, can be a branch, commit SHA, or tag.

TYPE: ReferenceType

RETURNS DESCRIPTION Commit

The commit referenced by the expression ref.

Source code in src/lakefs_spec/transaction.py
def rev_parse(self, ref: ReferenceType) -> Commit:\n    \"\"\"\n    Parse a given lakeFS reference expression and obtain its corresponding commit.\n\n    Parameters\n    ----------\n    ref: ReferenceType\n        Reference object to resolve, can be a branch, commit SHA, or tag.\n\n    Returns\n    -------\n    Commit\n        The commit referenced by the expression ``ref``.\n    \"\"\"\n\n    ref_id = ref.id if isinstance(ref, Reference) else ref\n    reference = lakefs.Reference(self.repository, ref_id, client=self.fs.client)\n    return reference.get_commit()\n
"},{"location":"reference/lakefs_spec/transaction/#lakefs_spec.transaction.LakeFSTransaction.tag","title":"tag","text":"
tag(ref: ReferenceType, name: str) -> Tag\n

Create a tag referencing a commit in a repository.

PARAMETER DESCRIPTION ref

Commit SHA or placeholder for a reference or commit object to which the new tag will point.

TYPE: ReferenceType

name

Name of the tag to be created.

TYPE: str

RETURNS DESCRIPTION Tag

The requested tag.

Source code in src/lakefs_spec/transaction.py
def tag(self, ref: ReferenceType, name: str) -> Tag:\n    \"\"\"\n    Create a tag referencing a commit in a repository.\n\n    Parameters\n    ----------\n    ref: ReferenceType\n        Commit SHA or placeholder for a reference or commit object\n        to which the new tag will point.\n    name: str\n        Name of the tag to be created.\n\n    Returns\n    -------\n    Tag\n        The requested tag.\n    \"\"\"\n\n    return lakefs.Tag(self.repository, name, client=self.fs.client).create(ref)\n
"},{"location":"reference/lakefs_spec/util/","title":"util","text":"

Useful utilities for handling lakeFS URIs and results of lakeFS API calls.

"},{"location":"reference/lakefs_spec/util/#lakefs_spec.util.depaginate","title":"depaginate","text":"
depaginate(\n    api: Callable[..., PaginatedApiResponse], *args: Any, **kwargs: Any\n) -> Generator[Any, None, None]\n

Unwrap the responses from a paginated lakeFS API method into a generator.

PARAMETER DESCRIPTION api

The lakeFS client API to call. Must return a paginated response with the pagination and results fields set.

TYPE: Callable[..., PaginatedApiResponse]

*args

Positional arguments to pass to the API call.

TYPE: Any DEFAULT: ()

**kwargs

Keyword arguments to pass to the API call.

TYPE: Any DEFAULT: {}

YIELDS DESCRIPTION Any

The obtained API result objects.

Source code in src/lakefs_spec/util.py
def depaginate(\n    api: Callable[..., PaginatedApiResponse], *args: Any, **kwargs: Any\n) -> Generator[Any, None, None]:\n    \"\"\"\n    Unwrap the responses from a paginated lakeFS API method into a generator.\n\n    Parameters\n    ----------\n    api: Callable[..., PaginatedApiResponse]\n        The lakeFS client API to call. Must return a paginated response with the ``pagination`` and ``results`` fields set.\n    *args: Any\n        Positional arguments to pass to the API call.\n    **kwargs: Any\n        Keyword arguments to pass to the API call.\n\n    Yields\n    ------\n    Any\n        The obtained API result objects.\n    \"\"\"\n    while True:\n        resp = api(*args, **kwargs)\n        yield from resp.results\n        if not resp.pagination.has_more:\n            break\n        kwargs[\"after\"] = resp.pagination.next_offset\n
"},{"location":"reference/lakefs_spec/util/#lakefs_spec.util.md5_checksum","title":"md5_checksum","text":"
md5_checksum(lpath: str | PathLike[str], blocksize: int = 2 ** 22) -> str\n

Calculate a local file's MD5 hash.

PARAMETER DESCRIPTION lpath

The local path whose MD5 hash to calculate. Must be a file.

TYPE: str | PathLike[str]

blocksize

Block size (in bytes) to use while reading in the file.

TYPE: int DEFAULT: 2 ** 22

RETURNS DESCRIPTION str

The file's MD5 hash value, as a string.

Source code in src/lakefs_spec/util.py
def md5_checksum(lpath: str | os.PathLike[str], blocksize: int = 2**22) -> str:\n    \"\"\"\n    Calculate a local file's MD5 hash.\n\n    Parameters\n    ----------\n    lpath: str | os.PathLike[str]\n        The local path whose MD5 hash to calculate. Must be a file.\n    blocksize: int\n        Block size (in bytes) to use while reading in the file.\n\n    Returns\n    -------\n    str\n        The file's MD5 hash value, as a string.\n    \"\"\"\n    with open(lpath, \"rb\") as f:\n        file_hash = hashlib.md5(usedforsecurity=False)\n        chunk = f.read(blocksize)\n        while chunk:\n            file_hash.update(chunk)\n            chunk = f.read(blocksize)\n    return file_hash.hexdigest()\n
"},{"location":"reference/lakefs_spec/util/#lakefs_spec.util.parse","title":"parse","text":"
parse(path: str) -> tuple[str, str, str]\n

Parses a lakeFS URI in the form lakefs://<repo>/<ref>/<resource>.

PARAMETER DESCRIPTION path

String path, needs to conform to the lakeFS URI format described above. The <resource> part can be the empty string; the leading lakefs:// scheme may be omitted.

TYPE: str

RETURNS DESCRIPTION tuple[str, str, str]

A 3-tuple of repository name, reference, and resource name.

RAISES DESCRIPTION ValueError

If the path does not conform to the lakeFS URI format.

Source code in src/lakefs_spec/util.py
def parse(path: str) -> tuple[str, str, str]:\n    \"\"\"\n    Parses a lakeFS URI in the form ``lakefs://<repo>/<ref>/<resource>``.\n\n    Parameters\n    ----------\n    path: str\n        String path, needs to conform to the lakeFS URI format described above.\n        The ``<resource>`` part can be the empty string; the leading ``lakefs://`` scheme may be omitted.\n\n    Returns\n    -------\n    tuple[str, str, str]\n        A 3-tuple of repository name, reference, and resource name.\n\n    Raises\n    ------\n    ValueError\n        If the path does not conform to the lakeFS URI format.\n    \"\"\"\n\n    # First regex reflects the lakeFS repository naming rules:\n    # only lowercase letters, digits and dash, no leading dash, minimum 3, maximum 63 characters\n    # https://docs.lakefs.io/understand/model.html#repository\n    # Second regex is the branch: Only letters, digits, underscores and dash, no leading dash.\n    path_regex = re.compile(r\"(?:lakefs://)?([a-z0-9][a-z0-9\\-]{2,62})/(\\w[\\w\\-]*)/(.*)\")\n    results = path_regex.fullmatch(path)\n    if results is None:\n        raise ValueError(\n            f\"expected path with structure lakefs://<repo>/<ref>/<resource>, got {path!r}\"\n        )\n\n    repo, ref, resource = results.groups()\n    return repo, ref, resource\n
"},{"location":"tutorials/","title":"Tutorials","text":"

Info

We aim to provide additional tutorials in the future - contributions are welcome!

  • Quickstart example: Using lakeFS-spec as a file system
  • A fully-worked data science example: Using lakeFS-spec together with Pandas to train a classifier based on a public dataset and simulate additional data being collected
"},{"location":"tutorials/demo_data_science_project/","title":"Data Science with lakeFS-spec","text":"
%pip install numpy pandas scikit-learn\n
\nCollecting numpy\n  Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)\n\n
\nCollecting pandas\n\n
\n  Downloading pandas-2.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)\n\n
\nCollecting scikit-learn\n  Using cached scikit_learn-1.4.1.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)\n\n
\nRequirement already satisfied: python-dateutil>=2.8.2 in /opt/hostedtoolcache/Python/3.11.8/x64/lib/python3.11/site-packages (from pandas) (2.8.2)\nRequirement already satisfied: pytz>=2020.1 in /opt/hostedtoolcache/Python/3.11.8/x64/lib/python3.11/site-packages (from pandas) (2024.1)\nCollecting tzdata>=2022.7 (from pandas)\n  Using cached tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)\n\n
\nCollecting scipy>=1.6.0 (from scikit-learn)\n  Using cached scipy-1.12.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)\nCollecting joblib>=1.2.0 (from scikit-learn)\n\n
\n  Using cached joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)\nCollecting threadpoolctl>=2.0.0 (from scikit-learn)\n  Using cached threadpoolctl-3.3.0-py3-none-any.whl.metadata (13 kB)\nRequirement already satisfied: six>=1.5 in /opt/hostedtoolcache/Python/3.11.8/x64/lib/python3.11/site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\n\n
\nUsing cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)\nDownloading pandas-2.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.0 MB)\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 0.0/13.0 MB ? eta -:--:--\n
\n\n   \u2501\u2501\u2501\u2501\u2578\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 1.6/13.0 MB 76.4 MB/s eta 0:00:01\n
\n\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u257a\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 6.2/13.0 MB 90.8 MB/s eta 0:00:01\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u257a\u2501\u2501\u2501 11.8/13.0 MB 123.4 MB/s eta 0:00:01\n
\n\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2578 13.0/13.0 MB 164.6 MB/s eta 0:00:01\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 13.0/13.0 MB 107.9 MB/s eta 0:00:00\nUsing cached scikit_learn-1.4.1.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)\n\n
\nUsing cached joblib-1.3.2-py3-none-any.whl (302 kB)\nUsing cached scipy-1.12.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.4 MB)\n\n
\nUsing cached threadpoolctl-3.3.0-py3-none-any.whl (17 kB)\nUsing cached tzdata-2024.1-py2.py3-none-any.whl (345 kB)\n\n
\nInstalling collected packages: tzdata, threadpoolctl, numpy, joblib, scipy, pandas, scikit-learn\n\n
\nSuccessfully installed joblib-1.3.2 numpy-1.26.4 pandas-2.2.1 scikit-learn-1.4.1.post1 scipy-1.12.0 threadpoolctl-3.3.0 tzdata-2024.1\n\n
\nNote: you may need to restart the kernel to use updated packages.\n\n

Also install an appropriate lakeFS-spec version, which can be either the latest release from PyPI via pip install --upgrade lakefs-spec, or the development version from GitHub via pip install git+https://github.com/aai-institute/lakefs-spec.git.

import os\nimport tempfile\nimport urllib.request\nfrom pathlib import Path\n\nurllib.request.urlretrieve(\n    \"https://raw.githubusercontent.com/aai-institute/lakefs-spec/main/docs/tutorials/.lakectl.yaml\",\n    os.path.expanduser(\"~/.lakectl.yaml\"),\n)\n
\n('/home/runner/.lakectl.yaml', <http.client.HTTPMessage at 0x7f781c8cc090>)\n

We can now instantiate the LakeFSFileSystem with the credentials we just downloaded. Alternatively, we could have passed the credentials directly in the code. It is important that the credentials are available at the time of filesystem instantiation.

from lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\n\nREPO_NAME = \"weather\"\n

We will create a repository using a helper function provided by lakeFS-spec. If you have already created one in the UI, make sure to set the REPO_NAME variable accordingly in the cell directly above.

import lakefs\n\nrepo = lakefs.Repository(REPO_NAME, fs.client).create(storage_namespace=f\"local://{REPO_NAME}\")\n
def _maybe_urlretrieve(url: str, filename: str) -&gt; str:\n    # Avoid API rate limit errors by downloading to a fixed local location\n    destination = Path(tempfile.gettempdir()) / \"lakefs-spec-tutorials\" / filename\n    destination.parent.mkdir(exist_ok=True, parents=True)\n    if destination.exists():\n        return str(destination)\n\n    outfile, _ = urllib.request.urlretrieve(url, str(destination))\n    return outfile\n\n\noutfile = _maybe_urlretrieve(\n    \"https://archive-api.open-meteo.com/v1/archive?latitude=52.52&amp;longitude=13.41&amp;start_date=2010-01-01&amp;end_date=2010-12-31&amp;hourly=temperature_2m,relativehumidity_2m,rain,pressure_msl,surface_pressure,cloudcover,cloudcover_low,cloudcover_mid,cloudcover_high,windspeed_10m,windspeed_100m,winddirection_10m,winddirection_100m\",\n    \"weather-2010.json\",\n)\n

The data is in JSON format. Therefore, we need to wrangle the data a bit to make it usable. But first, we will upload it to our lakeFS instance.

NEW_BRANCH = lakefs.Branch(REPO_NAME, \"transform-raw-data\", client=fs.client)\nNEW_BRANCH.create(\"main\")\n\nwith fs.transaction(REPO_NAME, NEW_BRANCH) as tx:\n    fs.put(outfile, f\"{REPO_NAME}/{tx.branch.id}/weather-2010.json\")\n    tx.commit(message=\"Add 2010 weather data\")\n

You can inspect this commit by selecting the transform-raw-data branch, and navigating to the Commits tab.

import json\n\nimport pandas as pd\n\n\ndef transform_json_weather_data(filepath):\n    if hasattr(filepath, \"close\") and hasattr(filepath, \"tell\"):\n        data = json.load(filepath)\n    else:\n        with open(filepath, \"r\") as f:\n            data = json.load(f)\n\n    df = pd.DataFrame.from_dict(data[\"hourly\"])\n    df.time = pd.to_datetime(df.time)\n    df[\"is_raining\"] = df.rain &gt; 0\n    df[\"is_raining_in_1_day\"] = df.is_raining.shift(24).astype(bool)\n    df = df.dropna()\n    return df\n\n\ndf = transform_json_weather_data(outfile)\ndf.head(5)\n
time temperature_2m relativehumidity_2m rain pressure_msl surface_pressure cloudcover cloudcover_low cloudcover_mid cloudcover_high windspeed_10m windspeed_100m winddirection_10m winddirection_100m is_raining is_raining_in_1_day 0 2010-01-01 00:00:00 -2.6 88 0.0 996.9 992.1 100 100 97 75 16.0 27.2 54 58 False True 1 2010-01-01 01:00:00 -2.7 88 0.0 996.4 991.6 100 99 96 49 16.3 28.0 55 58 False True 2 2010-01-01 02:00:00 -2.7 88 0.0 996.2 991.4 100 96 94 60 16.3 27.5 55 58 False True 3 2010-01-01 03:00:00 -2.7 88 0.0 996.1 991.3 100 97 96 83 15.4 26.6 53 57 False True 4 2010-01-01 04:00:00 -2.7 88 0.0 996.0 991.2 100 92 98 82 14.8 25.6 47 52 False True

Next, we save this data as a CSV file into the main branch. When the transaction commit helper is called, the newly put CSV file is committed. You can verify the saving worked in the lakeFS UI in your browser by switching to the commits tab of the main branch.

with fs.transaction(REPO_NAME, \"main\") as tx:\n    df.to_csv(f\"lakefs://{REPO_NAME}/{tx.branch.id}/weather_2010.csv\")\n    tx.commit(message=\"Update weather data\")\n
import sklearn.model_selection\n\nmodel_data = df.drop(\"time\", axis=1)\n\ntrain, test = sklearn.model_selection.train_test_split(model_data, random_state=7)\n

We save these train and test datasets into a new training branch. If the branch does not exist yet, as in this case, it is implicitly created by default. You can control this behaviour with the create_branch_ok flag when initializing the LakeFSFileSystem. By default, create_branch_ok is set to True, so we need to only set fs = LakeFSFileSystem() to enable implicit branch creation.

TRAINING_BRANCH = lakefs.Branch(REPO_NAME, \"training\", client=fs.client)\nTRAINING_BRANCH.create(\"main\")\n\nwith fs.transaction(REPO_NAME, TRAINING_BRANCH) as tx:\n    train.to_csv(f\"lakefs://{REPO_NAME}/{tx.branch.id}/train_weather.csv\")\n    test.to_csv(f\"lakefs://{REPO_NAME}/{tx.branch.id}/test_weather.csv\")\n    tx.commit(message=\"Add train-test split of 2010 weather data\")\n

Let's check the shape of train and test data. Later on, we will get back to this data version and reproduce the results of the experiment.

print(f\"Initial train data shape: {train.shape}\")\nprint(f\"Initial test data shape: {test.shape}\")\n
\nInitial train data shape: (6570, 15)\nInitial test data shape: (2190, 15)\n\n

We now proceed to train a decision tree classifier and evaluate it on the test set:

from sklearn.tree import DecisionTreeClassifier\n\ndependent_variable = \"is_raining_in_1_day\"\n\nmodel = DecisionTreeClassifier(random_state=7)\n\nx_train, y_train = train.drop(dependent_variable, axis=1), train[dependent_variable].astype(bool)\nx_test, y_test = test.drop(dependent_variable, axis=1), test[dependent_variable].astype(bool)\n\nmodel.fit(x_train, y_train)\n\ntest_acc = model.score(x_test, y_test)\nprint(f\"Test accuracy: {test_acc:.2%}\")\n
\nTest accuracy: 87.31%\n\n
outfile = _maybe_urlretrieve(\n    \"https://archive-api.open-meteo.com/v1/archive?latitude=52.52&amp;longitude=13.41&amp;start_date=2020-01-01&amp;end_date=2020-12-31&amp;hourly=temperature_2m,relativehumidity_2m,rain,pressure_msl,surface_pressure,cloudcover,cloudcover_low,cloudcover_mid,cloudcover_high,windspeed_10m,windspeed_100m,winddirection_10m,winddirection_100m\",\n    \"weather-2020.json\",\n)\n\nnew_data = transform_json_weather_data(outfile)\n\nwith fs.transaction(REPO_NAME, \"main\") as tx:\n    new_data.to_csv(f\"lakefs://{REPO_NAME}/{tx.branch.id}/weather_2020.csv\")\n    tx.commit(message=\"Add 2020 weather data\")\n\n# Remove leftover temporary files from previous `urlretrieve` calls\nurllib.request.urlcleanup()\n

Let's concatenate the old data and the new data, create a new train-test split, and push the updated files to lakeFS:

new_data = new_data.drop(\"time\", axis=1)\nfull_data = pd.concat([new_data, train, test])\n\ntrain_df, test_df = sklearn.model_selection.train_test_split(full_data, random_state=7)\n\nprint(f\"Updated train data shape: {train_df.shape}\")\nprint(f\"Updated test data shape: {test_df.shape}\")\n\nwith fs.transaction(REPO_NAME, TRAINING_BRANCH) as tx:\n    train_df.to_csv(f\"lakefs://{REPO_NAME}/{tx.branch.id}/train_weather.csv\")\n    test_df.to_csv(f\"lakefs://{REPO_NAME}/{tx.branch.id}/test_weather.csv\")\n    tx.commit(message=\"Add train-test split of 2010 and 2020 data\")\n
\nUpdated train data shape: (13158, 15)\nUpdated test data shape: (4386, 15)\n\n

Now, we train the model on the new data and validate on the new test data.

x_train, y_train = (\n    train_df.drop(dependent_variable, axis=1),\n    train_df[dependent_variable].astype(bool),\n)\nx_test, y_test = test_df.drop(dependent_variable, axis=1), test_df[dependent_variable].astype(bool)\n\nmodel.fit(x_train, y_train)\n\ntest_acc = model.score(x_test, y_test)\n\nprint(f\"Test accuracy: {test_acc:.2%}\")\n
\nTest accuracy: 82.67%\n\n
# access the data of the previous commit with a lakefs ref expression, in this case the same as in git.\nprevious_commit = repo.ref(f\"{TRAINING_BRANCH.id}~\").get_commit()\nfixed_commit_id = previous_commit.id\nprint(fixed_commit_id)\n
\n698964fd36f1fa620e92c786e62fc0d59c2f62b8c82eb721cc1233f95520702d\n\n

Let's check whether we managed to get the initial train and test data with this commit SHA, checking equality to the initial data:

orig_train = pd.read_csv(f\"lakefs://{REPO_NAME}/{fixed_commit_id}/train_weather.csv\", index_col=0)\norig_test = pd.read_csv(f\"lakefs://{REPO_NAME}/{fixed_commit_id}/test_weather.csv\", index_col=0)\n\nprint(f\"Is the pulled training data equal to the local training data? {train.equals(orig_train)}\")\nprint(f\"Is the pulled test data equal to the local test data? {test.equals(orig_test)}\")\n
\nIs the pulled training data equal to the local training data? True\nIs the pulled test data equal to the local test data? True\n\n

Let's train and validate the model again based on the redownloaded data and see if we manage to reproduce the initial accuracy.

x_train, y_train = train.drop(dependent_variable, axis=1), train[dependent_variable].astype(bool)\nx_test, y_test = test.drop(dependent_variable, axis=1), test[dependent_variable].astype(bool)\n\nmodel.fit(x_train, y_train)\n\ntest_acc = model.score(x_test, y_test)\n\nprint(f\"Test accuracy: {test_acc:.2%}\")\n
\nTest accuracy: 87.31%\n\n
with fs.transaction(REPO_NAME, \"main\") as tx:\n    # returns the tag as a lakeFS object.\n    tag = tx.tag(fixed_commit_id, name=\"train-test-split-2010\")\n

Now we can access the specific files with the semantic tag. Both the fixed_commit_id and tag reference the same version ref in lakeFS, whereas a branch name always points to the latest version on that respective branch.

train_from_commit = pd.read_csv(\n    f\"lakefs://{REPO_NAME}/{fixed_commit_id}/train_weather.csv\", index_col=0\n)\ntrain_from_tag = pd.read_csv(f\"lakefs://{REPO_NAME}/{tag.id}/train_weather.csv\", index_col=0)\n

We can verify this by comparing the DataFrames. We see that the train_from_commit and train_from_tag are equal.

print(\n    f\"Is the data tagged {tag!r} equal to the data in commit {fixed_commit_id[:8]}? {train_from_commit.equals(train_from_tag)}\"\n)\n
\nIs the data tagged Tag(repository=\"weather\", id=\"train-test-split-2010\") equal to the data in commit 698964fd? True\n\n
"},{"location":"tutorials/demo_data_science_project/#data-science-with-lakefs-spec","title":"Data Science with lakeFS-spec","text":"

In this notebook, we will complete a small end-to-end data science tutorial that employs lakeFS-spec for data versioning. We will use versioned weather data to train a decision tree classifier to predict whether it is raining tomorrow given the current weather.

We will do the following:

  • Environment setup
  • LakeFS setup
  • Authenticating with the lakeFS server
  • Data ingestion via transactions
  • Model training
  • Updating data and retraining a model
  • Accessing data versions and reproducing experiments
  • Using tags for semantic versioning

Local Execution

If you want to execute the code in this tutorial as a Jupyter notebook yourself, download the demo_data_science_project.py file from the lakeFS-spec repository.

You can then convert the Python file to a notebook using Jupytext using the following command: jupytext --to notebook demo_data_science_project.py.

This tutorial assumes that you have installed lakeFS-spec in a virtual environment, and that you have followed the quickstart guide to set up a local lakeFS instance.

"},{"location":"tutorials/demo_data_science_project/#environment-setup","title":"Environment setup","text":"

Install the necessary libraries for this notebook on the environment you have just created:

"},{"location":"tutorials/demo_data_science_project/#lakefs-setup","title":"lakeFS Setup","text":"

With Docker Desktop or a similar runtime running set up lakeFS by executing the following docker run command (from the lakeFS quickstart) in your console:

docker run --name lakefs --pull always --rm --publish 8000:8000 treeverse/lakefs:latest run --quickstart\n

You find the authentication credentials in the terminal output. The default address for the local lakeFS GUI is http://localhost:8000/.

"},{"location":"tutorials/demo_data_science_project/#authenticating-with-the-lakefs-server","title":"Authenticating with the lakeFS server","text":"

There are multiple ways to authenticate with lakeFS from Python code. In this tutorial, we choose the YAML file configuration. By executing the cell below, you will download a YAML file containing the default lakeFS quickstart credentials and server URL to your user directory.

"},{"location":"tutorials/demo_data_science_project/#data-ingestion","title":"Data Ingestion","text":"

Now it's time to get some data. We will use the Open-Meteo API, where we can pull weather data from an API for free (as long as we are non-commercial) and without an API token. In order to prevent hitting the rate limits when repeatedly querying the API (and out of courtesy towards the operators of the API), the _maybe_urlretrieve function provides a simple local cache for the downloaded data.

For training our toy model, we download the full weather data of Munich for the year 2010:

"},{"location":"tutorials/demo_data_science_project/#upload-a-file-using-transactions","title":"Upload a file using transactions","text":"

lakeFS works similar to git as a versioning system. You can create commits that contain specific changes to the data. You can also work with branches to create your own isolated view of the data independently of your colleagues. Every commit (on any branch) is identified by a commit SHA. This SHA can be used to programmatically interact with specific states of your data and enables logging of the specific data versions used to create a certain model.

To easily carry out versioning operations while uploading files, you can use transactions. A transaction is a context manager that keeps track of all files that were uploaded in its scope, as well as all versioning operations happening between file uploads. All operations are deferred to the end of the transaction, and are executed sequentially on completion.

To create a commit after a file upload, you can run the following transaction:

"},{"location":"tutorials/demo_data_science_project/#data-transformation","title":"Data Transformation","text":"

Now let's transform the data for our use case. We put the transformation into a function to be able to reuse it later.

In this notebook, we use a simple toy model to predict whether it is raining at the same time tomorrow given weather data from right now.

We will skip a lot of possible feature engineering and other data science aspects in order to focus more on the application of the LakeFSFileSystem.

"},{"location":"tutorials/demo_data_science_project/#training-the-initial-weather-model","title":"Training the initial weather model","text":"

First we will do a train-test split:

"},{"location":"tutorials/demo_data_science_project/#updating-data-and-retraining-the-model","title":"Updating data and retraining the model","text":"

Until now, we only have used data from 2010. Let's download additional 2020 data, transform it, and save it to lakeFS.

"},{"location":"tutorials/demo_data_science_project/#accessing-data-versions-through-commits-and-reproducing-experiments","title":"Accessing data versions through commits and reproducing experiments","text":"

If we need to go to our initial data and reproduce the first experiment (the model trained on the 2010 data with its initial accuracy), we can go back in the commit history of the training branch and select the appropriate commit data snapshot. Since we have created multiple commits on the same branch already, we will address different data versions by their commit SHAs.

To obtain the actual commit SHA from a branch, we have multiple options. Manually, we could go into the lakeFS UI, select the training branch, and navigate to the Commits tab. There, we take the parent of the previous commit, titled Add train-test split of 2010 weather data, and copy its revision SHA (also called ID).

In code, we can obtain commit SHAs for different revisions on the training branch by using lakefs.Reference objects.

"},{"location":"tutorials/demo_data_science_project/#using-tags-instead-of-commit-shas-for-semantic-versioning","title":"Using tags instead of commit SHAs for semantic versioning","text":"

The above method for data versioning works great when you have experiment tracking tools to store and retrieve the commit SHA in automated pipelines. But it can be tedious to retrieve in manual prototyping. We can make selected versions of the dataset more accessible with semantic versioning by attaching a human-interpretable tag to a specific commit SHA.

Creating a tag is easiest when done inside a transaction, just like the files we already uploaded. To do this, simply call tx.tag on the transaction and supply the repository name, the commit SHA to tag, and the intended tag name. Tags are immutable once created, so attempting to tag two different commits with the same name will result in an error.

"}]} \ No newline at end of file diff --git a/development/sitemap.xml b/development/sitemap.xml index dd53ecd8..eb007fa5 100644 --- a/development/sitemap.xml +++ b/development/sitemap.xml @@ -2,82 +2,82 @@ https://lakefs-spec.org/latest/ - 2024-02-21 + 2024-02-28 daily https://lakefs-spec.org/latest/CONTRIBUTING/ - 2024-02-21 + 2024-02-28 daily https://lakefs-spec.org/latest/quickstart/ - 2024-02-21 + 2024-02-28 daily https://lakefs-spec.org/latest/guides/ - 2024-02-21 + 2024-02-28 daily https://lakefs-spec.org/latest/guides/configuration/ - 2024-02-21 + 2024-02-28 daily https://lakefs-spec.org/latest/guides/filesystem-usage/ - 2024-02-21 + 2024-02-28 daily https://lakefs-spec.org/latest/guides/integrations/ - 2024-02-21 + 2024-02-28 daily https://lakefs-spec.org/latest/guides/transactions/ - 2024-02-21 + 2024-02-28 daily https://lakefs-spec.org/latest/reference/SUMMARY/ - 2024-02-21 + 2024-02-28 daily https://lakefs-spec.org/latest/reference/lakefs_spec/ - 2024-02-21 + 2024-02-28 daily https://lakefs-spec.org/latest/reference/lakefs_spec/errors/ - 2024-02-21 + 2024-02-28 daily https://lakefs-spec.org/latest/reference/lakefs_spec/spec/ - 2024-02-21 + 2024-02-28 daily https://lakefs-spec.org/latest/reference/lakefs_spec/transaction/ - 2024-02-21 + 2024-02-28 daily https://lakefs-spec.org/latest/reference/lakefs_spec/util/ - 2024-02-21 + 2024-02-28 daily https://lakefs-spec.org/latest/tutorials/ - 2024-02-21 + 2024-02-28 daily https://lakefs-spec.org/latest/tutorials/demo_data_science_project/ - 2024-02-21 + 2024-02-28 daily \ No newline at end of file diff --git a/development/sitemap.xml.gz b/development/sitemap.xml.gz index c0020266..902edf25 100644 Binary files a/development/sitemap.xml.gz and b/development/sitemap.xml.gz differ diff --git a/development/tutorials/demo_data_science_project/index.html b/development/tutorials/demo_data_science_project/index.html index f9d52518..aa5f07eb 100644 --- a/development/tutorials/demo_data_science_project/index.html +++ b/development/tutorials/demo_data_science_project/index.html @@ -112,21 +112,55 @@ - + + + + + +
-
@@ -1064,6 +1112,7 @@

Environment setup
 Collecting numpy
+  Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
 
 
@@ -1071,9 +1120,7 @@

Environment setup
-  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0.0/61.0 kB ? eta -:--:--
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 61.0/61.0 kB 3.6 MB/s eta 0:00:00
+Collecting pandas
 
 
@@ -1081,8 +1128,7 @@

Environment setup
-Collecting pandas
-  Downloading pandas-2.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
+  Downloading pandas-2.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
 
 
@@ -1091,7 +1137,7 @@

Environment setup
 Collecting scikit-learn
-  Downloading scikit_learn-1.4.1.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
+  Using cached scikit_learn-1.4.1.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
 
 
@@ -1102,7 +1148,7 @@

Environment setupEnvironment setup
 Collecting scipy>=1.6.0 (from scikit-learn)
-  Downloading scipy-1.12.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0.0/60.4 kB ? eta -:--:--
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 60.4/60.4 kB 17.5 MB/s eta 0:00:00
+  Using cached scipy-1.12.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
+Collecting joblib>=1.2.0 (from scikit-learn)
 
 
@@ -1121,17 +1166,9 @@

Environment setup
-Collecting joblib>=1.2.0 (from scikit-learn)
-  Downloading joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)
-
-
-
- -
- -
-
-
-
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.2/18.3 MB 65.1 MB/s eta 0:00:01
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 7.4/18.3 MB 107.5 MB/s eta 0:00:01
-
-
-
-
-
-
-
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.6/18.3 MB 152.4 MB/s eta 0:00:01
-
-
-
-
-
-
-
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 18.2/18.3 MB 163.2 MB/s eta 0:00:01
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 18.2/18.3 MB 163.2 MB/s eta 0:00:01
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 18.3/18.3 MB 94.2 MB/s eta 0:00:00
-
-
-
-
-
-
-
-
-
-
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 13.0/13.0 MB 154.8 MB/s eta 0:00:01
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 13.0/13.0 MB 102.0 MB/s eta 0:00:00
-
-
-
-
-
-
-
-Downloading scikit_learn-1.4.1.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0.0/12.1 MB ? eta -:--:--
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 5.5/12.1 MB 166.6 MB/s eta 0:00:01
-
-
-
-
-
-
-
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 10.3/12.1 MB 151.7 MB/s eta 0:00:01
-
-
-
-
-
-
-
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.1/12.1 MB 151.0 MB/s eta 0:00:01
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.1/12.1 MB 103.2 MB/s eta 0:00:00
-Downloading joblib-1.3.2-py3-none-any.whl (302 kB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0.0/302.2 kB ? eta -:--:--
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 302.2/302.2 kB 59.9 MB/s eta 0:00:00
-Downloading scipy-1.12.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.4 MB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0.0/38.4 MB ? eta -:--:--
-
-
-
-
-
-
-
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 5.4/38.4 MB 162.5 MB/s eta 0:00:01
+   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.6/13.0 MB 76.4 MB/s eta 0:00:01
 
@@ -1243,7 +1195,8 @@

Environment setup
 
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 10.6/38.4 MB 156.3 MB/s eta 0:00:01
+   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.2/13.0 MB 90.8 MB/s eta 0:00:01
+   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 11.8/13.0 MB 123.4 MB/s eta 0:00:01
 
@@ -1251,43 +1204,18 @@

Environment setup
 
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 16.1/38.4 MB 156.9 MB/s eta 0:00:01
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 21.4/38.4 MB 158.4 MB/s eta 0:00:01
-
- - -
-
-
-
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 26.7/38.4 MB 155.5 MB/s eta 0:00:01
-
-
-
-
-
-
-
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 32.2/38.4 MB 157.9 MB/s eta 0:00:01
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 38.2/38.4 MB 169.0 MB/s eta 0:00:01
-
-
-
-
-
-
-
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 38.4/38.4 MB 165.0 MB/s eta 0:00:01
+   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 13.0/13.0 MB 164.6 MB/s eta 0:00:01
+   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 13.0/13.0 MB 107.9 MB/s eta 0:00:00
+Using cached scikit_learn-1.4.1.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
+
 
-
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 38.4/38.4 MB 165.0 MB/s eta 0:00:01
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 38.4/38.4 MB 66.1 MB/s eta 0:00:00
-Downloading threadpoolctl-3.3.0-py3-none-any.whl (17 kB)
+Using cached joblib-1.3.2-py3-none-any.whl (302 kB)
+Using cached scipy-1.12.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.4 MB)
 
 
@@ -1295,9 +1223,8 @@

Environment setup
-Downloading tzdata-2024.1-py2.py3-none-any.whl (345 kB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0.0/345.4 kB ? eta -:--:--
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 345.4/345.4 kB 64.5 MB/s eta 0:00:00
+Using cached threadpoolctl-3.3.0-py3-none-any.whl (17 kB)
+Using cached tzdata-2024.1-py2.py3-none-any.whl (345 kB)
 
 
@@ -1313,7 +1240,7 @@

Environment setup
-Successfully installed joblib-1.3.2 numpy-1.26.4 pandas-2.2.0 scikit-learn-1.4.1.post1 scipy-1.12.0 threadpoolctl-3.3.0 tzdata-2024.1
+Successfully installed joblib-1.3.2 numpy-1.26.4 pandas-2.2.1 scikit-learn-1.4.1.post1 scipy-1.12.0 threadpoolctl-3.3.0 tzdata-2024.1
 
 
@@ -1370,7 +1297,7 @@

Authenticating with the lakeFS se
-('/home/runner/.lakectl.yaml', <http.client.HTTPMessage at 0x7fd726d97590>)
+('/home/runner/.lakectl.yaml', <http.client.HTTPMessage at 0x7f781c8cc090>)
 
@@ -1535,21 +1462,6 @@

Data Transformation
-
-
-/tmp/ipykernel_2291/2823322696.py:3: DeprecationWarning: 
-Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
-(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
-but was not found to be installed on your system.
-If this would cause problems for you,
-please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
-
-  import pandas as pd
-
-
-
-
-