diff --git a/development/404.html b/development/404.html
index f4ac8d49..3724d987 100644
--- a/development/404.html
+++ b/development/404.html
@@ -101,21 +101,55 @@
       </div>
     
     
-      
+      <!--
+  Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>
+
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to
+  deal in the Software without restriction, including without limitation the
+  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+  sell copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+  IN THE SOFTWARE.
+-->
+
+<!-- Determine classes -->
+
 
   
 
+
+<!-- Header -->
 <header class="md-header md-header--shadow" data-md-component="header">
-  <nav class="md-header__inner md-grid" aria-label="Header">
+  <nav
+    class="md-header__inner md-grid"
+    aria-label="Header"
+  >
+
+    <!-- Link to home -->
     <a href="https://lakefs-spec.org" title="lakeFS-spec" class="md-header__button md-logo" aria-label="lakeFS-spec" data-md-component="logo">
       
   <img src="/latest/_images/aai-logo-cropped.png" alt="logo">
 
     </a>
+
+    <!-- Button to open drawer -->
     <label class="md-header__button md-icon" for="__drawer">
       
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
     </label>
+
+    <!-- Header title -->
     <div class="md-header__title" data-md-component="header-title">
       <div class="md-header__ellipsis">
         <div class="md-header__topic">
@@ -126,12 +160,14 @@
         <div class="md-header__topic" data-md-component="header-topic">
           <span class="md-ellipsis">
             
-              
+              lakeFS-spec &mdash; <strong></strong>
             
           </span>
         </div>
       </div>
     </div>
+
+    <!-- Color palette toggle -->
     
       
         <form class="md-header__option" data-md-component="palette">
@@ -159,15 +195,23 @@
 </form>
       
     
+
+    <!-- User preference: color palette -->
     
       <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
     
+
+    <!-- Site language selector -->
     
+
+    <!-- Button to open search modal -->
     
       <label class="md-header__button md-icon" for="__search">
         
         <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
       </label>
+
+      <!-- Search interface -->
       <div class="md-search" data-md-component="search" role="dialog">
   <label class="md-search__overlay" for="__search"></label>
   <div class="md-search__inner" role="search">
@@ -201,6 +245,8 @@
   </div>
 </div>
     
+
+    <!-- Repository information -->
     
       <div class="md-header__source">
         <a href="https://github.com/aai-institute/lakefs-spec" title="Go to repository" class="md-source" data-md-component="source">
@@ -215,6 +261,8 @@
       </div>
     
   </nav>
+
+  <!-- Navigation tabs (sticky) -->
   
 </header>
     
diff --git a/development/CONTRIBUTING/index.html b/development/CONTRIBUTING/index.html
index 6a5bc836..d9c53946 100644
--- a/development/CONTRIBUTING/index.html
+++ b/development/CONTRIBUTING/index.html
@@ -110,21 +110,55 @@
       </div>
     
     
-      
+      <!--
+  Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>
+
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to
+  deal in the Software without restriction, including without limitation the
+  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+  sell copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+  IN THE SOFTWARE.
+-->
+
+<!-- Determine classes -->
+
 
   
 
+
+<!-- Header -->
 <header class="md-header md-header--shadow" data-md-component="header">
-  <nav class="md-header__inner md-grid" aria-label="Header">
+  <nav
+    class="md-header__inner md-grid"
+    aria-label="Header"
+  >
+
+    <!-- Link to home -->
     <a href="https://lakefs-spec.org" title="lakeFS-spec" class="md-header__button md-logo" aria-label="lakeFS-spec" data-md-component="logo">
       
   <img src="../_images/aai-logo-cropped.png" alt="logo">
 
     </a>
+
+    <!-- Button to open drawer -->
     <label class="md-header__button md-icon" for="__drawer">
       
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
     </label>
+
+    <!-- Header title -->
     <div class="md-header__title" data-md-component="header-title">
       <div class="md-header__ellipsis">
         <div class="md-header__topic">
@@ -135,12 +169,14 @@
         <div class="md-header__topic" data-md-component="header-topic">
           <span class="md-ellipsis">
             
-              Contributing
+              lakeFS-spec &mdash; <strong>Contributing</strong>
             
           </span>
         </div>
       </div>
     </div>
+
+    <!-- Color palette toggle -->
     
       
         <form class="md-header__option" data-md-component="palette">
@@ -168,15 +204,23 @@
 </form>
       
     
+
+    <!-- User preference: color palette -->
     
       <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
     
+
+    <!-- Site language selector -->
     
+
+    <!-- Button to open search modal -->
     
       <label class="md-header__button md-icon" for="__search">
         
         <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
       </label>
+
+      <!-- Search interface -->
       <div class="md-search" data-md-component="search" role="dialog">
   <label class="md-search__overlay" for="__search"></label>
   <div class="md-search__inner" role="search">
@@ -210,6 +254,8 @@
   </div>
 </div>
     
+
+    <!-- Repository information -->
     
       <div class="md-header__source">
         <a href="https://github.com/aai-institute/lakefs-spec" title="Go to repository" class="md-source" data-md-component="source">
@@ -224,6 +270,8 @@
       </div>
     
   </nav>
+
+  <!-- Navigation tabs (sticky) -->
   
 </header>
     
diff --git a/development/_theme_overrides/partials/header.html b/development/_theme_overrides/partials/header.html
new file mode 100644
index 00000000..fa74eebf
--- /dev/null
+++ b/development/_theme_overrides/partials/header.html
@@ -0,0 +1,117 @@
+<!--
+  Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>
+
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to
+  deal in the Software without restriction, including without limitation the
+  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+  sell copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+  IN THE SOFTWARE.
+-->
+
+<!-- Determine classes -->
+{% set class = "md-header" %}
+{% if "navigation.tabs.sticky" in features %}
+  {% set class = class ~ " md-header--shadow md-header--lifted" %}
+{% elif "navigation.tabs" not in features %}
+  {% set class = class ~ " md-header--shadow" %}
+{% endif %}
+
+<!-- Header -->
+<header class="{{ class }}" data-md-component="header">
+  <nav
+    class="md-header__inner md-grid"
+    aria-label="{{ lang.t('header') }}"
+  >
+
+    <!-- Link to home -->
+    <a
+      href="{{ config.extra.homepage | d(nav.homepage.url, true) | url }}"
+      title="{{ config.site_name | e }}"
+      class="md-header__button md-logo"
+      aria-label="{{ config.site_name }}"
+      data-md-component="logo"
+    >
+      {% include "partials/logo.html" %}
+    </a>
+
+    <!-- Button to open drawer -->
+    <label class="md-header__button md-icon" for="__drawer">
+      {% set icon = config.theme.icon.menu or "material/menu" %}
+      {% include ".icons/" ~ icon ~ ".svg" %}
+    </label>
+
+    <!-- Header title -->
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            {{ config.site_name }}
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            {% if page.meta and page.meta.title %}
+              {{ config.site_name }} &mdash; <strong>{{ page.meta.title }}</strong>
+            {% else %}
+              {{ config.site_name }} &mdash; <strong>{{ page.title }}</strong>
+            {% endif %}
+          </span>
+        </div>
+      </div>
+    </div>
+
+    <!-- Color palette toggle -->
+    {% if config.theme.palette %}
+      {% if not config.theme.palette is mapping %}
+        {% include "partials/palette.html" %}
+      {% endif %}
+    {% endif %}
+
+    <!-- User preference: color palette -->
+    {% if not config.theme.palette is mapping %}
+      {% include "partials/javascripts/palette.html" %}
+    {% endif %}
+
+    <!-- Site language selector -->
+    {% if config.extra.alternate %}
+      {% include "partials/alternate.html" %}
+    {% endif %}
+
+    <!-- Button to open search modal -->
+    {% if "material/search" in config.plugins %}
+      <label class="md-header__button md-icon" for="__search">
+        {% set icon = config.theme.icon.search or "material/magnify" %}
+        {% include ".icons/" ~ icon ~ ".svg" %}
+      </label>
+
+      <!-- Search interface -->
+      {% include "partials/search.html" %}
+    {% endif %}
+
+    <!-- Repository information -->
+    {% if config.repo_url %}
+      <div class="md-header__source">
+        {% include "partials/source.html" %}
+      </div>
+    {% endif %}
+  </nav>
+
+  <!-- Navigation tabs (sticky) -->
+  {% if "navigation.tabs.sticky" in features %}
+    {% if "navigation.tabs" in features %}
+      {% include "partials/tabs.html" %}
+    {% endif %}
+  {% endif %}
+</header>
diff --git a/development/guides/configuration/index.html b/development/guides/configuration/index.html
index 00048400..47cef167 100644
--- a/development/guides/configuration/index.html
+++ b/development/guides/configuration/index.html
@@ -112,21 +112,55 @@
       </div>
     
     
-      
+      <!--
+  Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>
+
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to
+  deal in the Software without restriction, including without limitation the
+  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+  sell copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+  IN THE SOFTWARE.
+-->
+
+<!-- Determine classes -->
+
 
   
 
+
+<!-- Header -->
 <header class="md-header md-header--shadow" data-md-component="header">
-  <nav class="md-header__inner md-grid" aria-label="Header">
+  <nav
+    class="md-header__inner md-grid"
+    aria-label="Header"
+  >
+
+    <!-- Link to home -->
     <a href="https://lakefs-spec.org" title="lakeFS-spec" class="md-header__button md-logo" aria-label="lakeFS-spec" data-md-component="logo">
       
   <img src="../../_images/aai-logo-cropped.png" alt="logo">
 
     </a>
+
+    <!-- Button to open drawer -->
     <label class="md-header__button md-icon" for="__drawer">
       
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
     </label>
+
+    <!-- Header title -->
     <div class="md-header__title" data-md-component="header-title">
       <div class="md-header__ellipsis">
         <div class="md-header__topic">
@@ -137,12 +171,14 @@
         <div class="md-header__topic" data-md-component="header-topic">
           <span class="md-ellipsis">
             
-              Passing configuration to the file system
+              lakeFS-spec &mdash; <strong>Passing configuration to the file system</strong>
             
           </span>
         </div>
       </div>
     </div>
+
+    <!-- Color palette toggle -->
     
       
         <form class="md-header__option" data-md-component="palette">
@@ -170,15 +206,23 @@
 </form>
       
     
+
+    <!-- User preference: color palette -->
     
       <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
     
+
+    <!-- Site language selector -->
     
+
+    <!-- Button to open search modal -->
     
       <label class="md-header__button md-icon" for="__search">
         
         <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
       </label>
+
+      <!-- Search interface -->
       <div class="md-search" data-md-component="search" role="dialog">
   <label class="md-search__overlay" for="__search"></label>
   <div class="md-search__inner" role="search">
@@ -212,6 +256,8 @@
   </div>
 </div>
     
+
+    <!-- Repository information -->
     
       <div class="md-header__source">
         <a href="https://github.com/aai-institute/lakefs-spec" title="Go to repository" class="md-source" data-md-component="source">
@@ -226,6 +272,8 @@
       </div>
     
   </nav>
+
+  <!-- Navigation tabs (sticky) -->
   
 </header>
     
diff --git a/development/guides/filesystem-usage/index.html b/development/guides/filesystem-usage/index.html
index fa09e5e9..0c78b524 100644
--- a/development/guides/filesystem-usage/index.html
+++ b/development/guides/filesystem-usage/index.html
@@ -112,21 +112,55 @@
       </div>
     
     
-      
+      <!--
+  Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>
+
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to
+  deal in the Software without restriction, including without limitation the
+  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+  sell copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+  IN THE SOFTWARE.
+-->
+
+<!-- Determine classes -->
+
 
   
 
+
+<!-- Header -->
 <header class="md-header md-header--shadow" data-md-component="header">
-  <nav class="md-header__inner md-grid" aria-label="Header">
+  <nav
+    class="md-header__inner md-grid"
+    aria-label="Header"
+  >
+
+    <!-- Link to home -->
     <a href="https://lakefs-spec.org" title="lakeFS-spec" class="md-header__button md-logo" aria-label="lakeFS-spec" data-md-component="logo">
       
   <img src="../../_images/aai-logo-cropped.png" alt="logo">
 
     </a>
+
+    <!-- Button to open drawer -->
     <label class="md-header__button md-icon" for="__drawer">
       
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
     </label>
+
+    <!-- Header title -->
     <div class="md-header__title" data-md-component="header-title">
       <div class="md-header__ellipsis">
         <div class="md-header__topic">
@@ -137,12 +171,14 @@
         <div class="md-header__topic" data-md-component="header-topic">
           <span class="md-ellipsis">
             
-              How to use the lakeFS file system
+              lakeFS-spec &mdash; <strong>How to use the lakeFS file system</strong>
             
           </span>
         </div>
       </div>
     </div>
+
+    <!-- Color palette toggle -->
     
       
         <form class="md-header__option" data-md-component="palette">
@@ -170,15 +206,23 @@
 </form>
       
     
+
+    <!-- User preference: color palette -->
     
       <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
     
+
+    <!-- Site language selector -->
     
+
+    <!-- Button to open search modal -->
     
       <label class="md-header__button md-icon" for="__search">
         
         <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
       </label>
+
+      <!-- Search interface -->
       <div class="md-search" data-md-component="search" role="dialog">
   <label class="md-search__overlay" for="__search"></label>
   <div class="md-search__inner" role="search">
@@ -212,6 +256,8 @@
   </div>
 </div>
     
+
+    <!-- Repository information -->
     
       <div class="md-header__source">
         <a href="https://github.com/aai-institute/lakefs-spec" title="Go to repository" class="md-source" data-md-component="source">
@@ -226,6 +272,8 @@
       </div>
     
   </nav>
+
+  <!-- Navigation tabs (sticky) -->
   
 </header>
     
diff --git a/development/guides/index.html b/development/guides/index.html
index edeceda1..d034086d 100644
--- a/development/guides/index.html
+++ b/development/guides/index.html
@@ -112,21 +112,55 @@
       </div>
     
     
-      
+      <!--
+  Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>
+
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to
+  deal in the Software without restriction, including without limitation the
+  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+  sell copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+  IN THE SOFTWARE.
+-->
+
+<!-- Determine classes -->
+
 
   
 
+
+<!-- Header -->
 <header class="md-header md-header--shadow" data-md-component="header">
-  <nav class="md-header__inner md-grid" aria-label="Header">
+  <nav
+    class="md-header__inner md-grid"
+    aria-label="Header"
+  >
+
+    <!-- Link to home -->
     <a href="https://lakefs-spec.org" title="lakeFS-spec" class="md-header__button md-logo" aria-label="lakeFS-spec" data-md-component="logo">
       
   <img src="../_images/aai-logo-cropped.png" alt="logo">
 
     </a>
+
+    <!-- Button to open drawer -->
     <label class="md-header__button md-icon" for="__drawer">
       
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
     </label>
+
+    <!-- Header title -->
     <div class="md-header__title" data-md-component="header-title">
       <div class="md-header__ellipsis">
         <div class="md-header__topic">
@@ -137,12 +171,14 @@
         <div class="md-header__topic" data-md-component="header-topic">
           <span class="md-ellipsis">
             
-              User Guide
+              lakeFS-spec &mdash; <strong>User Guide</strong>
             
           </span>
         </div>
       </div>
     </div>
+
+    <!-- Color palette toggle -->
     
       
         <form class="md-header__option" data-md-component="palette">
@@ -170,15 +206,23 @@
 </form>
       
     
+
+    <!-- User preference: color palette -->
     
       <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
     
+
+    <!-- Site language selector -->
     
+
+    <!-- Button to open search modal -->
     
       <label class="md-header__button md-icon" for="__search">
         
         <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
       </label>
+
+      <!-- Search interface -->
       <div class="md-search" data-md-component="search" role="dialog">
   <label class="md-search__overlay" for="__search"></label>
   <div class="md-search__inner" role="search">
@@ -212,6 +256,8 @@
   </div>
 </div>
     
+
+    <!-- Repository information -->
     
       <div class="md-header__source">
         <a href="https://github.com/aai-institute/lakefs-spec" title="Go to repository" class="md-source" data-md-component="source">
@@ -226,6 +272,8 @@
       </div>
     
   </nav>
+
+  <!-- Navigation tabs (sticky) -->
   
 </header>
     
diff --git a/development/guides/integrations/index.html b/development/guides/integrations/index.html
index a0713739..09f7b1b0 100644
--- a/development/guides/integrations/index.html
+++ b/development/guides/integrations/index.html
@@ -112,21 +112,55 @@
       </div>
     
     
-      
+      <!--
+  Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>
+
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to
+  deal in the Software without restriction, including without limitation the
+  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+  sell copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+  IN THE SOFTWARE.
+-->
+
+<!-- Determine classes -->
+
 
   
 
+
+<!-- Header -->
 <header class="md-header md-header--shadow" data-md-component="header">
-  <nav class="md-header__inner md-grid" aria-label="Header">
+  <nav
+    class="md-header__inner md-grid"
+    aria-label="Header"
+  >
+
+    <!-- Link to home -->
     <a href="https://lakefs-spec.org" title="lakeFS-spec" class="md-header__button md-logo" aria-label="lakeFS-spec" data-md-component="logo">
       
   <img src="../../_images/aai-logo-cropped.png" alt="logo">
 
     </a>
+
+    <!-- Button to open drawer -->
     <label class="md-header__button md-icon" for="__drawer">
       
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
     </label>
+
+    <!-- Header title -->
     <div class="md-header__title" data-md-component="header-title">
       <div class="md-header__ellipsis">
         <div class="md-header__topic">
@@ -137,12 +171,14 @@
         <div class="md-header__topic" data-md-component="header-topic">
           <span class="md-ellipsis">
             
-              How to use lakeFS-spec with third-party data science libraries
+              lakeFS-spec &mdash; <strong>How to use lakeFS-spec with third-party data science libraries</strong>
             
           </span>
         </div>
       </div>
     </div>
+
+    <!-- Color palette toggle -->
     
       
         <form class="md-header__option" data-md-component="palette">
@@ -170,15 +206,23 @@
 </form>
       
     
+
+    <!-- User preference: color palette -->
     
       <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
     
+
+    <!-- Site language selector -->
     
+
+    <!-- Button to open search modal -->
     
       <label class="md-header__button md-icon" for="__search">
         
         <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
       </label>
+
+      <!-- Search interface -->
       <div class="md-search" data-md-component="search" role="dialog">
   <label class="md-search__overlay" for="__search"></label>
   <div class="md-search__inner" role="search">
@@ -212,6 +256,8 @@
   </div>
 </div>
     
+
+    <!-- Repository information -->
     
       <div class="md-header__source">
         <a href="https://github.com/aai-institute/lakefs-spec" title="Go to repository" class="md-source" data-md-component="source">
@@ -226,6 +272,8 @@
       </div>
     
   </nav>
+
+  <!-- Navigation tabs (sticky) -->
   
 </header>
     
diff --git a/development/guides/transactions/index.html b/development/guides/transactions/index.html
index cdcb56e5..1687fbfb 100644
--- a/development/guides/transactions/index.html
+++ b/development/guides/transactions/index.html
@@ -112,21 +112,55 @@
       </div>
     
     
-      
+      <!--
+  Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>
+
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to
+  deal in the Software without restriction, including without limitation the
+  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+  sell copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+  IN THE SOFTWARE.
+-->
+
+<!-- Determine classes -->
+
 
   
 
+
+<!-- Header -->
 <header class="md-header md-header--shadow" data-md-component="header">
-  <nav class="md-header__inner md-grid" aria-label="Header">
+  <nav
+    class="md-header__inner md-grid"
+    aria-label="Header"
+  >
+
+    <!-- Link to home -->
     <a href="https://lakefs-spec.org" title="lakeFS-spec" class="md-header__button md-logo" aria-label="lakeFS-spec" data-md-component="logo">
       
   <img src="../../_images/aai-logo-cropped.png" alt="logo">
 
     </a>
+
+    <!-- Button to open drawer -->
     <label class="md-header__button md-icon" for="__drawer">
       
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
     </label>
+
+    <!-- Header title -->
     <div class="md-header__title" data-md-component="header-title">
       <div class="md-header__ellipsis">
         <div class="md-header__topic">
@@ -137,12 +171,14 @@
         <div class="md-header__topic" data-md-component="header-topic">
           <span class="md-ellipsis">
             
-              Using transactions on the lakeFS file system
+              lakeFS-spec &mdash; <strong>Using transactions on the lakeFS file system</strong>
             
           </span>
         </div>
       </div>
     </div>
+
+    <!-- Color palette toggle -->
     
       
         <form class="md-header__option" data-md-component="palette">
@@ -170,15 +206,23 @@
 </form>
       
     
+
+    <!-- User preference: color palette -->
     
       <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
     
+
+    <!-- Site language selector -->
     
+
+    <!-- Button to open search modal -->
     
       <label class="md-header__button md-icon" for="__search">
         
         <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
       </label>
+
+      <!-- Search interface -->
       <div class="md-search" data-md-component="search" role="dialog">
   <label class="md-search__overlay" for="__search"></label>
   <div class="md-search__inner" role="search">
@@ -212,6 +256,8 @@
   </div>
 </div>
     
+
+    <!-- Repository information -->
     
       <div class="md-header__source">
         <a href="https://github.com/aai-institute/lakefs-spec" title="Go to repository" class="md-source" data-md-component="source">
@@ -226,6 +272,8 @@
       </div>
     
   </nav>
+
+  <!-- Navigation tabs (sticky) -->
   
 </header>
     
diff --git a/development/index.html b/development/index.html
index 68f96d2e..8d881751 100644
--- a/development/index.html
+++ b/development/index.html
@@ -110,21 +110,55 @@
       </div>
     
     
-      
+      <!--
+  Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>
+
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to
+  deal in the Software without restriction, including without limitation the
+  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+  sell copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+  IN THE SOFTWARE.
+-->
+
+<!-- Determine classes -->
+
 
   
 
+
+<!-- Header -->
 <header class="md-header md-header--shadow" data-md-component="header">
-  <nav class="md-header__inner md-grid" aria-label="Header">
+  <nav
+    class="md-header__inner md-grid"
+    aria-label="Header"
+  >
+
+    <!-- Link to home -->
     <a href="https://lakefs-spec.org" title="lakeFS-spec" class="md-header__button md-logo" aria-label="lakeFS-spec" data-md-component="logo">
       
   <img src="_images/aai-logo-cropped.png" alt="logo">
 
     </a>
+
+    <!-- Button to open drawer -->
     <label class="md-header__button md-icon" for="__drawer">
       
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
     </label>
+
+    <!-- Header title -->
     <div class="md-header__title" data-md-component="header-title">
       <div class="md-header__ellipsis">
         <div class="md-header__topic">
@@ -135,12 +169,14 @@
         <div class="md-header__topic" data-md-component="header-topic">
           <span class="md-ellipsis">
             
-              Home
+              lakeFS-spec &mdash; <strong>Home</strong>
             
           </span>
         </div>
       </div>
     </div>
+
+    <!-- Color palette toggle -->
     
       
         <form class="md-header__option" data-md-component="palette">
@@ -168,15 +204,23 @@
 </form>
       
     
+
+    <!-- User preference: color palette -->
     
       <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
     
+
+    <!-- Site language selector -->
     
+
+    <!-- Button to open search modal -->
     
       <label class="md-header__button md-icon" for="__search">
         
         <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
       </label>
+
+      <!-- Search interface -->
       <div class="md-search" data-md-component="search" role="dialog">
   <label class="md-search__overlay" for="__search"></label>
   <div class="md-search__inner" role="search">
@@ -210,6 +254,8 @@
   </div>
 </div>
     
+
+    <!-- Repository information -->
     
       <div class="md-header__source">
         <a href="https://github.com/aai-institute/lakefs-spec" title="Go to repository" class="md-source" data-md-component="source">
@@ -224,6 +270,8 @@
       </div>
     
   </nav>
+
+  <!-- Navigation tabs (sticky) -->
   
 </header>
     
diff --git a/development/quickstart/index.html b/development/quickstart/index.html
index dc84ea20..86d79def 100644
--- a/development/quickstart/index.html
+++ b/development/quickstart/index.html
@@ -112,21 +112,55 @@
       </div>
     
     
-      
+      <!--
+  Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>
+
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to
+  deal in the Software without restriction, including without limitation the
+  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+  sell copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+  IN THE SOFTWARE.
+-->
+
+<!-- Determine classes -->
+
 
   
 
+
+<!-- Header -->
 <header class="md-header md-header--shadow" data-md-component="header">
-  <nav class="md-header__inner md-grid" aria-label="Header">
+  <nav
+    class="md-header__inner md-grid"
+    aria-label="Header"
+  >
+
+    <!-- Link to home -->
     <a href="https://lakefs-spec.org" title="lakeFS-spec" class="md-header__button md-logo" aria-label="lakeFS-spec" data-md-component="logo">
       
   <img src="../_images/aai-logo-cropped.png" alt="logo">
 
     </a>
+
+    <!-- Button to open drawer -->
     <label class="md-header__button md-icon" for="__drawer">
       
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
     </label>
+
+    <!-- Header title -->
     <div class="md-header__title" data-md-component="header-title">
       <div class="md-header__ellipsis">
         <div class="md-header__topic">
@@ -137,12 +171,14 @@
         <div class="md-header__topic" data-md-component="header-topic">
           <span class="md-ellipsis">
             
-              Quickstart
+              lakeFS-spec &mdash; <strong>Quickstart</strong>
             
           </span>
         </div>
       </div>
     </div>
+
+    <!-- Color palette toggle -->
     
       
         <form class="md-header__option" data-md-component="palette">
@@ -170,15 +206,23 @@
 </form>
       
     
+
+    <!-- User preference: color palette -->
     
       <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
     
+
+    <!-- Site language selector -->
     
+
+    <!-- Button to open search modal -->
     
       <label class="md-header__button md-icon" for="__search">
         
         <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
       </label>
+
+      <!-- Search interface -->
       <div class="md-search" data-md-component="search" role="dialog">
   <label class="md-search__overlay" for="__search"></label>
   <div class="md-search__inner" role="search">
@@ -212,6 +256,8 @@
   </div>
 </div>
     
+
+    <!-- Repository information -->
     
       <div class="md-header__source">
         <a href="https://github.com/aai-institute/lakefs-spec" title="Go to repository" class="md-source" data-md-component="source">
@@ -226,6 +272,8 @@
       </div>
     
   </nav>
+
+  <!-- Navigation tabs (sticky) -->
   
 </header>
     
diff --git a/development/reference/SUMMARY/index.html b/development/reference/SUMMARY/index.html
index 6d18f6ac..4523662f 100644
--- a/development/reference/SUMMARY/index.html
+++ b/development/reference/SUMMARY/index.html
@@ -103,21 +103,55 @@
       </div>
     
     
-      
+      <!--
+  Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>
+
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to
+  deal in the Software without restriction, including without limitation the
+  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+  sell copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+  IN THE SOFTWARE.
+-->
+
+<!-- Determine classes -->
+
 
   
 
+
+<!-- Header -->
 <header class="md-header md-header--shadow" data-md-component="header">
-  <nav class="md-header__inner md-grid" aria-label="Header">
+  <nav
+    class="md-header__inner md-grid"
+    aria-label="Header"
+  >
+
+    <!-- Link to home -->
     <a href="https://lakefs-spec.org" title="lakeFS-spec" class="md-header__button md-logo" aria-label="lakeFS-spec" data-md-component="logo">
       
   <img src="../../_images/aai-logo-cropped.png" alt="logo">
 
     </a>
+
+    <!-- Button to open drawer -->
     <label class="md-header__button md-icon" for="__drawer">
       
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
     </label>
+
+    <!-- Header title -->
     <div class="md-header__title" data-md-component="header-title">
       <div class="md-header__ellipsis">
         <div class="md-header__topic">
@@ -128,12 +162,14 @@
         <div class="md-header__topic" data-md-component="header-topic">
           <span class="md-ellipsis">
             
-              SUMMARY
+              lakeFS-spec &mdash; <strong>SUMMARY</strong>
             
           </span>
         </div>
       </div>
     </div>
+
+    <!-- Color palette toggle -->
     
       
         <form class="md-header__option" data-md-component="palette">
@@ -161,15 +197,23 @@
 </form>
       
     
+
+    <!-- User preference: color palette -->
     
       <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
     
+
+    <!-- Site language selector -->
     
+
+    <!-- Button to open search modal -->
     
       <label class="md-header__button md-icon" for="__search">
         
         <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
       </label>
+
+      <!-- Search interface -->
       <div class="md-search" data-md-component="search" role="dialog">
   <label class="md-search__overlay" for="__search"></label>
   <div class="md-search__inner" role="search">
@@ -203,6 +247,8 @@
   </div>
 </div>
     
+
+    <!-- Repository information -->
     
       <div class="md-header__source">
         <a href="https://github.com/aai-institute/lakefs-spec" title="Go to repository" class="md-source" data-md-component="source">
@@ -217,6 +263,8 @@
       </div>
     
   </nav>
+
+  <!-- Navigation tabs (sticky) -->
   
 </header>
     
@@ -775,7 +823,7 @@ <h1>SUMMARY</h1>
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-02-21</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-02-28</span>
   </span>
 
     
diff --git a/development/reference/lakefs_spec/errors/index.html b/development/reference/lakefs_spec/errors/index.html
index c4ed7856..1d6e0c6b 100644
--- a/development/reference/lakefs_spec/errors/index.html
+++ b/development/reference/lakefs_spec/errors/index.html
@@ -112,21 +112,55 @@
       </div>
     
     
-      
+      <!--
+  Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>
+
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to
+  deal in the Software without restriction, including without limitation the
+  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+  sell copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+  IN THE SOFTWARE.
+-->
+
+<!-- Determine classes -->
+
 
   
 
+
+<!-- Header -->
 <header class="md-header md-header--shadow" data-md-component="header">
-  <nav class="md-header__inner md-grid" aria-label="Header">
+  <nav
+    class="md-header__inner md-grid"
+    aria-label="Header"
+  >
+
+    <!-- Link to home -->
     <a href="https://lakefs-spec.org" title="lakeFS-spec" class="md-header__button md-logo" aria-label="lakeFS-spec" data-md-component="logo">
       
   <img src="../../../_images/aai-logo-cropped.png" alt="logo">
 
     </a>
+
+    <!-- Button to open drawer -->
     <label class="md-header__button md-icon" for="__drawer">
       
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
     </label>
+
+    <!-- Header title -->
     <div class="md-header__title" data-md-component="header-title">
       <div class="md-header__ellipsis">
         <div class="md-header__topic">
@@ -137,12 +171,14 @@
         <div class="md-header__topic" data-md-component="header-topic">
           <span class="md-ellipsis">
             
-              errors
+              lakeFS-spec &mdash; <strong>errors</strong>
             
           </span>
         </div>
       </div>
     </div>
+
+    <!-- Color palette toggle -->
     
       
         <form class="md-header__option" data-md-component="palette">
@@ -170,15 +206,23 @@
 </form>
       
     
+
+    <!-- User preference: color palette -->
     
       <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
     
+
+    <!-- Site language selector -->
     
+
+    <!-- Button to open search modal -->
     
       <label class="md-header__button md-icon" for="__search">
         
         <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
       </label>
+
+      <!-- Search interface -->
       <div class="md-search" data-md-component="search" role="dialog">
   <label class="md-search__overlay" for="__search"></label>
   <div class="md-search__inner" role="search">
@@ -212,6 +256,8 @@
   </div>
 </div>
     
+
+    <!-- Repository information -->
     
       <div class="md-header__source">
         <a href="https://github.com/aai-institute/lakefs-spec" title="Go to repository" class="md-source" data-md-component="source">
@@ -226,6 +272,8 @@
       </div>
     
   </nav>
+
+  <!-- Navigation tabs (sticky) -->
   
 </header>
     
@@ -1129,7 +1177,7 @@ <h2 id="lakefs_spec.errors.translate_lakefs_error" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-02-21</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-02-28</span>
   </span>
 
     
diff --git a/development/reference/lakefs_spec/index.html b/development/reference/lakefs_spec/index.html
index 2d07a5e2..afbd50e3 100644
--- a/development/reference/lakefs_spec/index.html
+++ b/development/reference/lakefs_spec/index.html
@@ -112,21 +112,55 @@
       </div>
     
     
-      
+      <!--
+  Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>
+
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to
+  deal in the Software without restriction, including without limitation the
+  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+  sell copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+  IN THE SOFTWARE.
+-->
+
+<!-- Determine classes -->
+
 
   
 
+
+<!-- Header -->
 <header class="md-header md-header--shadow" data-md-component="header">
-  <nav class="md-header__inner md-grid" aria-label="Header">
+  <nav
+    class="md-header__inner md-grid"
+    aria-label="Header"
+  >
+
+    <!-- Link to home -->
     <a href="https://lakefs-spec.org" title="lakeFS-spec" class="md-header__button md-logo" aria-label="lakeFS-spec" data-md-component="logo">
       
   <img src="../../_images/aai-logo-cropped.png" alt="logo">
 
     </a>
+
+    <!-- Button to open drawer -->
     <label class="md-header__button md-icon" for="__drawer">
       
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
     </label>
+
+    <!-- Header title -->
     <div class="md-header__title" data-md-component="header-title">
       <div class="md-header__ellipsis">
         <div class="md-header__topic">
@@ -137,12 +171,14 @@
         <div class="md-header__topic" data-md-component="header-topic">
           <span class="md-ellipsis">
             
-              lakefs_spec
+              lakeFS-spec &mdash; <strong>lakefs_spec</strong>
             
           </span>
         </div>
       </div>
     </div>
+
+    <!-- Color palette toggle -->
     
       
         <form class="md-header__option" data-md-component="palette">
@@ -170,15 +206,23 @@
 </form>
       
     
+
+    <!-- User preference: color palette -->
     
       <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
     
+
+    <!-- Site language selector -->
     
+
+    <!-- Button to open search modal -->
     
       <label class="md-header__button md-icon" for="__search">
         
         <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
       </label>
+
+      <!-- Search interface -->
       <div class="md-search" data-md-component="search" role="dialog">
   <label class="md-search__overlay" for="__search"></label>
   <div class="md-search__inner" role="search">
@@ -212,6 +256,8 @@
   </div>
 </div>
     
+
+    <!-- Repository information -->
     
       <div class="md-header__source">
         <a href="https://github.com/aai-institute/lakefs-spec" title="Go to repository" class="md-source" data-md-component="source">
@@ -226,6 +272,8 @@
       </div>
     
   </nav>
+
+  <!-- Navigation tabs (sticky) -->
   
 </header>
     
@@ -832,7 +880,7 @@ <h1>lakefs_spec</h1>
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-02-21</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-02-28</span>
   </span>
 
     
diff --git a/development/reference/lakefs_spec/spec/index.html b/development/reference/lakefs_spec/spec/index.html
index d6a4cf0d..4367daf3 100644
--- a/development/reference/lakefs_spec/spec/index.html
+++ b/development/reference/lakefs_spec/spec/index.html
@@ -112,21 +112,55 @@
       </div>
     
     
-      
+      <!--
+  Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>
+
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to
+  deal in the Software without restriction, including without limitation the
+  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+  sell copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+  IN THE SOFTWARE.
+-->
+
+<!-- Determine classes -->
+
 
   
 
+
+<!-- Header -->
 <header class="md-header md-header--shadow" data-md-component="header">
-  <nav class="md-header__inner md-grid" aria-label="Header">
+  <nav
+    class="md-header__inner md-grid"
+    aria-label="Header"
+  >
+
+    <!-- Link to home -->
     <a href="https://lakefs-spec.org" title="lakeFS-spec" class="md-header__button md-logo" aria-label="lakeFS-spec" data-md-component="logo">
       
   <img src="../../../_images/aai-logo-cropped.png" alt="logo">
 
     </a>
+
+    <!-- Button to open drawer -->
     <label class="md-header__button md-icon" for="__drawer">
       
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
     </label>
+
+    <!-- Header title -->
     <div class="md-header__title" data-md-component="header-title">
       <div class="md-header__ellipsis">
         <div class="md-header__topic">
@@ -137,12 +171,14 @@
         <div class="md-header__topic" data-md-component="header-topic">
           <span class="md-ellipsis">
             
-              spec
+              lakeFS-spec &mdash; <strong>spec</strong>
             
           </span>
         </div>
       </div>
     </div>
+
+    <!-- Color palette toggle -->
     
       
         <form class="md-header__option" data-md-component="palette">
@@ -170,15 +206,23 @@
 </form>
       
     
+
+    <!-- User preference: color palette -->
     
       <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
     
+
+    <!-- Site language selector -->
     
+
+    <!-- Button to open search modal -->
     
       <label class="md-header__button md-icon" for="__search">
         
         <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
       </label>
+
+      <!-- Search interface -->
       <div class="md-search" data-md-component="search" role="dialog">
   <label class="md-search__overlay" for="__search"></label>
   <div class="md-search__inner" role="search">
@@ -212,6 +256,8 @@
   </div>
 </div>
     
+
+    <!-- Repository information -->
     
       <div class="md-header__source">
         <a href="https://github.com/aai-institute/lakefs-spec" title="Go to repository" class="md-source" data-md-component="source">
@@ -226,6 +272,8 @@
       </div>
     
   </nav>
+
+  <!-- Navigation tabs (sticky) -->
   
 </header>
     
@@ -5580,7 +5628,7 @@ <h3 id="lakefs_spec.spec.LakeFSFileSystem.tail" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-02-21</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-02-28</span>
   </span>
 
     
diff --git a/development/reference/lakefs_spec/transaction/index.html b/development/reference/lakefs_spec/transaction/index.html
index d9b94eca..aac1088b 100644
--- a/development/reference/lakefs_spec/transaction/index.html
+++ b/development/reference/lakefs_spec/transaction/index.html
@@ -112,21 +112,55 @@
       </div>
     
     
-      
+      <!--
+  Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>
+
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to
+  deal in the Software without restriction, including without limitation the
+  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+  sell copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+  IN THE SOFTWARE.
+-->
+
+<!-- Determine classes -->
+
 
   
 
+
+<!-- Header -->
 <header class="md-header md-header--shadow" data-md-component="header">
-  <nav class="md-header__inner md-grid" aria-label="Header">
+  <nav
+    class="md-header__inner md-grid"
+    aria-label="Header"
+  >
+
+    <!-- Link to home -->
     <a href="https://lakefs-spec.org" title="lakeFS-spec" class="md-header__button md-logo" aria-label="lakeFS-spec" data-md-component="logo">
       
   <img src="../../../_images/aai-logo-cropped.png" alt="logo">
 
     </a>
+
+    <!-- Button to open drawer -->
     <label class="md-header__button md-icon" for="__drawer">
       
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
     </label>
+
+    <!-- Header title -->
     <div class="md-header__title" data-md-component="header-title">
       <div class="md-header__ellipsis">
         <div class="md-header__topic">
@@ -137,12 +171,14 @@
         <div class="md-header__topic" data-md-component="header-topic">
           <span class="md-ellipsis">
             
-              transaction
+              lakeFS-spec &mdash; <strong>transaction</strong>
             
           </span>
         </div>
       </div>
     </div>
+
+    <!-- Color palette toggle -->
     
       
         <form class="md-header__option" data-md-component="palette">
@@ -170,15 +206,23 @@
 </form>
       
     
+
+    <!-- User preference: color palette -->
     
       <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
     
+
+    <!-- Site language selector -->
     
+
+    <!-- Button to open search modal -->
     
       <label class="md-header__button md-icon" for="__search">
         
         <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
       </label>
+
+      <!-- Search interface -->
       <div class="md-search" data-md-component="search" role="dialog">
   <label class="md-search__overlay" for="__search"></label>
   <div class="md-search__inner" role="search">
@@ -212,6 +256,8 @@
   </div>
 </div>
     
+
+    <!-- Repository information -->
     
       <div class="md-header__source">
         <a href="https://github.com/aai-institute/lakefs-spec" title="Go to repository" class="md-source" data-md-component="source">
@@ -226,6 +272,8 @@
       </div>
     
   </nav>
+
+  <!-- Navigation tabs (sticky) -->
   
 </header>
     
@@ -2224,7 +2272,7 @@ <h3 id="lakefs_spec.transaction.LakeFSTransaction.tag" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-02-21</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-02-28</span>
   </span>
 
     
diff --git a/development/reference/lakefs_spec/util/index.html b/development/reference/lakefs_spec/util/index.html
index 8b5ebcd6..beb631be 100644
--- a/development/reference/lakefs_spec/util/index.html
+++ b/development/reference/lakefs_spec/util/index.html
@@ -112,21 +112,55 @@
       </div>
     
     
-      
+      <!--
+  Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>
+
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to
+  deal in the Software without restriction, including without limitation the
+  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+  sell copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+  IN THE SOFTWARE.
+-->
+
+<!-- Determine classes -->
+
 
   
 
+
+<!-- Header -->
 <header class="md-header md-header--shadow" data-md-component="header">
-  <nav class="md-header__inner md-grid" aria-label="Header">
+  <nav
+    class="md-header__inner md-grid"
+    aria-label="Header"
+  >
+
+    <!-- Link to home -->
     <a href="https://lakefs-spec.org" title="lakeFS-spec" class="md-header__button md-logo" aria-label="lakeFS-spec" data-md-component="logo">
       
   <img src="../../../_images/aai-logo-cropped.png" alt="logo">
 
     </a>
+
+    <!-- Button to open drawer -->
     <label class="md-header__button md-icon" for="__drawer">
       
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
     </label>
+
+    <!-- Header title -->
     <div class="md-header__title" data-md-component="header-title">
       <div class="md-header__ellipsis">
         <div class="md-header__topic">
@@ -137,12 +171,14 @@
         <div class="md-header__topic" data-md-component="header-topic">
           <span class="md-ellipsis">
             
-              util
+              lakeFS-spec &mdash; <strong>util</strong>
             
           </span>
         </div>
       </div>
     </div>
+
+    <!-- Color palette toggle -->
     
       
         <form class="md-header__option" data-md-component="palette">
@@ -170,15 +206,23 @@
 </form>
       
     
+
+    <!-- User preference: color palette -->
     
       <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
     
+
+    <!-- Site language selector -->
     
+
+    <!-- Button to open search modal -->
     
       <label class="md-header__button md-icon" for="__search">
         
         <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
       </label>
+
+      <!-- Search interface -->
       <div class="md-search" data-md-component="search" role="dialog">
   <label class="md-search__overlay" for="__search"></label>
   <div class="md-search__inner" role="search">
@@ -212,6 +256,8 @@
   </div>
 </div>
     
+
+    <!-- Repository information -->
     
       <div class="md-header__source">
         <a href="https://github.com/aai-institute/lakefs-spec" title="Go to repository" class="md-source" data-md-component="source">
@@ -226,6 +272,8 @@
       </div>
     
   </nav>
+
+  <!-- Navigation tabs (sticky) -->
   
 </header>
     
@@ -1407,7 +1455,7 @@ <h2 id="lakefs_spec.util.parse" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-02-21</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-02-28</span>
   </span>
 
     
diff --git a/development/search/search_index.json b/development/search/search_index.json
index 042d1687..5fb47950 100644
--- a/development/search/search_index.json
+++ b/development/search/search_index.json
@@ -1 +1 @@
-{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Home","text":"<p>Welcome to lakeFS-spec, a filesystem-spec backend implementation for the lakeFS data lake. Our primary goal is to streamline versioned data operations in lakeFS, enabling seamless integration with popular data science tools such as Pandas, Polars, and DuckDB directly from Python.</p> <p>Highlights:</p> <ul> <li>Simple repository operations in lakeFS</li> <li>Easy access to underlying storage and versioning operations</li> <li>Seamless integration with the fsspec ecosystem</li> <li>Directly access lakeFS objects from popular data science libraries (including Pandas, Polars, DuckDB, PyArrow) with minimal code</li> <li>Transaction support for reliable data version control</li> <li>Smart data transfers through client-side caching (up-/download)</li> <li>Auto-discovery configuration</li> </ul> <p>Early Adopters</p> <p>We are seeking early adopters who would like to actively participate in our feedback process and shape the future of the library. If you are interested in using the library and want to get in touch with us, please reach out via Github Discussions.</p> <p>Quickstart</p><p>Step-by-step installation and first operations</p> <p>Tutorials</p><p>In-depth tutorials on using lakeFS-spec</p> <p>API Reference</p><p>Full documentation of the Python API</p> <p>User Guide</p><p>Solving specific tasks with lakeFS-spec</p> <p>Contributing</p><p>How to contribute to the project</p>"},{"location":"CONTRIBUTING/","title":"Contributing to lakeFS-spec","text":"<p>Thank you for your interest in contributing to this project!</p> <p>We appreciate issue reports, pull requests for code and documentation, as well as any project-related communication through GitHub Discussions.</p>"},{"location":"CONTRIBUTING/#getting-started","title":"Getting Started","text":"<p>To get started with development, you can follow these steps:</p> <ol> <li> <p>Clone this repository:</p> <pre><code>git clone https://github.com/aai-institute/lakefs-spec.git\n</code></pre> </li> <li> <p>Navigate to the directory and install the development dependencies into a virtual environment:</p> <pre><code>cd lakefs-spec\npython3 -m venv venv --system-site-packages\nsource venv/bin/activate\npython -m pip install -r requirements-dev.txt\npython -m pip install -e . --no-deps\n</code></pre> </li> <li> <p>After making your changes, verify they adhere to our Python code style by running <code>pre-commit</code>:</p> <pre><code>pre-commit run --all-files\n</code></pre> <p>You can also set up Git hooks through <code>pre-commit</code> to perform these checks automatically:</p> <pre><code>pre-commit install\n</code></pre> </li> <li> <p>To run the tests against an ephemeral lakeFS instance, you just run <code>pytest</code>:     <pre><code>pytest\n</code></pre></p> <p>To spin up a local lakeFS instance quickly for testing, you can use the Docker Compose file bundled with this repository:</p> <pre><code>docker-compose -f hack/docker-compose.yml up\n</code></pre> </li> </ol>"},{"location":"CONTRIBUTING/#updating-dependencies","title":"Updating dependencies","text":"<p>Dependencies should stay locked for as long as possible, ideally for a whole release. If you have to update a dependency during development, you should do the following:</p> <ol> <li>If it is a core dependency needed for the package, add it to the <code>dependencies</code> section in the <code>pyproject.toml</code>.</li> <li>In case of a development dependency, add it to the <code>dev</code> section of the <code>project.optional-dependencies</code> table instead.</li> <li>Dependencies needed for documentation generation are found in the <code>docs</code> sections of <code>project.optional-dependencies</code>.</li> </ol> <p>After adding the dependency in either of these sections, run the helper script <code>hack/lock-deps.sh</code> (which in turn uses <code>pip-compile</code>) to pin all dependencies again:</p> <pre><code>python -m pip install --upgrade pip-tools\nhack/lock-deps.sh\n</code></pre> <p>In addition to these manual steps, we also provide <code>pre-commit</code> hooks that automatically lock the dependencies whenever <code>pyproject.toml</code> is changed.</p> <p>Selective package upgrade for existing dependencies are also handled by the helper script above. If you want to update the <code>lakefs-sdk</code> dependency, for example, simply run:</p> <pre><code>hack/lock-deps.sh lakefs-sdk\n</code></pre> <p>Tip</p> <p>Since the official development version is Python 3.11, please run the above commands in a virtual environment with Python 3.11.</p>"},{"location":"CONTRIBUTING/#working-on-documentation","title":"Working on Documentation","text":"<p>Improvements or additions to the project's documentation are highly appreciated.</p> <p>The documentation is based on the MkDocs and Material for MkDocs (<code>mkdocs-material</code>) projects, see their homepages for in-depth guides on their features and usage. We use the Numpy documentation style for Python docstrings.</p> <p>To build the documentation locally, you need to first install the optional <code>docs</code> dependencies from <code>requirements-docs.txt</code>, e.g., with <code>pip install -r requirements-docs.txt</code>. You can then start a local documentation server with <code>mkdocs serve</code>, or build the documentation into its output folder in <code>public/</code>.</p> <p>In order to maintain documentation for multiple versions of this library, we use the mike tool, which automatically maintains individual documentation builds per version and publishes them to the <code>gh-pages</code> branch.</p> <p>The GitHub CI pipeline automatically invokes <code>mike</code> as part of the release process with the correct version and updates the GitHub pages branch for the project.</p>"},{"location":"quickstart/","title":"Quickstart","text":"<p>Welcome! This quickstart guide will get you up and running with lakeFS-spec by showing you how to</p> <ol> <li>install the <code>lakefs-spec</code> package,</li> <li>spin up a local lakeFS server,</li> <li>create a lakeFS repository for experimentation, and</li> <li>perform basic file system operations in a lakeFS repository using lakeFS-spec.</li> </ol> Prerequisites <p>To follow along with this guide, you will need a few prerequisites ready on your machine:</p> <ul> <li>lakeFS-spec supports Windows, macOS, or Linux</li> <li>Docker, with Docker Compose</li> <li>Python 3.9 or later</li> <li>optionally, <code>lakectl</code>, the lakeFS command line tool</li> </ul> <p>Please take a moment to make sure you have these tools available before proceeding with the next steps.</p>"},{"location":"quickstart/#installing-lakefs-spec","title":"Installing lakeFS-spec","text":"A note on virtual environments <p>We generally recommend installing the library in a virtual environment to ensure proper isolation, especially when following this quickstart guide.</p> <p>If you are using Poetry, virtual environments can automatically be created by the tool.</p> <p>If you prefer the <code>venv</code> functionality built into Python, see the official docs (tl;dr: <code>python -m venv venv; source venv/bin/activate</code>).</p> <p>To install the package directly from PyPI, run:</p> pippoetry <pre><code>pip install lakefs-spec\n</code></pre> <pre><code>poetry add lakefs-spec\n</code></pre> <p>Or, if you want to try the latest pre-release version directly from GitHub:</p> pippoetry <pre><code>pip install git+https://github.com/aai-institute/lakefs-spec.git\n</code></pre> <pre><code>poetry add git+https://github.com/aai-institute/lakefs-spec.git\n</code></pre>"},{"location":"quickstart/#first-steps","title":"First Steps","text":""},{"location":"quickstart/#spinning-up-a-local-lakefs-instance","title":"Spinning up a local lakeFS instance","text":"<p>Warning</p> <p>This setup is not recommended for production uses, since it does not store the data persistently.</p> <p>Please check out the lakeFS docs for production-ready deployment options.</p> <p>If you don't already have access to a lakeFS server, you can quickly start a local instance using Docker Compose. Before continuing, please make sure Docker is installed and running on your machine.</p> <p>The lakeFS quickstart deployment can be launched directly with a configuration file provided in the lakeFS-spec repository:</p> <pre><code>$ curl https://raw.githubusercontent.com/aai-institute/lakefs-spec/main/hack/docker-compose.yml | docker-compose -f - up\n</code></pre> <p>If you do not have <code>curl</code> installed on your machine or would like to examine and/or customize the container configuration, you can also create a <code>docker-compose.yml</code> file locally and use it with <code>docker-compose up</code>:</p> docker-compose.yml<pre><code>version: \"3\"\n\nservices:\n  lakefs:\n    image: treeverse/lakefs:1.7.0\n    ports:\n      - 8000:8000\n    environment:\n      LAKEFS_INSTALLATION_USER_NAME: \"quickstart\"\n      LAKEFS_INSTALLATION_ACCESS_KEY_ID: \"AKIAIOSFOLQUICKSTART\"\n      LAKEFS_INSTALLATION_SECRET_ACCESS_KEY: \"wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY\"\n      LAKEFS_DATABASE_TYPE: \"local\"\n      LAKEFS_AUTH_ENCRYPT_SECRET_KEY: \"THIS_MUST_BE_CHANGED_IN_PRODUCTION\"\n      LAKEFS_BLOCKSTORE_TYPE: \"local\"\n</code></pre> <p>In order to allow lakeFS-spec to automatically discover credentials to access this lakeFS instance, create a <code>.lakectl.yaml</code> in your home directory containing the credentials for the quickstart environment (you can also use <code>lakectl config</code> to create this file interactively if you have the <code>lakectl</code> tool installed on your machine):</p> ~/.lakectl.yaml<pre><code>credentials: # (1)!\n  access_key_id: AKIAIOSFOLQUICKSTART\n  secret_access_key: wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY\nserver:\n  endpoint_url: http://127.0.0.1:8000\n</code></pre> <ol> <li>These must match the credentials set in the <code>environment</code> section of the Docker Compose file above</li> </ol> <p>After the container has finished initializing, you can access the web UI of your local lakeFS deployment in your browser. Fill out the setup form, where you can optionally share your email address with the developers of lakeFS to receive updates on their product. Next, you can log into your fresh lakeFS instance with the credentials listed above.</p> <p>Success</p> <p>Your fresh local lakeFS instance is a playground for you to explore lakeFS functionality. </p> <p>In the next step, we will create your first repository on this server.</p>"},{"location":"quickstart/#create-a-lakefs-repository","title":"Create a lakeFS repository","text":"<p>Once you have logged into the web UI of the lakeFS server for the first time, you can create an empty repository on the next page. Click the small Click here link at the bottom of the page to proceed and create a repository named <code>repo</code> (we don't want to add the sample data for this guide):</p> <p></p> Tip: Creating a repository later <p>If you have inadvertently skipped over the quickstart repository creation page, you can always create a new repository on the Repositories tab in the lakeFS web UI (and optionally choose to add the sample data):</p> <p></p> <p>Success</p> <p>You have successfully created a lakeFS repository named <code>repo</code>, ready to be used with lakeFS-spec.</p>"},{"location":"quickstart/#using-the-lakefs-file-system","title":"Using the lakeFS file system","text":"<p>We will now use the lakeFS-spec file system interface to perform some basic operations on the repository created in the previous step:</p> <ul> <li>Upload a local file to the repository</li> <li>Read data from a file in the repository</li> <li>Make a commit</li> <li>Fetch metadata about repository contents</li> <li>Delete a file from the repository</li> </ul> <p>To get started, create a file called <code>quickstart.py</code> with the following contents:</p> quickstart.py<pre><code>from pathlib import Path\n\nfrom lakefs_spec import LakeFSFileSystem\n\nREPO, BRANCH = \"repo\", \"main\"\n\n# Prepare example local data\nlocal_path = Path(\"demo.txt\")\nlocal_path.write_text(\"Hello, lakeFS!\")\n</code></pre> <p>Tip</p> <p>We will keep adding more code to this file as we progress through the next steps. Feel free to execute the script after each step and observe the effects as noted in the guide.</p> <p>This code snippet prepares a file <code>demo.txt</code> on your machine, ready to be added to the lakeFS repository, so let's do just that:</p> <pre><code>fs = LakeFSFileSystem()  # will auto-discover credentials from ~/.lakectl.yaml\nrepo_path = f\"{REPO}/{BRANCH}/{local_path.name}\"\n\nwith fs.transaction(REPO, BRANCH) as tx:\n    fs.put(str(local_path), f\"{REPO}/{tx.branch.id}/{local_path.name}\")\n    tx.commit(message=\"Add demo data\")\n</code></pre> <p>If you execute the <code>quickstart.py</code> script at this point, you can already see the committed file in the lakeFS web UI:</p> <p></p> <p>While examining the file contents in the browser is nice, we want to access the committed file programmatically. Add the following lines at the end of your script and observe the output:</p> <pre><code>f = fs.open(repo_path, \"rt\")\nprint(f.readline())  # prints \"Hello, lakeFS!\"\n</code></pre> <p>Note that executing the same code multiple times will only result in a single commit in the repository since the contents of the file on disk and in the repository are identical.</p> <p>In addition to simple read and write operations, the fsspec file system interface also allows us to list the files in a repository folder using <code>ls</code>, and query the metadata of objects in the repository through <code>info</code> (akin to the POSIX <code>stat</code> system call). Let's add the following code to our script and observe the output:</p> <pre><code># Compare the sizes of local file and repo\nfile_info = fs.info(repo_path)\nprint(\n    f\"{local_path.name}: local size: {file_info['size']}, remote size: {local_path.stat().st_size}\"\n)\n\n# Get information about all files in the repo root\nprint(fs.ls(f\"{REPO}/{BRANCH}/\"))\n</code></pre> <p>As the last order of business, let's clean up the repository to its original state by removing the file using the <code>rm</code> operation and creating another commit (also, the local file is deleted, since we don't need it anymore):</p> <pre><code>with fs.transaction(REPO, BRANCH) as tx:\n    fs.rm(f\"{REPO}/{tx.branch.id}/{local_path.name}\")\n    tx.commit(message=\"Delete demo data\")\n</code></pre> <p>Success</p> <p>You now have all the basic tools available to version data from your Python code using the file system interface provided by lakeFS-spec.</p> Full example code quickstart.py<pre><code>from pathlib import Path\n\nfrom lakefs_spec import LakeFSFileSystem\n\nREPO, BRANCH = \"repo\", \"main\"\n\n# Prepare example local data\nlocal_path = Path(\"demo.txt\")\nlocal_path.write_text(\"Hello, lakeFS!\")\n\n# Upload the local file to the repo and commit\nfs = LakeFSFileSystem()  # will auto-discover credentials from ~/.lakectl.yaml\nrepo_path = f\"{REPO}/{BRANCH}/{local_path.name}\"\n\nwith fs.transaction(REPO, BRANCH) as tx:\n    fs.put(str(local_path), f\"{REPO}/{tx.branch.id}/{local_path.name}\")\n    tx.commit(message=\"Add demo data\")\n\n# Read back the file contents\nf = fs.open(repo_path, \"rt\")\nprint(f.readline())  # prints \"Hello, lakeFS!\"\n\n# Compare the sizes of local file and repo\nfile_info = fs.info(repo_path)\nprint(\n    f\"{local_path.name}: local size: {file_info['size']}, remote size: {local_path.stat().st_size}\"\n)\n\n# Get information about all files in the repo root\nprint(fs.ls(f\"{REPO}/{BRANCH}/\"))\n\n# Delete uploaded file from the repository (and commit)\nwith fs.transaction(REPO, BRANCH) as tx:\n    fs.rm(f\"{REPO}/{tx.branch.id}/{local_path.name}\")\n    tx.commit(message=\"Delete demo data\")\n\nlocal_path.unlink()\n</code></pre>"},{"location":"quickstart/#next-steps","title":"Next Steps","text":"<p>After this walkthrough of the installation and an introduction to basic file system operations using lakeFS-spec, you might want to consider more advanced topics:</p> <ul> <li>API Reference</li> <li>User Guide, in particular<ul> <li>How to use the lakeFS file system</li> <li>How to use lakeFS-spec with third-party data science libraries</li> </ul> </li> <li>Tutorial: Using lakeFS-spec in a data science project</li> </ul>"},{"location":"guides/","title":"User Guide","text":"<p>The lakeFS-spec user guide provides documentation for users of the library looking to solve specific tasks. See the Quickstart guide for an introductory tutorial.</p> <ul> <li>How to use the lakeFS file system</li> <li>Passing configuration to the file system</li> <li>Using file system transactions</li> <li>Using transactions on the lakeFS file system</li> <li>How to use lakeFS-spec with third-party data science libraries</li> </ul>"},{"location":"guides/configuration/","title":"Passing configuration to the file system","text":"<p>There are multiple ways to configure the <code>LakeFSFileSystem</code> for use with a deployed lakeFS instance. This guide introduces them in the order of least to most in-Python configuration - the preferred way to use the file system is with as little Python code as possible.</p> <p>Info</p> <p>The configuration methods are introduced in reverse order of precedence - config file arguments have the lowest priority and are overwritten by environment variables (if specified).</p>"},{"location":"guides/configuration/#the-lakectlyaml-configuration-file","title":"The <code>.lakectl.yaml</code> configuration file","text":"<p>The easiest way of configuring the lakeFS file system is with a <code>lakectl</code> YAML configuration file. To address a lakeFS server, the following minimum configuration is required:</p> ~/.lakectl.yaml<pre><code>credentials:\n  access_key_id: &lt;ID&gt;\n  secret_access_key: &lt;KEY&gt;\nserver:\n  endpoint_url: &lt;LAKEFS-HOST&gt;\n</code></pre> <p>For a local instance produced by the quickstart, the following values will work:</p> ~/.lakectl.yaml<pre><code>credentials:\n  access_key_id: AKIAIOSFOLQUICKSTART\n  secret_access_key: wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY\nserver:\n  endpoint_url: http://127.0.0.1:8000\n</code></pre> <p>To work without any more arguments \"out of the box\", the configuration file has to be placed in your home directory with the name <code>.lakectl.yaml</code> (this is where lakeFS expects it). If you set all values correctly, you can instantiate the lakeFS file system without any arguments:</p> <pre><code>from lakefs_spec import LakeFSFileSystem\n\n# zero config necessary.\nfs = LakeFSFileSystem()\n</code></pre> <p>If you cannot use the default location (<code>$HOME/.lakectl.yaml</code>), you can read a file from any other location by passing the <code>configfile</code> argument:</p> <pre><code>from lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem(configfile=\"/path/to/my/configfile.yaml\")\n</code></pre>"},{"location":"guides/configuration/#setting-environment-variables","title":"Setting environment variables","text":"<p>It is also possible to specify certain configuration values used for authentication with the lakeFS server with environment variables. For these values, the variable name is exactly the constructor argument name prefaced with <code>LAKEFS_</code>, e.g. the <code>host</code> argument can be set via the <code>LAKEFS_HOST</code> environment variable.</p> <pre><code>import os\nfrom lakefs_spec import LakeFSFileSystem\n\nos.environ[\"LAKEFS_HOST\"] = \"http://my-lakefs.host\"\nos.environ[\"LAKEFS_USERNAME\"] = \"my-username\"\nos.environ[\"LAKEFS_PASSWORD\"] = \"my-password\"\n\n# also zero-config.\nfs = LakeFSFileSystem()\n</code></pre> <p>Info</p> <p>Not all initialization values can be set via environment variables - the <code>proxy</code>, <code>create_branch_ok</code>, and <code>source_branch</code> arguments can only be supplied in Python.</p>"},{"location":"guides/configuration/#appendix-mixing-zero-config-methods","title":"Appendix: Mixing zero-config methods","text":"<p>Two of the introduced methods allow for \"zero-config\" (i.e. no arguments given to the constructor) initialization of the file system. However, care must be taken when working with different file systems configured by the same means (for example, file systems configured with separate environment variables).</p> <p>The reason for this is the instance caching mechanism built into fsspec. While this allows for efficient reuse of file systems e.g. by third-party libraries (pandas, DuckDB, ...), it can lead to silent misconfigurations. Consider this example, with an existent <code>.lakectl.yaml</code> file:</p> ~/.lakectl.yaml<pre><code>credentials:\n  access_key_id: AKIAIOSFOLQUICKSTART\n  secret_access_key: wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY\nserver:\n  endpoint_url: http://127.0.0.1:8000\n</code></pre> <p>Now, mixing config file and environment variable initializations leads to the wrong result:</p> <pre><code>import os\nfrom lakefs_spec import LakeFSFileSystem\n\n# first file system, initialized from the config file\nconfig_fs = LakeFSFileSystem()\n\nos.environ[\"LAKEFS_HOST\"] = \"http://my-other-lakefs.host\"\nos.environ[\"LAKEFS_USERNAME\"] = \"my-username\"\nos.environ[\"LAKEFS_PASSWORD\"] = \"my-password\"\n\nenvvar_fs = LakeFSFileSystem()\n\nprint(config_fs is envvar_fs) # &lt;- prints True! \n</code></pre> <p>The reason why the above code does not work as desired is that the cached config-file-initialized file system is simply reused on the second assignment. To clear the file system instance cache, you can run the following:</p> <pre><code>from lakefs_spec import LakeFSFileSystem\n\nLakeFSFileSystem.clear_instance_cache()\n</code></pre>"},{"location":"guides/filesystem-usage/","title":"How to use the lakeFS file system","text":"<p>This guide contains instructions and code snippets on how to use the lakeFS file system.</p>"},{"location":"guides/filesystem-usage/#the-lakefs-uri-structure","title":"The lakeFS URI structure","text":"<p>In the following subsections, we frequently make use of lakeFS URIs in the example code. lakeFS URIs identify resources in a lakeFS deployment through a unique path consisting of repository name, lakeFS revision/ref name, and file name relative to the repository root. Optionally, they may be prefixed with the <code>lakefs://</code> URI scheme (this is required when using third-party libraries).</p> <p>As an example, a URI like <code>repo/main/file.txt</code> addresses the <code>file.txt</code> file on the <code>main</code> branch in the repository named <code>repo</code>.</p> <p>In some lakeFS file system operations, directories are also allowed as resource names. For example, the URI <code>repo/main/data/</code> (note the optional trailing slash) refers to the <code>data</code> directory on the <code>main</code> branch in the <code>repo</code> repository.</p>"},{"location":"guides/filesystem-usage/#on-staged-versus-committed-changes","title":"On staged versus committed changes","text":"<p>When uploading, copying, or removing files or directories from a branch, those removal operations will result in staged changes in the repository until a commit is created. lakeFS-spec does not create these commits automatically, since it separates file operations from versioning operations rigorously. If you want to conduct versioning operations, like creating commits, between file transfers, the best way to do so is by using filesystem transactions.</p>"},{"location":"guides/filesystem-usage/#how-to-use-lakefs-file-system-apis","title":"How to use lakeFS file system APIs","text":"<p>The following section explains more in-depth how to use the <code>LakeFSFileSystem</code> APIs. This section concerns the explicitly implemented operations. In addition, there are a number of file system APIs inherited from the <code>AbstractFileSystem</code> interface in fsspec.</p> <p>More information on file system usage can be found in the fsspec documentation.</p>"},{"location":"guides/filesystem-usage/#uploading-and-downloading-files","title":"Uploading and downloading files","text":"<p>The arguably most important feature of the file system is file transfers.</p>"},{"location":"guides/filesystem-usage/#file-uploads","title":"File uploads","text":"<p>To upload a file, you can use the <code>fs.put()</code> and <code>fs.put_file()</code> methods.  While <code>fs.put_file()</code> operates on single files only, the <code>fs.put()</code> API can be used for directory uploads.</p> <pre><code>from lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\n\n# remote path, then local target path.\nfs.put_file(\"file.txt\", \"my-repo/my-ref/file.txt\")\n</code></pre> <p>If you want to upload an entire directory to lakeFS, you can use the <code>fs.put()</code> API together with the <code>recursive=True</code> switch:</p> <pre><code># structure:\n#   dir/\n#   \u251c\u2500\u2500 a.txt\n#   \u251c\u2500\u2500 b.yaml\n#   \u251c\u2500\u2500 c.csv\n#   \u2514\u2500\u2500 ...\n\nfs.put(\"dir\", \"my-repo/my-ref/dir\", recursive=True)\n</code></pre> <p>Info</p> <p>The above method of file uploading results in two transfers: Once from the client to the lakeFS server, and once from the lakeFS server to the object storage. This can impact performance if the uploaded files are very large. To avoid this performance issue, you can also decide to write the file directly to the underlying object storage:</p> <pre><code>fs = LakeFSFileSystem()\n\nfs.put_file(\"my-repo/my-ref/file.txt\", \"file.txt\", use_blockstore=True)\n</code></pre> <p>Direct lakeFS blockstore uploads require the installation of the corresponding fsspec file system implementation through <code>pip</code>. For an S3-based lakeFS deployment, install the <code>s3fs</code> package. For Google Cloud Storage (GCS), install the <code>gcsfs</code> package. For Azure blob storage, install the <code>adlfs</code> package.</p>"},{"location":"guides/filesystem-usage/#file-downloads","title":"File downloads","text":"<p>To download a file, you can use the <code>fs.get()</code> or <code>fs.get_file()</code> methods. While <code>fs.get_file()</code> downloads single files only, the <code>fs.get()</code> API can be used for recursive directory downloads.</p> <pre><code>from lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\n\n# remote path, then local target path.\nfs.get_file(\"my-repo/my-ref/file.txt\", \"file.txt\")\n</code></pre> <p>In the case of a directory in lakeFS, use the <code>fs.get()</code> API together with the <code>recursive=True</code> switch:</p> <pre><code># structure:\n#   dir/\n#   \u251c\u2500\u2500 a.txt\n#   \u251c\u2500\u2500 b.yaml\n#   \u251c\u2500\u2500 c.csv\n#   \u2514\u2500\u2500 ...\n\n# downloads the entire `dir` directory (and subdirectories) into the current directory.\nfs.get(\"my-repo/my-ref/dir\", \"dir\", recursive=True)\n</code></pre>"},{"location":"guides/filesystem-usage/#checking-the-existence-of-lakefs-objects","title":"Checking the existence of lakeFS objects","text":"<p>To check the existence of a file in a given revision of a repository, you can use the <code>fs.exists()</code> API:</p> <pre><code>from lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\n\nmy_file_exists = fs.exists(\"my-repo/my-ref/my-file.txt\")\n</code></pre> <p>This function returns <code>True</code> if the file exists on that revision, and <code>False</code> if it does not. Errors (e.g. permission errors) will be raised, since in that case, object existence cannot be decided.</p> <p>Warning</p> <p><code>fs.exists()</code> only works on file objects, and will return <code>False</code> if called on directories.</p>"},{"location":"guides/filesystem-usage/#obtaining-info-on-stored-objects","title":"Obtaining info on stored objects","text":"<p>To query the metadata of a single object in a lakeFS repository, use the <code>fs.info()</code> API:</p> <pre><code>from lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\n\nmy_file_info = fs.info(\"my-repo/my-ref/my-file.txt\")\n</code></pre> <p>The resulting <code>my_file_info</code> object is a dictionary containing useful information such as storage location of the file, creation timestamp, and size (in bytes).</p> <p>You can also call <code>fs.info()</code> on directories:</p> <pre><code>dir_info = fs.info(\"my-repo/my-ref/dir/\")\n</code></pre> <p>In this case, the resulting <code>dir_info</code> object only contains the directory name, and the cumulated size of the files it contains.</p>"},{"location":"guides/filesystem-usage/#listing-directories-in-lakefs","title":"Listing directories in lakeFS","text":"<p>To list the files in a directory in lakeFS, use the <code>fs.ls()</code> method:</p> <pre><code>from lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\n\nmy_dir_listing = fs.ls(\"my-repo/my-ref/my-dir/\")\n</code></pre> <p>This returns a list of Python dictionaries containing information on the objects contained in the requested directory. The returned objects have the same fields set as those returned by a normal <code>fs.info()</code> call on a file object.</p>"},{"location":"guides/filesystem-usage/#deleting-objects-from-a-lakefs-branch","title":"Deleting objects from a lakeFS branch","text":"<p>To delete objects from a lakeFS branch, use the <code>fs.rm_file()</code> or <code>fs.rm()</code> APIs. As before, while the former works only for single files, the latter can be used to remove entire directories with the <code>recursive=True</code> option.</p> <pre><code>from lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\n\nfs.rm_file(\"my-repo/my-branch/my-file.txt\")\n\n# removes the entire `my-dir` directory.\nfs.rm(\"my-repo/my-branch/my-dir/\", recursive=True)\n</code></pre>"},{"location":"guides/filesystem-usage/#copying-files-in-a-repository","title":"Copying files in a repository","text":"<p>To copy files on a branch or from one branch to another, use the <code>fs.cp_file()</code> or <code>fs.copy()</code> methods:</p> <pre><code>from lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\n\n# copies a single file on the same branch to a new location.\nfs.cp_file(\"my-repo/branch-a/file.txt\", \"my-repo/branch-a/file.txt.bak\")\n\n# copies a single file from branch A to branch B.\nfs.cp_file(\"my-repo/branch-a/file.txt\", \"my-repo/branch-b/file.txt\")\n\n# copies the entire `my-dir` directory from branch A to branch B (which must exist).\nfs.copy(\"my-repo/branch-a/my-dir/\", \"my-repo/branch-b/my-dir/\", recursive=True)\n</code></pre> <p>Info</p> <p>Files and directories can only be copied between branches in the same repository, not between different repositories.</p> <p>Trying to copy to a non-existent branch will not create the branch.</p>"},{"location":"guides/integrations/","title":"How to use lakeFS-spec with third-party data science libraries","text":"<p>lakeFS-spec is built on top of the fsspec library, which allows third-party libraries to make use of its file system abstraction to offer high-level features. The fsspec documentation lists examples of its users, mostly data science libraries.</p> <p>This user guide page adds more detail on how lakeFS-spec can be used with four prominent data science libraries.</p> <p>Code Examples</p> <p>The code examples assume access to an existing lakeFS server with a <code>quickstart</code> repository containing the sample data already set up.</p> <p>Please see the Quickstart guide or lakeFS quickstart guide if you need guidance in getting started.</p> <p>The relevant lines for the lakeFS-spec integration in the following code snippets are highlighted.</p>"},{"location":"guides/integrations/#pandas","title":"Pandas","text":"<p>Pandas can read and write data from remote locations, and uses fsspec for all URLs that are not local or HTTP(S).</p> <p>This means that (almost) all <code>pd.read_*</code> and <code>pd.DataFrame.to_*</code> operations can benefit from the lakeFS integration offered by our library without any additional configuration. See the Pandas documentation on reading/writing remote files for additional details.</p> <p>The following code snippet illustrates how to read and write Pandas data frames in various formats from/to a lakeFS repository in the context of a transaction:</p> <pre><code>import pandas as pd\n\nfrom lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\n\nwith fs.transaction(\"quickstart\", \"main\") as tx:\n    lakes = pd.read_parquet(f\"lakefs://quickstart/{tx.branch.id}/lakes.parquet\")\n    german_lakes = lakes.query('Country == \"Germany\"')\n    german_lakes.to_csv(f\"lakefs://quickstart/{tx.branch.id}/german_lakes.csv\")\n\n    tx.commit(message=\"Add German lakes\")\n</code></pre>"},{"location":"guides/integrations/#duckdb","title":"DuckDB","text":"<p>The DuckDB in-memory database management system includes support for fsspec file systems as part of its Python API (see the official documentation on using fsspec filesystems for details). This allows DuckDB to transparently query and store data located in lakeFS repositories through lakeFS-spec.</p> <p>Similar to the example above, the following code snippet illustrates how to read and write data from/to a lakeFS repository in the context of a transaction through the DuckDB Python API:</p> <pre><code>import duckdb\n\nfrom lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\nduckdb.register_filesystem(fs)\n\nwith fs.transaction(\"quickstart\", \"main\") as tx:\n    lakes = duckdb.read_parquet(\"lakefs://quickstart/main/lakes.parquet\")\n    italian_lakes = duckdb.sql(\"SELECT * FROM lakes where Country='Italy'\")\n    italian_lakes.to_csv(f\"lakefs://quickstart/{tx.branch.id}/italian_lakes.csv\")\n\n    tx.commit(message=\"Add Italian lakes\")\n</code></pre> <ol> <li>Makes the lakeFS-spec file system known to DuckDB (<code>duckdb.register_filesystem(fsspec.filesystem(\"lakefs\"))</code> can also be used to avoid the direct import of <code>LakeFSFileSystem</code>)</li> </ol>"},{"location":"guides/integrations/#polars","title":"Polars","text":"<p>Warning</p> <p>There is an ongoing discussion in the Polars development team whether to remove support for fsspec file systems, with no clear outcome as of the time this page was written. Please refer to the discussion on the relevant GitHub issue in case you encounter any problems.</p> <p>The Python API wrapper for the Rust-based Polars DataFrame library can access remote storage through fsspec, similar to Pandas (see the official documentation on cloud storage).</p> <p>Again, the following code example demonstrates how to read a Parquet file and save a modified version back in CSV format to a lakeFS repository from Polars in the context of a  transaction:</p> <pre><code>import polars as pl\n\nfrom lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\n\nwith fs.transaction(\"quickstart\", \"main\") as tx:\n    lakes = pl.read_parquet(f\"lakefs://quickstart/{tx.branch.id}/lakes.parquet\")\n    us_lakes = lakes.filter(pl.col(\"Country\") == \"United States of America\")\n\n    with fs.open(f\"lakefs://quickstart/{tx.branch.id}/us_lakes.csv\", \"wb\") as f:\n        us_lakes.write_csv(f)\n\n    tx.commit(message=\"Add US lakes\")\n</code></pre> <ol> <li>Polars does not support directly writing to remote storage through the <code>pl.DataFrame.write_*</code> API (see docs)</li> </ol>"},{"location":"guides/integrations/#pyarrow","title":"PyArrow","text":"<p>Apache Arrow and its Python API, PyArrow, can also use fsspec file systems to perform I/O operations on data objects. The documentation has additional details on using fsspec-compatible file systems with Arrow.</p> <p>PyArrow <code>read_*</code> and <code>write_*</code> functions take an explicit <code>filesystem</code> parameter, which accepts any fsspec file system, such as the <code>LakeFSFileSystem</code> provided by this library. </p> <p>The following example code illustrates the use of lakeFS-spec with PyArrow, reading a Parquet file and writing it back to a lakeFS repository as a partitioned CSV dataset in the context of a transaction:</p> <pre><code>import pyarrow as pa\nimport pyarrow.dataset as ds\nimport pyarrow.parquet as pq\n\nfrom lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\n\nwith fs.transaction(\"quickstart\", \"main\") as tx:\n    lakes_table = pq.read_table(f\"quickstart/{tx.branch.id}/lakes.parquet\", filesystem=fs)\n\n    ds.write_dataset(\n        lakes_table,\n        f\"quickstart/{tx.branch.id}/lakes\",\n        filesystem=fs,\n        format=\"csv\",\n        partitioning=ds.partitioning(pa.schema([lakes_table.schema.field(\"Country\")])),\n    )\n\n    tx.commit(\"Add partitioned lakes data set\")\n</code></pre>"},{"location":"guides/transactions/","title":"Using transactions on the lakeFS file system","text":"<p>In addition to file operations, you can carry out versioning operations in your Python code using file system transactions.</p> <p>Transactions in lakeFS-spec behave similarly to the transactions in the high-level lakeFS SDK: Both approaches create an ephemeral branch for a transaction, perform the operations in the context block on that ephemeral branch, and optionally merge it back into the source branch upon exiting the context manager.</p> <p>They are an \"all or nothing\" proposition: If an error occurs during the transaction, the base branch is left unchanged.</p> <p>The lakeFS-spec transaction inherits from fsspec transactions. For more information on fsspec transactions, see the official documentation.</p>"},{"location":"guides/transactions/#versioning-operations","title":"Versioning operations","text":"<p>The lakeFS file system's transaction is the intended place for conducting versioning operations between file transfers. The following is an example of file uploads with commit creations, with a tag being applied at the end.</p> <pre><code>from lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\n\nwith fs.transaction(\"repo\", \"main\") as tx:\n    fs.put_file(\"train-data.txt\", f\"repo/{tx.branch.id}/train-data.txt\")\n    tx.commit(message=\"Add training data\")\n    fs.put_file(\"test-data.txt\", f\"repo/{tx.branch.id}/test-data.txt\")\n    sha = tx.commit(message=\"Add test data\")\n    tx.tag(sha, name=\"My train-test split\")\n</code></pre> <p>The full list of supported lakeFS versioning operations (by default, these operations target the transaction branch):</p> <ul> <li><code>commit</code>, for creating a commit, optionally with attached metadata.</li> <li><code>merge</code>, for merging a given branch.</li> <li><code>revert</code>, for reverting a previous commit.</li> <li><code>rev_parse</code>, for parsing revisions like branch/tag names and SHA fragments into full commit SHAs.</li> <li><code>tag</code>, for creating a tag pointing to a commit.</li> </ul>"},{"location":"guides/transactions/#lifecycle-of-ephemeral-transaction-branches","title":"Lifecycle of ephemeral transaction branches","text":"<p>You can control the lifecycle for a transaction branch with the <code>delete</code> argument:</p> <ul> <li>By default (<code>delete=\"onsuccess</code>), the branch is deleted after successful completion, and left over in case of failure for debugging purposes.</li> <li>If <code>delete=\"always\"</code>, the branch is unconditionally deleted after the transaction regardless of its status.</li> <li>Similarly, if <code>delete=\"never\"</code>, the branch is unconditionally left in place after the transaction.</li> </ul> <p>Additionally, the <code>automerge</code> keyword controls whether the transaction branch is merged after successful completion of the transaction.  It has no effect if an error occurs over the course of the transaction.</p>"},{"location":"guides/transactions/#error-handling","title":"Error handling","text":"<p>Since all files are uploaded to a short-lived transaction branch, no commit on the target branch happens in case of an exception:</p> <pre><code>from lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\n\nwith fs.transaction(\"repo\", \"main\", delete=\"onsuccess\") as tx:\n    fs.put_file(\"my-file.txt\", f\"repo/{tx.branch.id}/my-file.txt\")\n    tx.commit(message=\"Add my-file.txt\")\n    raise ValueError(\"oops!\")\n</code></pre> <p>The above code will not modify the <code>main</code> branch, since the <code>ValueError</code> prevents the merge of the transaction branch. Note that you can examine the contents of the transaction branch due to <code>delete=\"onsuccess\"</code> (the default behavior), which prevents deletion of the branch in case of failure for debugging purposes.</p>"},{"location":"reference/SUMMARY/","title":"SUMMARY","text":"<ul> <li>lakefs_spec<ul> <li>errors</li> <li>spec</li> <li>transaction</li> <li>util</li> </ul> </li> </ul>"},{"location":"reference/lakefs_spec/","title":"lakefs_spec","text":"<p>lakefs-spec is an fsspec file system integration for the lakeFS data lake.</p>"},{"location":"reference/lakefs_spec/errors/","title":"errors","text":"<p>Error translation facilities to map lakeFS API errors to Python-native OS errors in the lakeFS file system.</p> <p>This is important to honor the fsspec API contract, where users only need to expect builtin Python exceptions to avoid complicated error handling setups.</p>"},{"location":"reference/lakefs_spec/errors/#lakefs_spec.errors.translate_lakefs_error","title":"translate_lakefs_error","text":"<pre><code>translate_lakefs_error(\n    error: ServerException,\n    rpath: str | None = None,\n    message: str | None = None,\n    set_cause: bool = True,\n) -&gt; OSError\n</code></pre> <p>Convert a lakeFS server exception to a Python builtin exception.</p> <p>For some subclasses of <code>lakefs.exceptions.ServerException</code>, a direct Python builtin equivalent exists. In these cases, the suitable equivalent is returned. All other classes are converted to a standard <code>IOError</code>.</p> PARAMETER  DESCRIPTION <code>error</code> <p>The exception returned by the lakeFS SDK wrapper.</p> <p> TYPE: <code>ServerException</code> </p> <code>rpath</code> <p>The remote resource path involved in the error.</p> <p> TYPE: <code>str | None</code> DEFAULT: <code>None</code> </p> <code>message</code> <p>An error message to use for the returned exception.  If not given, the error message returned by the lakeFS server is used instead.</p> <p> TYPE: <code>str | None</code> DEFAULT: <code>None</code> </p> <code>set_cause</code> <p>Whether to set the <code>__cause__</code> attribute to the previous exception if the exception is translated.</p> <p> TYPE: <code>bool</code> DEFAULT: <code>True</code> </p> RETURNS DESCRIPTION <code>OSError</code> <p>A builtin Python exception ready to be thrown.</p> Source code in <code>src/lakefs_spec/errors.py</code> <pre><code>def translate_lakefs_error(\n    error: ServerException,\n    rpath: str | None = None,\n    message: str | None = None,\n    set_cause: bool = True,\n) -&gt; OSError:\n    \"\"\"\n    Convert a lakeFS server exception to a Python builtin exception.\n\n    For some subclasses of ``lakefs.exceptions.ServerException``, a direct Python builtin equivalent exists.\n    In these cases, the suitable equivalent is returned. All other classes are converted to a standard ``IOError``.\n\n    Parameters\n    ----------\n    error: ServerException\n        The exception returned by the lakeFS SDK wrapper.\n    rpath: str | None\n        The remote resource path involved in the error.\n    message: str | None\n        An error message to use for the returned exception.\n         If not given, the error message returned by the lakeFS server is used instead.\n    set_cause: bool\n        Whether to set the ``__cause__`` attribute to the previous exception if the exception is translated.\n\n    Returns\n    -------\n    OSError\n        A builtin Python exception ready to be thrown.\n    \"\"\"\n    status = error.status_code\n\n    if hasattr(error, \"body\"):\n        # error has a JSON response body attached\n        reason = error.body[\"message\"]\n    else:\n        reason = error.reason\n\n    emsg = f\"{status} {reason}\"\n    if rpath:\n        emsg += f\": {rpath!r}\"\n\n    constructor = HTTP_CODE_TO_ERROR.get(status, partial(IOError, errno.EIO))\n    custom_exc = constructor(message or emsg)\n\n    if set_cause:\n        custom_exc.__cause__ = error\n    return custom_exc\n</code></pre>"},{"location":"reference/lakefs_spec/spec/","title":"spec","text":"<p>Core interface definitions for file system interaction with lakeFS from Python, namely the <code>LakeFSFileSystem</code> and <code>LakeFSFile</code> classes.</p>"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem","title":"LakeFSFileSystem","text":"<p>             Bases: <code>AbstractFileSystem</code></p> <p>lakeFS file system implementation.</p> <p>Instances of this class are cached based on their constructor arguments.</p> <p>For more information, see the fsspec documentation https://filesystem-spec.readthedocs.io/en/latest/features.html#instance-caching.</p> PARAMETER  DESCRIPTION <code>host</code> <p>The address of your lakeFS instance.</p> <p> TYPE: <code>str | None</code> DEFAULT: <code>None</code> </p> <code>username</code> <p>The access key name to use in case of access key authentication.</p> <p> TYPE: <code>str | None</code> DEFAULT: <code>None</code> </p> <code>password</code> <p>The access key secret to use in case of access key authentication.</p> <p> TYPE: <code>str | None</code> DEFAULT: <code>None</code> </p> <code>api_key</code> <p>The API key to use in case of authentication with an API key.</p> <p> TYPE: <code>str | None</code> DEFAULT: <code>None</code> </p> <code>api_key_prefix</code> <p>A string prefix to use for the API key in authentication.</p> <p> TYPE: <code>str | None</code> DEFAULT: <code>None</code> </p> <code>access_token</code> <p>An access token to use in case of access token authentication.</p> <p> TYPE: <code>str | None</code> DEFAULT: <code>None</code> </p> <code>verify_ssl</code> <p>Whether to verify SSL certificates in API interactions. Do not disable in production.</p> <p> TYPE: <code>bool</code> DEFAULT: <code>True</code> </p> <code>ssl_ca_cert</code> <p>A custom certificate PEM file to use to verify the peer in SSL connections.</p> <p> TYPE: <code>str | None</code> DEFAULT: <code>None</code> </p> <code>proxy</code> <p>Proxy address to use when connecting to a lakeFS server.</p> <p> TYPE: <code>str | None</code> DEFAULT: <code>None</code> </p> <code>create_branch_ok</code> <p>Whether to create branches implicitly when not-existing branches are referenced on file uploads.</p> <p> TYPE: <code>bool</code> DEFAULT: <code>True</code> </p> <code>source_branch</code> <p>Source branch set as origin when a new branch is implicitly created.</p> <p> TYPE: <code>str</code> DEFAULT: <code>'main'</code> </p> <code>**storage_options</code> <p>Configuration options to pass to the file system's directory cache.</p> <p> TYPE: <code>Any</code> DEFAULT: <code>{}</code> </p> Source code in <code>src/lakefs_spec/spec.py</code> <pre><code>class LakeFSFileSystem(AbstractFileSystem):\n    \"\"\"\n    lakeFS file system implementation.\n\n    Instances of this class are cached based on their constructor arguments.\n\n    For more information, see the fsspec documentation &lt;https://filesystem-spec.readthedocs.io/en/latest/features.html#instance-caching&gt;.\n\n    Parameters\n    ----------\n    host: str | None\n        The address of your lakeFS instance.\n    username: str | None\n        The access key name to use in case of access key authentication.\n    password: str | None\n        The access key secret to use in case of access key authentication.\n    api_key: str | None\n        The API key to use in case of authentication with an API key.\n    api_key_prefix: str | None\n        A string prefix to use for the API key in authentication.\n    access_token: str | None\n        An access token to use in case of access token authentication.\n    verify_ssl: bool\n        Whether to verify SSL certificates in API interactions. Do not disable in production.\n    ssl_ca_cert: str | None\n        A custom certificate PEM file to use to verify the peer in SSL connections.\n    proxy: str | None\n        Proxy address to use when connecting to a lakeFS server.\n    create_branch_ok: bool\n        Whether to create branches implicitly when not-existing branches are referenced on file uploads.\n    source_branch: str\n        Source branch set as origin when a new branch is implicitly created.\n    **storage_options: Any\n        Configuration options to pass to the file system's directory cache.\n    \"\"\"\n\n    protocol = \"lakefs\"\n\n    def __init__(\n        self,\n        host: str | None = None,\n        username: str | None = None,\n        password: str | None = None,\n        api_key: str | None = None,\n        api_key_prefix: str | None = None,\n        access_token: str | None = None,\n        verify_ssl: bool = True,\n        ssl_ca_cert: str | None = None,\n        proxy: str | None = None,\n        create_branch_ok: bool = True,\n        source_branch: str = \"main\",\n        **storage_options: Any,\n    ):\n        super().__init__(**storage_options)\n\n        # lakeFS client arguments\n        cargs = [host, username, password, api_key, api_key_prefix, access_token, ssl_ca_cert]\n\n        if all(arg is None for arg in cargs):\n            # empty kwargs means envvar and configfile autodiscovery\n            self.client = Client()\n        else:\n            self.client = Client(\n                host=host,\n                username=username,\n                password=password,\n                api_key=api_key,\n                api_key_prefix=api_key_prefix,\n                access_token=access_token,\n                ssl_ca_cert=ssl_ca_cert,\n            )\n\n        # proxy address, not part of the constructor\n        self.client.config.proxy = proxy\n        # whether to verify SSL certs, not part of the constructor\n        self.client.config.verify_ssl = verify_ssl\n\n        self.create_branch_ok = create_branch_ok\n        self.source_branch = source_branch\n\n    @cached_property\n    def _lakefs_server_version(self):\n        with self.wrapped_api_call():\n            return tuple(int(t) for t in self.client.version.split(\".\"))\n\n    @classmethod\n    @overload\n    def _strip_protocol(cls, path: str | os.PathLike[str] | Path) -&gt; str:\n        ...\n\n    @classmethod\n    @overload\n    def _strip_protocol(cls, path: list[str | os.PathLike[str] | Path]) -&gt; list[str]:\n        ...\n\n    @classmethod\n    def _strip_protocol(cls, path):\n        \"\"\"Copied verbatim from the base class, save for the slash rstrip.\"\"\"\n        if isinstance(path, list):\n            return [cls._strip_protocol(p) for p in path]\n        spath = super()._strip_protocol(path)\n        if stringify_path(path).endswith(\"/\"):\n            return spath + \"/\"\n        return spath\n\n    @property\n    def transaction(self) -&gt; LakeFSTransaction:\n        \"\"\"\n        A context manager within which file uploads and versioning operations are deferred to a\n        queue, and carried out during when exiting the context.\n\n        Requires the file class to implement ``.commit()`` and ``.discard()`` for the normal and exception cases.\n        \"\"\"\n        self._transaction: LakeFSTransaction | None\n        if self._transaction is None:\n            self._transaction = LakeFSTransaction(self)\n        return self._transaction\n\n    def start_transaction(self):\n        raise NotImplementedError(\n            \"lakeFS transactions should only be used as a context manager via\"\n            \" `with LakeFSFileSystem.transaction as tx:`\"\n        )\n\n    @contextmanager\n    def wrapped_api_call(\n        self, rpath: str | None = None, message: str | None = None, set_cause: bool = True\n    ) -&gt; Generator[None, None, None]:\n        \"\"\"\n        A context manager to wrap lakeFS API calls, translating any API errors to Python-native OS errors.\n\n        Meant for internal use.\n\n        Parameters\n        ----------\n        rpath: str | None\n            The remote path involved in the requested API call.\n        message: str | None\n            A custom error message to emit instead of parsing the API error response.\n        set_cause: bool\n            Whether to include the original lakeFS API error in the resulting traceback.\n\n        Yields\n        ------\n        None\n            An empty generator, to be used as a context manager.\n\n        Raises\n        ------\n        OSError\n            Translated error from the lakeFS API call, if any.\n        \"\"\"\n        try:\n            yield\n        except ServerException as e:\n            raise translate_lakefs_error(e, rpath=rpath, message=message, set_cause=set_cause)\n\n    def checksum(self, path: str | os.PathLike[str]) -&gt; str | None:\n        \"\"\"\n        Get a remote lakeFS file object's checksum.\n\n        This is usually its MD5 hash, unless another hash function was used on upload.\n\n        Parameters\n        ----------\n        path: str | os.PathLike[str]\n            The remote path to look up the lakeFS checksum for. Must point to a single file object.\n\n        Returns\n        -------\n        str | None\n            The remote file's checksum, or ``None`` if ``path`` points to a directory or does not exist.\n        \"\"\"\n        path = stringify_path(path)\n        try:\n            return self.info(path).get(\"checksum\")\n        except FileNotFoundError:\n            return None\n\n    def exists(self, path: str | os.PathLike[str], **kwargs: Any) -&gt; bool:\n        \"\"\"\n        Check existence of a remote path in a lakeFS repository.\n\n        Input paths can either be files or directories.\n\n        Parameters\n        ----------\n        path: str | os.PathLike[str]\n            The remote path whose existence to check. Must be a fully qualified lakeFS URI.\n        **kwargs: Any\n            Additional keyword arguments for fsspec compatibility, unused.\n\n        Returns\n        -------\n        bool\n            ``True`` if the requested path exists, ``False`` if it does not.\n\n        Raises\n        ------\n        PermissionError\n            If the user does not have sufficient permissions to query object existence.\n        \"\"\"\n        path = stringify_path(path)\n        repository, ref, resource = parse(path)\n        try:\n            reference = lakefs.Reference(repository, ref, client=self.client)\n            return reference.object(resource).exists()\n        except ServerException as e:\n            # in case of an error other than \"not found\", existence cannot be\n            # decided, so raise the translated error.\n            raise translate_lakefs_error(e)\n\n    def cp_file(\n        self, path1: str | os.PathLike[str], path2: str | os.PathLike[str], **kwargs: Any\n    ) -&gt; None:\n        \"\"\"\n        Copy a single file from one remote location to another in lakeFS.\n\n        Parameters\n        ----------\n        path1: str | os.PathLike[str]\n            The remote file location to be copied.\n        path2: str | os.PathLike[str]\n            The (remote) target location to which to copy the file.\n        **kwargs: Any\n            Additional keyword arguments for fsspec compatibility, unused.\n\n        Raises\n        ------\n        ValueError\n            When attempting to copy objects between repositories.\n        \"\"\"\n        path1 = stringify_path(path1)\n        path2 = stringify_path(path2)\n        if path1 == path2:\n            return\n\n        orig_repo, orig_ref, orig_path = parse(path1)\n        dest_repo, dest_ref, dest_path = parse(path2)\n\n        if orig_repo != dest_repo:\n            raise ValueError(\n                \"can only copy objects within a repository, but got source \"\n                f\"repository {orig_repo!r} and destination repository {dest_repo!r}\"\n            )\n\n        with self.wrapped_api_call():\n            reference = lakefs.Reference(orig_repo, orig_ref, client=self.client)\n            reference.object(orig_path).copy(dest_ref, dest_path)\n\n    def get_file(\n        self,\n        rpath: str | os.PathLike[str],\n        lpath: str | os.PathLike[str],\n        callback: fsspec.callbacks.Callback = _DEFAULT_CALLBACK,\n        outfile: Any = None,\n        precheck: bool = True,\n        **kwargs: Any,\n    ) -&gt; None:\n        \"\"\"\n        Download a single file from a remote lakeFS server to local storage.\n\n        Parameters\n        ----------\n        rpath: str | os.PathLike[str]\n            The remote path to download to local storage. Must be a fully qualified lakeFS URI, and point to a single file.\n        lpath: str | os.PathLike[str]\n            The local path on disk to save the downloaded file to.\n        callback: fsspec.callbacks.Callback\n            An fsspec callback to use during the operation. Can be used to report download progress.\n        outfile: Any\n            A file-like object to save the downloaded content to. Can be used in place of ``lpath``.\n        precheck: bool\n            Check if ``lpath`` already exists and compare its checksum with that of ``rpath``, skipping the download if they match.\n        **kwargs: Any\n            Additional keyword arguments passed to ``AbstractFileSystem.open()``.\n        \"\"\"\n        rpath = stringify_path(rpath)\n        lpath = stringify_path(lpath)\n\n        if precheck and Path(lpath).is_file():\n            local_checksum = md5_checksum(lpath, blocksize=self.blocksize)\n            remote_checksum = self.info(rpath).get(\"checksum\")\n            if local_checksum == remote_checksum:\n                logger.info(\n                    f\"Skipping download of resource {rpath!r} to local path {lpath!r}: \"\n                    f\"Resource {lpath!r} exists and checksums match.\"\n                )\n                return\n\n        with self.wrapped_api_call(rpath=rpath):\n            super().get_file(rpath, lpath, callback=callback, outfile=outfile, **kwargs)\n\n    def info(self, path: str | os.PathLike[str], **kwargs: Any) -&gt; dict[str, Any]:\n        \"\"\"\n        Query a remote lakeFS object's metadata.\n\n        Parameters\n        ----------\n        path: str | os.PathLike[str]\n            The object for which to obtain metadata. Must be a fully qualified lakeFS URI, can either point to a file or a directory.\n        **kwargs: Any\n            Additional keyword arguments to pass to ``LakeFSFileSystem.ls()`` if ``path`` points to a directory.\n\n        Returns\n        -------\n        dict[str, Any]\n            A dictionary containing metadata on the object, including its full remote path and object type (file or directory).\n\n        Raises\n        ------\n        FileNotFoundError\n            If the ``path`` refers to a non-file path that does not exist in the repository.\n        \"\"\"\n        path = stringify_path(path)\n        repository, ref, resource = parse(path)\n        # first, try with `stat_object` in case of a file.\n        # the condition below checks edge cases of resources that cannot be files.\n        if resource and not resource.endswith(\"/\"):\n            try:\n                reference = lakefs.Reference(repository, ref, client=self.client)\n                res = reference.object(resource).stat()\n                return {\n                    \"checksum\": res.checksum,\n                    \"content-type\": res.content_type,\n                    \"mtime\": res.mtime,\n                    \"name\": f\"{repository}/{ref}/{res.path}\",\n                    \"size\": res.size_bytes,\n                    \"type\": \"file\",\n                }\n            except NotFoundException:\n                # fall through, retry with `ls` if it's a directory.\n                pass\n            except ServerException as e:\n                raise translate_lakefs_error(e, rpath=path)\n\n        out = self.ls(path, detail=True, recursive=True, **kwargs)\n        if not out:\n            raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), path)\n\n        return {\n            \"name\": path.rstrip(\"/\"),\n            \"size\": sum(o.get(\"size\") or 0 for o in out),\n            \"type\": \"directory\",\n        }\n\n    def _update_dircache(self, info: list) -&gt; None:\n        \"\"\"Update logic for dircache (optionally recursive) based on lakeFS API response\"\"\"\n        parents = {self._parent(i[\"name\"].rstrip(\"/\")) for i in info}\n        for pp in parents:\n            # subset of info entries which are direct descendants of `parent`\n            dir_info = [i for i in info if self._parent(i[\"name\"].rstrip(\"/\")) == pp]\n            if pp not in self.dircache:\n                self.dircache[pp] = dir_info\n                continue\n\n            # Merge existing dircache entry with updated listing, which contains either:\n            # - files not present in the cache yet\n            # - a fresh listing (if `refresh=True`)\n\n            cache_entry = self.dircache[pp][:]\n\n            old_names = {e[\"name\"] for e in cache_entry}\n            new_names = {e[\"name\"] for e in dir_info}\n\n            to_remove = old_names - new_names\n            to_update = old_names.intersection(new_names)\n\n            # Remove all entries no longer present in the current listing\n            cache_entry = [e for e in cache_entry if e[\"name\"] not in to_remove]\n\n            # Overwrite existing entries in the cache with its updated values\n            for name in to_update:\n                old_idx = next(idx for idx, e in enumerate(cache_entry) if e[\"name\"] == name)\n                new_entry = next(e for e in info if e[\"name\"] == name)\n\n                cache_entry[old_idx] = new_entry\n                dir_info.remove(new_entry)\n\n            # Add the remaining (new) entries to the cache\n            cache_entry.extend(dir_info)\n            self.dircache[pp] = sorted(cache_entry, key=operator.itemgetter(\"name\"))\n\n    def _ls_from_cache(self, path: str, recursive: bool = False) -&gt; list[dict[str, Any]] | None:\n        \"\"\"Override of ``AbstractFileSystem._ls_from_cache`` with support for recursive listings.\"\"\"\n        if not recursive:\n            return super()._ls_from_cache(path)\n\n        result = None\n        for key, files in self.dircache.items():\n            if not (key.startswith(path) or path == key + \"/\"):\n                continue\n            if result is None:\n                result = []\n            result.extend(files)\n        if not result:\n            return result\n        return sorted(result, key=operator.itemgetter(\"name\"))\n\n    @overload\n    def ls(\n        self,\n        path: str | os.PathLike[str],\n        detail: Literal[True] = ...,\n        **kwargs: Any,\n    ) -&gt; list[dict[str, Any]]:\n        ...\n\n    @overload\n    def ls(\n        self,\n        path: str | os.PathLike[str],\n        detail: Literal[False],\n        **kwargs: Any,\n    ) -&gt; list[str]:\n        ...\n\n    @overload\n    def ls(\n        self,\n        path: str | os.PathLike[str],\n        detail: bool = True,\n        **kwargs: Any,\n    ) -&gt; list[str] | list[dict[str, Any]]:\n        ...\n\n    def ls(\n        self,\n        path: str | os.PathLike[str],\n        detail: bool = True,\n        **kwargs: Any,\n    ) -&gt; list[str] | list[dict[str, Any]]:\n        \"\"\"\n        List all available objects under a given path in lakeFS.\n\n        Parameters\n        ----------\n        path: str | os.PathLike[str]\n            The path under which to list objects. Must be a fully qualified lakeFS URI.\n            Can also point to a file, in which case the file's metadata will be returned.\n        detail: bool\n            Whether to obtain all metadata on the requested objects or just their names.\n        **kwargs: Any\n            Additional keyword arguments for fsspec compatibility.\n\n            In particular:\n                `refresh: bool`: whether to skip the directory listing cache,\n                `recursive: bool`: whether to list subdirectory contents recursively\n\n        Returns\n        -------\n        list[str] | list[dict[str, Any]]\n            A list of all objects' metadata under the given remote path if ``detail=True``, or alternatively only their names if ``detail=False``.\n        \"\"\"\n        path = self._strip_protocol(path)\n        repository, ref, prefix = parse(path)\n\n        recursive = kwargs.pop(\"recursive\", False)\n\n        # Try lookup in dircache unless explicitly disabled by `refresh=True` kwarg\n        use_dircache = not kwargs.pop(\"refresh\", False)\n\n        if use_dircache:\n            cache_entry: list[Any] | None = None\n            try:\n                cache_entry = self._ls_from_cache(path, recursive=recursive)\n            except FileNotFoundError:\n                # we patch files missing from an ls call in the cache entry below,\n                # so this should not be an error.\n                pass\n\n            if cache_entry is not None:\n                if not detail:\n                    return [e[\"name\"] for e in cache_entry]\n                return cache_entry[:]\n\n        kwargs[\"prefix\"] = prefix\n\n        info = []\n        # stat infos are either the path only (`detail=False`) or a dict full of metadata\n        delimiter = \"\" if recursive else \"/\"\n        reference = lakefs.Reference(repository, ref, client=self.client)\n\n        with self.wrapped_api_call(rpath=path):\n            for obj in reference.objects(prefix=prefix, delimiter=delimiter):\n                if isinstance(obj, CommonPrefix):\n                    # prefixes are added below.\n                    info.append(\n                        {\n                            \"name\": f\"{repository}/{ref}/{obj.path}\",\n                            \"size\": 0,\n                            \"type\": \"directory\",\n                        }\n                    )\n                elif isinstance(obj, ObjectInfo):\n                    info.append(\n                        {\n                            \"checksum\": obj.checksum,\n                            \"content-type\": obj.content_type,\n                            \"mtime\": obj.mtime,\n                            \"name\": f\"{repository}/{ref}/{obj.path}\",\n                            \"size\": obj.size_bytes,\n                            \"type\": \"object\",\n                        }\n                    )\n\n        # Retry the API call with appended slash if the current result\n        # is just a single directory entry only (not its contents).\n        # This is useful to allow `ls(\"repo/branch/dir\")` calls without a trailing slash.\n        if len(info) == 1 and info[0][\"type\"] == \"directory\" and info[0][\"name\"] == path + \"/\":\n            return self.ls(\n                path + \"/\",\n                detail=detail,\n                **kwargs | {\"refresh\": not use_dircache, \"recursive\": recursive},\n            )\n\n        if recursive:\n            # To make recursive ls behave identical to the non-recursive case,\n            # add back virtual `directory` entries, which are only returned by\n            # the lakeFS API when querying non-recursively.\n            here = self._strip_protocol(path).rstrip(\"/\")\n            subdirs = {parent for o in info if (parent := self._parent(o[\"name\"])) != here}\n            for subdir in subdirs:\n                info.append(\n                    {\n                        \"name\": subdir + \"/\",\n                        \"size\": 0,\n                        \"type\": \"directory\",\n                    }\n                )\n\n        if info:\n            self._update_dircache(info[:])\n\n        if not detail:\n            info = [o[\"name\"] for o in info]  # type: ignore\n\n        return info\n\n    def open(\n        self,\n        path: str | os.PathLike[str],\n        mode: Literal[\"r\", \"rb\", \"rt\", \"w\", \"wb\", \"wt\", \"x\", \"xb\", \"xt\"] = \"rb\",\n        pre_sign: bool = False,\n        content_type: str | None = None,\n        metadata: dict[str, str] | None = None,\n        autocommit: bool = True,\n        **kwargs: Any,\n    ) -&gt; LakeFSIOBase:\n        \"\"\"\n        Dispatch a lakeFS file-like object (local buffer on disk) for the given remote path for up- or downloads depending on ``mode``.\n\n        Parameters\n        ----------\n        path: str | os.PathLike[str]\n            The remote path for which to open a local ``LakeFSFile``. Must be a fully qualified lakeFS URI.\n        mode: Literal[\"r\", \"rb\", \"rt\", \"w\", \"wb\", \"wt\", \"x\", \"xb\", \"xt\"]\n            The file mode indicating its purpose. Use ``r/rb`` for downloads from lakeFS, ``w/wb/x/xb`` for uploads to lakeFS.\n        pre_sign: bool\n            Whether to use a pre-signed URL for the file up-/download.\n        content_type: str | None\n            Content type to use for the file, relevant for uploads only.\n        metadata: dict[str, str] | None\n            Additional metadata to attach to the file, relevant for uploads only.\n        autocommit: bool\n            Whether to process the file immediately instead of queueing it for transaction while in a transaction context.\n        **kwargs: Any\n            Additional keyword arguments for fsspec compatibility, unused.\n\n        Returns\n        -------\n        LakeFSIOBase\n            A local file-like object ready to hold data to be received from / sent to a lakeFS server.\n\n        Raises\n        ------\n        NotImplementedError\n            If ``mode`` is not supported.\n        \"\"\"\n        if mode.endswith(\"t\"):\n            # text modes {r,w,x}t are equivalent to {r,w,x} here respectively.\n            mode = mode[:-1]  # type: ignore\n\n        if mode not in {\"r\", \"rb\", \"w\", \"wb\", \"x\", \"xb\"}:\n            raise NotImplementedError(f\"unsupported mode {mode!r}\")\n\n        path = stringify_path(path)\n        repo, ref, resource = parse(path)\n\n        if mode.startswith(\"r\"):\n            reference = lakefs.Reference(repo, ref, client=self.client)\n            obj = reference.object(resource)\n\n            if not obj.exists():\n                raise FileNotFoundError(path)\n            handler = ObjectReader(obj, mode=mode, pre_sign=pre_sign, client=self.client)\n        else:\n            # for writing ops, ref must be a branch\n            branch = lakefs.Branch(repo, ref, client=self.client)\n            if self.create_branch_ok:\n                branch.create(self.source_branch, exist_ok=True)\n\n            obj = branch.object(resource)\n            handler = ObjectWriter(\n                obj,\n                mode=mode,\n                pre_sign=pre_sign,\n                content_type=content_type,\n                metadata=metadata,\n                client=self.client,\n            )\n\n        ac = kwargs.pop(\"autocommit\", not self._intrans)\n        if not ac and \"r\" not in mode:\n            self._transaction.files.append(handler)\n\n        return handler\n\n    def put_file(\n        self,\n        lpath: str | os.PathLike[str],\n        rpath: str | os.PathLike[str],\n        callback: fsspec.callbacks.Callback = _DEFAULT_CALLBACK,\n        precheck: bool = True,\n        **kwargs: Any,\n    ) -&gt; None:\n        \"\"\"\n        Upload a local file to a remote location on a lakeFS server.\n\n        Note that depending on the block store type, additional configuration like credentials may need to be configured when ``use_blockstore=True`` and ``presign=False``.\n\n        Parameters\n        ----------\n        lpath: str | os.PathLike[str]\n            The local path on disk to upload to the lakeFS server.\n        rpath: str | os.PathLike[str]\n            The remote target path to upload the local file to. Must be a fully qualified lakeFS URI.\n        callback: fsspec.callbacks.Callback\n            An fsspec callback to use during the operation. Can be used to report download progress.\n        precheck: bool\n            Check if ``lpath`` already exists and compare its checksum with that of ``rpath``, skipping the download if they match.\n        **kwargs: Any\n            Additional keyword arguments to pass to ``LakeFSFileSystem.open()``.\n        \"\"\"\n        lpath = stringify_path(lpath)\n        rpath = stringify_path(rpath)\n\n        if precheck and Path(lpath).is_file():\n            remote_checksum = self.checksum(rpath)\n            local_checksum = md5_checksum(lpath, blocksize=self.blocksize)\n            if local_checksum == remote_checksum:\n                logger.info(\n                    f\"Skipping upload of resource {lpath!r} to remote path {rpath!r}: \"\n                    f\"Resource {rpath!r} exists and checksums match.\"\n                )\n                return\n\n        with self.wrapped_api_call(rpath=rpath):\n            super().put_file(lpath, rpath, callback=callback, **kwargs)\n\n    def rm_file(self, path: str | os.PathLike[str]) -&gt; None:  # pragma: no cover\n        \"\"\"\n        Stage a remote file for removal on a lakeFS server.\n\n        The file will not actually be removed from the requested branch until a commit is created.\n\n        Parameters\n        ----------\n        path: str | os.PathLike[str]\n            The remote file to delete. Must be a fully qualified lakeFS URI.\n        \"\"\"\n        self.rm(path)\n\n    def rm(\n        self, path: str | os.PathLike[str], recursive: bool = False, maxdepth: int | None = None\n    ) -&gt; None:\n        \"\"\"\n        Stage multiple remote files for removal on a lakeFS server.\n\n        The files will not actually be removed from the requested branch until a commit is created.\n\n        Parameters\n        ----------\n        path: str | os.PathLike[str]\n            File(s) to delete.\n        recursive: bool\n            If file(s) include nested directories, recursively delete their contents.\n        maxdepth: int | None\n            Depth to pass to walk for finding files to delete, if recursive.\n            If None, there will be no limit and infinite recursion may be\n            possible.\n        \"\"\"\n\n        path = stringify_path(path)\n        repository, ref, prefix = parse(path)\n\n        with self.wrapped_api_call(rpath=path):\n            branch = lakefs.Branch(repository, ref, client=self.client)\n            objgen = branch.objects(prefix=prefix, delimiter=\"\" if recursive else \"/\")\n            if maxdepth is None:\n                branch.delete_objects(obj.path for obj in objgen)\n            else:\n                # nesting level is just the amount of \"/\"s in the path, no leading \"/\".\n                branch.delete_objects(obj.path for obj in objgen if obj.path.count(\"/\") &lt;= maxdepth)\n\n            # Directory listing cache for the containing folder must be invalidated\n            self.dircache.pop(self._parent(path), None)\n\n    def touch(self, path: str | os.PathLike[str], truncate: bool = True, **kwargs: Any) -&gt; None:\n        \"\"\"\n        Create an empty file or update an existing file on a lakeFS server.\n\n        Parameters\n        ----------\n        path: str | os.PathLike[str]\n            The file path to create or update. Must be a fully qualified lakeFS URI.\n        truncate: bool\n            Whether to set the file size to 0 (zero) bytes, even if the path already exists.\n        **kwargs: Any\n            Additional keyword arguments to pass to ``LakeFSFileSystem.open()``.\n\n        Raises\n        ------\n        NotImplementedError\n            If the targeted lakeFS server version does not support `touch()` operations.\n        \"\"\"\n\n        # empty buffer upload errors were fixed in https://github.com/treeverse/lakeFS/issues/7130,\n        # which was first released in lakeFS v1.3.1.\n        if self._lakefs_server_version &lt; (1, 3, 1):\n            version_string = \".\".join(str(v) for v in self._lakefs_server_version)\n            raise NotImplementedError(\n                \"LakeFSFileSystem.touch() is not supported for your lakeFS server version. \"\n                f\"minimum required version: '1.3.1', actual version: {version_string!r}\"\n            )\n\n        super().touch(path=path, truncate=truncate, **kwargs)\n\n    def tail(self, path: str | os.PathLike[str], size: int = 1024) -&gt; bytes:\n        \"\"\"\n        Get the last ``size`` bytes from a remote file.\n\n        Parameters\n        ----------\n        path: str | os.PathLike[str]\n            The file path to read. Must be a fully qualified lakeFS URI.\n        size: int\n            The amount of bytes to get.\n\n        Returns\n        -------\n        bytes\n            The bytes at the end of the requested file.\n        \"\"\"\n        f: ObjectReader\n        with self.open(path, \"rb\") as f:\n            f.seek(max(-size, -f._obj.stat().size_bytes), 2)\n            return f.read()\n</code></pre>"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.transaction","title":"transaction  <code>property</code>","text":"<pre><code>transaction: LakeFSTransaction\n</code></pre> <p>A context manager within which file uploads and versioning operations are deferred to a queue, and carried out during when exiting the context.</p> <p>Requires the file class to implement <code>.commit()</code> and <code>.discard()</code> for the normal and exception cases.</p>"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.wrapped_api_call","title":"wrapped_api_call","text":"<pre><code>wrapped_api_call(\n    rpath: str | None = None, message: str | None = None, set_cause: bool = True\n) -&gt; Generator[None, None, None]\n</code></pre> <p>A context manager to wrap lakeFS API calls, translating any API errors to Python-native OS errors.</p> <p>Meant for internal use.</p> PARAMETER  DESCRIPTION <code>rpath</code> <p>The remote path involved in the requested API call.</p> <p> TYPE: <code>str | None</code> DEFAULT: <code>None</code> </p> <code>message</code> <p>A custom error message to emit instead of parsing the API error response.</p> <p> TYPE: <code>str | None</code> DEFAULT: <code>None</code> </p> <code>set_cause</code> <p>Whether to include the original lakeFS API error in the resulting traceback.</p> <p> TYPE: <code>bool</code> DEFAULT: <code>True</code> </p> YIELDS DESCRIPTION <code>None</code> <p>An empty generator, to be used as a context manager.</p> RAISES DESCRIPTION <code>OSError</code> <p>Translated error from the lakeFS API call, if any.</p> Source code in <code>src/lakefs_spec/spec.py</code> <pre><code>@contextmanager\ndef wrapped_api_call(\n    self, rpath: str | None = None, message: str | None = None, set_cause: bool = True\n) -&gt; Generator[None, None, None]:\n    \"\"\"\n    A context manager to wrap lakeFS API calls, translating any API errors to Python-native OS errors.\n\n    Meant for internal use.\n\n    Parameters\n    ----------\n    rpath: str | None\n        The remote path involved in the requested API call.\n    message: str | None\n        A custom error message to emit instead of parsing the API error response.\n    set_cause: bool\n        Whether to include the original lakeFS API error in the resulting traceback.\n\n    Yields\n    ------\n    None\n        An empty generator, to be used as a context manager.\n\n    Raises\n    ------\n    OSError\n        Translated error from the lakeFS API call, if any.\n    \"\"\"\n    try:\n        yield\n    except ServerException as e:\n        raise translate_lakefs_error(e, rpath=rpath, message=message, set_cause=set_cause)\n</code></pre>"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.checksum","title":"checksum","text":"<pre><code>checksum(path: str | PathLike[str]) -&gt; str | None\n</code></pre> <p>Get a remote lakeFS file object's checksum.</p> <p>This is usually its MD5 hash, unless another hash function was used on upload.</p> PARAMETER  DESCRIPTION <code>path</code> <p>The remote path to look up the lakeFS checksum for. Must point to a single file object.</p> <p> TYPE: <code>str | PathLike[str]</code> </p> RETURNS DESCRIPTION <code>str | None</code> <p>The remote file's checksum, or <code>None</code> if <code>path</code> points to a directory or does not exist.</p> Source code in <code>src/lakefs_spec/spec.py</code> <pre><code>def checksum(self, path: str | os.PathLike[str]) -&gt; str | None:\n    \"\"\"\n    Get a remote lakeFS file object's checksum.\n\n    This is usually its MD5 hash, unless another hash function was used on upload.\n\n    Parameters\n    ----------\n    path: str | os.PathLike[str]\n        The remote path to look up the lakeFS checksum for. Must point to a single file object.\n\n    Returns\n    -------\n    str | None\n        The remote file's checksum, or ``None`` if ``path`` points to a directory or does not exist.\n    \"\"\"\n    path = stringify_path(path)\n    try:\n        return self.info(path).get(\"checksum\")\n    except FileNotFoundError:\n        return None\n</code></pre>"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.exists","title":"exists","text":"<pre><code>exists(path: str | PathLike[str], **kwargs: Any) -&gt; bool\n</code></pre> <p>Check existence of a remote path in a lakeFS repository.</p> <p>Input paths can either be files or directories.</p> PARAMETER  DESCRIPTION <code>path</code> <p>The remote path whose existence to check. Must be a fully qualified lakeFS URI.</p> <p> TYPE: <code>str | PathLike[str]</code> </p> <code>**kwargs</code> <p>Additional keyword arguments for fsspec compatibility, unused.</p> <p> TYPE: <code>Any</code> DEFAULT: <code>{}</code> </p> RETURNS DESCRIPTION <code>bool</code> <p><code>True</code> if the requested path exists, <code>False</code> if it does not.</p> RAISES DESCRIPTION <code>PermissionError</code> <p>If the user does not have sufficient permissions to query object existence.</p> Source code in <code>src/lakefs_spec/spec.py</code> <pre><code>def exists(self, path: str | os.PathLike[str], **kwargs: Any) -&gt; bool:\n    \"\"\"\n    Check existence of a remote path in a lakeFS repository.\n\n    Input paths can either be files or directories.\n\n    Parameters\n    ----------\n    path: str | os.PathLike[str]\n        The remote path whose existence to check. Must be a fully qualified lakeFS URI.\n    **kwargs: Any\n        Additional keyword arguments for fsspec compatibility, unused.\n\n    Returns\n    -------\n    bool\n        ``True`` if the requested path exists, ``False`` if it does not.\n\n    Raises\n    ------\n    PermissionError\n        If the user does not have sufficient permissions to query object existence.\n    \"\"\"\n    path = stringify_path(path)\n    repository, ref, resource = parse(path)\n    try:\n        reference = lakefs.Reference(repository, ref, client=self.client)\n        return reference.object(resource).exists()\n    except ServerException as e:\n        # in case of an error other than \"not found\", existence cannot be\n        # decided, so raise the translated error.\n        raise translate_lakefs_error(e)\n</code></pre>"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.cp_file","title":"cp_file","text":"<pre><code>cp_file(path1: str | PathLike[str], path2: str | PathLike[str], **kwargs: Any) -&gt; None\n</code></pre> <p>Copy a single file from one remote location to another in lakeFS.</p> PARAMETER  DESCRIPTION <code>path1</code> <p>The remote file location to be copied.</p> <p> TYPE: <code>str | PathLike[str]</code> </p> <code>path2</code> <p>The (remote) target location to which to copy the file.</p> <p> TYPE: <code>str | PathLike[str]</code> </p> <code>**kwargs</code> <p>Additional keyword arguments for fsspec compatibility, unused.</p> <p> TYPE: <code>Any</code> DEFAULT: <code>{}</code> </p> RAISES DESCRIPTION <code>ValueError</code> <p>When attempting to copy objects between repositories.</p> Source code in <code>src/lakefs_spec/spec.py</code> <pre><code>def cp_file(\n    self, path1: str | os.PathLike[str], path2: str | os.PathLike[str], **kwargs: Any\n) -&gt; None:\n    \"\"\"\n    Copy a single file from one remote location to another in lakeFS.\n\n    Parameters\n    ----------\n    path1: str | os.PathLike[str]\n        The remote file location to be copied.\n    path2: str | os.PathLike[str]\n        The (remote) target location to which to copy the file.\n    **kwargs: Any\n        Additional keyword arguments for fsspec compatibility, unused.\n\n    Raises\n    ------\n    ValueError\n        When attempting to copy objects between repositories.\n    \"\"\"\n    path1 = stringify_path(path1)\n    path2 = stringify_path(path2)\n    if path1 == path2:\n        return\n\n    orig_repo, orig_ref, orig_path = parse(path1)\n    dest_repo, dest_ref, dest_path = parse(path2)\n\n    if orig_repo != dest_repo:\n        raise ValueError(\n            \"can only copy objects within a repository, but got source \"\n            f\"repository {orig_repo!r} and destination repository {dest_repo!r}\"\n        )\n\n    with self.wrapped_api_call():\n        reference = lakefs.Reference(orig_repo, orig_ref, client=self.client)\n        reference.object(orig_path).copy(dest_ref, dest_path)\n</code></pre>"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.get_file","title":"get_file","text":"<pre><code>get_file(\n    rpath: str | PathLike[str],\n    lpath: str | PathLike[str],\n    callback: Callback = _DEFAULT_CALLBACK,\n    outfile: Any = None,\n    precheck: bool = True,\n    **kwargs: Any\n) -&gt; None\n</code></pre> <p>Download a single file from a remote lakeFS server to local storage.</p> PARAMETER  DESCRIPTION <code>rpath</code> <p>The remote path to download to local storage. Must be a fully qualified lakeFS URI, and point to a single file.</p> <p> TYPE: <code>str | PathLike[str]</code> </p> <code>lpath</code> <p>The local path on disk to save the downloaded file to.</p> <p> TYPE: <code>str | PathLike[str]</code> </p> <code>callback</code> <p>An fsspec callback to use during the operation. Can be used to report download progress.</p> <p> TYPE: <code>Callback</code> DEFAULT: <code>_DEFAULT_CALLBACK</code> </p> <code>outfile</code> <p>A file-like object to save the downloaded content to. Can be used in place of <code>lpath</code>.</p> <p> TYPE: <code>Any</code> DEFAULT: <code>None</code> </p> <code>precheck</code> <p>Check if <code>lpath</code> already exists and compare its checksum with that of <code>rpath</code>, skipping the download if they match.</p> <p> TYPE: <code>bool</code> DEFAULT: <code>True</code> </p> <code>**kwargs</code> <p>Additional keyword arguments passed to <code>AbstractFileSystem.open()</code>.</p> <p> TYPE: <code>Any</code> DEFAULT: <code>{}</code> </p> Source code in <code>src/lakefs_spec/spec.py</code> <pre><code>def get_file(\n    self,\n    rpath: str | os.PathLike[str],\n    lpath: str | os.PathLike[str],\n    callback: fsspec.callbacks.Callback = _DEFAULT_CALLBACK,\n    outfile: Any = None,\n    precheck: bool = True,\n    **kwargs: Any,\n) -&gt; None:\n    \"\"\"\n    Download a single file from a remote lakeFS server to local storage.\n\n    Parameters\n    ----------\n    rpath: str | os.PathLike[str]\n        The remote path to download to local storage. Must be a fully qualified lakeFS URI, and point to a single file.\n    lpath: str | os.PathLike[str]\n        The local path on disk to save the downloaded file to.\n    callback: fsspec.callbacks.Callback\n        An fsspec callback to use during the operation. Can be used to report download progress.\n    outfile: Any\n        A file-like object to save the downloaded content to. Can be used in place of ``lpath``.\n    precheck: bool\n        Check if ``lpath`` already exists and compare its checksum with that of ``rpath``, skipping the download if they match.\n    **kwargs: Any\n        Additional keyword arguments passed to ``AbstractFileSystem.open()``.\n    \"\"\"\n    rpath = stringify_path(rpath)\n    lpath = stringify_path(lpath)\n\n    if precheck and Path(lpath).is_file():\n        local_checksum = md5_checksum(lpath, blocksize=self.blocksize)\n        remote_checksum = self.info(rpath).get(\"checksum\")\n        if local_checksum == remote_checksum:\n            logger.info(\n                f\"Skipping download of resource {rpath!r} to local path {lpath!r}: \"\n                f\"Resource {lpath!r} exists and checksums match.\"\n            )\n            return\n\n    with self.wrapped_api_call(rpath=rpath):\n        super().get_file(rpath, lpath, callback=callback, outfile=outfile, **kwargs)\n</code></pre>"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.info","title":"info","text":"<pre><code>info(path: str | PathLike[str], **kwargs: Any) -&gt; dict[str, Any]\n</code></pre> <p>Query a remote lakeFS object's metadata.</p> PARAMETER  DESCRIPTION <code>path</code> <p>The object for which to obtain metadata. Must be a fully qualified lakeFS URI, can either point to a file or a directory.</p> <p> TYPE: <code>str | PathLike[str]</code> </p> <code>**kwargs</code> <p>Additional keyword arguments to pass to <code>LakeFSFileSystem.ls()</code> if <code>path</code> points to a directory.</p> <p> TYPE: <code>Any</code> DEFAULT: <code>{}</code> </p> RETURNS DESCRIPTION <code>dict[str, Any]</code> <p>A dictionary containing metadata on the object, including its full remote path and object type (file or directory).</p> RAISES DESCRIPTION <code>FileNotFoundError</code> <p>If the <code>path</code> refers to a non-file path that does not exist in the repository.</p> Source code in <code>src/lakefs_spec/spec.py</code> <pre><code>def info(self, path: str | os.PathLike[str], **kwargs: Any) -&gt; dict[str, Any]:\n    \"\"\"\n    Query a remote lakeFS object's metadata.\n\n    Parameters\n    ----------\n    path: str | os.PathLike[str]\n        The object for which to obtain metadata. Must be a fully qualified lakeFS URI, can either point to a file or a directory.\n    **kwargs: Any\n        Additional keyword arguments to pass to ``LakeFSFileSystem.ls()`` if ``path`` points to a directory.\n\n    Returns\n    -------\n    dict[str, Any]\n        A dictionary containing metadata on the object, including its full remote path and object type (file or directory).\n\n    Raises\n    ------\n    FileNotFoundError\n        If the ``path`` refers to a non-file path that does not exist in the repository.\n    \"\"\"\n    path = stringify_path(path)\n    repository, ref, resource = parse(path)\n    # first, try with `stat_object` in case of a file.\n    # the condition below checks edge cases of resources that cannot be files.\n    if resource and not resource.endswith(\"/\"):\n        try:\n            reference = lakefs.Reference(repository, ref, client=self.client)\n            res = reference.object(resource).stat()\n            return {\n                \"checksum\": res.checksum,\n                \"content-type\": res.content_type,\n                \"mtime\": res.mtime,\n                \"name\": f\"{repository}/{ref}/{res.path}\",\n                \"size\": res.size_bytes,\n                \"type\": \"file\",\n            }\n        except NotFoundException:\n            # fall through, retry with `ls` if it's a directory.\n            pass\n        except ServerException as e:\n            raise translate_lakefs_error(e, rpath=path)\n\n    out = self.ls(path, detail=True, recursive=True, **kwargs)\n    if not out:\n        raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), path)\n\n    return {\n        \"name\": path.rstrip(\"/\"),\n        \"size\": sum(o.get(\"size\") or 0 for o in out),\n        \"type\": \"directory\",\n    }\n</code></pre>"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.ls","title":"ls","text":"<pre><code>ls(\n    path: str | PathLike[str], detail: bool = True, **kwargs: Any\n) -&gt; list[str] | list[dict[str, Any]]\n</code></pre> <p>List all available objects under a given path in lakeFS.</p> PARAMETER  DESCRIPTION <code>path</code> <p>The path under which to list objects. Must be a fully qualified lakeFS URI. Can also point to a file, in which case the file's metadata will be returned.</p> <p> TYPE: <code>str | PathLike[str]</code> </p> <code>detail</code> <p>Whether to obtain all metadata on the requested objects or just their names.</p> <p> TYPE: <code>bool</code> DEFAULT: <code>True</code> </p> <code>**kwargs</code> <p>Additional keyword arguments for fsspec compatibility.</p> <p>In particular:     <code>refresh: bool</code>: whether to skip the directory listing cache,     <code>recursive: bool</code>: whether to list subdirectory contents recursively</p> <p> TYPE: <code>Any</code> DEFAULT: <code>{}</code> </p> RETURNS DESCRIPTION <code>list[str] | list[dict[str, Any]]</code> <p>A list of all objects' metadata under the given remote path if <code>detail=True</code>, or alternatively only their names if <code>detail=False</code>.</p> Source code in <code>src/lakefs_spec/spec.py</code> <pre><code>def ls(\n    self,\n    path: str | os.PathLike[str],\n    detail: bool = True,\n    **kwargs: Any,\n) -&gt; list[str] | list[dict[str, Any]]:\n    \"\"\"\n    List all available objects under a given path in lakeFS.\n\n    Parameters\n    ----------\n    path: str | os.PathLike[str]\n        The path under which to list objects. Must be a fully qualified lakeFS URI.\n        Can also point to a file, in which case the file's metadata will be returned.\n    detail: bool\n        Whether to obtain all metadata on the requested objects or just their names.\n    **kwargs: Any\n        Additional keyword arguments for fsspec compatibility.\n\n        In particular:\n            `refresh: bool`: whether to skip the directory listing cache,\n            `recursive: bool`: whether to list subdirectory contents recursively\n\n    Returns\n    -------\n    list[str] | list[dict[str, Any]]\n        A list of all objects' metadata under the given remote path if ``detail=True``, or alternatively only their names if ``detail=False``.\n    \"\"\"\n    path = self._strip_protocol(path)\n    repository, ref, prefix = parse(path)\n\n    recursive = kwargs.pop(\"recursive\", False)\n\n    # Try lookup in dircache unless explicitly disabled by `refresh=True` kwarg\n    use_dircache = not kwargs.pop(\"refresh\", False)\n\n    if use_dircache:\n        cache_entry: list[Any] | None = None\n        try:\n            cache_entry = self._ls_from_cache(path, recursive=recursive)\n        except FileNotFoundError:\n            # we patch files missing from an ls call in the cache entry below,\n            # so this should not be an error.\n            pass\n\n        if cache_entry is not None:\n            if not detail:\n                return [e[\"name\"] for e in cache_entry]\n            return cache_entry[:]\n\n    kwargs[\"prefix\"] = prefix\n\n    info = []\n    # stat infos are either the path only (`detail=False`) or a dict full of metadata\n    delimiter = \"\" if recursive else \"/\"\n    reference = lakefs.Reference(repository, ref, client=self.client)\n\n    with self.wrapped_api_call(rpath=path):\n        for obj in reference.objects(prefix=prefix, delimiter=delimiter):\n            if isinstance(obj, CommonPrefix):\n                # prefixes are added below.\n                info.append(\n                    {\n                        \"name\": f\"{repository}/{ref}/{obj.path}\",\n                        \"size\": 0,\n                        \"type\": \"directory\",\n                    }\n                )\n            elif isinstance(obj, ObjectInfo):\n                info.append(\n                    {\n                        \"checksum\": obj.checksum,\n                        \"content-type\": obj.content_type,\n                        \"mtime\": obj.mtime,\n                        \"name\": f\"{repository}/{ref}/{obj.path}\",\n                        \"size\": obj.size_bytes,\n                        \"type\": \"object\",\n                    }\n                )\n\n    # Retry the API call with appended slash if the current result\n    # is just a single directory entry only (not its contents).\n    # This is useful to allow `ls(\"repo/branch/dir\")` calls without a trailing slash.\n    if len(info) == 1 and info[0][\"type\"] == \"directory\" and info[0][\"name\"] == path + \"/\":\n        return self.ls(\n            path + \"/\",\n            detail=detail,\n            **kwargs | {\"refresh\": not use_dircache, \"recursive\": recursive},\n        )\n\n    if recursive:\n        # To make recursive ls behave identical to the non-recursive case,\n        # add back virtual `directory` entries, which are only returned by\n        # the lakeFS API when querying non-recursively.\n        here = self._strip_protocol(path).rstrip(\"/\")\n        subdirs = {parent for o in info if (parent := self._parent(o[\"name\"])) != here}\n        for subdir in subdirs:\n            info.append(\n                {\n                    \"name\": subdir + \"/\",\n                    \"size\": 0,\n                    \"type\": \"directory\",\n                }\n            )\n\n    if info:\n        self._update_dircache(info[:])\n\n    if not detail:\n        info = [o[\"name\"] for o in info]  # type: ignore\n\n    return info\n</code></pre>"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.open","title":"open","text":"<pre><code>open(\n    path: str | PathLike[str],\n    mode: Literal[\"r\", \"rb\", \"rt\", \"w\", \"wb\", \"wt\", \"x\", \"xb\", \"xt\"] = \"rb\",\n    pre_sign: bool = False,\n    content_type: str | None = None,\n    metadata: dict[str, str] | None = None,\n    autocommit: bool = True,\n    **kwargs: Any\n) -&gt; LakeFSIOBase\n</code></pre> <p>Dispatch a lakeFS file-like object (local buffer on disk) for the given remote path for up- or downloads depending on <code>mode</code>.</p> PARAMETER  DESCRIPTION <code>path</code> <p>The remote path for which to open a local <code>LakeFSFile</code>. Must be a fully qualified lakeFS URI.</p> <p> TYPE: <code>str | PathLike[str]</code> </p> <code>mode</code> <p>The file mode indicating its purpose. Use <code>r/rb</code> for downloads from lakeFS, <code>w/wb/x/xb</code> for uploads to lakeFS.</p> <p> TYPE: <code>Literal['r', 'rb', 'rt', 'w', 'wb', 'wt', 'x', 'xb', 'xt']</code> DEFAULT: <code>'rb'</code> </p> <code>pre_sign</code> <p>Whether to use a pre-signed URL for the file up-/download.</p> <p> TYPE: <code>bool</code> DEFAULT: <code>False</code> </p> <code>content_type</code> <p>Content type to use for the file, relevant for uploads only.</p> <p> TYPE: <code>str | None</code> DEFAULT: <code>None</code> </p> <code>metadata</code> <p>Additional metadata to attach to the file, relevant for uploads only.</p> <p> TYPE: <code>dict[str, str] | None</code> DEFAULT: <code>None</code> </p> <code>autocommit</code> <p>Whether to process the file immediately instead of queueing it for transaction while in a transaction context.</p> <p> TYPE: <code>bool</code> DEFAULT: <code>True</code> </p> <code>**kwargs</code> <p>Additional keyword arguments for fsspec compatibility, unused.</p> <p> TYPE: <code>Any</code> DEFAULT: <code>{}</code> </p> RETURNS DESCRIPTION <code>LakeFSIOBase</code> <p>A local file-like object ready to hold data to be received from / sent to a lakeFS server.</p> RAISES DESCRIPTION <code>NotImplementedError</code> <p>If <code>mode</code> is not supported.</p> Source code in <code>src/lakefs_spec/spec.py</code> <pre><code>def open(\n    self,\n    path: str | os.PathLike[str],\n    mode: Literal[\"r\", \"rb\", \"rt\", \"w\", \"wb\", \"wt\", \"x\", \"xb\", \"xt\"] = \"rb\",\n    pre_sign: bool = False,\n    content_type: str | None = None,\n    metadata: dict[str, str] | None = None,\n    autocommit: bool = True,\n    **kwargs: Any,\n) -&gt; LakeFSIOBase:\n    \"\"\"\n    Dispatch a lakeFS file-like object (local buffer on disk) for the given remote path for up- or downloads depending on ``mode``.\n\n    Parameters\n    ----------\n    path: str | os.PathLike[str]\n        The remote path for which to open a local ``LakeFSFile``. Must be a fully qualified lakeFS URI.\n    mode: Literal[\"r\", \"rb\", \"rt\", \"w\", \"wb\", \"wt\", \"x\", \"xb\", \"xt\"]\n        The file mode indicating its purpose. Use ``r/rb`` for downloads from lakeFS, ``w/wb/x/xb`` for uploads to lakeFS.\n    pre_sign: bool\n        Whether to use a pre-signed URL for the file up-/download.\n    content_type: str | None\n        Content type to use for the file, relevant for uploads only.\n    metadata: dict[str, str] | None\n        Additional metadata to attach to the file, relevant for uploads only.\n    autocommit: bool\n        Whether to process the file immediately instead of queueing it for transaction while in a transaction context.\n    **kwargs: Any\n        Additional keyword arguments for fsspec compatibility, unused.\n\n    Returns\n    -------\n    LakeFSIOBase\n        A local file-like object ready to hold data to be received from / sent to a lakeFS server.\n\n    Raises\n    ------\n    NotImplementedError\n        If ``mode`` is not supported.\n    \"\"\"\n    if mode.endswith(\"t\"):\n        # text modes {r,w,x}t are equivalent to {r,w,x} here respectively.\n        mode = mode[:-1]  # type: ignore\n\n    if mode not in {\"r\", \"rb\", \"w\", \"wb\", \"x\", \"xb\"}:\n        raise NotImplementedError(f\"unsupported mode {mode!r}\")\n\n    path = stringify_path(path)\n    repo, ref, resource = parse(path)\n\n    if mode.startswith(\"r\"):\n        reference = lakefs.Reference(repo, ref, client=self.client)\n        obj = reference.object(resource)\n\n        if not obj.exists():\n            raise FileNotFoundError(path)\n        handler = ObjectReader(obj, mode=mode, pre_sign=pre_sign, client=self.client)\n    else:\n        # for writing ops, ref must be a branch\n        branch = lakefs.Branch(repo, ref, client=self.client)\n        if self.create_branch_ok:\n            branch.create(self.source_branch, exist_ok=True)\n\n        obj = branch.object(resource)\n        handler = ObjectWriter(\n            obj,\n            mode=mode,\n            pre_sign=pre_sign,\n            content_type=content_type,\n            metadata=metadata,\n            client=self.client,\n        )\n\n    ac = kwargs.pop(\"autocommit\", not self._intrans)\n    if not ac and \"r\" not in mode:\n        self._transaction.files.append(handler)\n\n    return handler\n</code></pre>"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.put_file","title":"put_file","text":"<pre><code>put_file(\n    lpath: str | PathLike[str],\n    rpath: str | PathLike[str],\n    callback: Callback = _DEFAULT_CALLBACK,\n    precheck: bool = True,\n    **kwargs: Any\n) -&gt; None\n</code></pre> <p>Upload a local file to a remote location on a lakeFS server.</p> <p>Note that depending on the block store type, additional configuration like credentials may need to be configured when <code>use_blockstore=True</code> and <code>presign=False</code>.</p> PARAMETER  DESCRIPTION <code>lpath</code> <p>The local path on disk to upload to the lakeFS server.</p> <p> TYPE: <code>str | PathLike[str]</code> </p> <code>rpath</code> <p>The remote target path to upload the local file to. Must be a fully qualified lakeFS URI.</p> <p> TYPE: <code>str | PathLike[str]</code> </p> <code>callback</code> <p>An fsspec callback to use during the operation. Can be used to report download progress.</p> <p> TYPE: <code>Callback</code> DEFAULT: <code>_DEFAULT_CALLBACK</code> </p> <code>precheck</code> <p>Check if <code>lpath</code> already exists and compare its checksum with that of <code>rpath</code>, skipping the download if they match.</p> <p> TYPE: <code>bool</code> DEFAULT: <code>True</code> </p> <code>**kwargs</code> <p>Additional keyword arguments to pass to <code>LakeFSFileSystem.open()</code>.</p> <p> TYPE: <code>Any</code> DEFAULT: <code>{}</code> </p> Source code in <code>src/lakefs_spec/spec.py</code> <pre><code>def put_file(\n    self,\n    lpath: str | os.PathLike[str],\n    rpath: str | os.PathLike[str],\n    callback: fsspec.callbacks.Callback = _DEFAULT_CALLBACK,\n    precheck: bool = True,\n    **kwargs: Any,\n) -&gt; None:\n    \"\"\"\n    Upload a local file to a remote location on a lakeFS server.\n\n    Note that depending on the block store type, additional configuration like credentials may need to be configured when ``use_blockstore=True`` and ``presign=False``.\n\n    Parameters\n    ----------\n    lpath: str | os.PathLike[str]\n        The local path on disk to upload to the lakeFS server.\n    rpath: str | os.PathLike[str]\n        The remote target path to upload the local file to. Must be a fully qualified lakeFS URI.\n    callback: fsspec.callbacks.Callback\n        An fsspec callback to use during the operation. Can be used to report download progress.\n    precheck: bool\n        Check if ``lpath`` already exists and compare its checksum with that of ``rpath``, skipping the download if they match.\n    **kwargs: Any\n        Additional keyword arguments to pass to ``LakeFSFileSystem.open()``.\n    \"\"\"\n    lpath = stringify_path(lpath)\n    rpath = stringify_path(rpath)\n\n    if precheck and Path(lpath).is_file():\n        remote_checksum = self.checksum(rpath)\n        local_checksum = md5_checksum(lpath, blocksize=self.blocksize)\n        if local_checksum == remote_checksum:\n            logger.info(\n                f\"Skipping upload of resource {lpath!r} to remote path {rpath!r}: \"\n                f\"Resource {rpath!r} exists and checksums match.\"\n            )\n            return\n\n    with self.wrapped_api_call(rpath=rpath):\n        super().put_file(lpath, rpath, callback=callback, **kwargs)\n</code></pre>"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.rm_file","title":"rm_file","text":"<pre><code>rm_file(path: str | PathLike[str]) -&gt; None\n</code></pre> <p>Stage a remote file for removal on a lakeFS server.</p> <p>The file will not actually be removed from the requested branch until a commit is created.</p> PARAMETER  DESCRIPTION <code>path</code> <p>The remote file to delete. Must be a fully qualified lakeFS URI.</p> <p> TYPE: <code>str | PathLike[str]</code> </p> Source code in <code>src/lakefs_spec/spec.py</code> <pre><code>def rm_file(self, path: str | os.PathLike[str]) -&gt; None:  # pragma: no cover\n    \"\"\"\n    Stage a remote file for removal on a lakeFS server.\n\n    The file will not actually be removed from the requested branch until a commit is created.\n\n    Parameters\n    ----------\n    path: str | os.PathLike[str]\n        The remote file to delete. Must be a fully qualified lakeFS URI.\n    \"\"\"\n    self.rm(path)\n</code></pre>"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.rm","title":"rm","text":"<pre><code>rm(path: str | PathLike[str], recursive: bool = False, maxdepth: int | None = None) -&gt; None\n</code></pre> <p>Stage multiple remote files for removal on a lakeFS server.</p> <p>The files will not actually be removed from the requested branch until a commit is created.</p> PARAMETER  DESCRIPTION <code>path</code> <p>File(s) to delete.</p> <p> TYPE: <code>str | PathLike[str]</code> </p> <code>recursive</code> <p>If file(s) include nested directories, recursively delete their contents.</p> <p> TYPE: <code>bool</code> DEFAULT: <code>False</code> </p> <code>maxdepth</code> <p>Depth to pass to walk for finding files to delete, if recursive. If None, there will be no limit and infinite recursion may be possible.</p> <p> TYPE: <code>int | None</code> DEFAULT: <code>None</code> </p> Source code in <code>src/lakefs_spec/spec.py</code> <pre><code>def rm(\n    self, path: str | os.PathLike[str], recursive: bool = False, maxdepth: int | None = None\n) -&gt; None:\n    \"\"\"\n    Stage multiple remote files for removal on a lakeFS server.\n\n    The files will not actually be removed from the requested branch until a commit is created.\n\n    Parameters\n    ----------\n    path: str | os.PathLike[str]\n        File(s) to delete.\n    recursive: bool\n        If file(s) include nested directories, recursively delete their contents.\n    maxdepth: int | None\n        Depth to pass to walk for finding files to delete, if recursive.\n        If None, there will be no limit and infinite recursion may be\n        possible.\n    \"\"\"\n\n    path = stringify_path(path)\n    repository, ref, prefix = parse(path)\n\n    with self.wrapped_api_call(rpath=path):\n        branch = lakefs.Branch(repository, ref, client=self.client)\n        objgen = branch.objects(prefix=prefix, delimiter=\"\" if recursive else \"/\")\n        if maxdepth is None:\n            branch.delete_objects(obj.path for obj in objgen)\n        else:\n            # nesting level is just the amount of \"/\"s in the path, no leading \"/\".\n            branch.delete_objects(obj.path for obj in objgen if obj.path.count(\"/\") &lt;= maxdepth)\n\n        # Directory listing cache for the containing folder must be invalidated\n        self.dircache.pop(self._parent(path), None)\n</code></pre>"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.touch","title":"touch","text":"<pre><code>touch(path: str | PathLike[str], truncate: bool = True, **kwargs: Any) -&gt; None\n</code></pre> <p>Create an empty file or update an existing file on a lakeFS server.</p> PARAMETER  DESCRIPTION <code>path</code> <p>The file path to create or update. Must be a fully qualified lakeFS URI.</p> <p> TYPE: <code>str | PathLike[str]</code> </p> <code>truncate</code> <p>Whether to set the file size to 0 (zero) bytes, even if the path already exists.</p> <p> TYPE: <code>bool</code> DEFAULT: <code>True</code> </p> <code>**kwargs</code> <p>Additional keyword arguments to pass to <code>LakeFSFileSystem.open()</code>.</p> <p> TYPE: <code>Any</code> DEFAULT: <code>{}</code> </p> RAISES DESCRIPTION <code>NotImplementedError</code> <p>If the targeted lakeFS server version does not support <code>touch()</code> operations.</p> Source code in <code>src/lakefs_spec/spec.py</code> <pre><code>def touch(self, path: str | os.PathLike[str], truncate: bool = True, **kwargs: Any) -&gt; None:\n    \"\"\"\n    Create an empty file or update an existing file on a lakeFS server.\n\n    Parameters\n    ----------\n    path: str | os.PathLike[str]\n        The file path to create or update. Must be a fully qualified lakeFS URI.\n    truncate: bool\n        Whether to set the file size to 0 (zero) bytes, even if the path already exists.\n    **kwargs: Any\n        Additional keyword arguments to pass to ``LakeFSFileSystem.open()``.\n\n    Raises\n    ------\n    NotImplementedError\n        If the targeted lakeFS server version does not support `touch()` operations.\n    \"\"\"\n\n    # empty buffer upload errors were fixed in https://github.com/treeverse/lakeFS/issues/7130,\n    # which was first released in lakeFS v1.3.1.\n    if self._lakefs_server_version &lt; (1, 3, 1):\n        version_string = \".\".join(str(v) for v in self._lakefs_server_version)\n        raise NotImplementedError(\n            \"LakeFSFileSystem.touch() is not supported for your lakeFS server version. \"\n            f\"minimum required version: '1.3.1', actual version: {version_string!r}\"\n        )\n\n    super().touch(path=path, truncate=truncate, **kwargs)\n</code></pre>"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.tail","title":"tail","text":"<pre><code>tail(path: str | PathLike[str], size: int = 1024) -&gt; bytes\n</code></pre> <p>Get the last <code>size</code> bytes from a remote file.</p> PARAMETER  DESCRIPTION <code>path</code> <p>The file path to read. Must be a fully qualified lakeFS URI.</p> <p> TYPE: <code>str | PathLike[str]</code> </p> <code>size</code> <p>The amount of bytes to get.</p> <p> TYPE: <code>int</code> DEFAULT: <code>1024</code> </p> RETURNS DESCRIPTION <code>bytes</code> <p>The bytes at the end of the requested file.</p> Source code in <code>src/lakefs_spec/spec.py</code> <pre><code>def tail(self, path: str | os.PathLike[str], size: int = 1024) -&gt; bytes:\n    \"\"\"\n    Get the last ``size`` bytes from a remote file.\n\n    Parameters\n    ----------\n    path: str | os.PathLike[str]\n        The file path to read. Must be a fully qualified lakeFS URI.\n    size: int\n        The amount of bytes to get.\n\n    Returns\n    -------\n    bytes\n        The bytes at the end of the requested file.\n    \"\"\"\n    f: ObjectReader\n    with self.open(path, \"rb\") as f:\n        f.seek(max(-size, -f._obj.stat().size_bytes), 2)\n        return f.read()\n</code></pre>"},{"location":"reference/lakefs_spec/transaction/","title":"transaction","text":"<p>Functionality for extended lakeFS transactions to conduct versioning operations between file uploads.</p>"},{"location":"reference/lakefs_spec/transaction/#lakefs_spec.transaction.LakeFSTransaction","title":"LakeFSTransaction","text":"<p>             Bases: <code>Transaction</code></p> <p>A lakeFS transaction model capable of versioning operations in between file uploads.</p> PARAMETER  DESCRIPTION <code>fs</code> <p>The lakeFS file system associated with the transaction.</p> <p> TYPE: <code>'LakeFSFileSystem'</code> </p> Source code in <code>src/lakefs_spec/transaction.py</code> <pre><code>class LakeFSTransaction(Transaction):\n    \"\"\"\n    A lakeFS transaction model capable of versioning operations in between file uploads.\n\n    Parameters\n    ----------\n    fs: LakeFSFileSystem\n        The lakeFS file system associated with the transaction.\n    \"\"\"\n\n    def __init__(\n        self,\n        fs: \"LakeFSFileSystem\",\n    ):\n        super().__init__(fs=fs)\n        self.fs: \"LakeFSFileSystem\"\n        self.files: deque[ObjectWriter] = deque(self.files)\n\n        self.repository: str | None = None\n        self.base_branch: Branch | None = None\n        self.automerge: bool = False\n        self.delete: Literal[\"onsuccess\", \"always\", \"never\"] = \"onsuccess\"\n        self._ephemeral_branch: Branch | None = None\n\n    def __call__(\n        self,\n        repository: str | Repository,\n        base_branch: str | Branch = \"main\",\n        branch_name: str | None = None,\n        automerge: bool = True,\n        delete: Literal[\"onsuccess\", \"always\", \"never\"] = \"onsuccess\",\n    ) -&gt; \"LakeFSTransaction\":\n        \"\"\"\n        Creates an ephemeral branch, conducts all uploads and operations on that branch,\n        and optionally merges it back into the source branch.\n\n        repository: str | Repository\n            The repository in which to conduct the transaction.\n        base_branch: str | Branch\n            The branch on which the transaction operations should be based.\n        automerge: bool\n            Automatically merge the ephemeral branch into the base branch after successful\n            transaction completion.\n        delete: Literal[\"onsuccess\", \"always\", \"never\"]\n            Cleanup policy / deletion handling for the ephemeral branch after the transaction.\n\n            If ``\"onsuccess\"``, the branch is deleted if the transaction succeeded,\n            or left over if an error occurred.\n\n            If ``\"always\"``, the ephemeral branch is always deleted after transaction regardless of success\n            or failure.\n\n            If ``\"never\"``, the transaction branch is always left in the repository.\n        \"\"\"\n\n        if isinstance(repository, str):\n            self.repository = repository\n        else:\n            self.repository = repository.id\n\n        repo = lakefs.Repository(self.repository, client=self.fs.client)\n        try:\n            _ = repo.metadata\n        except ServerException:\n            raise ValueError(f\"repository {self.repository!r} does not exist\") from None\n\n        # base branch needs to be a lakefs.Branch, since it is being diffed\n        # with the ephemeral branch in __exit__.\n        self.base_branch = _ensurebranch(base_branch, self.repository, self.fs.client)\n\n        self.automerge = automerge\n        self.delete = delete\n\n        ephem_name = branch_name or \"transaction-\" + \"\".join(random.choices(string.digits, k=6))  # nosec: B311\n        self._ephemeral_branch = Branch(self.repository, ephem_name, client=self.fs.client)\n        return self\n\n    def __enter__(self):\n        logger.debug(\n            f\"Creating ephemeral branch {self._ephemeral_branch.id!r} \"\n            f\"from branch {self.base_branch.id!r}.\"\n        )\n        self._ephemeral_branch.create(self.base_branch, exist_ok=False)\n        self.fs._intrans = True\n        return self\n\n    def __exit__(self, exc_type, exc_val, exc_tb):\n        success = exc_type is None\n        while self.files:\n            # fsspec base class calls `append` on the file, which means we\n            # have to pop from the left to preserve order.\n            f = self.files.popleft()\n            if not success:\n                f.discard()\n\n        self.fs._intrans = False\n        self.fs._transaction = None\n\n        if any(self._ephemeral_branch.uncommitted()):\n            msg = f\"Finished transaction on branch {self._ephemeral_branch.id!r} with uncommitted changes.\"\n            if self.delete != \"never\":\n                msg += \" Objects added but not committed are lost.\"\n            warnings.warn(msg)\n\n        if success and self.automerge:\n            if any(self.base_branch.diff(self._ephemeral_branch)):\n                self._ephemeral_branch.merge_into(self.base_branch)\n        if self.delete == \"always\" or (success and self.delete == \"onsuccess\"):\n            self._ephemeral_branch.delete()\n\n    @property\n    def branch(self):\n        return self._ephemeral_branch\n\n    def commit(self, message: str, metadata: dict[str, str] | None = None) -&gt; Reference:\n        \"\"\"\n        Create a commit on this transaction's ephemeral branch with a commit message\n        and attached metadata.\n\n        Parameters\n        ----------\n        message: str\n            The commit message to attach to the newly created commit.\n        metadata: dict[str, str] | None\n            Optional metadata to enrich the created commit with (author, e-mail, ...).\n\n        Returns\n        -------\n        Reference\n            The created commit.\n        \"\"\"\n\n        diff = list(self.branch.uncommitted())\n\n        if not diff:\n            logger.warning(f\"No changes to commit on branch {self.branch.id!r}.\")\n            return self.branch.head\n\n        return self.branch.commit(message, metadata=metadata)\n\n    def merge(self, source_ref: str | Branch, into: str | Branch) -&gt; Commit:\n        \"\"\"\n        Merge a branch into another branch in a repository.\n\n        In case the branch contains no changes relevant to the target branch,\n        no merge happens, and the tip of the target branch is returned instead.\n\n        Parameters\n        ----------\n        source_ref: str | Branch\n            Source reference containing the changes to merge.\n            Can be a branch name or partial commit SHA.\n        into: str | Branch\n            Target branch into which the changes will be merged.\n\n        Returns\n        -------\n        Commit\n            Either the created merge commit, or the head commit of the target branch.\n        \"\"\"\n        source = _ensurebranch(source_ref, self.repository, self.fs.client)\n        dest = _ensurebranch(into, self.repository, self.fs.client)\n\n        if any(dest.diff(source)):\n            source.merge_into(dest)\n        return dest.head.get_commit()\n\n    def revert(self, branch: str | Branch, ref: ReferenceType, parent_number: int = 1) -&gt; Commit:\n        \"\"\"\n        Revert a previous commit on a branch.\n\n        Parameters\n        ----------\n        branch: str | Branch\n            Branch on which the commit should be reverted.\n        ref: ReferenceType\n            The reference to revert.\n        parent_number: int\n            If there are multiple parents to a commit, specify to which parent\n            the commit should be reverted. ``parent_number = 1`` (the default)\n            refers to the first parent commit of the current ``branch`` tip.\n\n        Returns\n        -------\n        Commit\n            The created revert commit.\n        \"\"\"\n\n        b = _ensurebranch(branch, self.repository, self.fs.client)\n\n        ref_id = ref if isinstance(ref, str) else ref.id\n        b.revert(ref_id, parent_number=parent_number)\n        return b.head.get_commit()\n\n    def rev_parse(self, ref: ReferenceType) -&gt; Commit:\n        \"\"\"\n        Parse a given lakeFS reference expression and obtain its corresponding commit.\n\n        Parameters\n        ----------\n        ref: ReferenceType\n            Reference object to resolve, can be a branch, commit SHA, or tag.\n\n        Returns\n        -------\n        Commit\n            The commit referenced by the expression ``ref``.\n        \"\"\"\n\n        ref_id = ref.id if isinstance(ref, Reference) else ref\n        reference = lakefs.Reference(self.repository, ref_id, client=self.fs.client)\n        return reference.get_commit()\n\n    def tag(self, ref: ReferenceType, name: str) -&gt; Tag:\n        \"\"\"\n        Create a tag referencing a commit in a repository.\n\n        Parameters\n        ----------\n        ref: ReferenceType\n            Commit SHA or placeholder for a reference or commit object\n            to which the new tag will point.\n        name: str\n            Name of the tag to be created.\n\n        Returns\n        -------\n        Tag\n            The requested tag.\n        \"\"\"\n\n        return lakefs.Tag(self.repository, name, client=self.fs.client).create(ref)\n</code></pre>"},{"location":"reference/lakefs_spec/transaction/#lakefs_spec.transaction.LakeFSTransaction.commit","title":"commit","text":"<pre><code>commit(message: str, metadata: dict[str, str] | None = None) -&gt; Reference\n</code></pre> <p>Create a commit on this transaction's ephemeral branch with a commit message and attached metadata.</p> PARAMETER  DESCRIPTION <code>message</code> <p>The commit message to attach to the newly created commit.</p> <p> TYPE: <code>str</code> </p> <code>metadata</code> <p>Optional metadata to enrich the created commit with (author, e-mail, ...).</p> <p> TYPE: <code>dict[str, str] | None</code> DEFAULT: <code>None</code> </p> RETURNS DESCRIPTION <code>Reference</code> <p>The created commit.</p> Source code in <code>src/lakefs_spec/transaction.py</code> <pre><code>def commit(self, message: str, metadata: dict[str, str] | None = None) -&gt; Reference:\n    \"\"\"\n    Create a commit on this transaction's ephemeral branch with a commit message\n    and attached metadata.\n\n    Parameters\n    ----------\n    message: str\n        The commit message to attach to the newly created commit.\n    metadata: dict[str, str] | None\n        Optional metadata to enrich the created commit with (author, e-mail, ...).\n\n    Returns\n    -------\n    Reference\n        The created commit.\n    \"\"\"\n\n    diff = list(self.branch.uncommitted())\n\n    if not diff:\n        logger.warning(f\"No changes to commit on branch {self.branch.id!r}.\")\n        return self.branch.head\n\n    return self.branch.commit(message, metadata=metadata)\n</code></pre>"},{"location":"reference/lakefs_spec/transaction/#lakefs_spec.transaction.LakeFSTransaction.merge","title":"merge","text":"<pre><code>merge(source_ref: str | Branch, into: str | Branch) -&gt; Commit\n</code></pre> <p>Merge a branch into another branch in a repository.</p> <p>In case the branch contains no changes relevant to the target branch, no merge happens, and the tip of the target branch is returned instead.</p> PARAMETER  DESCRIPTION <code>source_ref</code> <p>Source reference containing the changes to merge. Can be a branch name or partial commit SHA.</p> <p> TYPE: <code>str | Branch</code> </p> <code>into</code> <p>Target branch into which the changes will be merged.</p> <p> TYPE: <code>str | Branch</code> </p> RETURNS DESCRIPTION <code>Commit</code> <p>Either the created merge commit, or the head commit of the target branch.</p> Source code in <code>src/lakefs_spec/transaction.py</code> <pre><code>def merge(self, source_ref: str | Branch, into: str | Branch) -&gt; Commit:\n    \"\"\"\n    Merge a branch into another branch in a repository.\n\n    In case the branch contains no changes relevant to the target branch,\n    no merge happens, and the tip of the target branch is returned instead.\n\n    Parameters\n    ----------\n    source_ref: str | Branch\n        Source reference containing the changes to merge.\n        Can be a branch name or partial commit SHA.\n    into: str | Branch\n        Target branch into which the changes will be merged.\n\n    Returns\n    -------\n    Commit\n        Either the created merge commit, or the head commit of the target branch.\n    \"\"\"\n    source = _ensurebranch(source_ref, self.repository, self.fs.client)\n    dest = _ensurebranch(into, self.repository, self.fs.client)\n\n    if any(dest.diff(source)):\n        source.merge_into(dest)\n    return dest.head.get_commit()\n</code></pre>"},{"location":"reference/lakefs_spec/transaction/#lakefs_spec.transaction.LakeFSTransaction.revert","title":"revert","text":"<pre><code>revert(branch: str | Branch, ref: ReferenceType, parent_number: int = 1) -&gt; Commit\n</code></pre> <p>Revert a previous commit on a branch.</p> PARAMETER  DESCRIPTION <code>branch</code> <p>Branch on which the commit should be reverted.</p> <p> TYPE: <code>str | Branch</code> </p> <code>ref</code> <p>The reference to revert.</p> <p> TYPE: <code>ReferenceType</code> </p> <code>parent_number</code> <p>If there are multiple parents to a commit, specify to which parent the commit should be reverted. <code>parent_number = 1</code> (the default) refers to the first parent commit of the current <code>branch</code> tip.</p> <p> TYPE: <code>int</code> DEFAULT: <code>1</code> </p> RETURNS DESCRIPTION <code>Commit</code> <p>The created revert commit.</p> Source code in <code>src/lakefs_spec/transaction.py</code> <pre><code>def revert(self, branch: str | Branch, ref: ReferenceType, parent_number: int = 1) -&gt; Commit:\n    \"\"\"\n    Revert a previous commit on a branch.\n\n    Parameters\n    ----------\n    branch: str | Branch\n        Branch on which the commit should be reverted.\n    ref: ReferenceType\n        The reference to revert.\n    parent_number: int\n        If there are multiple parents to a commit, specify to which parent\n        the commit should be reverted. ``parent_number = 1`` (the default)\n        refers to the first parent commit of the current ``branch`` tip.\n\n    Returns\n    -------\n    Commit\n        The created revert commit.\n    \"\"\"\n\n    b = _ensurebranch(branch, self.repository, self.fs.client)\n\n    ref_id = ref if isinstance(ref, str) else ref.id\n    b.revert(ref_id, parent_number=parent_number)\n    return b.head.get_commit()\n</code></pre>"},{"location":"reference/lakefs_spec/transaction/#lakefs_spec.transaction.LakeFSTransaction.rev_parse","title":"rev_parse","text":"<pre><code>rev_parse(ref: ReferenceType) -&gt; Commit\n</code></pre> <p>Parse a given lakeFS reference expression and obtain its corresponding commit.</p> PARAMETER  DESCRIPTION <code>ref</code> <p>Reference object to resolve, can be a branch, commit SHA, or tag.</p> <p> TYPE: <code>ReferenceType</code> </p> RETURNS DESCRIPTION <code>Commit</code> <p>The commit referenced by the expression <code>ref</code>.</p> Source code in <code>src/lakefs_spec/transaction.py</code> <pre><code>def rev_parse(self, ref: ReferenceType) -&gt; Commit:\n    \"\"\"\n    Parse a given lakeFS reference expression and obtain its corresponding commit.\n\n    Parameters\n    ----------\n    ref: ReferenceType\n        Reference object to resolve, can be a branch, commit SHA, or tag.\n\n    Returns\n    -------\n    Commit\n        The commit referenced by the expression ``ref``.\n    \"\"\"\n\n    ref_id = ref.id if isinstance(ref, Reference) else ref\n    reference = lakefs.Reference(self.repository, ref_id, client=self.fs.client)\n    return reference.get_commit()\n</code></pre>"},{"location":"reference/lakefs_spec/transaction/#lakefs_spec.transaction.LakeFSTransaction.tag","title":"tag","text":"<pre><code>tag(ref: ReferenceType, name: str) -&gt; Tag\n</code></pre> <p>Create a tag referencing a commit in a repository.</p> PARAMETER  DESCRIPTION <code>ref</code> <p>Commit SHA or placeholder for a reference or commit object to which the new tag will point.</p> <p> TYPE: <code>ReferenceType</code> </p> <code>name</code> <p>Name of the tag to be created.</p> <p> TYPE: <code>str</code> </p> RETURNS DESCRIPTION <code>Tag</code> <p>The requested tag.</p> Source code in <code>src/lakefs_spec/transaction.py</code> <pre><code>def tag(self, ref: ReferenceType, name: str) -&gt; Tag:\n    \"\"\"\n    Create a tag referencing a commit in a repository.\n\n    Parameters\n    ----------\n    ref: ReferenceType\n        Commit SHA or placeholder for a reference or commit object\n        to which the new tag will point.\n    name: str\n        Name of the tag to be created.\n\n    Returns\n    -------\n    Tag\n        The requested tag.\n    \"\"\"\n\n    return lakefs.Tag(self.repository, name, client=self.fs.client).create(ref)\n</code></pre>"},{"location":"reference/lakefs_spec/util/","title":"util","text":"<p>Useful utilities for handling lakeFS URIs and results of lakeFS API calls.</p>"},{"location":"reference/lakefs_spec/util/#lakefs_spec.util.depaginate","title":"depaginate","text":"<pre><code>depaginate(\n    api: Callable[..., PaginatedApiResponse], *args: Any, **kwargs: Any\n) -&gt; Generator[Any, None, None]\n</code></pre> <p>Unwrap the responses from a paginated lakeFS API method into a generator.</p> PARAMETER  DESCRIPTION <code>api</code> <p>The lakeFS client API to call. Must return a paginated response with the <code>pagination</code> and <code>results</code> fields set.</p> <p> TYPE: <code>Callable[..., PaginatedApiResponse]</code> </p> <code>*args</code> <p>Positional arguments to pass to the API call.</p> <p> TYPE: <code>Any</code> DEFAULT: <code>()</code> </p> <code>**kwargs</code> <p>Keyword arguments to pass to the API call.</p> <p> TYPE: <code>Any</code> DEFAULT: <code>{}</code> </p> YIELDS DESCRIPTION <code>Any</code> <p>The obtained API result objects.</p> Source code in <code>src/lakefs_spec/util.py</code> <pre><code>def depaginate(\n    api: Callable[..., PaginatedApiResponse], *args: Any, **kwargs: Any\n) -&gt; Generator[Any, None, None]:\n    \"\"\"\n    Unwrap the responses from a paginated lakeFS API method into a generator.\n\n    Parameters\n    ----------\n    api: Callable[..., PaginatedApiResponse]\n        The lakeFS client API to call. Must return a paginated response with the ``pagination`` and ``results`` fields set.\n    *args: Any\n        Positional arguments to pass to the API call.\n    **kwargs: Any\n        Keyword arguments to pass to the API call.\n\n    Yields\n    ------\n    Any\n        The obtained API result objects.\n    \"\"\"\n    while True:\n        resp = api(*args, **kwargs)\n        yield from resp.results\n        if not resp.pagination.has_more:\n            break\n        kwargs[\"after\"] = resp.pagination.next_offset\n</code></pre>"},{"location":"reference/lakefs_spec/util/#lakefs_spec.util.md5_checksum","title":"md5_checksum","text":"<pre><code>md5_checksum(lpath: str | PathLike[str], blocksize: int = 2 ** 22) -&gt; str\n</code></pre> <p>Calculate a local file's MD5 hash.</p> PARAMETER  DESCRIPTION <code>lpath</code> <p>The local path whose MD5 hash to calculate. Must be a file.</p> <p> TYPE: <code>str | PathLike[str]</code> </p> <code>blocksize</code> <p>Block size (in bytes) to use while reading in the file.</p> <p> TYPE: <code>int</code> DEFAULT: <code>2 ** 22</code> </p> RETURNS DESCRIPTION <code>str</code> <p>The file's MD5 hash value, as a string.</p> Source code in <code>src/lakefs_spec/util.py</code> <pre><code>def md5_checksum(lpath: str | os.PathLike[str], blocksize: int = 2**22) -&gt; str:\n    \"\"\"\n    Calculate a local file's MD5 hash.\n\n    Parameters\n    ----------\n    lpath: str | os.PathLike[str]\n        The local path whose MD5 hash to calculate. Must be a file.\n    blocksize: int\n        Block size (in bytes) to use while reading in the file.\n\n    Returns\n    -------\n    str\n        The file's MD5 hash value, as a string.\n    \"\"\"\n    with open(lpath, \"rb\") as f:\n        file_hash = hashlib.md5(usedforsecurity=False)\n        chunk = f.read(blocksize)\n        while chunk:\n            file_hash.update(chunk)\n            chunk = f.read(blocksize)\n    return file_hash.hexdigest()\n</code></pre>"},{"location":"reference/lakefs_spec/util/#lakefs_spec.util.parse","title":"parse","text":"<pre><code>parse(path: str) -&gt; tuple[str, str, str]\n</code></pre> <p>Parses a lakeFS URI in the form <code>lakefs://&lt;repo&gt;/&lt;ref&gt;/&lt;resource&gt;</code>.</p> PARAMETER  DESCRIPTION <code>path</code> <p>String path, needs to conform to the lakeFS URI format described above. The <code>&lt;resource&gt;</code> part can be the empty string; the leading <code>lakefs://</code> scheme may be omitted.</p> <p> TYPE: <code>str</code> </p> RETURNS DESCRIPTION <code>tuple[str, str, str]</code> <p>A 3-tuple of repository name, reference, and resource name.</p> RAISES DESCRIPTION <code>ValueError</code> <p>If the path does not conform to the lakeFS URI format.</p> Source code in <code>src/lakefs_spec/util.py</code> <pre><code>def parse(path: str) -&gt; tuple[str, str, str]:\n    \"\"\"\n    Parses a lakeFS URI in the form ``lakefs://&lt;repo&gt;/&lt;ref&gt;/&lt;resource&gt;``.\n\n    Parameters\n    ----------\n    path: str\n        String path, needs to conform to the lakeFS URI format described above.\n        The ``&lt;resource&gt;`` part can be the empty string; the leading ``lakefs://`` scheme may be omitted.\n\n    Returns\n    -------\n    tuple[str, str, str]\n        A 3-tuple of repository name, reference, and resource name.\n\n    Raises\n    ------\n    ValueError\n        If the path does not conform to the lakeFS URI format.\n    \"\"\"\n\n    # First regex reflects the lakeFS repository naming rules:\n    # only lowercase letters, digits and dash, no leading dash, minimum 3, maximum 63 characters\n    # https://docs.lakefs.io/understand/model.html#repository\n    # Second regex is the branch: Only letters, digits, underscores and dash, no leading dash.\n    path_regex = re.compile(r\"(?:lakefs://)?([a-z0-9][a-z0-9\\-]{2,62})/(\\w[\\w\\-]*)/(.*)\")\n    results = path_regex.fullmatch(path)\n    if results is None:\n        raise ValueError(\n            f\"expected path with structure lakefs://&lt;repo&gt;/&lt;ref&gt;/&lt;resource&gt;, got {path!r}\"\n        )\n\n    repo, ref, resource = results.groups()\n    return repo, ref, resource\n</code></pre>"},{"location":"tutorials/","title":"Tutorials","text":"<p>Info</p> <p>We aim to provide additional tutorials in the future - contributions are welcome!</p> <ul> <li>Quickstart example: Using lakeFS-spec as a file system</li> <li>A fully-worked data science example: Using lakeFS-spec together with Pandas to train a classifier based on a public dataset and simulate additional data being collected</li> </ul>"},{"location":"tutorials/demo_data_science_project/","title":"Data Science with lakeFS-spec","text":"<pre><code>%pip install numpy pandas scikit-learn\n</code></pre> <pre>\n<code>Collecting numpy\n</code>\n</pre> <pre>\n<code>  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)\n     \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 0.0/61.0 kB ? eta -:--:--\n     \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 61.0/61.0 kB 3.6 MB/s eta 0:00:00\n</code>\n</pre> <pre>\n<code>Collecting pandas\n  Downloading pandas-2.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)\n</code>\n</pre> <pre>\n<code>Collecting scikit-learn\n  Downloading scikit_learn-1.4.1.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)\n</code>\n</pre> <pre>\n<code>Requirement already satisfied: python-dateutil&gt;=2.8.2 in /opt/hostedtoolcache/Python/3.11.8/x64/lib/python3.11/site-packages (from pandas) (2.8.2)\nRequirement already satisfied: pytz&gt;=2020.1 in /opt/hostedtoolcache/Python/3.11.8/x64/lib/python3.11/site-packages (from pandas) (2024.1)\nCollecting tzdata&gt;=2022.7 (from pandas)\n  Downloading tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)\n</code>\n</pre> <pre>\n<code>Collecting scipy&gt;=1.6.0 (from scikit-learn)\n  Downloading scipy-1.12.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)\n     \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 0.0/60.4 kB ? eta -:--:--\n     \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 60.4/60.4 kB 17.5 MB/s eta 0:00:00\n</code>\n</pre> <pre>\n<code>Collecting joblib&gt;=1.2.0 (from scikit-learn)\n  Downloading joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)\n</code>\n</pre> <pre>\n<code>Collecting threadpoolctl&gt;=2.0.0 (from scikit-learn)\n  Downloading threadpoolctl-3.3.0-py3-none-any.whl.metadata (13 kB)\nRequirement already satisfied: six&gt;=1.5 in /opt/hostedtoolcache/Python/3.11.8/x64/lib/python3.11/site-packages (from python-dateutil&gt;=2.8.2-&gt;pandas) (1.16.0)\n</code>\n</pre> <pre>\n<code>Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 0.0/18.3 MB ? eta -:--:--</code>\n</pre> <pre>\n<code>\n   \u2501\u2501\u2501\u2501\u2578\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 2.2/18.3 MB 65.1 MB/s eta 0:00:01\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u257a\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 7.4/18.3 MB 107.5 MB/s eta 0:00:01</code>\n</pre> <pre>\n<code>\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2578\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 12.6/18.3 MB 152.4 MB/s eta 0:00:01</code>\n</pre> <pre>\n<code>\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2578 18.2/18.3 MB 163.2 MB/s eta 0:00:01\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2578 18.2/18.3 MB 163.2 MB/s eta 0:00:01\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 18.3/18.3 MB 94.2 MB/s eta 0:00:00\n</code>\n</pre> <pre>\n<code>Downloading pandas-2.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.0 MB)\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 0.0/13.0 MB ? eta -:--:--</code>\n</pre> <pre>\n<code>\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2578\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 5.1/13.0 MB 153.9 MB/s eta 0:00:01\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2578\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 10.4/13.0 MB 153.4 MB/s eta 0:00:01</code>\n</pre> <pre>\n<code>\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2578 13.0/13.0 MB 154.8 MB/s eta 0:00:01\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 13.0/13.0 MB 102.0 MB/s eta 0:00:00\n</code>\n</pre> <pre>\n<code>Downloading scikit_learn-1.4.1.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 0.0/12.1 MB ? eta -:--:--\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u257a\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 5.5/12.1 MB 166.6 MB/s eta 0:00:01</code>\n</pre> <pre>\n<code>\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u257a\u2501\u2501\u2501\u2501\u2501 10.3/12.1 MB 151.7 MB/s eta 0:00:01</code>\n</pre> <pre>\n<code>\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2578 12.1/12.1 MB 151.0 MB/s eta 0:00:01\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 12.1/12.1 MB 103.2 MB/s eta 0:00:00\nDownloading joblib-1.3.2-py3-none-any.whl (302 kB)\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 0.0/302.2 kB ? eta -:--:--\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 302.2/302.2 kB 59.9 MB/s eta 0:00:00\nDownloading scipy-1.12.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.4 MB)\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 0.0/38.4 MB ? eta -:--:--</code>\n</pre> <pre>\n<code>\n   \u2501\u2501\u2501\u2501\u2501\u2578\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 5.4/38.4 MB 162.5 MB/s eta 0:00:01</code>\n</pre> <pre>\n<code>\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u257a\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 10.6/38.4 MB 156.3 MB/s eta 0:00:01</code>\n</pre> <pre>\n<code>\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2578\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 16.1/38.4 MB 156.9 MB/s eta 0:00:01\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u257a\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 21.4/38.4 MB 158.4 MB/s eta 0:00:01</code>\n</pre> <pre>\n<code>\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2578\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 26.7/38.4 MB 155.5 MB/s eta 0:00:01</code>\n</pre> <pre>\n<code>\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2578\u2501\u2501\u2501\u2501\u2501\u2501 32.2/38.4 MB 157.9 MB/s eta 0:00:01\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2578 38.2/38.4 MB 169.0 MB/s eta 0:00:01</code>\n</pre> <pre>\n<code>\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2578 38.4/38.4 MB 165.0 MB/s eta 0:00:01</code>\n</pre> <pre>\n<code>\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2578 38.4/38.4 MB 165.0 MB/s eta 0:00:01\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 38.4/38.4 MB 66.1 MB/s eta 0:00:00\nDownloading threadpoolctl-3.3.0-py3-none-any.whl (17 kB)\n</code>\n</pre> <pre>\n<code>Downloading tzdata-2024.1-py2.py3-none-any.whl (345 kB)\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 0.0/345.4 kB ? eta -:--:--\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 345.4/345.4 kB 64.5 MB/s eta 0:00:00\n</code>\n</pre> <pre>\n<code>Installing collected packages: tzdata, threadpoolctl, numpy, joblib, scipy, pandas, scikit-learn\n</code>\n</pre> <pre>\n<code>Successfully installed joblib-1.3.2 numpy-1.26.4 pandas-2.2.0 scikit-learn-1.4.1.post1 scipy-1.12.0 threadpoolctl-3.3.0 tzdata-2024.1\n</code>\n</pre> <pre>\n<code>Note: you may need to restart the kernel to use updated packages.\n</code>\n</pre> <p>Also install an appropriate lakeFS-spec version, which can be either the latest release from PyPI via <code>pip install --upgrade lakefs-spec</code>, or the development version from GitHub via <code>pip install git+https://github.com/aai-institute/lakefs-spec.git</code>.</p> <pre><code>import os\nimport tempfile\nimport urllib.request\nfrom pathlib import Path\n\nurllib.request.urlretrieve(\n    \"https://raw.githubusercontent.com/aai-institute/lakefs-spec/main/docs/tutorials/.lakectl.yaml\",\n    os.path.expanduser(\"~/.lakectl.yaml\"),\n)\n</code></pre> <pre>\n<code>('/home/runner/.lakectl.yaml', &lt;http.client.HTTPMessage at 0x7fd726d97590&gt;)</code>\n</pre> <p>We can now instantiate the <code>LakeFSFileSystem</code> with the credentials we just downloaded. Alternatively, we could have passed the credentials directly in the code. It is important that the credentials are available at the time of filesystem instantiation.</p> <pre><code>from lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\n\nREPO_NAME = \"weather\"\n</code></pre> <p>We will create a repository using a helper function provided by lakeFS-spec. If you have already created one in the UI, make sure to set the <code>REPO_NAME</code> variable accordingly in the cell directly above.</p> <pre><code>import lakefs\n\nrepo = lakefs.Repository(REPO_NAME, fs.client).create(storage_namespace=f\"local://{REPO_NAME}\")\n</code></pre> <pre><code>def _maybe_urlretrieve(url: str, filename: str) -&amp;gt; str:\n    # Avoid API rate limit errors by downloading to a fixed local location\n    destination = Path(tempfile.gettempdir()) / \"lakefs-spec-tutorials\" / filename\n    destination.parent.mkdir(exist_ok=True, parents=True)\n    if destination.exists():\n        return str(destination)\n\n    outfile, _ = urllib.request.urlretrieve(url, str(destination))\n    return outfile\n\n\noutfile = _maybe_urlretrieve(\n    \"https://archive-api.open-meteo.com/v1/archive?latitude=52.52&amp;amp;longitude=13.41&amp;amp;start_date=2010-01-01&amp;amp;end_date=2010-12-31&amp;amp;hourly=temperature_2m,relativehumidity_2m,rain,pressure_msl,surface_pressure,cloudcover,cloudcover_low,cloudcover_mid,cloudcover_high,windspeed_10m,windspeed_100m,winddirection_10m,winddirection_100m\",\n    \"weather-2010.json\",\n)\n</code></pre> <p>The data is in JSON format. Therefore, we need to wrangle the data a bit to make it usable. But first, we will upload it to our lakeFS instance.</p> <pre><code>NEW_BRANCH = lakefs.Branch(REPO_NAME, \"transform-raw-data\", client=fs.client)\nNEW_BRANCH.create(\"main\")\n\nwith fs.transaction(REPO_NAME, NEW_BRANCH) as tx:\n    fs.put(outfile, f\"{REPO_NAME}/{tx.branch.id}/weather-2010.json\")\n    tx.commit(message=\"Add 2010 weather data\")\n</code></pre> <p>You can inspect this commit by selecting the <code>transform-raw-data</code> branch, and navigating to the Commits tab.</p> <pre><code>import json\n\nimport pandas as pd\n\n\ndef transform_json_weather_data(filepath):\n    if hasattr(filepath, \"close\") and hasattr(filepath, \"tell\"):\n        data = json.load(filepath)\n    else:\n        with open(filepath, \"r\") as f:\n            data = json.load(f)\n\n    df = pd.DataFrame.from_dict(data[\"hourly\"])\n    df.time = pd.to_datetime(df.time)\n    df[\"is_raining\"] = df.rain &amp;gt; 0\n    df[\"is_raining_in_1_day\"] = df.is_raining.shift(24).astype(bool)\n    df = df.dropna()\n    return df\n\n\ndf = transform_json_weather_data(outfile)\ndf.head(5)\n</code></pre> <pre>\n<code>/tmp/ipykernel_2291/2823322696.py:3: DeprecationWarning: \nPyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),\n(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)\nbut was not found to be installed on your system.\nIf this would cause problems for you,\nplease provide us feedback at https://github.com/pandas-dev/pandas/issues/54466\n\n  import pandas as pd\n</code>\n</pre> time temperature_2m relativehumidity_2m rain pressure_msl surface_pressure cloudcover cloudcover_low cloudcover_mid cloudcover_high windspeed_10m windspeed_100m winddirection_10m winddirection_100m is_raining is_raining_in_1_day 0 2010-01-01 00:00:00 -2.6 88 0.0 996.9 992.1 100 100 97 75 16.0 27.2 54 58 False True 1 2010-01-01 01:00:00 -2.7 88 0.0 996.4 991.6 100 99 96 49 16.3 28.0 55 58 False True 2 2010-01-01 02:00:00 -2.7 88 0.0 996.2 991.4 100 96 94 60 16.3 27.5 55 58 False True 3 2010-01-01 03:00:00 -2.7 88 0.0 996.1 991.3 100 97 96 83 15.4 26.6 53 57 False True 4 2010-01-01 04:00:00 -2.7 88 0.0 996.0 991.2 100 92 98 82 14.8 25.6 47 52 False True <p>Next, we save this data as a CSV file into the main branch. When the transaction commit helper is called, the newly put CSV file is committed. You can verify the saving worked in the lakeFS UI in your browser by switching to the commits tab of the <code>main</code> branch.</p> <pre><code>with fs.transaction(REPO_NAME, \"main\") as tx:\n    df.to_csv(f\"lakefs://{REPO_NAME}/{tx.branch.id}/weather_2010.csv\")\n    tx.commit(message=\"Update weather data\")\n</code></pre> <pre><code>import sklearn.model_selection\n\nmodel_data = df.drop(\"time\", axis=1)\n\ntrain, test = sklearn.model_selection.train_test_split(model_data, random_state=7)\n</code></pre> <p>We save these train and test datasets into a new <code>training</code> branch. If the branch does not exist yet, as in this case, it is implicitly created by default. You can control this behaviour with the <code>create_branch_ok</code> flag when initializing the <code>LakeFSFileSystem</code>. By default, <code>create_branch_ok</code> is set to <code>True</code>, so we need to only set <code>fs = LakeFSFileSystem()</code> to enable implicit branch creation.</p> <pre><code>TRAINING_BRANCH = lakefs.Branch(REPO_NAME, \"training\", client=fs.client)\nTRAINING_BRANCH.create(\"main\")\n\nwith fs.transaction(REPO_NAME, TRAINING_BRANCH) as tx:\n    train.to_csv(f\"lakefs://{REPO_NAME}/{tx.branch.id}/train_weather.csv\")\n    test.to_csv(f\"lakefs://{REPO_NAME}/{tx.branch.id}/test_weather.csv\")\n    tx.commit(message=\"Add train-test split of 2010 weather data\")\n</code></pre> <p>Let's check the shape of train and test data. Later on, we will get back to this data version and reproduce the results of the experiment.</p> <pre><code>print(f\"Initial train data shape: {train.shape}\")\nprint(f\"Initial test data shape: {test.shape}\")\n</code></pre> <pre>\n<code>Initial train data shape: (6570, 15)\nInitial test data shape: (2190, 15)\n</code>\n</pre> <p>We now proceed to train a decision tree classifier and evaluate it on the test set:</p> <pre><code>from sklearn.tree import DecisionTreeClassifier\n\ndependent_variable = \"is_raining_in_1_day\"\n\nmodel = DecisionTreeClassifier(random_state=7)\n\nx_train, y_train = train.drop(dependent_variable, axis=1), train[dependent_variable].astype(bool)\nx_test, y_test = test.drop(dependent_variable, axis=1), test[dependent_variable].astype(bool)\n\nmodel.fit(x_train, y_train)\n\ntest_acc = model.score(x_test, y_test)\nprint(f\"Test accuracy: {test_acc:.2%}\")\n</code></pre> <pre>\n<code>Test accuracy: 87.31%\n</code>\n</pre> <pre><code>outfile = _maybe_urlretrieve(\n    \"https://archive-api.open-meteo.com/v1/archive?latitude=52.52&amp;amp;longitude=13.41&amp;amp;start_date=2020-01-01&amp;amp;end_date=2020-12-31&amp;amp;hourly=temperature_2m,relativehumidity_2m,rain,pressure_msl,surface_pressure,cloudcover,cloudcover_low,cloudcover_mid,cloudcover_high,windspeed_10m,windspeed_100m,winddirection_10m,winddirection_100m\",\n    \"weather-2020.json\",\n)\n\nnew_data = transform_json_weather_data(outfile)\n\nwith fs.transaction(REPO_NAME, \"main\") as tx:\n    new_data.to_csv(f\"lakefs://{REPO_NAME}/{tx.branch.id}/weather_2020.csv\")\n    tx.commit(message=\"Add 2020 weather data\")\n\n# Remove leftover temporary files from previous `urlretrieve` calls\nurllib.request.urlcleanup()\n</code></pre> <p>Let's concatenate the old data and the new data, create a new train-test split, and push the updated files to lakeFS:</p> <pre><code>new_data = new_data.drop(\"time\", axis=1)\nfull_data = pd.concat([new_data, train, test])\n\ntrain_df, test_df = sklearn.model_selection.train_test_split(full_data, random_state=7)\n\nprint(f\"Updated train data shape: {train_df.shape}\")\nprint(f\"Updated test data shape: {test_df.shape}\")\n\nwith fs.transaction(REPO_NAME, TRAINING_BRANCH) as tx:\n    train_df.to_csv(f\"lakefs://{REPO_NAME}/{tx.branch.id}/train_weather.csv\")\n    test_df.to_csv(f\"lakefs://{REPO_NAME}/{tx.branch.id}/test_weather.csv\")\n    tx.commit(message=\"Add train-test split of 2010 and 2020 data\")\n</code></pre> <pre>\n<code>Updated train data shape: (13158, 15)\nUpdated test data shape: (4386, 15)\n</code>\n</pre> <p>Now, we train the model on the new data and validate on the new test data.</p> <pre><code>x_train, y_train = (\n    train_df.drop(dependent_variable, axis=1),\n    train_df[dependent_variable].astype(bool),\n)\nx_test, y_test = test_df.drop(dependent_variable, axis=1), test_df[dependent_variable].astype(bool)\n\nmodel.fit(x_train, y_train)\n\ntest_acc = model.score(x_test, y_test)\n\nprint(f\"Test accuracy: {test_acc:.2%}\")\n</code></pre> <pre>\n<code>Test accuracy: 82.67%\n</code>\n</pre> <pre><code># access the data of the previous commit with a lakefs ref expression, in this case the same as in git.\nprevious_commit = repo.ref(f\"{TRAINING_BRANCH.id}~\").get_commit()\nfixed_commit_id = previous_commit.id\nprint(fixed_commit_id)\n</code></pre> <pre>\n<code>240be1477daa4fd3df6b6621be4398d480424683a09e65a8e0664c0a9e79f496\n</code>\n</pre> <p>Let's check whether we managed to get the initial train and test data with this commit SHA, checking equality to the initial data:</p> <pre><code>orig_train = pd.read_csv(f\"lakefs://{REPO_NAME}/{fixed_commit_id}/train_weather.csv\", index_col=0)\norig_test = pd.read_csv(f\"lakefs://{REPO_NAME}/{fixed_commit_id}/test_weather.csv\", index_col=0)\n\nprint(f\"Is the pulled training data equal to the local training data? {train.equals(orig_train)}\")\nprint(f\"Is the pulled test data equal to the local test data? {test.equals(orig_test)}\")\n</code></pre> <pre>\n<code>Is the pulled training data equal to the local training data? True\nIs the pulled test data equal to the local test data? True\n</code>\n</pre> <p>Let's train and validate the model again based on the redownloaded data and see if we manage to reproduce the initial accuracy.</p> <pre><code>x_train, y_train = train.drop(dependent_variable, axis=1), train[dependent_variable].astype(bool)\nx_test, y_test = test.drop(dependent_variable, axis=1), test[dependent_variable].astype(bool)\n\nmodel.fit(x_train, y_train)\n\ntest_acc = model.score(x_test, y_test)\n\nprint(f\"Test accuracy: {test_acc:.2%}\")\n</code></pre> <pre>\n<code>Test accuracy: 87.31%\n</code>\n</pre> <pre><code>with fs.transaction(REPO_NAME, \"main\") as tx:\n    # returns the tag as a lakeFS object.\n    tag = tx.tag(fixed_commit_id, name=\"train-test-split-2010\")\n</code></pre> <p>Now we can access the specific files with the semantic tag. Both the <code>fixed_commit_id</code> and <code>tag</code> reference the same version <code>ref</code> in lakeFS, whereas a branch name always points to the latest version on that respective branch.</p> <pre><code>train_from_commit = pd.read_csv(\n    f\"lakefs://{REPO_NAME}/{fixed_commit_id}/train_weather.csv\", index_col=0\n)\ntrain_from_tag = pd.read_csv(f\"lakefs://{REPO_NAME}/{tag.id}/train_weather.csv\", index_col=0)\n</code></pre> <p>We can verify this by comparing the <code>DataFrame</code>s. We see that the <code>train_from_commit</code> and <code>train_from_tag</code> are equal.</p> <pre><code>print(\n    f\"Is the data tagged {tag!r} equal to the data in commit {fixed_commit_id[:8]}? {train_from_commit.equals(train_from_tag)}\"\n)\n</code></pre> <pre>\n<code>Is the data tagged Tag(repository=\"weather\", id=\"train-test-split-2010\") equal to the data in commit 240be147? True\n</code>\n</pre>"},{"location":"tutorials/demo_data_science_project/#data-science-with-lakefs-spec","title":"Data Science with lakeFS-spec","text":"<p>In this notebook, we will complete a small end-to-end data science tutorial that employs lakeFS-spec for data versioning. We will use versioned weather data to train a decision tree classifier to predict whether it is raining tomorrow given the current weather.</p> <p>We will do the following:</p> <ul> <li>Environment setup</li> <li>LakeFS setup</li> <li>Authenticating with the lakeFS server</li> <li>Data ingestion via transactions</li> <li>Model training</li> <li>Updating data and retraining a model</li> <li>Accessing data versions and reproducing experiments</li> <li>Using tags for semantic versioning</li> </ul> <p>Local Execution</p> <p>If you want to execute the code in this tutorial as a Jupyter notebook yourself, download the <code>demo_data_science_project.py</code> file from the lakeFS-spec repository.</p> <p>You can then convert the Python file to a notebook using Jupytext using the following command: <code>jupytext --to notebook demo_data_science_project.py</code>.</p> <p>This tutorial assumes that you have installed lakeFS-spec in a virtual environment, and that you have followed the quickstart guide to set up a local lakeFS instance.</p>"},{"location":"tutorials/demo_data_science_project/#environment-setup","title":"Environment setup","text":"<p>Install the necessary libraries for this notebook on the environment you have just created:</p>"},{"location":"tutorials/demo_data_science_project/#lakefs-setup","title":"lakeFS Setup","text":"<p>With Docker Desktop or a similar runtime running set up lakeFS by executing the following <code>docker run</code> command (from the lakeFS quickstart) in your console:</p> <pre><code>docker run --name lakefs --pull always --rm --publish 8000:8000 treeverse/lakefs:latest run --quickstart\n</code></pre> <p>You find the authentication credentials in the terminal output. The default address for the local lakeFS GUI is http://localhost:8000/.</p>"},{"location":"tutorials/demo_data_science_project/#authenticating-with-the-lakefs-server","title":"Authenticating with the lakeFS server","text":"<p>There are multiple ways to authenticate with lakeFS from Python code. In this tutorial, we choose the YAML file configuration. By executing the cell below, you will download a YAML file containing the default lakeFS quickstart credentials and server URL to your user directory.</p>"},{"location":"tutorials/demo_data_science_project/#data-ingestion","title":"Data Ingestion","text":"<p>Now it's time to get some data. We will use the Open-Meteo API, where we can pull weather data from an API for free (as long as we are non-commercial) and without an API token. In order to prevent hitting the rate limits when repeatedly querying the API (and out of courtesy towards the operators of the API), the <code>_maybe_urlretrieve</code> function provides a simple local cache for the downloaded data.</p> <p>For training our toy model, we download the full weather data of Munich for the year 2010:</p>"},{"location":"tutorials/demo_data_science_project/#upload-a-file-using-transactions","title":"Upload a file using transactions","text":"<p>lakeFS works similar to <code>git</code> as a versioning system. You can create commits that contain specific changes to the data. You can also work with branches to create your own isolated view of the data independently of your colleagues. Every commit (on any branch) is identified by a commit SHA. This SHA can be used to programmatically interact with specific states of your data and enables logging of the specific data versions used to create a certain model.</p> <p>To easily carry out versioning operations while uploading files, you can use transactions. A transaction is a context manager that keeps track of all files that were uploaded in its scope, as well as all versioning operations happening between file uploads. All operations are deferred to the end of the transaction, and are executed sequentially on completion.</p> <p>To create a commit after a file upload, you can run the following transaction:</p>"},{"location":"tutorials/demo_data_science_project/#data-transformation","title":"Data Transformation","text":"<p>Now let's transform the data for our use case. We put the transformation into a function to be able to reuse it later.</p> <p>In this notebook, we use a simple toy model to predict whether it is raining at the same time tomorrow given weather data from right now.</p> <p>We will skip a lot of possible feature engineering and other data science aspects in order to focus more on the application of the <code>LakeFSFileSystem</code>.</p>"},{"location":"tutorials/demo_data_science_project/#training-the-initial-weather-model","title":"Training the initial weather model","text":"<p>First we will do a train-test split:</p>"},{"location":"tutorials/demo_data_science_project/#updating-data-and-retraining-the-model","title":"Updating data and retraining the model","text":"<p>Until now, we only have used data from 2010. Let's download additional 2020 data, transform it, and save it to lakeFS.</p>"},{"location":"tutorials/demo_data_science_project/#accessing-data-versions-through-commits-and-reproducing-experiments","title":"Accessing data versions through commits and reproducing experiments","text":"<p>If we need to go to our initial data and reproduce the first experiment (the model trained on the 2010 data with its initial accuracy), we can go back in the commit history of the <code>training</code> branch and select the appropriate commit data snapshot. Since we have created multiple commits on the same branch already, we will address different data versions by their commit SHAs.</p> <p>To obtain the actual commit SHA from a branch, we have multiple options. Manually, we could go into the lakeFS UI, select the training branch, and navigate to the Commits tab. There, we take the parent of the previous commit, titled <code>Add train-test split of 2010 weather data</code>, and copy its revision SHA (also called <code>ID</code>).</p> <p>In code, we can obtain commit SHAs for different revisions on the <code>training</code> branch by using <code>lakefs.Reference</code> objects.</p>"},{"location":"tutorials/demo_data_science_project/#using-tags-instead-of-commit-shas-for-semantic-versioning","title":"Using tags instead of commit SHAs for semantic versioning","text":"<p>The above method for data versioning works great when you have experiment tracking tools to store and retrieve the commit SHA in automated pipelines. But it can be tedious to retrieve in manual prototyping. We can make selected versions of the dataset more accessible with semantic versioning by attaching a human-interpretable tag to a specific commit SHA.</p> <p>Creating a tag is easiest when done inside a transaction, just like the files we already uploaded. To do this, simply call <code>tx.tag</code> on the transaction and supply the repository name, the commit SHA to tag, and the intended tag name. Tags are immutable once created, so attempting to tag two different commits with the same name will result in an error.</p>"}]}
\ No newline at end of file
+{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Home","text":"<p>Welcome to lakeFS-spec, a filesystem-spec backend implementation for the lakeFS data lake. Our primary goal is to streamline versioned data operations in lakeFS, enabling seamless integration with popular data science tools such as Pandas, Polars, and DuckDB directly from Python.</p> <p>Highlights:</p> <ul> <li>Simple repository operations in lakeFS</li> <li>Easy access to underlying storage and versioning operations</li> <li>Seamless integration with the fsspec ecosystem</li> <li>Directly access lakeFS objects from popular data science libraries (including Pandas, Polars, DuckDB, PyArrow) with minimal code</li> <li>Transaction support for reliable data version control</li> <li>Smart data transfers through client-side caching (up-/download)</li> <li>Auto-discovery configuration</li> </ul> <p>Early Adopters</p> <p>We are seeking early adopters who would like to actively participate in our feedback process and shape the future of the library. If you are interested in using the library and want to get in touch with us, please reach out via Github Discussions.</p> <p>Quickstart</p><p>Step-by-step installation and first operations</p> <p>Tutorials</p><p>In-depth tutorials on using lakeFS-spec</p> <p>API Reference</p><p>Full documentation of the Python API</p> <p>User Guide</p><p>Solving specific tasks with lakeFS-spec</p> <p>Contributing</p><p>How to contribute to the project</p>"},{"location":"CONTRIBUTING/","title":"Contributing to lakeFS-spec","text":"<p>Thank you for your interest in contributing to this project!</p> <p>We appreciate issue reports, pull requests for code and documentation, as well as any project-related communication through GitHub Discussions.</p>"},{"location":"CONTRIBUTING/#getting-started","title":"Getting Started","text":"<p>To get started with development, you can follow these steps:</p> <ol> <li> <p>Clone this repository:</p> <pre><code>git clone https://github.com/aai-institute/lakefs-spec.git\n</code></pre> </li> <li> <p>Navigate to the directory and install the development dependencies into a virtual environment:</p> <pre><code>cd lakefs-spec\npython3 -m venv venv --system-site-packages\nsource venv/bin/activate\npython -m pip install -r requirements-dev.txt\npython -m pip install -e . --no-deps\n</code></pre> </li> <li> <p>After making your changes, verify they adhere to our Python code style by running <code>pre-commit</code>:</p> <pre><code>pre-commit run --all-files\n</code></pre> <p>You can also set up Git hooks through <code>pre-commit</code> to perform these checks automatically:</p> <pre><code>pre-commit install\n</code></pre> </li> <li> <p>To run the tests against an ephemeral lakeFS instance, you just run <code>pytest</code>:     <pre><code>pytest\n</code></pre></p> <p>To spin up a local lakeFS instance quickly for testing, you can use the Docker Compose file bundled with this repository:</p> <pre><code>docker-compose -f hack/docker-compose.yml up\n</code></pre> </li> </ol>"},{"location":"CONTRIBUTING/#updating-dependencies","title":"Updating dependencies","text":"<p>Dependencies should stay locked for as long as possible, ideally for a whole release. If you have to update a dependency during development, you should do the following:</p> <ol> <li>If it is a core dependency needed for the package, add it to the <code>dependencies</code> section in the <code>pyproject.toml</code>.</li> <li>In case of a development dependency, add it to the <code>dev</code> section of the <code>project.optional-dependencies</code> table instead.</li> <li>Dependencies needed for documentation generation are found in the <code>docs</code> sections of <code>project.optional-dependencies</code>.</li> </ol> <p>After adding the dependency in either of these sections, run the helper script <code>hack/lock-deps.sh</code> (which in turn uses <code>pip-compile</code>) to pin all dependencies again:</p> <pre><code>python -m pip install --upgrade pip-tools\nhack/lock-deps.sh\n</code></pre> <p>In addition to these manual steps, we also provide <code>pre-commit</code> hooks that automatically lock the dependencies whenever <code>pyproject.toml</code> is changed.</p> <p>Selective package upgrade for existing dependencies are also handled by the helper script above. If you want to update the <code>lakefs-sdk</code> dependency, for example, simply run:</p> <pre><code>hack/lock-deps.sh lakefs-sdk\n</code></pre> <p>Tip</p> <p>Since the official development version is Python 3.11, please run the above commands in a virtual environment with Python 3.11.</p>"},{"location":"CONTRIBUTING/#working-on-documentation","title":"Working on Documentation","text":"<p>Improvements or additions to the project's documentation are highly appreciated.</p> <p>The documentation is based on the MkDocs and Material for MkDocs (<code>mkdocs-material</code>) projects, see their homepages for in-depth guides on their features and usage. We use the Numpy documentation style for Python docstrings.</p> <p>To build the documentation locally, you need to first install the optional <code>docs</code> dependencies from <code>requirements-docs.txt</code>, e.g., with <code>pip install -r requirements-docs.txt</code>. You can then start a local documentation server with <code>mkdocs serve</code>, or build the documentation into its output folder in <code>public/</code>.</p> <p>In order to maintain documentation for multiple versions of this library, we use the mike tool, which automatically maintains individual documentation builds per version and publishes them to the <code>gh-pages</code> branch.</p> <p>The GitHub CI pipeline automatically invokes <code>mike</code> as part of the release process with the correct version and updates the GitHub pages branch for the project.</p>"},{"location":"quickstart/","title":"Quickstart","text":"<p>Welcome! This quickstart guide will get you up and running with lakeFS-spec by showing you how to</p> <ol> <li>install the <code>lakefs-spec</code> package,</li> <li>spin up a local lakeFS server,</li> <li>create a lakeFS repository for experimentation, and</li> <li>perform basic file system operations in a lakeFS repository using lakeFS-spec.</li> </ol> Prerequisites <p>To follow along with this guide, you will need a few prerequisites ready on your machine:</p> <ul> <li>lakeFS-spec supports Windows, macOS, or Linux</li> <li>Docker, with Docker Compose</li> <li>Python 3.9 or later</li> <li>optionally, <code>lakectl</code>, the lakeFS command line tool</li> </ul> <p>Please take a moment to make sure you have these tools available before proceeding with the next steps.</p>"},{"location":"quickstart/#installing-lakefs-spec","title":"Installing lakeFS-spec","text":"A note on virtual environments <p>We generally recommend installing the library in a virtual environment to ensure proper isolation, especially when following this quickstart guide.</p> <p>If you are using Poetry, virtual environments can automatically be created by the tool.</p> <p>If you prefer the <code>venv</code> functionality built into Python, see the official docs (tl;dr: <code>python -m venv venv; source venv/bin/activate</code>).</p> <p>To install the package directly from PyPI, run:</p> pippoetry <pre><code>pip install lakefs-spec\n</code></pre> <pre><code>poetry add lakefs-spec\n</code></pre> <p>Or, if you want to try the latest pre-release version directly from GitHub:</p> pippoetry <pre><code>pip install git+https://github.com/aai-institute/lakefs-spec.git\n</code></pre> <pre><code>poetry add git+https://github.com/aai-institute/lakefs-spec.git\n</code></pre>"},{"location":"quickstart/#first-steps","title":"First Steps","text":""},{"location":"quickstart/#spinning-up-a-local-lakefs-instance","title":"Spinning up a local lakeFS instance","text":"<p>Warning</p> <p>This setup is not recommended for production uses, since it does not store the data persistently.</p> <p>Please check out the lakeFS docs for production-ready deployment options.</p> <p>If you don't already have access to a lakeFS server, you can quickly start a local instance using Docker Compose. Before continuing, please make sure Docker is installed and running on your machine.</p> <p>The lakeFS quickstart deployment can be launched directly with a configuration file provided in the lakeFS-spec repository:</p> <pre><code>$ curl https://raw.githubusercontent.com/aai-institute/lakefs-spec/main/hack/docker-compose.yml | docker-compose -f - up\n</code></pre> <p>If you do not have <code>curl</code> installed on your machine or would like to examine and/or customize the container configuration, you can also create a <code>docker-compose.yml</code> file locally and use it with <code>docker-compose up</code>:</p> docker-compose.yml<pre><code>version: \"3\"\n\nservices:\n  lakefs:\n    image: treeverse/lakefs:1.7.0\n    ports:\n      - 8000:8000\n    environment:\n      LAKEFS_INSTALLATION_USER_NAME: \"quickstart\"\n      LAKEFS_INSTALLATION_ACCESS_KEY_ID: \"AKIAIOSFOLQUICKSTART\"\n      LAKEFS_INSTALLATION_SECRET_ACCESS_KEY: \"wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY\"\n      LAKEFS_DATABASE_TYPE: \"local\"\n      LAKEFS_AUTH_ENCRYPT_SECRET_KEY: \"THIS_MUST_BE_CHANGED_IN_PRODUCTION\"\n      LAKEFS_BLOCKSTORE_TYPE: \"local\"\n</code></pre> <p>In order to allow lakeFS-spec to automatically discover credentials to access this lakeFS instance, create a <code>.lakectl.yaml</code> in your home directory containing the credentials for the quickstart environment (you can also use <code>lakectl config</code> to create this file interactively if you have the <code>lakectl</code> tool installed on your machine):</p> ~/.lakectl.yaml<pre><code>credentials: # (1)!\n  access_key_id: AKIAIOSFOLQUICKSTART\n  secret_access_key: wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY\nserver:\n  endpoint_url: http://127.0.0.1:8000\n</code></pre> <ol> <li>These must match the credentials set in the <code>environment</code> section of the Docker Compose file above</li> </ol> <p>After the container has finished initializing, you can access the web UI of your local lakeFS deployment in your browser. Fill out the setup form, where you can optionally share your email address with the developers of lakeFS to receive updates on their product. Next, you can log into your fresh lakeFS instance with the credentials listed above.</p> <p>Success</p> <p>Your fresh local lakeFS instance is a playground for you to explore lakeFS functionality. </p> <p>In the next step, we will create your first repository on this server.</p>"},{"location":"quickstart/#create-a-lakefs-repository","title":"Create a lakeFS repository","text":"<p>Once you have logged into the web UI of the lakeFS server for the first time, you can create an empty repository on the next page. Click the small Click here link at the bottom of the page to proceed and create a repository named <code>repo</code> (we don't want to add the sample data for this guide):</p> <p></p> Tip: Creating a repository later <p>If you have inadvertently skipped over the quickstart repository creation page, you can always create a new repository on the Repositories tab in the lakeFS web UI (and optionally choose to add the sample data):</p> <p></p> <p>Success</p> <p>You have successfully created a lakeFS repository named <code>repo</code>, ready to be used with lakeFS-spec.</p>"},{"location":"quickstart/#using-the-lakefs-file-system","title":"Using the lakeFS file system","text":"<p>We will now use the lakeFS-spec file system interface to perform some basic operations on the repository created in the previous step:</p> <ul> <li>Upload a local file to the repository</li> <li>Read data from a file in the repository</li> <li>Make a commit</li> <li>Fetch metadata about repository contents</li> <li>Delete a file from the repository</li> </ul> <p>To get started, create a file called <code>quickstart.py</code> with the following contents:</p> quickstart.py<pre><code>from pathlib import Path\n\nfrom lakefs_spec import LakeFSFileSystem\n\nREPO, BRANCH = \"repo\", \"main\"\n\n# Prepare example local data\nlocal_path = Path(\"demo.txt\")\nlocal_path.write_text(\"Hello, lakeFS!\")\n</code></pre> <p>Tip</p> <p>We will keep adding more code to this file as we progress through the next steps. Feel free to execute the script after each step and observe the effects as noted in the guide.</p> <p>This code snippet prepares a file <code>demo.txt</code> on your machine, ready to be added to the lakeFS repository, so let's do just that:</p> <pre><code>fs = LakeFSFileSystem()  # will auto-discover credentials from ~/.lakectl.yaml\nrepo_path = f\"{REPO}/{BRANCH}/{local_path.name}\"\n\nwith fs.transaction(REPO, BRANCH) as tx:\n    fs.put(str(local_path), f\"{REPO}/{tx.branch.id}/{local_path.name}\")\n    tx.commit(message=\"Add demo data\")\n</code></pre> <p>If you execute the <code>quickstart.py</code> script at this point, you can already see the committed file in the lakeFS web UI:</p> <p></p> <p>While examining the file contents in the browser is nice, we want to access the committed file programmatically. Add the following lines at the end of your script and observe the output:</p> <pre><code>f = fs.open(repo_path, \"rt\")\nprint(f.readline())  # prints \"Hello, lakeFS!\"\n</code></pre> <p>Note that executing the same code multiple times will only result in a single commit in the repository since the contents of the file on disk and in the repository are identical.</p> <p>In addition to simple read and write operations, the fsspec file system interface also allows us to list the files in a repository folder using <code>ls</code>, and query the metadata of objects in the repository through <code>info</code> (akin to the POSIX <code>stat</code> system call). Let's add the following code to our script and observe the output:</p> <pre><code># Compare the sizes of local file and repo\nfile_info = fs.info(repo_path)\nprint(\n    f\"{local_path.name}: local size: {file_info['size']}, remote size: {local_path.stat().st_size}\"\n)\n\n# Get information about all files in the repo root\nprint(fs.ls(f\"{REPO}/{BRANCH}/\"))\n</code></pre> <p>As the last order of business, let's clean up the repository to its original state by removing the file using the <code>rm</code> operation and creating another commit (also, the local file is deleted, since we don't need it anymore):</p> <pre><code>with fs.transaction(REPO, BRANCH) as tx:\n    fs.rm(f\"{REPO}/{tx.branch.id}/{local_path.name}\")\n    tx.commit(message=\"Delete demo data\")\n</code></pre> <p>Success</p> <p>You now have all the basic tools available to version data from your Python code using the file system interface provided by lakeFS-spec.</p> Full example code quickstart.py<pre><code>from pathlib import Path\n\nfrom lakefs_spec import LakeFSFileSystem\n\nREPO, BRANCH = \"repo\", \"main\"\n\n# Prepare example local data\nlocal_path = Path(\"demo.txt\")\nlocal_path.write_text(\"Hello, lakeFS!\")\n\n# Upload the local file to the repo and commit\nfs = LakeFSFileSystem()  # will auto-discover credentials from ~/.lakectl.yaml\nrepo_path = f\"{REPO}/{BRANCH}/{local_path.name}\"\n\nwith fs.transaction(REPO, BRANCH) as tx:\n    fs.put(str(local_path), f\"{REPO}/{tx.branch.id}/{local_path.name}\")\n    tx.commit(message=\"Add demo data\")\n\n# Read back the file contents\nf = fs.open(repo_path, \"rt\")\nprint(f.readline())  # prints \"Hello, lakeFS!\"\n\n# Compare the sizes of local file and repo\nfile_info = fs.info(repo_path)\nprint(\n    f\"{local_path.name}: local size: {file_info['size']}, remote size: {local_path.stat().st_size}\"\n)\n\n# Get information about all files in the repo root\nprint(fs.ls(f\"{REPO}/{BRANCH}/\"))\n\n# Delete uploaded file from the repository (and commit)\nwith fs.transaction(REPO, BRANCH) as tx:\n    fs.rm(f\"{REPO}/{tx.branch.id}/{local_path.name}\")\n    tx.commit(message=\"Delete demo data\")\n\nlocal_path.unlink()\n</code></pre>"},{"location":"quickstart/#next-steps","title":"Next Steps","text":"<p>After this walkthrough of the installation and an introduction to basic file system operations using lakeFS-spec, you might want to consider more advanced topics:</p> <ul> <li>API Reference</li> <li>User Guide, in particular<ul> <li>How to use the lakeFS file system</li> <li>How to use lakeFS-spec with third-party data science libraries</li> </ul> </li> <li>Tutorial: Using lakeFS-spec in a data science project</li> </ul>"},{"location":"guides/","title":"User Guide","text":"<p>The lakeFS-spec user guide provides documentation for users of the library looking to solve specific tasks. See the Quickstart guide for an introductory tutorial.</p> <ul> <li>How to use the lakeFS file system</li> <li>Passing configuration to the file system</li> <li>Using file system transactions</li> <li>Using transactions on the lakeFS file system</li> <li>How to use lakeFS-spec with third-party data science libraries</li> </ul>"},{"location":"guides/configuration/","title":"Passing configuration to the file system","text":"<p>There are multiple ways to configure the <code>LakeFSFileSystem</code> for use with a deployed lakeFS instance. This guide introduces them in the order of least to most in-Python configuration - the preferred way to use the file system is with as little Python code as possible.</p> <p>Info</p> <p>The configuration methods are introduced in reverse order of precedence - config file arguments have the lowest priority and are overwritten by environment variables (if specified).</p>"},{"location":"guides/configuration/#the-lakectlyaml-configuration-file","title":"The <code>.lakectl.yaml</code> configuration file","text":"<p>The easiest way of configuring the lakeFS file system is with a <code>lakectl</code> YAML configuration file. To address a lakeFS server, the following minimum configuration is required:</p> ~/.lakectl.yaml<pre><code>credentials:\n  access_key_id: &lt;ID&gt;\n  secret_access_key: &lt;KEY&gt;\nserver:\n  endpoint_url: &lt;LAKEFS-HOST&gt;\n</code></pre> <p>For a local instance produced by the quickstart, the following values will work:</p> ~/.lakectl.yaml<pre><code>credentials:\n  access_key_id: AKIAIOSFOLQUICKSTART\n  secret_access_key: wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY\nserver:\n  endpoint_url: http://127.0.0.1:8000\n</code></pre> <p>To work without any more arguments \"out of the box\", the configuration file has to be placed in your home directory with the name <code>.lakectl.yaml</code> (this is where lakeFS expects it). If you set all values correctly, you can instantiate the lakeFS file system without any arguments:</p> <pre><code>from lakefs_spec import LakeFSFileSystem\n\n# zero config necessary.\nfs = LakeFSFileSystem()\n</code></pre> <p>If you cannot use the default location (<code>$HOME/.lakectl.yaml</code>), you can read a file from any other location by passing the <code>configfile</code> argument:</p> <pre><code>from lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem(configfile=\"/path/to/my/configfile.yaml\")\n</code></pre>"},{"location":"guides/configuration/#setting-environment-variables","title":"Setting environment variables","text":"<p>It is also possible to specify certain configuration values used for authentication with the lakeFS server with environment variables. For these values, the variable name is exactly the constructor argument name prefaced with <code>LAKEFS_</code>, e.g. the <code>host</code> argument can be set via the <code>LAKEFS_HOST</code> environment variable.</p> <pre><code>import os\nfrom lakefs_spec import LakeFSFileSystem\n\nos.environ[\"LAKEFS_HOST\"] = \"http://my-lakefs.host\"\nos.environ[\"LAKEFS_USERNAME\"] = \"my-username\"\nos.environ[\"LAKEFS_PASSWORD\"] = \"my-password\"\n\n# also zero-config.\nfs = LakeFSFileSystem()\n</code></pre> <p>Info</p> <p>Not all initialization values can be set via environment variables - the <code>proxy</code>, <code>create_branch_ok</code>, and <code>source_branch</code> arguments can only be supplied in Python.</p>"},{"location":"guides/configuration/#appendix-mixing-zero-config-methods","title":"Appendix: Mixing zero-config methods","text":"<p>Two of the introduced methods allow for \"zero-config\" (i.e. no arguments given to the constructor) initialization of the file system. However, care must be taken when working with different file systems configured by the same means (for example, file systems configured with separate environment variables).</p> <p>The reason for this is the instance caching mechanism built into fsspec. While this allows for efficient reuse of file systems e.g. by third-party libraries (pandas, DuckDB, ...), it can lead to silent misconfigurations. Consider this example, with an existent <code>.lakectl.yaml</code> file:</p> ~/.lakectl.yaml<pre><code>credentials:\n  access_key_id: AKIAIOSFOLQUICKSTART\n  secret_access_key: wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY\nserver:\n  endpoint_url: http://127.0.0.1:8000\n</code></pre> <p>Now, mixing config file and environment variable initializations leads to the wrong result:</p> <pre><code>import os\nfrom lakefs_spec import LakeFSFileSystem\n\n# first file system, initialized from the config file\nconfig_fs = LakeFSFileSystem()\n\nos.environ[\"LAKEFS_HOST\"] = \"http://my-other-lakefs.host\"\nos.environ[\"LAKEFS_USERNAME\"] = \"my-username\"\nos.environ[\"LAKEFS_PASSWORD\"] = \"my-password\"\n\nenvvar_fs = LakeFSFileSystem()\n\nprint(config_fs is envvar_fs) # &lt;- prints True! \n</code></pre> <p>The reason why the above code does not work as desired is that the cached config-file-initialized file system is simply reused on the second assignment. To clear the file system instance cache, you can run the following:</p> <pre><code>from lakefs_spec import LakeFSFileSystem\n\nLakeFSFileSystem.clear_instance_cache()\n</code></pre>"},{"location":"guides/filesystem-usage/","title":"How to use the lakeFS file system","text":"<p>This guide contains instructions and code snippets on how to use the lakeFS file system.</p>"},{"location":"guides/filesystem-usage/#the-lakefs-uri-structure","title":"The lakeFS URI structure","text":"<p>In the following subsections, we frequently make use of lakeFS URIs in the example code. lakeFS URIs identify resources in a lakeFS deployment through a unique path consisting of repository name, lakeFS revision/ref name, and file name relative to the repository root. Optionally, they may be prefixed with the <code>lakefs://</code> URI scheme (this is required when using third-party libraries).</p> <p>As an example, a URI like <code>repo/main/file.txt</code> addresses the <code>file.txt</code> file on the <code>main</code> branch in the repository named <code>repo</code>.</p> <p>In some lakeFS file system operations, directories are also allowed as resource names. For example, the URI <code>repo/main/data/</code> (note the optional trailing slash) refers to the <code>data</code> directory on the <code>main</code> branch in the <code>repo</code> repository.</p>"},{"location":"guides/filesystem-usage/#on-staged-versus-committed-changes","title":"On staged versus committed changes","text":"<p>When uploading, copying, or removing files or directories from a branch, those removal operations will result in staged changes in the repository until a commit is created. lakeFS-spec does not create these commits automatically, since it separates file operations from versioning operations rigorously. If you want to conduct versioning operations, like creating commits, between file transfers, the best way to do so is by using filesystem transactions.</p>"},{"location":"guides/filesystem-usage/#how-to-use-lakefs-file-system-apis","title":"How to use lakeFS file system APIs","text":"<p>The following section explains more in-depth how to use the <code>LakeFSFileSystem</code> APIs. This section concerns the explicitly implemented operations. In addition, there are a number of file system APIs inherited from the <code>AbstractFileSystem</code> interface in fsspec.</p> <p>More information on file system usage can be found in the fsspec documentation.</p>"},{"location":"guides/filesystem-usage/#uploading-and-downloading-files","title":"Uploading and downloading files","text":"<p>The arguably most important feature of the file system is file transfers.</p>"},{"location":"guides/filesystem-usage/#file-uploads","title":"File uploads","text":"<p>To upload a file, you can use the <code>fs.put()</code> and <code>fs.put_file()</code> methods.  While <code>fs.put_file()</code> operates on single files only, the <code>fs.put()</code> API can be used for directory uploads.</p> <pre><code>from lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\n\n# remote path, then local target path.\nfs.put_file(\"file.txt\", \"my-repo/my-ref/file.txt\")\n</code></pre> <p>If you want to upload an entire directory to lakeFS, you can use the <code>fs.put()</code> API together with the <code>recursive=True</code> switch:</p> <pre><code># structure:\n#   dir/\n#   \u251c\u2500\u2500 a.txt\n#   \u251c\u2500\u2500 b.yaml\n#   \u251c\u2500\u2500 c.csv\n#   \u2514\u2500\u2500 ...\n\nfs.put(\"dir\", \"my-repo/my-ref/dir\", recursive=True)\n</code></pre> <p>Info</p> <p>The above method of file uploading results in two transfers: Once from the client to the lakeFS server, and once from the lakeFS server to the object storage. This can impact performance if the uploaded files are very large. To avoid this performance issue, you can also decide to write the file directly to the underlying object storage:</p> <pre><code>fs = LakeFSFileSystem()\n\nfs.put_file(\"my-repo/my-ref/file.txt\", \"file.txt\", use_blockstore=True)\n</code></pre> <p>Direct lakeFS blockstore uploads require the installation of the corresponding fsspec file system implementation through <code>pip</code>. For an S3-based lakeFS deployment, install the <code>s3fs</code> package. For Google Cloud Storage (GCS), install the <code>gcsfs</code> package. For Azure blob storage, install the <code>adlfs</code> package.</p>"},{"location":"guides/filesystem-usage/#file-downloads","title":"File downloads","text":"<p>To download a file, you can use the <code>fs.get()</code> or <code>fs.get_file()</code> methods. While <code>fs.get_file()</code> downloads single files only, the <code>fs.get()</code> API can be used for recursive directory downloads.</p> <pre><code>from lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\n\n# remote path, then local target path.\nfs.get_file(\"my-repo/my-ref/file.txt\", \"file.txt\")\n</code></pre> <p>In the case of a directory in lakeFS, use the <code>fs.get()</code> API together with the <code>recursive=True</code> switch:</p> <pre><code># structure:\n#   dir/\n#   \u251c\u2500\u2500 a.txt\n#   \u251c\u2500\u2500 b.yaml\n#   \u251c\u2500\u2500 c.csv\n#   \u2514\u2500\u2500 ...\n\n# downloads the entire `dir` directory (and subdirectories) into the current directory.\nfs.get(\"my-repo/my-ref/dir\", \"dir\", recursive=True)\n</code></pre>"},{"location":"guides/filesystem-usage/#checking-the-existence-of-lakefs-objects","title":"Checking the existence of lakeFS objects","text":"<p>To check the existence of a file in a given revision of a repository, you can use the <code>fs.exists()</code> API:</p> <pre><code>from lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\n\nmy_file_exists = fs.exists(\"my-repo/my-ref/my-file.txt\")\n</code></pre> <p>This function returns <code>True</code> if the file exists on that revision, and <code>False</code> if it does not. Errors (e.g. permission errors) will be raised, since in that case, object existence cannot be decided.</p> <p>Warning</p> <p><code>fs.exists()</code> only works on file objects, and will return <code>False</code> if called on directories.</p>"},{"location":"guides/filesystem-usage/#obtaining-info-on-stored-objects","title":"Obtaining info on stored objects","text":"<p>To query the metadata of a single object in a lakeFS repository, use the <code>fs.info()</code> API:</p> <pre><code>from lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\n\nmy_file_info = fs.info(\"my-repo/my-ref/my-file.txt\")\n</code></pre> <p>The resulting <code>my_file_info</code> object is a dictionary containing useful information such as storage location of the file, creation timestamp, and size (in bytes).</p> <p>You can also call <code>fs.info()</code> on directories:</p> <pre><code>dir_info = fs.info(\"my-repo/my-ref/dir/\")\n</code></pre> <p>In this case, the resulting <code>dir_info</code> object only contains the directory name, and the cumulated size of the files it contains.</p>"},{"location":"guides/filesystem-usage/#listing-directories-in-lakefs","title":"Listing directories in lakeFS","text":"<p>To list the files in a directory in lakeFS, use the <code>fs.ls()</code> method:</p> <pre><code>from lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\n\nmy_dir_listing = fs.ls(\"my-repo/my-ref/my-dir/\")\n</code></pre> <p>This returns a list of Python dictionaries containing information on the objects contained in the requested directory. The returned objects have the same fields set as those returned by a normal <code>fs.info()</code> call on a file object.</p>"},{"location":"guides/filesystem-usage/#deleting-objects-from-a-lakefs-branch","title":"Deleting objects from a lakeFS branch","text":"<p>To delete objects from a lakeFS branch, use the <code>fs.rm_file()</code> or <code>fs.rm()</code> APIs. As before, while the former works only for single files, the latter can be used to remove entire directories with the <code>recursive=True</code> option.</p> <pre><code>from lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\n\nfs.rm_file(\"my-repo/my-branch/my-file.txt\")\n\n# removes the entire `my-dir` directory.\nfs.rm(\"my-repo/my-branch/my-dir/\", recursive=True)\n</code></pre>"},{"location":"guides/filesystem-usage/#copying-files-in-a-repository","title":"Copying files in a repository","text":"<p>To copy files on a branch or from one branch to another, use the <code>fs.cp_file()</code> or <code>fs.copy()</code> methods:</p> <pre><code>from lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\n\n# copies a single file on the same branch to a new location.\nfs.cp_file(\"my-repo/branch-a/file.txt\", \"my-repo/branch-a/file.txt.bak\")\n\n# copies a single file from branch A to branch B.\nfs.cp_file(\"my-repo/branch-a/file.txt\", \"my-repo/branch-b/file.txt\")\n\n# copies the entire `my-dir` directory from branch A to branch B (which must exist).\nfs.copy(\"my-repo/branch-a/my-dir/\", \"my-repo/branch-b/my-dir/\", recursive=True)\n</code></pre> <p>Info</p> <p>Files and directories can only be copied between branches in the same repository, not between different repositories.</p> <p>Trying to copy to a non-existent branch will not create the branch.</p>"},{"location":"guides/integrations/","title":"How to use lakeFS-spec with third-party data science libraries","text":"<p>lakeFS-spec is built on top of the fsspec library, which allows third-party libraries to make use of its file system abstraction to offer high-level features. The fsspec documentation lists examples of its users, mostly data science libraries.</p> <p>This user guide page adds more detail on how lakeFS-spec can be used with four prominent data science libraries.</p> <p>Code Examples</p> <p>The code examples assume access to an existing lakeFS server with a <code>quickstart</code> repository containing the sample data already set up.</p> <p>Please see the Quickstart guide or lakeFS quickstart guide if you need guidance in getting started.</p> <p>The relevant lines for the lakeFS-spec integration in the following code snippets are highlighted.</p>"},{"location":"guides/integrations/#pandas","title":"Pandas","text":"<p>Pandas can read and write data from remote locations, and uses fsspec for all URLs that are not local or HTTP(S).</p> <p>This means that (almost) all <code>pd.read_*</code> and <code>pd.DataFrame.to_*</code> operations can benefit from the lakeFS integration offered by our library without any additional configuration. See the Pandas documentation on reading/writing remote files for additional details.</p> <p>The following code snippet illustrates how to read and write Pandas data frames in various formats from/to a lakeFS repository in the context of a transaction:</p> <pre><code>import pandas as pd\n\nfrom lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\n\nwith fs.transaction(\"quickstart\", \"main\") as tx:\n    lakes = pd.read_parquet(f\"lakefs://quickstart/{tx.branch.id}/lakes.parquet\")\n    german_lakes = lakes.query('Country == \"Germany\"')\n    german_lakes.to_csv(f\"lakefs://quickstart/{tx.branch.id}/german_lakes.csv\")\n\n    tx.commit(message=\"Add German lakes\")\n</code></pre>"},{"location":"guides/integrations/#duckdb","title":"DuckDB","text":"<p>The DuckDB in-memory database management system includes support for fsspec file systems as part of its Python API (see the official documentation on using fsspec filesystems for details). This allows DuckDB to transparently query and store data located in lakeFS repositories through lakeFS-spec.</p> <p>Similar to the example above, the following code snippet illustrates how to read and write data from/to a lakeFS repository in the context of a transaction through the DuckDB Python API:</p> <pre><code>import duckdb\n\nfrom lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\nduckdb.register_filesystem(fs)\n\nwith fs.transaction(\"quickstart\", \"main\") as tx:\n    lakes = duckdb.read_parquet(\"lakefs://quickstart/main/lakes.parquet\")\n    italian_lakes = duckdb.sql(\"SELECT * FROM lakes where Country='Italy'\")\n    italian_lakes.to_csv(f\"lakefs://quickstart/{tx.branch.id}/italian_lakes.csv\")\n\n    tx.commit(message=\"Add Italian lakes\")\n</code></pre> <ol> <li>Makes the lakeFS-spec file system known to DuckDB (<code>duckdb.register_filesystem(fsspec.filesystem(\"lakefs\"))</code> can also be used to avoid the direct import of <code>LakeFSFileSystem</code>)</li> </ol>"},{"location":"guides/integrations/#polars","title":"Polars","text":"<p>Warning</p> <p>There is an ongoing discussion in the Polars development team whether to remove support for fsspec file systems, with no clear outcome as of the time this page was written. Please refer to the discussion on the relevant GitHub issue in case you encounter any problems.</p> <p>The Python API wrapper for the Rust-based Polars DataFrame library can access remote storage through fsspec, similar to Pandas (see the official documentation on cloud storage).</p> <p>Again, the following code example demonstrates how to read a Parquet file and save a modified version back in CSV format to a lakeFS repository from Polars in the context of a  transaction:</p> <pre><code>import polars as pl\n\nfrom lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\n\nwith fs.transaction(\"quickstart\", \"main\") as tx:\n    lakes = pl.read_parquet(f\"lakefs://quickstart/{tx.branch.id}/lakes.parquet\")\n    us_lakes = lakes.filter(pl.col(\"Country\") == \"United States of America\")\n\n    with fs.open(f\"lakefs://quickstart/{tx.branch.id}/us_lakes.csv\", \"wb\") as f:\n        us_lakes.write_csv(f)\n\n    tx.commit(message=\"Add US lakes\")\n</code></pre> <ol> <li>Polars does not support directly writing to remote storage through the <code>pl.DataFrame.write_*</code> API (see docs)</li> </ol>"},{"location":"guides/integrations/#pyarrow","title":"PyArrow","text":"<p>Apache Arrow and its Python API, PyArrow, can also use fsspec file systems to perform I/O operations on data objects. The documentation has additional details on using fsspec-compatible file systems with Arrow.</p> <p>PyArrow <code>read_*</code> and <code>write_*</code> functions take an explicit <code>filesystem</code> parameter, which accepts any fsspec file system, such as the <code>LakeFSFileSystem</code> provided by this library. </p> <p>The following example code illustrates the use of lakeFS-spec with PyArrow, reading a Parquet file and writing it back to a lakeFS repository as a partitioned CSV dataset in the context of a transaction:</p> <pre><code>import pyarrow as pa\nimport pyarrow.dataset as ds\nimport pyarrow.parquet as pq\n\nfrom lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\n\nwith fs.transaction(\"quickstart\", \"main\") as tx:\n    lakes_table = pq.read_table(f\"quickstart/{tx.branch.id}/lakes.parquet\", filesystem=fs)\n\n    ds.write_dataset(\n        lakes_table,\n        f\"quickstart/{tx.branch.id}/lakes\",\n        filesystem=fs,\n        format=\"csv\",\n        partitioning=ds.partitioning(pa.schema([lakes_table.schema.field(\"Country\")])),\n    )\n\n    tx.commit(\"Add partitioned lakes data set\")\n</code></pre>"},{"location":"guides/transactions/","title":"Using transactions on the lakeFS file system","text":"<p>In addition to file operations, you can carry out versioning operations in your Python code using file system transactions.</p> <p>Transactions in lakeFS-spec behave similarly to the transactions in the high-level lakeFS SDK: Both approaches create an ephemeral branch for a transaction, perform the operations in the context block on that ephemeral branch, and optionally merge it back into the source branch upon exiting the context manager.</p> <p>They are an \"all or nothing\" proposition: If an error occurs during the transaction, the base branch is left unchanged.</p> <p>The lakeFS-spec transaction inherits from fsspec transactions. For more information on fsspec transactions, see the official documentation.</p>"},{"location":"guides/transactions/#versioning-operations","title":"Versioning operations","text":"<p>The lakeFS file system's transaction is the intended place for conducting versioning operations between file transfers. The following is an example of file uploads with commit creations, with a tag being applied at the end.</p> <pre><code>from lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\n\nwith fs.transaction(\"repo\", \"main\") as tx:\n    fs.put_file(\"train-data.txt\", f\"repo/{tx.branch.id}/train-data.txt\")\n    tx.commit(message=\"Add training data\")\n    fs.put_file(\"test-data.txt\", f\"repo/{tx.branch.id}/test-data.txt\")\n    sha = tx.commit(message=\"Add test data\")\n    tx.tag(sha, name=\"My train-test split\")\n</code></pre> <p>The full list of supported lakeFS versioning operations (by default, these operations target the transaction branch):</p> <ul> <li><code>commit</code>, for creating a commit, optionally with attached metadata.</li> <li><code>merge</code>, for merging a given branch.</li> <li><code>revert</code>, for reverting a previous commit.</li> <li><code>rev_parse</code>, for parsing revisions like branch/tag names and SHA fragments into full commit SHAs.</li> <li><code>tag</code>, for creating a tag pointing to a commit.</li> </ul>"},{"location":"guides/transactions/#lifecycle-of-ephemeral-transaction-branches","title":"Lifecycle of ephemeral transaction branches","text":"<p>You can control the lifecycle for a transaction branch with the <code>delete</code> argument:</p> <ul> <li>By default (<code>delete=\"onsuccess</code>), the branch is deleted after successful completion, and left over in case of failure for debugging purposes.</li> <li>If <code>delete=\"always\"</code>, the branch is unconditionally deleted after the transaction regardless of its status.</li> <li>Similarly, if <code>delete=\"never\"</code>, the branch is unconditionally left in place after the transaction.</li> </ul> <p>Additionally, the <code>automerge</code> keyword controls whether the transaction branch is merged after successful completion of the transaction.  It has no effect if an error occurs over the course of the transaction.</p>"},{"location":"guides/transactions/#error-handling","title":"Error handling","text":"<p>Since all files are uploaded to a short-lived transaction branch, no commit on the target branch happens in case of an exception:</p> <pre><code>from lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\n\nwith fs.transaction(\"repo\", \"main\", delete=\"onsuccess\") as tx:\n    fs.put_file(\"my-file.txt\", f\"repo/{tx.branch.id}/my-file.txt\")\n    tx.commit(message=\"Add my-file.txt\")\n    raise ValueError(\"oops!\")\n</code></pre> <p>The above code will not modify the <code>main</code> branch, since the <code>ValueError</code> prevents the merge of the transaction branch. Note that you can examine the contents of the transaction branch due to <code>delete=\"onsuccess\"</code> (the default behavior), which prevents deletion of the branch in case of failure for debugging purposes.</p>"},{"location":"reference/SUMMARY/","title":"SUMMARY","text":"<ul> <li>lakefs_spec<ul> <li>errors</li> <li>spec</li> <li>transaction</li> <li>util</li> </ul> </li> </ul>"},{"location":"reference/lakefs_spec/","title":"lakefs_spec","text":"<p>lakefs-spec is an fsspec file system integration for the lakeFS data lake.</p>"},{"location":"reference/lakefs_spec/errors/","title":"errors","text":"<p>Error translation facilities to map lakeFS API errors to Python-native OS errors in the lakeFS file system.</p> <p>This is important to honor the fsspec API contract, where users only need to expect builtin Python exceptions to avoid complicated error handling setups.</p>"},{"location":"reference/lakefs_spec/errors/#lakefs_spec.errors.translate_lakefs_error","title":"translate_lakefs_error","text":"<pre><code>translate_lakefs_error(\n    error: ServerException,\n    rpath: str | None = None,\n    message: str | None = None,\n    set_cause: bool = True,\n) -&gt; OSError\n</code></pre> <p>Convert a lakeFS server exception to a Python builtin exception.</p> <p>For some subclasses of <code>lakefs.exceptions.ServerException</code>, a direct Python builtin equivalent exists. In these cases, the suitable equivalent is returned. All other classes are converted to a standard <code>IOError</code>.</p> PARAMETER  DESCRIPTION <code>error</code> <p>The exception returned by the lakeFS SDK wrapper.</p> <p> TYPE: <code>ServerException</code> </p> <code>rpath</code> <p>The remote resource path involved in the error.</p> <p> TYPE: <code>str | None</code> DEFAULT: <code>None</code> </p> <code>message</code> <p>An error message to use for the returned exception.  If not given, the error message returned by the lakeFS server is used instead.</p> <p> TYPE: <code>str | None</code> DEFAULT: <code>None</code> </p> <code>set_cause</code> <p>Whether to set the <code>__cause__</code> attribute to the previous exception if the exception is translated.</p> <p> TYPE: <code>bool</code> DEFAULT: <code>True</code> </p> RETURNS DESCRIPTION <code>OSError</code> <p>A builtin Python exception ready to be thrown.</p> Source code in <code>src/lakefs_spec/errors.py</code> <pre><code>def translate_lakefs_error(\n    error: ServerException,\n    rpath: str | None = None,\n    message: str | None = None,\n    set_cause: bool = True,\n) -&gt; OSError:\n    \"\"\"\n    Convert a lakeFS server exception to a Python builtin exception.\n\n    For some subclasses of ``lakefs.exceptions.ServerException``, a direct Python builtin equivalent exists.\n    In these cases, the suitable equivalent is returned. All other classes are converted to a standard ``IOError``.\n\n    Parameters\n    ----------\n    error: ServerException\n        The exception returned by the lakeFS SDK wrapper.\n    rpath: str | None\n        The remote resource path involved in the error.\n    message: str | None\n        An error message to use for the returned exception.\n         If not given, the error message returned by the lakeFS server is used instead.\n    set_cause: bool\n        Whether to set the ``__cause__`` attribute to the previous exception if the exception is translated.\n\n    Returns\n    -------\n    OSError\n        A builtin Python exception ready to be thrown.\n    \"\"\"\n    status = error.status_code\n\n    if hasattr(error, \"body\"):\n        # error has a JSON response body attached\n        reason = error.body[\"message\"]\n    else:\n        reason = error.reason\n\n    emsg = f\"{status} {reason}\"\n    if rpath:\n        emsg += f\": {rpath!r}\"\n\n    constructor = HTTP_CODE_TO_ERROR.get(status, partial(IOError, errno.EIO))\n    custom_exc = constructor(message or emsg)\n\n    if set_cause:\n        custom_exc.__cause__ = error\n    return custom_exc\n</code></pre>"},{"location":"reference/lakefs_spec/spec/","title":"spec","text":"<p>Core interface definitions for file system interaction with lakeFS from Python, namely the <code>LakeFSFileSystem</code> and <code>LakeFSFile</code> classes.</p>"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem","title":"LakeFSFileSystem","text":"<p>             Bases: <code>AbstractFileSystem</code></p> <p>lakeFS file system implementation.</p> <p>Instances of this class are cached based on their constructor arguments.</p> <p>For more information, see the fsspec documentation https://filesystem-spec.readthedocs.io/en/latest/features.html#instance-caching.</p> PARAMETER  DESCRIPTION <code>host</code> <p>The address of your lakeFS instance.</p> <p> TYPE: <code>str | None</code> DEFAULT: <code>None</code> </p> <code>username</code> <p>The access key name to use in case of access key authentication.</p> <p> TYPE: <code>str | None</code> DEFAULT: <code>None</code> </p> <code>password</code> <p>The access key secret to use in case of access key authentication.</p> <p> TYPE: <code>str | None</code> DEFAULT: <code>None</code> </p> <code>api_key</code> <p>The API key to use in case of authentication with an API key.</p> <p> TYPE: <code>str | None</code> DEFAULT: <code>None</code> </p> <code>api_key_prefix</code> <p>A string prefix to use for the API key in authentication.</p> <p> TYPE: <code>str | None</code> DEFAULT: <code>None</code> </p> <code>access_token</code> <p>An access token to use in case of access token authentication.</p> <p> TYPE: <code>str | None</code> DEFAULT: <code>None</code> </p> <code>verify_ssl</code> <p>Whether to verify SSL certificates in API interactions. Do not disable in production.</p> <p> TYPE: <code>bool</code> DEFAULT: <code>True</code> </p> <code>ssl_ca_cert</code> <p>A custom certificate PEM file to use to verify the peer in SSL connections.</p> <p> TYPE: <code>str | None</code> DEFAULT: <code>None</code> </p> <code>proxy</code> <p>Proxy address to use when connecting to a lakeFS server.</p> <p> TYPE: <code>str | None</code> DEFAULT: <code>None</code> </p> <code>create_branch_ok</code> <p>Whether to create branches implicitly when not-existing branches are referenced on file uploads.</p> <p> TYPE: <code>bool</code> DEFAULT: <code>True</code> </p> <code>source_branch</code> <p>Source branch set as origin when a new branch is implicitly created.</p> <p> TYPE: <code>str</code> DEFAULT: <code>'main'</code> </p> <code>**storage_options</code> <p>Configuration options to pass to the file system's directory cache.</p> <p> TYPE: <code>Any</code> DEFAULT: <code>{}</code> </p> Source code in <code>src/lakefs_spec/spec.py</code> <pre><code>class LakeFSFileSystem(AbstractFileSystem):\n    \"\"\"\n    lakeFS file system implementation.\n\n    Instances of this class are cached based on their constructor arguments.\n\n    For more information, see the fsspec documentation &lt;https://filesystem-spec.readthedocs.io/en/latest/features.html#instance-caching&gt;.\n\n    Parameters\n    ----------\n    host: str | None\n        The address of your lakeFS instance.\n    username: str | None\n        The access key name to use in case of access key authentication.\n    password: str | None\n        The access key secret to use in case of access key authentication.\n    api_key: str | None\n        The API key to use in case of authentication with an API key.\n    api_key_prefix: str | None\n        A string prefix to use for the API key in authentication.\n    access_token: str | None\n        An access token to use in case of access token authentication.\n    verify_ssl: bool\n        Whether to verify SSL certificates in API interactions. Do not disable in production.\n    ssl_ca_cert: str | None\n        A custom certificate PEM file to use to verify the peer in SSL connections.\n    proxy: str | None\n        Proxy address to use when connecting to a lakeFS server.\n    create_branch_ok: bool\n        Whether to create branches implicitly when not-existing branches are referenced on file uploads.\n    source_branch: str\n        Source branch set as origin when a new branch is implicitly created.\n    **storage_options: Any\n        Configuration options to pass to the file system's directory cache.\n    \"\"\"\n\n    protocol = \"lakefs\"\n\n    def __init__(\n        self,\n        host: str | None = None,\n        username: str | None = None,\n        password: str | None = None,\n        api_key: str | None = None,\n        api_key_prefix: str | None = None,\n        access_token: str | None = None,\n        verify_ssl: bool = True,\n        ssl_ca_cert: str | None = None,\n        proxy: str | None = None,\n        create_branch_ok: bool = True,\n        source_branch: str = \"main\",\n        **storage_options: Any,\n    ):\n        super().__init__(**storage_options)\n\n        # lakeFS client arguments\n        cargs = [host, username, password, api_key, api_key_prefix, access_token, ssl_ca_cert]\n\n        if all(arg is None for arg in cargs):\n            # empty kwargs means envvar and configfile autodiscovery\n            self.client = Client()\n        else:\n            self.client = Client(\n                host=host,\n                username=username,\n                password=password,\n                api_key=api_key,\n                api_key_prefix=api_key_prefix,\n                access_token=access_token,\n                ssl_ca_cert=ssl_ca_cert,\n            )\n\n        # proxy address, not part of the constructor\n        self.client.config.proxy = proxy\n        # whether to verify SSL certs, not part of the constructor\n        self.client.config.verify_ssl = verify_ssl\n\n        self.create_branch_ok = create_branch_ok\n        self.source_branch = source_branch\n\n    @cached_property\n    def _lakefs_server_version(self):\n        with self.wrapped_api_call():\n            return tuple(int(t) for t in self.client.version.split(\".\"))\n\n    @classmethod\n    @overload\n    def _strip_protocol(cls, path: str | os.PathLike[str] | Path) -&gt; str:\n        ...\n\n    @classmethod\n    @overload\n    def _strip_protocol(cls, path: list[str | os.PathLike[str] | Path]) -&gt; list[str]:\n        ...\n\n    @classmethod\n    def _strip_protocol(cls, path):\n        \"\"\"Copied verbatim from the base class, save for the slash rstrip.\"\"\"\n        if isinstance(path, list):\n            return [cls._strip_protocol(p) for p in path]\n        spath = super()._strip_protocol(path)\n        if stringify_path(path).endswith(\"/\"):\n            return spath + \"/\"\n        return spath\n\n    @property\n    def transaction(self) -&gt; LakeFSTransaction:\n        \"\"\"\n        A context manager within which file uploads and versioning operations are deferred to a\n        queue, and carried out during when exiting the context.\n\n        Requires the file class to implement ``.commit()`` and ``.discard()`` for the normal and exception cases.\n        \"\"\"\n        self._transaction: LakeFSTransaction | None\n        if self._transaction is None:\n            self._transaction = LakeFSTransaction(self)\n        return self._transaction\n\n    def start_transaction(self):\n        raise NotImplementedError(\n            \"lakeFS transactions should only be used as a context manager via\"\n            \" `with LakeFSFileSystem.transaction as tx:`\"\n        )\n\n    @contextmanager\n    def wrapped_api_call(\n        self, rpath: str | None = None, message: str | None = None, set_cause: bool = True\n    ) -&gt; Generator[None, None, None]:\n        \"\"\"\n        A context manager to wrap lakeFS API calls, translating any API errors to Python-native OS errors.\n\n        Meant for internal use.\n\n        Parameters\n        ----------\n        rpath: str | None\n            The remote path involved in the requested API call.\n        message: str | None\n            A custom error message to emit instead of parsing the API error response.\n        set_cause: bool\n            Whether to include the original lakeFS API error in the resulting traceback.\n\n        Yields\n        ------\n        None\n            An empty generator, to be used as a context manager.\n\n        Raises\n        ------\n        OSError\n            Translated error from the lakeFS API call, if any.\n        \"\"\"\n        try:\n            yield\n        except ServerException as e:\n            raise translate_lakefs_error(e, rpath=rpath, message=message, set_cause=set_cause)\n\n    def checksum(self, path: str | os.PathLike[str]) -&gt; str | None:\n        \"\"\"\n        Get a remote lakeFS file object's checksum.\n\n        This is usually its MD5 hash, unless another hash function was used on upload.\n\n        Parameters\n        ----------\n        path: str | os.PathLike[str]\n            The remote path to look up the lakeFS checksum for. Must point to a single file object.\n\n        Returns\n        -------\n        str | None\n            The remote file's checksum, or ``None`` if ``path`` points to a directory or does not exist.\n        \"\"\"\n        path = stringify_path(path)\n        try:\n            return self.info(path).get(\"checksum\")\n        except FileNotFoundError:\n            return None\n\n    def exists(self, path: str | os.PathLike[str], **kwargs: Any) -&gt; bool:\n        \"\"\"\n        Check existence of a remote path in a lakeFS repository.\n\n        Input paths can either be files or directories.\n\n        Parameters\n        ----------\n        path: str | os.PathLike[str]\n            The remote path whose existence to check. Must be a fully qualified lakeFS URI.\n        **kwargs: Any\n            Additional keyword arguments for fsspec compatibility, unused.\n\n        Returns\n        -------\n        bool\n            ``True`` if the requested path exists, ``False`` if it does not.\n\n        Raises\n        ------\n        PermissionError\n            If the user does not have sufficient permissions to query object existence.\n        \"\"\"\n        path = stringify_path(path)\n        repository, ref, resource = parse(path)\n        try:\n            reference = lakefs.Reference(repository, ref, client=self.client)\n            return reference.object(resource).exists()\n        except ServerException as e:\n            # in case of an error other than \"not found\", existence cannot be\n            # decided, so raise the translated error.\n            raise translate_lakefs_error(e)\n\n    def cp_file(\n        self, path1: str | os.PathLike[str], path2: str | os.PathLike[str], **kwargs: Any\n    ) -&gt; None:\n        \"\"\"\n        Copy a single file from one remote location to another in lakeFS.\n\n        Parameters\n        ----------\n        path1: str | os.PathLike[str]\n            The remote file location to be copied.\n        path2: str | os.PathLike[str]\n            The (remote) target location to which to copy the file.\n        **kwargs: Any\n            Additional keyword arguments for fsspec compatibility, unused.\n\n        Raises\n        ------\n        ValueError\n            When attempting to copy objects between repositories.\n        \"\"\"\n        path1 = stringify_path(path1)\n        path2 = stringify_path(path2)\n        if path1 == path2:\n            return\n\n        orig_repo, orig_ref, orig_path = parse(path1)\n        dest_repo, dest_ref, dest_path = parse(path2)\n\n        if orig_repo != dest_repo:\n            raise ValueError(\n                \"can only copy objects within a repository, but got source \"\n                f\"repository {orig_repo!r} and destination repository {dest_repo!r}\"\n            )\n\n        with self.wrapped_api_call():\n            reference = lakefs.Reference(orig_repo, orig_ref, client=self.client)\n            reference.object(orig_path).copy(dest_ref, dest_path)\n\n    def get_file(\n        self,\n        rpath: str | os.PathLike[str],\n        lpath: str | os.PathLike[str],\n        callback: fsspec.callbacks.Callback = _DEFAULT_CALLBACK,\n        outfile: Any = None,\n        precheck: bool = True,\n        **kwargs: Any,\n    ) -&gt; None:\n        \"\"\"\n        Download a single file from a remote lakeFS server to local storage.\n\n        Parameters\n        ----------\n        rpath: str | os.PathLike[str]\n            The remote path to download to local storage. Must be a fully qualified lakeFS URI, and point to a single file.\n        lpath: str | os.PathLike[str]\n            The local path on disk to save the downloaded file to.\n        callback: fsspec.callbacks.Callback\n            An fsspec callback to use during the operation. Can be used to report download progress.\n        outfile: Any\n            A file-like object to save the downloaded content to. Can be used in place of ``lpath``.\n        precheck: bool\n            Check if ``lpath`` already exists and compare its checksum with that of ``rpath``, skipping the download if they match.\n        **kwargs: Any\n            Additional keyword arguments passed to ``AbstractFileSystem.open()``.\n        \"\"\"\n        rpath = stringify_path(rpath)\n        lpath = stringify_path(lpath)\n\n        if precheck and Path(lpath).is_file():\n            local_checksum = md5_checksum(lpath, blocksize=self.blocksize)\n            remote_checksum = self.info(rpath).get(\"checksum\")\n            if local_checksum == remote_checksum:\n                logger.info(\n                    f\"Skipping download of resource {rpath!r} to local path {lpath!r}: \"\n                    f\"Resource {lpath!r} exists and checksums match.\"\n                )\n                return\n\n        with self.wrapped_api_call(rpath=rpath):\n            super().get_file(rpath, lpath, callback=callback, outfile=outfile, **kwargs)\n\n    def info(self, path: str | os.PathLike[str], **kwargs: Any) -&gt; dict[str, Any]:\n        \"\"\"\n        Query a remote lakeFS object's metadata.\n\n        Parameters\n        ----------\n        path: str | os.PathLike[str]\n            The object for which to obtain metadata. Must be a fully qualified lakeFS URI, can either point to a file or a directory.\n        **kwargs: Any\n            Additional keyword arguments to pass to ``LakeFSFileSystem.ls()`` if ``path`` points to a directory.\n\n        Returns\n        -------\n        dict[str, Any]\n            A dictionary containing metadata on the object, including its full remote path and object type (file or directory).\n\n        Raises\n        ------\n        FileNotFoundError\n            If the ``path`` refers to a non-file path that does not exist in the repository.\n        \"\"\"\n        path = stringify_path(path)\n        repository, ref, resource = parse(path)\n        # first, try with `stat_object` in case of a file.\n        # the condition below checks edge cases of resources that cannot be files.\n        if resource and not resource.endswith(\"/\"):\n            try:\n                reference = lakefs.Reference(repository, ref, client=self.client)\n                res = reference.object(resource).stat()\n                return {\n                    \"checksum\": res.checksum,\n                    \"content-type\": res.content_type,\n                    \"mtime\": res.mtime,\n                    \"name\": f\"{repository}/{ref}/{res.path}\",\n                    \"size\": res.size_bytes,\n                    \"type\": \"file\",\n                }\n            except NotFoundException:\n                # fall through, retry with `ls` if it's a directory.\n                pass\n            except ServerException as e:\n                raise translate_lakefs_error(e, rpath=path)\n\n        out = self.ls(path, detail=True, recursive=True, **kwargs)\n        if not out:\n            raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), path)\n\n        return {\n            \"name\": path.rstrip(\"/\"),\n            \"size\": sum(o.get(\"size\") or 0 for o in out),\n            \"type\": \"directory\",\n        }\n\n    def _update_dircache(self, info: list) -&gt; None:\n        \"\"\"Update logic for dircache (optionally recursive) based on lakeFS API response\"\"\"\n        parents = {self._parent(i[\"name\"].rstrip(\"/\")) for i in info}\n        for pp in parents:\n            # subset of info entries which are direct descendants of `parent`\n            dir_info = [i for i in info if self._parent(i[\"name\"].rstrip(\"/\")) == pp]\n            if pp not in self.dircache:\n                self.dircache[pp] = dir_info\n                continue\n\n            # Merge existing dircache entry with updated listing, which contains either:\n            # - files not present in the cache yet\n            # - a fresh listing (if `refresh=True`)\n\n            cache_entry = self.dircache[pp][:]\n\n            old_names = {e[\"name\"] for e in cache_entry}\n            new_names = {e[\"name\"] for e in dir_info}\n\n            to_remove = old_names - new_names\n            to_update = old_names.intersection(new_names)\n\n            # Remove all entries no longer present in the current listing\n            cache_entry = [e for e in cache_entry if e[\"name\"] not in to_remove]\n\n            # Overwrite existing entries in the cache with its updated values\n            for name in to_update:\n                old_idx = next(idx for idx, e in enumerate(cache_entry) if e[\"name\"] == name)\n                new_entry = next(e for e in info if e[\"name\"] == name)\n\n                cache_entry[old_idx] = new_entry\n                dir_info.remove(new_entry)\n\n            # Add the remaining (new) entries to the cache\n            cache_entry.extend(dir_info)\n            self.dircache[pp] = sorted(cache_entry, key=operator.itemgetter(\"name\"))\n\n    def _ls_from_cache(self, path: str, recursive: bool = False) -&gt; list[dict[str, Any]] | None:\n        \"\"\"Override of ``AbstractFileSystem._ls_from_cache`` with support for recursive listings.\"\"\"\n        if not recursive:\n            return super()._ls_from_cache(path)\n\n        result = None\n        for key, files in self.dircache.items():\n            if not (key.startswith(path) or path == key + \"/\"):\n                continue\n            if result is None:\n                result = []\n            result.extend(files)\n        if not result:\n            return result\n        return sorted(result, key=operator.itemgetter(\"name\"))\n\n    @overload\n    def ls(\n        self,\n        path: str | os.PathLike[str],\n        detail: Literal[True] = ...,\n        **kwargs: Any,\n    ) -&gt; list[dict[str, Any]]:\n        ...\n\n    @overload\n    def ls(\n        self,\n        path: str | os.PathLike[str],\n        detail: Literal[False],\n        **kwargs: Any,\n    ) -&gt; list[str]:\n        ...\n\n    @overload\n    def ls(\n        self,\n        path: str | os.PathLike[str],\n        detail: bool = True,\n        **kwargs: Any,\n    ) -&gt; list[str] | list[dict[str, Any]]:\n        ...\n\n    def ls(\n        self,\n        path: str | os.PathLike[str],\n        detail: bool = True,\n        **kwargs: Any,\n    ) -&gt; list[str] | list[dict[str, Any]]:\n        \"\"\"\n        List all available objects under a given path in lakeFS.\n\n        Parameters\n        ----------\n        path: str | os.PathLike[str]\n            The path under which to list objects. Must be a fully qualified lakeFS URI.\n            Can also point to a file, in which case the file's metadata will be returned.\n        detail: bool\n            Whether to obtain all metadata on the requested objects or just their names.\n        **kwargs: Any\n            Additional keyword arguments for fsspec compatibility.\n\n            In particular:\n                `refresh: bool`: whether to skip the directory listing cache,\n                `recursive: bool`: whether to list subdirectory contents recursively\n\n        Returns\n        -------\n        list[str] | list[dict[str, Any]]\n            A list of all objects' metadata under the given remote path if ``detail=True``, or alternatively only their names if ``detail=False``.\n        \"\"\"\n        path = self._strip_protocol(path)\n        repository, ref, prefix = parse(path)\n\n        recursive = kwargs.pop(\"recursive\", False)\n\n        # Try lookup in dircache unless explicitly disabled by `refresh=True` kwarg\n        use_dircache = not kwargs.pop(\"refresh\", False)\n\n        if use_dircache:\n            cache_entry: list[Any] | None = None\n            try:\n                cache_entry = self._ls_from_cache(path, recursive=recursive)\n            except FileNotFoundError:\n                # we patch files missing from an ls call in the cache entry below,\n                # so this should not be an error.\n                pass\n\n            if cache_entry is not None:\n                if not detail:\n                    return [e[\"name\"] for e in cache_entry]\n                return cache_entry[:]\n\n        kwargs[\"prefix\"] = prefix\n\n        info = []\n        # stat infos are either the path only (`detail=False`) or a dict full of metadata\n        delimiter = \"\" if recursive else \"/\"\n        reference = lakefs.Reference(repository, ref, client=self.client)\n\n        with self.wrapped_api_call(rpath=path):\n            for obj in reference.objects(prefix=prefix, delimiter=delimiter):\n                if isinstance(obj, CommonPrefix):\n                    # prefixes are added below.\n                    info.append(\n                        {\n                            \"name\": f\"{repository}/{ref}/{obj.path}\",\n                            \"size\": 0,\n                            \"type\": \"directory\",\n                        }\n                    )\n                elif isinstance(obj, ObjectInfo):\n                    info.append(\n                        {\n                            \"checksum\": obj.checksum,\n                            \"content-type\": obj.content_type,\n                            \"mtime\": obj.mtime,\n                            \"name\": f\"{repository}/{ref}/{obj.path}\",\n                            \"size\": obj.size_bytes,\n                            \"type\": \"object\",\n                        }\n                    )\n\n        # Retry the API call with appended slash if the current result\n        # is just a single directory entry only (not its contents).\n        # This is useful to allow `ls(\"repo/branch/dir\")` calls without a trailing slash.\n        if len(info) == 1 and info[0][\"type\"] == \"directory\" and info[0][\"name\"] == path + \"/\":\n            return self.ls(\n                path + \"/\",\n                detail=detail,\n                **kwargs | {\"refresh\": not use_dircache, \"recursive\": recursive},\n            )\n\n        if recursive:\n            # To make recursive ls behave identical to the non-recursive case,\n            # add back virtual `directory` entries, which are only returned by\n            # the lakeFS API when querying non-recursively.\n            here = self._strip_protocol(path).rstrip(\"/\")\n            subdirs = {parent for o in info if (parent := self._parent(o[\"name\"])) != here}\n            for subdir in subdirs:\n                info.append(\n                    {\n                        \"name\": subdir + \"/\",\n                        \"size\": 0,\n                        \"type\": \"directory\",\n                    }\n                )\n\n        if info:\n            self._update_dircache(info[:])\n\n        if not detail:\n            info = [o[\"name\"] for o in info]  # type: ignore\n\n        return info\n\n    def open(\n        self,\n        path: str | os.PathLike[str],\n        mode: Literal[\"r\", \"rb\", \"rt\", \"w\", \"wb\", \"wt\", \"x\", \"xb\", \"xt\"] = \"rb\",\n        pre_sign: bool = False,\n        content_type: str | None = None,\n        metadata: dict[str, str] | None = None,\n        autocommit: bool = True,\n        **kwargs: Any,\n    ) -&gt; LakeFSIOBase:\n        \"\"\"\n        Dispatch a lakeFS file-like object (local buffer on disk) for the given remote path for up- or downloads depending on ``mode``.\n\n        Parameters\n        ----------\n        path: str | os.PathLike[str]\n            The remote path for which to open a local ``LakeFSFile``. Must be a fully qualified lakeFS URI.\n        mode: Literal[\"r\", \"rb\", \"rt\", \"w\", \"wb\", \"wt\", \"x\", \"xb\", \"xt\"]\n            The file mode indicating its purpose. Use ``r/rb`` for downloads from lakeFS, ``w/wb/x/xb`` for uploads to lakeFS.\n        pre_sign: bool\n            Whether to use a pre-signed URL for the file up-/download.\n        content_type: str | None\n            Content type to use for the file, relevant for uploads only.\n        metadata: dict[str, str] | None\n            Additional metadata to attach to the file, relevant for uploads only.\n        autocommit: bool\n            Whether to process the file immediately instead of queueing it for transaction while in a transaction context.\n        **kwargs: Any\n            Additional keyword arguments for fsspec compatibility, unused.\n\n        Returns\n        -------\n        LakeFSIOBase\n            A local file-like object ready to hold data to be received from / sent to a lakeFS server.\n\n        Raises\n        ------\n        NotImplementedError\n            If ``mode`` is not supported.\n        \"\"\"\n        if mode.endswith(\"t\"):\n            # text modes {r,w,x}t are equivalent to {r,w,x} here respectively.\n            mode = mode[:-1]  # type: ignore\n\n        if mode not in {\"r\", \"rb\", \"w\", \"wb\", \"x\", \"xb\"}:\n            raise NotImplementedError(f\"unsupported mode {mode!r}\")\n\n        path = stringify_path(path)\n        repo, ref, resource = parse(path)\n\n        if mode.startswith(\"r\"):\n            reference = lakefs.Reference(repo, ref, client=self.client)\n            obj = reference.object(resource)\n\n            if not obj.exists():\n                raise FileNotFoundError(path)\n            handler = ObjectReader(obj, mode=mode, pre_sign=pre_sign, client=self.client)\n        else:\n            # for writing ops, ref must be a branch\n            branch = lakefs.Branch(repo, ref, client=self.client)\n            if self.create_branch_ok:\n                branch.create(self.source_branch, exist_ok=True)\n\n            obj = branch.object(resource)\n            handler = ObjectWriter(\n                obj,\n                mode=mode,\n                pre_sign=pre_sign,\n                content_type=content_type,\n                metadata=metadata,\n                client=self.client,\n            )\n\n        ac = kwargs.pop(\"autocommit\", not self._intrans)\n        if not ac and \"r\" not in mode:\n            self._transaction.files.append(handler)\n\n        return handler\n\n    def put_file(\n        self,\n        lpath: str | os.PathLike[str],\n        rpath: str | os.PathLike[str],\n        callback: fsspec.callbacks.Callback = _DEFAULT_CALLBACK,\n        precheck: bool = True,\n        **kwargs: Any,\n    ) -&gt; None:\n        \"\"\"\n        Upload a local file to a remote location on a lakeFS server.\n\n        Note that depending on the block store type, additional configuration like credentials may need to be configured when ``use_blockstore=True`` and ``presign=False``.\n\n        Parameters\n        ----------\n        lpath: str | os.PathLike[str]\n            The local path on disk to upload to the lakeFS server.\n        rpath: str | os.PathLike[str]\n            The remote target path to upload the local file to. Must be a fully qualified lakeFS URI.\n        callback: fsspec.callbacks.Callback\n            An fsspec callback to use during the operation. Can be used to report download progress.\n        precheck: bool\n            Check if ``lpath`` already exists and compare its checksum with that of ``rpath``, skipping the download if they match.\n        **kwargs: Any\n            Additional keyword arguments to pass to ``LakeFSFileSystem.open()``.\n        \"\"\"\n        lpath = stringify_path(lpath)\n        rpath = stringify_path(rpath)\n\n        if precheck and Path(lpath).is_file():\n            remote_checksum = self.checksum(rpath)\n            local_checksum = md5_checksum(lpath, blocksize=self.blocksize)\n            if local_checksum == remote_checksum:\n                logger.info(\n                    f\"Skipping upload of resource {lpath!r} to remote path {rpath!r}: \"\n                    f\"Resource {rpath!r} exists and checksums match.\"\n                )\n                return\n\n        with self.wrapped_api_call(rpath=rpath):\n            super().put_file(lpath, rpath, callback=callback, **kwargs)\n\n    def rm_file(self, path: str | os.PathLike[str]) -&gt; None:  # pragma: no cover\n        \"\"\"\n        Stage a remote file for removal on a lakeFS server.\n\n        The file will not actually be removed from the requested branch until a commit is created.\n\n        Parameters\n        ----------\n        path: str | os.PathLike[str]\n            The remote file to delete. Must be a fully qualified lakeFS URI.\n        \"\"\"\n        self.rm(path)\n\n    def rm(\n        self, path: str | os.PathLike[str], recursive: bool = False, maxdepth: int | None = None\n    ) -&gt; None:\n        \"\"\"\n        Stage multiple remote files for removal on a lakeFS server.\n\n        The files will not actually be removed from the requested branch until a commit is created.\n\n        Parameters\n        ----------\n        path: str | os.PathLike[str]\n            File(s) to delete.\n        recursive: bool\n            If file(s) include nested directories, recursively delete their contents.\n        maxdepth: int | None\n            Depth to pass to walk for finding files to delete, if recursive.\n            If None, there will be no limit and infinite recursion may be\n            possible.\n        \"\"\"\n\n        path = stringify_path(path)\n        repository, ref, prefix = parse(path)\n\n        with self.wrapped_api_call(rpath=path):\n            branch = lakefs.Branch(repository, ref, client=self.client)\n            objgen = branch.objects(prefix=prefix, delimiter=\"\" if recursive else \"/\")\n            if maxdepth is None:\n                branch.delete_objects(obj.path for obj in objgen)\n            else:\n                # nesting level is just the amount of \"/\"s in the path, no leading \"/\".\n                branch.delete_objects(obj.path for obj in objgen if obj.path.count(\"/\") &lt;= maxdepth)\n\n            # Directory listing cache for the containing folder must be invalidated\n            self.dircache.pop(self._parent(path), None)\n\n    def touch(self, path: str | os.PathLike[str], truncate: bool = True, **kwargs: Any) -&gt; None:\n        \"\"\"\n        Create an empty file or update an existing file on a lakeFS server.\n\n        Parameters\n        ----------\n        path: str | os.PathLike[str]\n            The file path to create or update. Must be a fully qualified lakeFS URI.\n        truncate: bool\n            Whether to set the file size to 0 (zero) bytes, even if the path already exists.\n        **kwargs: Any\n            Additional keyword arguments to pass to ``LakeFSFileSystem.open()``.\n\n        Raises\n        ------\n        NotImplementedError\n            If the targeted lakeFS server version does not support `touch()` operations.\n        \"\"\"\n\n        # empty buffer upload errors were fixed in https://github.com/treeverse/lakeFS/issues/7130,\n        # which was first released in lakeFS v1.3.1.\n        if self._lakefs_server_version &lt; (1, 3, 1):\n            version_string = \".\".join(str(v) for v in self._lakefs_server_version)\n            raise NotImplementedError(\n                \"LakeFSFileSystem.touch() is not supported for your lakeFS server version. \"\n                f\"minimum required version: '1.3.1', actual version: {version_string!r}\"\n            )\n\n        super().touch(path=path, truncate=truncate, **kwargs)\n\n    def tail(self, path: str | os.PathLike[str], size: int = 1024) -&gt; bytes:\n        \"\"\"\n        Get the last ``size`` bytes from a remote file.\n\n        Parameters\n        ----------\n        path: str | os.PathLike[str]\n            The file path to read. Must be a fully qualified lakeFS URI.\n        size: int\n            The amount of bytes to get.\n\n        Returns\n        -------\n        bytes\n            The bytes at the end of the requested file.\n        \"\"\"\n        f: ObjectReader\n        with self.open(path, \"rb\") as f:\n            f.seek(max(-size, -f._obj.stat().size_bytes), 2)\n            return f.read()\n</code></pre>"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.transaction","title":"transaction  <code>property</code>","text":"<pre><code>transaction: LakeFSTransaction\n</code></pre> <p>A context manager within which file uploads and versioning operations are deferred to a queue, and carried out during when exiting the context.</p> <p>Requires the file class to implement <code>.commit()</code> and <code>.discard()</code> for the normal and exception cases.</p>"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.wrapped_api_call","title":"wrapped_api_call","text":"<pre><code>wrapped_api_call(\n    rpath: str | None = None, message: str | None = None, set_cause: bool = True\n) -&gt; Generator[None, None, None]\n</code></pre> <p>A context manager to wrap lakeFS API calls, translating any API errors to Python-native OS errors.</p> <p>Meant for internal use.</p> PARAMETER  DESCRIPTION <code>rpath</code> <p>The remote path involved in the requested API call.</p> <p> TYPE: <code>str | None</code> DEFAULT: <code>None</code> </p> <code>message</code> <p>A custom error message to emit instead of parsing the API error response.</p> <p> TYPE: <code>str | None</code> DEFAULT: <code>None</code> </p> <code>set_cause</code> <p>Whether to include the original lakeFS API error in the resulting traceback.</p> <p> TYPE: <code>bool</code> DEFAULT: <code>True</code> </p> YIELDS DESCRIPTION <code>None</code> <p>An empty generator, to be used as a context manager.</p> RAISES DESCRIPTION <code>OSError</code> <p>Translated error from the lakeFS API call, if any.</p> Source code in <code>src/lakefs_spec/spec.py</code> <pre><code>@contextmanager\ndef wrapped_api_call(\n    self, rpath: str | None = None, message: str | None = None, set_cause: bool = True\n) -&gt; Generator[None, None, None]:\n    \"\"\"\n    A context manager to wrap lakeFS API calls, translating any API errors to Python-native OS errors.\n\n    Meant for internal use.\n\n    Parameters\n    ----------\n    rpath: str | None\n        The remote path involved in the requested API call.\n    message: str | None\n        A custom error message to emit instead of parsing the API error response.\n    set_cause: bool\n        Whether to include the original lakeFS API error in the resulting traceback.\n\n    Yields\n    ------\n    None\n        An empty generator, to be used as a context manager.\n\n    Raises\n    ------\n    OSError\n        Translated error from the lakeFS API call, if any.\n    \"\"\"\n    try:\n        yield\n    except ServerException as e:\n        raise translate_lakefs_error(e, rpath=rpath, message=message, set_cause=set_cause)\n</code></pre>"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.checksum","title":"checksum","text":"<pre><code>checksum(path: str | PathLike[str]) -&gt; str | None\n</code></pre> <p>Get a remote lakeFS file object's checksum.</p> <p>This is usually its MD5 hash, unless another hash function was used on upload.</p> PARAMETER  DESCRIPTION <code>path</code> <p>The remote path to look up the lakeFS checksum for. Must point to a single file object.</p> <p> TYPE: <code>str | PathLike[str]</code> </p> RETURNS DESCRIPTION <code>str | None</code> <p>The remote file's checksum, or <code>None</code> if <code>path</code> points to a directory or does not exist.</p> Source code in <code>src/lakefs_spec/spec.py</code> <pre><code>def checksum(self, path: str | os.PathLike[str]) -&gt; str | None:\n    \"\"\"\n    Get a remote lakeFS file object's checksum.\n\n    This is usually its MD5 hash, unless another hash function was used on upload.\n\n    Parameters\n    ----------\n    path: str | os.PathLike[str]\n        The remote path to look up the lakeFS checksum for. Must point to a single file object.\n\n    Returns\n    -------\n    str | None\n        The remote file's checksum, or ``None`` if ``path`` points to a directory or does not exist.\n    \"\"\"\n    path = stringify_path(path)\n    try:\n        return self.info(path).get(\"checksum\")\n    except FileNotFoundError:\n        return None\n</code></pre>"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.exists","title":"exists","text":"<pre><code>exists(path: str | PathLike[str], **kwargs: Any) -&gt; bool\n</code></pre> <p>Check existence of a remote path in a lakeFS repository.</p> <p>Input paths can either be files or directories.</p> PARAMETER  DESCRIPTION <code>path</code> <p>The remote path whose existence to check. Must be a fully qualified lakeFS URI.</p> <p> TYPE: <code>str | PathLike[str]</code> </p> <code>**kwargs</code> <p>Additional keyword arguments for fsspec compatibility, unused.</p> <p> TYPE: <code>Any</code> DEFAULT: <code>{}</code> </p> RETURNS DESCRIPTION <code>bool</code> <p><code>True</code> if the requested path exists, <code>False</code> if it does not.</p> RAISES DESCRIPTION <code>PermissionError</code> <p>If the user does not have sufficient permissions to query object existence.</p> Source code in <code>src/lakefs_spec/spec.py</code> <pre><code>def exists(self, path: str | os.PathLike[str], **kwargs: Any) -&gt; bool:\n    \"\"\"\n    Check existence of a remote path in a lakeFS repository.\n\n    Input paths can either be files or directories.\n\n    Parameters\n    ----------\n    path: str | os.PathLike[str]\n        The remote path whose existence to check. Must be a fully qualified lakeFS URI.\n    **kwargs: Any\n        Additional keyword arguments for fsspec compatibility, unused.\n\n    Returns\n    -------\n    bool\n        ``True`` if the requested path exists, ``False`` if it does not.\n\n    Raises\n    ------\n    PermissionError\n        If the user does not have sufficient permissions to query object existence.\n    \"\"\"\n    path = stringify_path(path)\n    repository, ref, resource = parse(path)\n    try:\n        reference = lakefs.Reference(repository, ref, client=self.client)\n        return reference.object(resource).exists()\n    except ServerException as e:\n        # in case of an error other than \"not found\", existence cannot be\n        # decided, so raise the translated error.\n        raise translate_lakefs_error(e)\n</code></pre>"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.cp_file","title":"cp_file","text":"<pre><code>cp_file(path1: str | PathLike[str], path2: str | PathLike[str], **kwargs: Any) -&gt; None\n</code></pre> <p>Copy a single file from one remote location to another in lakeFS.</p> PARAMETER  DESCRIPTION <code>path1</code> <p>The remote file location to be copied.</p> <p> TYPE: <code>str | PathLike[str]</code> </p> <code>path2</code> <p>The (remote) target location to which to copy the file.</p> <p> TYPE: <code>str | PathLike[str]</code> </p> <code>**kwargs</code> <p>Additional keyword arguments for fsspec compatibility, unused.</p> <p> TYPE: <code>Any</code> DEFAULT: <code>{}</code> </p> RAISES DESCRIPTION <code>ValueError</code> <p>When attempting to copy objects between repositories.</p> Source code in <code>src/lakefs_spec/spec.py</code> <pre><code>def cp_file(\n    self, path1: str | os.PathLike[str], path2: str | os.PathLike[str], **kwargs: Any\n) -&gt; None:\n    \"\"\"\n    Copy a single file from one remote location to another in lakeFS.\n\n    Parameters\n    ----------\n    path1: str | os.PathLike[str]\n        The remote file location to be copied.\n    path2: str | os.PathLike[str]\n        The (remote) target location to which to copy the file.\n    **kwargs: Any\n        Additional keyword arguments for fsspec compatibility, unused.\n\n    Raises\n    ------\n    ValueError\n        When attempting to copy objects between repositories.\n    \"\"\"\n    path1 = stringify_path(path1)\n    path2 = stringify_path(path2)\n    if path1 == path2:\n        return\n\n    orig_repo, orig_ref, orig_path = parse(path1)\n    dest_repo, dest_ref, dest_path = parse(path2)\n\n    if orig_repo != dest_repo:\n        raise ValueError(\n            \"can only copy objects within a repository, but got source \"\n            f\"repository {orig_repo!r} and destination repository {dest_repo!r}\"\n        )\n\n    with self.wrapped_api_call():\n        reference = lakefs.Reference(orig_repo, orig_ref, client=self.client)\n        reference.object(orig_path).copy(dest_ref, dest_path)\n</code></pre>"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.get_file","title":"get_file","text":"<pre><code>get_file(\n    rpath: str | PathLike[str],\n    lpath: str | PathLike[str],\n    callback: Callback = _DEFAULT_CALLBACK,\n    outfile: Any = None,\n    precheck: bool = True,\n    **kwargs: Any\n) -&gt; None\n</code></pre> <p>Download a single file from a remote lakeFS server to local storage.</p> PARAMETER  DESCRIPTION <code>rpath</code> <p>The remote path to download to local storage. Must be a fully qualified lakeFS URI, and point to a single file.</p> <p> TYPE: <code>str | PathLike[str]</code> </p> <code>lpath</code> <p>The local path on disk to save the downloaded file to.</p> <p> TYPE: <code>str | PathLike[str]</code> </p> <code>callback</code> <p>An fsspec callback to use during the operation. Can be used to report download progress.</p> <p> TYPE: <code>Callback</code> DEFAULT: <code>_DEFAULT_CALLBACK</code> </p> <code>outfile</code> <p>A file-like object to save the downloaded content to. Can be used in place of <code>lpath</code>.</p> <p> TYPE: <code>Any</code> DEFAULT: <code>None</code> </p> <code>precheck</code> <p>Check if <code>lpath</code> already exists and compare its checksum with that of <code>rpath</code>, skipping the download if they match.</p> <p> TYPE: <code>bool</code> DEFAULT: <code>True</code> </p> <code>**kwargs</code> <p>Additional keyword arguments passed to <code>AbstractFileSystem.open()</code>.</p> <p> TYPE: <code>Any</code> DEFAULT: <code>{}</code> </p> Source code in <code>src/lakefs_spec/spec.py</code> <pre><code>def get_file(\n    self,\n    rpath: str | os.PathLike[str],\n    lpath: str | os.PathLike[str],\n    callback: fsspec.callbacks.Callback = _DEFAULT_CALLBACK,\n    outfile: Any = None,\n    precheck: bool = True,\n    **kwargs: Any,\n) -&gt; None:\n    \"\"\"\n    Download a single file from a remote lakeFS server to local storage.\n\n    Parameters\n    ----------\n    rpath: str | os.PathLike[str]\n        The remote path to download to local storage. Must be a fully qualified lakeFS URI, and point to a single file.\n    lpath: str | os.PathLike[str]\n        The local path on disk to save the downloaded file to.\n    callback: fsspec.callbacks.Callback\n        An fsspec callback to use during the operation. Can be used to report download progress.\n    outfile: Any\n        A file-like object to save the downloaded content to. Can be used in place of ``lpath``.\n    precheck: bool\n        Check if ``lpath`` already exists and compare its checksum with that of ``rpath``, skipping the download if they match.\n    **kwargs: Any\n        Additional keyword arguments passed to ``AbstractFileSystem.open()``.\n    \"\"\"\n    rpath = stringify_path(rpath)\n    lpath = stringify_path(lpath)\n\n    if precheck and Path(lpath).is_file():\n        local_checksum = md5_checksum(lpath, blocksize=self.blocksize)\n        remote_checksum = self.info(rpath).get(\"checksum\")\n        if local_checksum == remote_checksum:\n            logger.info(\n                f\"Skipping download of resource {rpath!r} to local path {lpath!r}: \"\n                f\"Resource {lpath!r} exists and checksums match.\"\n            )\n            return\n\n    with self.wrapped_api_call(rpath=rpath):\n        super().get_file(rpath, lpath, callback=callback, outfile=outfile, **kwargs)\n</code></pre>"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.info","title":"info","text":"<pre><code>info(path: str | PathLike[str], **kwargs: Any) -&gt; dict[str, Any]\n</code></pre> <p>Query a remote lakeFS object's metadata.</p> PARAMETER  DESCRIPTION <code>path</code> <p>The object for which to obtain metadata. Must be a fully qualified lakeFS URI, can either point to a file or a directory.</p> <p> TYPE: <code>str | PathLike[str]</code> </p> <code>**kwargs</code> <p>Additional keyword arguments to pass to <code>LakeFSFileSystem.ls()</code> if <code>path</code> points to a directory.</p> <p> TYPE: <code>Any</code> DEFAULT: <code>{}</code> </p> RETURNS DESCRIPTION <code>dict[str, Any]</code> <p>A dictionary containing metadata on the object, including its full remote path and object type (file or directory).</p> RAISES DESCRIPTION <code>FileNotFoundError</code> <p>If the <code>path</code> refers to a non-file path that does not exist in the repository.</p> Source code in <code>src/lakefs_spec/spec.py</code> <pre><code>def info(self, path: str | os.PathLike[str], **kwargs: Any) -&gt; dict[str, Any]:\n    \"\"\"\n    Query a remote lakeFS object's metadata.\n\n    Parameters\n    ----------\n    path: str | os.PathLike[str]\n        The object for which to obtain metadata. Must be a fully qualified lakeFS URI, can either point to a file or a directory.\n    **kwargs: Any\n        Additional keyword arguments to pass to ``LakeFSFileSystem.ls()`` if ``path`` points to a directory.\n\n    Returns\n    -------\n    dict[str, Any]\n        A dictionary containing metadata on the object, including its full remote path and object type (file or directory).\n\n    Raises\n    ------\n    FileNotFoundError\n        If the ``path`` refers to a non-file path that does not exist in the repository.\n    \"\"\"\n    path = stringify_path(path)\n    repository, ref, resource = parse(path)\n    # first, try with `stat_object` in case of a file.\n    # the condition below checks edge cases of resources that cannot be files.\n    if resource and not resource.endswith(\"/\"):\n        try:\n            reference = lakefs.Reference(repository, ref, client=self.client)\n            res = reference.object(resource).stat()\n            return {\n                \"checksum\": res.checksum,\n                \"content-type\": res.content_type,\n                \"mtime\": res.mtime,\n                \"name\": f\"{repository}/{ref}/{res.path}\",\n                \"size\": res.size_bytes,\n                \"type\": \"file\",\n            }\n        except NotFoundException:\n            # fall through, retry with `ls` if it's a directory.\n            pass\n        except ServerException as e:\n            raise translate_lakefs_error(e, rpath=path)\n\n    out = self.ls(path, detail=True, recursive=True, **kwargs)\n    if not out:\n        raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), path)\n\n    return {\n        \"name\": path.rstrip(\"/\"),\n        \"size\": sum(o.get(\"size\") or 0 for o in out),\n        \"type\": \"directory\",\n    }\n</code></pre>"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.ls","title":"ls","text":"<pre><code>ls(\n    path: str | PathLike[str], detail: bool = True, **kwargs: Any\n) -&gt; list[str] | list[dict[str, Any]]\n</code></pre> <p>List all available objects under a given path in lakeFS.</p> PARAMETER  DESCRIPTION <code>path</code> <p>The path under which to list objects. Must be a fully qualified lakeFS URI. Can also point to a file, in which case the file's metadata will be returned.</p> <p> TYPE: <code>str | PathLike[str]</code> </p> <code>detail</code> <p>Whether to obtain all metadata on the requested objects or just their names.</p> <p> TYPE: <code>bool</code> DEFAULT: <code>True</code> </p> <code>**kwargs</code> <p>Additional keyword arguments for fsspec compatibility.</p> <p>In particular:     <code>refresh: bool</code>: whether to skip the directory listing cache,     <code>recursive: bool</code>: whether to list subdirectory contents recursively</p> <p> TYPE: <code>Any</code> DEFAULT: <code>{}</code> </p> RETURNS DESCRIPTION <code>list[str] | list[dict[str, Any]]</code> <p>A list of all objects' metadata under the given remote path if <code>detail=True</code>, or alternatively only their names if <code>detail=False</code>.</p> Source code in <code>src/lakefs_spec/spec.py</code> <pre><code>def ls(\n    self,\n    path: str | os.PathLike[str],\n    detail: bool = True,\n    **kwargs: Any,\n) -&gt; list[str] | list[dict[str, Any]]:\n    \"\"\"\n    List all available objects under a given path in lakeFS.\n\n    Parameters\n    ----------\n    path: str | os.PathLike[str]\n        The path under which to list objects. Must be a fully qualified lakeFS URI.\n        Can also point to a file, in which case the file's metadata will be returned.\n    detail: bool\n        Whether to obtain all metadata on the requested objects or just their names.\n    **kwargs: Any\n        Additional keyword arguments for fsspec compatibility.\n\n        In particular:\n            `refresh: bool`: whether to skip the directory listing cache,\n            `recursive: bool`: whether to list subdirectory contents recursively\n\n    Returns\n    -------\n    list[str] | list[dict[str, Any]]\n        A list of all objects' metadata under the given remote path if ``detail=True``, or alternatively only their names if ``detail=False``.\n    \"\"\"\n    path = self._strip_protocol(path)\n    repository, ref, prefix = parse(path)\n\n    recursive = kwargs.pop(\"recursive\", False)\n\n    # Try lookup in dircache unless explicitly disabled by `refresh=True` kwarg\n    use_dircache = not kwargs.pop(\"refresh\", False)\n\n    if use_dircache:\n        cache_entry: list[Any] | None = None\n        try:\n            cache_entry = self._ls_from_cache(path, recursive=recursive)\n        except FileNotFoundError:\n            # we patch files missing from an ls call in the cache entry below,\n            # so this should not be an error.\n            pass\n\n        if cache_entry is not None:\n            if not detail:\n                return [e[\"name\"] for e in cache_entry]\n            return cache_entry[:]\n\n    kwargs[\"prefix\"] = prefix\n\n    info = []\n    # stat infos are either the path only (`detail=False`) or a dict full of metadata\n    delimiter = \"\" if recursive else \"/\"\n    reference = lakefs.Reference(repository, ref, client=self.client)\n\n    with self.wrapped_api_call(rpath=path):\n        for obj in reference.objects(prefix=prefix, delimiter=delimiter):\n            if isinstance(obj, CommonPrefix):\n                # prefixes are added below.\n                info.append(\n                    {\n                        \"name\": f\"{repository}/{ref}/{obj.path}\",\n                        \"size\": 0,\n                        \"type\": \"directory\",\n                    }\n                )\n            elif isinstance(obj, ObjectInfo):\n                info.append(\n                    {\n                        \"checksum\": obj.checksum,\n                        \"content-type\": obj.content_type,\n                        \"mtime\": obj.mtime,\n                        \"name\": f\"{repository}/{ref}/{obj.path}\",\n                        \"size\": obj.size_bytes,\n                        \"type\": \"object\",\n                    }\n                )\n\n    # Retry the API call with appended slash if the current result\n    # is just a single directory entry only (not its contents).\n    # This is useful to allow `ls(\"repo/branch/dir\")` calls without a trailing slash.\n    if len(info) == 1 and info[0][\"type\"] == \"directory\" and info[0][\"name\"] == path + \"/\":\n        return self.ls(\n            path + \"/\",\n            detail=detail,\n            **kwargs | {\"refresh\": not use_dircache, \"recursive\": recursive},\n        )\n\n    if recursive:\n        # To make recursive ls behave identical to the non-recursive case,\n        # add back virtual `directory` entries, which are only returned by\n        # the lakeFS API when querying non-recursively.\n        here = self._strip_protocol(path).rstrip(\"/\")\n        subdirs = {parent for o in info if (parent := self._parent(o[\"name\"])) != here}\n        for subdir in subdirs:\n            info.append(\n                {\n                    \"name\": subdir + \"/\",\n                    \"size\": 0,\n                    \"type\": \"directory\",\n                }\n            )\n\n    if info:\n        self._update_dircache(info[:])\n\n    if not detail:\n        info = [o[\"name\"] for o in info]  # type: ignore\n\n    return info\n</code></pre>"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.open","title":"open","text":"<pre><code>open(\n    path: str | PathLike[str],\n    mode: Literal[\"r\", \"rb\", \"rt\", \"w\", \"wb\", \"wt\", \"x\", \"xb\", \"xt\"] = \"rb\",\n    pre_sign: bool = False,\n    content_type: str | None = None,\n    metadata: dict[str, str] | None = None,\n    autocommit: bool = True,\n    **kwargs: Any\n) -&gt; LakeFSIOBase\n</code></pre> <p>Dispatch a lakeFS file-like object (local buffer on disk) for the given remote path for up- or downloads depending on <code>mode</code>.</p> PARAMETER  DESCRIPTION <code>path</code> <p>The remote path for which to open a local <code>LakeFSFile</code>. Must be a fully qualified lakeFS URI.</p> <p> TYPE: <code>str | PathLike[str]</code> </p> <code>mode</code> <p>The file mode indicating its purpose. Use <code>r/rb</code> for downloads from lakeFS, <code>w/wb/x/xb</code> for uploads to lakeFS.</p> <p> TYPE: <code>Literal['r', 'rb', 'rt', 'w', 'wb', 'wt', 'x', 'xb', 'xt']</code> DEFAULT: <code>'rb'</code> </p> <code>pre_sign</code> <p>Whether to use a pre-signed URL for the file up-/download.</p> <p> TYPE: <code>bool</code> DEFAULT: <code>False</code> </p> <code>content_type</code> <p>Content type to use for the file, relevant for uploads only.</p> <p> TYPE: <code>str | None</code> DEFAULT: <code>None</code> </p> <code>metadata</code> <p>Additional metadata to attach to the file, relevant for uploads only.</p> <p> TYPE: <code>dict[str, str] | None</code> DEFAULT: <code>None</code> </p> <code>autocommit</code> <p>Whether to process the file immediately instead of queueing it for transaction while in a transaction context.</p> <p> TYPE: <code>bool</code> DEFAULT: <code>True</code> </p> <code>**kwargs</code> <p>Additional keyword arguments for fsspec compatibility, unused.</p> <p> TYPE: <code>Any</code> DEFAULT: <code>{}</code> </p> RETURNS DESCRIPTION <code>LakeFSIOBase</code> <p>A local file-like object ready to hold data to be received from / sent to a lakeFS server.</p> RAISES DESCRIPTION <code>NotImplementedError</code> <p>If <code>mode</code> is not supported.</p> Source code in <code>src/lakefs_spec/spec.py</code> <pre><code>def open(\n    self,\n    path: str | os.PathLike[str],\n    mode: Literal[\"r\", \"rb\", \"rt\", \"w\", \"wb\", \"wt\", \"x\", \"xb\", \"xt\"] = \"rb\",\n    pre_sign: bool = False,\n    content_type: str | None = None,\n    metadata: dict[str, str] | None = None,\n    autocommit: bool = True,\n    **kwargs: Any,\n) -&gt; LakeFSIOBase:\n    \"\"\"\n    Dispatch a lakeFS file-like object (local buffer on disk) for the given remote path for up- or downloads depending on ``mode``.\n\n    Parameters\n    ----------\n    path: str | os.PathLike[str]\n        The remote path for which to open a local ``LakeFSFile``. Must be a fully qualified lakeFS URI.\n    mode: Literal[\"r\", \"rb\", \"rt\", \"w\", \"wb\", \"wt\", \"x\", \"xb\", \"xt\"]\n        The file mode indicating its purpose. Use ``r/rb`` for downloads from lakeFS, ``w/wb/x/xb`` for uploads to lakeFS.\n    pre_sign: bool\n        Whether to use a pre-signed URL for the file up-/download.\n    content_type: str | None\n        Content type to use for the file, relevant for uploads only.\n    metadata: dict[str, str] | None\n        Additional metadata to attach to the file, relevant for uploads only.\n    autocommit: bool\n        Whether to process the file immediately instead of queueing it for transaction while in a transaction context.\n    **kwargs: Any\n        Additional keyword arguments for fsspec compatibility, unused.\n\n    Returns\n    -------\n    LakeFSIOBase\n        A local file-like object ready to hold data to be received from / sent to a lakeFS server.\n\n    Raises\n    ------\n    NotImplementedError\n        If ``mode`` is not supported.\n    \"\"\"\n    if mode.endswith(\"t\"):\n        # text modes {r,w,x}t are equivalent to {r,w,x} here respectively.\n        mode = mode[:-1]  # type: ignore\n\n    if mode not in {\"r\", \"rb\", \"w\", \"wb\", \"x\", \"xb\"}:\n        raise NotImplementedError(f\"unsupported mode {mode!r}\")\n\n    path = stringify_path(path)\n    repo, ref, resource = parse(path)\n\n    if mode.startswith(\"r\"):\n        reference = lakefs.Reference(repo, ref, client=self.client)\n        obj = reference.object(resource)\n\n        if not obj.exists():\n            raise FileNotFoundError(path)\n        handler = ObjectReader(obj, mode=mode, pre_sign=pre_sign, client=self.client)\n    else:\n        # for writing ops, ref must be a branch\n        branch = lakefs.Branch(repo, ref, client=self.client)\n        if self.create_branch_ok:\n            branch.create(self.source_branch, exist_ok=True)\n\n        obj = branch.object(resource)\n        handler = ObjectWriter(\n            obj,\n            mode=mode,\n            pre_sign=pre_sign,\n            content_type=content_type,\n            metadata=metadata,\n            client=self.client,\n        )\n\n    ac = kwargs.pop(\"autocommit\", not self._intrans)\n    if not ac and \"r\" not in mode:\n        self._transaction.files.append(handler)\n\n    return handler\n</code></pre>"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.put_file","title":"put_file","text":"<pre><code>put_file(\n    lpath: str | PathLike[str],\n    rpath: str | PathLike[str],\n    callback: Callback = _DEFAULT_CALLBACK,\n    precheck: bool = True,\n    **kwargs: Any\n) -&gt; None\n</code></pre> <p>Upload a local file to a remote location on a lakeFS server.</p> <p>Note that depending on the block store type, additional configuration like credentials may need to be configured when <code>use_blockstore=True</code> and <code>presign=False</code>.</p> PARAMETER  DESCRIPTION <code>lpath</code> <p>The local path on disk to upload to the lakeFS server.</p> <p> TYPE: <code>str | PathLike[str]</code> </p> <code>rpath</code> <p>The remote target path to upload the local file to. Must be a fully qualified lakeFS URI.</p> <p> TYPE: <code>str | PathLike[str]</code> </p> <code>callback</code> <p>An fsspec callback to use during the operation. Can be used to report download progress.</p> <p> TYPE: <code>Callback</code> DEFAULT: <code>_DEFAULT_CALLBACK</code> </p> <code>precheck</code> <p>Check if <code>lpath</code> already exists and compare its checksum with that of <code>rpath</code>, skipping the download if they match.</p> <p> TYPE: <code>bool</code> DEFAULT: <code>True</code> </p> <code>**kwargs</code> <p>Additional keyword arguments to pass to <code>LakeFSFileSystem.open()</code>.</p> <p> TYPE: <code>Any</code> DEFAULT: <code>{}</code> </p> Source code in <code>src/lakefs_spec/spec.py</code> <pre><code>def put_file(\n    self,\n    lpath: str | os.PathLike[str],\n    rpath: str | os.PathLike[str],\n    callback: fsspec.callbacks.Callback = _DEFAULT_CALLBACK,\n    precheck: bool = True,\n    **kwargs: Any,\n) -&gt; None:\n    \"\"\"\n    Upload a local file to a remote location on a lakeFS server.\n\n    Note that depending on the block store type, additional configuration like credentials may need to be configured when ``use_blockstore=True`` and ``presign=False``.\n\n    Parameters\n    ----------\n    lpath: str | os.PathLike[str]\n        The local path on disk to upload to the lakeFS server.\n    rpath: str | os.PathLike[str]\n        The remote target path to upload the local file to. Must be a fully qualified lakeFS URI.\n    callback: fsspec.callbacks.Callback\n        An fsspec callback to use during the operation. Can be used to report download progress.\n    precheck: bool\n        Check if ``lpath`` already exists and compare its checksum with that of ``rpath``, skipping the download if they match.\n    **kwargs: Any\n        Additional keyword arguments to pass to ``LakeFSFileSystem.open()``.\n    \"\"\"\n    lpath = stringify_path(lpath)\n    rpath = stringify_path(rpath)\n\n    if precheck and Path(lpath).is_file():\n        remote_checksum = self.checksum(rpath)\n        local_checksum = md5_checksum(lpath, blocksize=self.blocksize)\n        if local_checksum == remote_checksum:\n            logger.info(\n                f\"Skipping upload of resource {lpath!r} to remote path {rpath!r}: \"\n                f\"Resource {rpath!r} exists and checksums match.\"\n            )\n            return\n\n    with self.wrapped_api_call(rpath=rpath):\n        super().put_file(lpath, rpath, callback=callback, **kwargs)\n</code></pre>"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.rm_file","title":"rm_file","text":"<pre><code>rm_file(path: str | PathLike[str]) -&gt; None\n</code></pre> <p>Stage a remote file for removal on a lakeFS server.</p> <p>The file will not actually be removed from the requested branch until a commit is created.</p> PARAMETER  DESCRIPTION <code>path</code> <p>The remote file to delete. Must be a fully qualified lakeFS URI.</p> <p> TYPE: <code>str | PathLike[str]</code> </p> Source code in <code>src/lakefs_spec/spec.py</code> <pre><code>def rm_file(self, path: str | os.PathLike[str]) -&gt; None:  # pragma: no cover\n    \"\"\"\n    Stage a remote file for removal on a lakeFS server.\n\n    The file will not actually be removed from the requested branch until a commit is created.\n\n    Parameters\n    ----------\n    path: str | os.PathLike[str]\n        The remote file to delete. Must be a fully qualified lakeFS URI.\n    \"\"\"\n    self.rm(path)\n</code></pre>"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.rm","title":"rm","text":"<pre><code>rm(path: str | PathLike[str], recursive: bool = False, maxdepth: int | None = None) -&gt; None\n</code></pre> <p>Stage multiple remote files for removal on a lakeFS server.</p> <p>The files will not actually be removed from the requested branch until a commit is created.</p> PARAMETER  DESCRIPTION <code>path</code> <p>File(s) to delete.</p> <p> TYPE: <code>str | PathLike[str]</code> </p> <code>recursive</code> <p>If file(s) include nested directories, recursively delete their contents.</p> <p> TYPE: <code>bool</code> DEFAULT: <code>False</code> </p> <code>maxdepth</code> <p>Depth to pass to walk for finding files to delete, if recursive. If None, there will be no limit and infinite recursion may be possible.</p> <p> TYPE: <code>int | None</code> DEFAULT: <code>None</code> </p> Source code in <code>src/lakefs_spec/spec.py</code> <pre><code>def rm(\n    self, path: str | os.PathLike[str], recursive: bool = False, maxdepth: int | None = None\n) -&gt; None:\n    \"\"\"\n    Stage multiple remote files for removal on a lakeFS server.\n\n    The files will not actually be removed from the requested branch until a commit is created.\n\n    Parameters\n    ----------\n    path: str | os.PathLike[str]\n        File(s) to delete.\n    recursive: bool\n        If file(s) include nested directories, recursively delete their contents.\n    maxdepth: int | None\n        Depth to pass to walk for finding files to delete, if recursive.\n        If None, there will be no limit and infinite recursion may be\n        possible.\n    \"\"\"\n\n    path = stringify_path(path)\n    repository, ref, prefix = parse(path)\n\n    with self.wrapped_api_call(rpath=path):\n        branch = lakefs.Branch(repository, ref, client=self.client)\n        objgen = branch.objects(prefix=prefix, delimiter=\"\" if recursive else \"/\")\n        if maxdepth is None:\n            branch.delete_objects(obj.path for obj in objgen)\n        else:\n            # nesting level is just the amount of \"/\"s in the path, no leading \"/\".\n            branch.delete_objects(obj.path for obj in objgen if obj.path.count(\"/\") &lt;= maxdepth)\n\n        # Directory listing cache for the containing folder must be invalidated\n        self.dircache.pop(self._parent(path), None)\n</code></pre>"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.touch","title":"touch","text":"<pre><code>touch(path: str | PathLike[str], truncate: bool = True, **kwargs: Any) -&gt; None\n</code></pre> <p>Create an empty file or update an existing file on a lakeFS server.</p> PARAMETER  DESCRIPTION <code>path</code> <p>The file path to create or update. Must be a fully qualified lakeFS URI.</p> <p> TYPE: <code>str | PathLike[str]</code> </p> <code>truncate</code> <p>Whether to set the file size to 0 (zero) bytes, even if the path already exists.</p> <p> TYPE: <code>bool</code> DEFAULT: <code>True</code> </p> <code>**kwargs</code> <p>Additional keyword arguments to pass to <code>LakeFSFileSystem.open()</code>.</p> <p> TYPE: <code>Any</code> DEFAULT: <code>{}</code> </p> RAISES DESCRIPTION <code>NotImplementedError</code> <p>If the targeted lakeFS server version does not support <code>touch()</code> operations.</p> Source code in <code>src/lakefs_spec/spec.py</code> <pre><code>def touch(self, path: str | os.PathLike[str], truncate: bool = True, **kwargs: Any) -&gt; None:\n    \"\"\"\n    Create an empty file or update an existing file on a lakeFS server.\n\n    Parameters\n    ----------\n    path: str | os.PathLike[str]\n        The file path to create or update. Must be a fully qualified lakeFS URI.\n    truncate: bool\n        Whether to set the file size to 0 (zero) bytes, even if the path already exists.\n    **kwargs: Any\n        Additional keyword arguments to pass to ``LakeFSFileSystem.open()``.\n\n    Raises\n    ------\n    NotImplementedError\n        If the targeted lakeFS server version does not support `touch()` operations.\n    \"\"\"\n\n    # empty buffer upload errors were fixed in https://github.com/treeverse/lakeFS/issues/7130,\n    # which was first released in lakeFS v1.3.1.\n    if self._lakefs_server_version &lt; (1, 3, 1):\n        version_string = \".\".join(str(v) for v in self._lakefs_server_version)\n        raise NotImplementedError(\n            \"LakeFSFileSystem.touch() is not supported for your lakeFS server version. \"\n            f\"minimum required version: '1.3.1', actual version: {version_string!r}\"\n        )\n\n    super().touch(path=path, truncate=truncate, **kwargs)\n</code></pre>"},{"location":"reference/lakefs_spec/spec/#lakefs_spec.spec.LakeFSFileSystem.tail","title":"tail","text":"<pre><code>tail(path: str | PathLike[str], size: int = 1024) -&gt; bytes\n</code></pre> <p>Get the last <code>size</code> bytes from a remote file.</p> PARAMETER  DESCRIPTION <code>path</code> <p>The file path to read. Must be a fully qualified lakeFS URI.</p> <p> TYPE: <code>str | PathLike[str]</code> </p> <code>size</code> <p>The amount of bytes to get.</p> <p> TYPE: <code>int</code> DEFAULT: <code>1024</code> </p> RETURNS DESCRIPTION <code>bytes</code> <p>The bytes at the end of the requested file.</p> Source code in <code>src/lakefs_spec/spec.py</code> <pre><code>def tail(self, path: str | os.PathLike[str], size: int = 1024) -&gt; bytes:\n    \"\"\"\n    Get the last ``size`` bytes from a remote file.\n\n    Parameters\n    ----------\n    path: str | os.PathLike[str]\n        The file path to read. Must be a fully qualified lakeFS URI.\n    size: int\n        The amount of bytes to get.\n\n    Returns\n    -------\n    bytes\n        The bytes at the end of the requested file.\n    \"\"\"\n    f: ObjectReader\n    with self.open(path, \"rb\") as f:\n        f.seek(max(-size, -f._obj.stat().size_bytes), 2)\n        return f.read()\n</code></pre>"},{"location":"reference/lakefs_spec/transaction/","title":"transaction","text":"<p>Functionality for extended lakeFS transactions to conduct versioning operations between file uploads.</p>"},{"location":"reference/lakefs_spec/transaction/#lakefs_spec.transaction.LakeFSTransaction","title":"LakeFSTransaction","text":"<p>             Bases: <code>Transaction</code></p> <p>A lakeFS transaction model capable of versioning operations in between file uploads.</p> PARAMETER  DESCRIPTION <code>fs</code> <p>The lakeFS file system associated with the transaction.</p> <p> TYPE: <code>'LakeFSFileSystem'</code> </p> Source code in <code>src/lakefs_spec/transaction.py</code> <pre><code>class LakeFSTransaction(Transaction):\n    \"\"\"\n    A lakeFS transaction model capable of versioning operations in between file uploads.\n\n    Parameters\n    ----------\n    fs: LakeFSFileSystem\n        The lakeFS file system associated with the transaction.\n    \"\"\"\n\n    def __init__(\n        self,\n        fs: \"LakeFSFileSystem\",\n    ):\n        super().__init__(fs=fs)\n        self.fs: \"LakeFSFileSystem\"\n        self.files: deque[ObjectWriter] = deque(self.files)\n\n        self.repository: str | None = None\n        self.base_branch: Branch | None = None\n        self.automerge: bool = False\n        self.delete: Literal[\"onsuccess\", \"always\", \"never\"] = \"onsuccess\"\n        self._ephemeral_branch: Branch | None = None\n\n    def __call__(\n        self,\n        repository: str | Repository,\n        base_branch: str | Branch = \"main\",\n        branch_name: str | None = None,\n        automerge: bool = True,\n        delete: Literal[\"onsuccess\", \"always\", \"never\"] = \"onsuccess\",\n    ) -&gt; \"LakeFSTransaction\":\n        \"\"\"\n        Creates an ephemeral branch, conducts all uploads and operations on that branch,\n        and optionally merges it back into the source branch.\n\n        repository: str | Repository\n            The repository in which to conduct the transaction.\n        base_branch: str | Branch\n            The branch on which the transaction operations should be based.\n        automerge: bool\n            Automatically merge the ephemeral branch into the base branch after successful\n            transaction completion.\n        delete: Literal[\"onsuccess\", \"always\", \"never\"]\n            Cleanup policy / deletion handling for the ephemeral branch after the transaction.\n\n            If ``\"onsuccess\"``, the branch is deleted if the transaction succeeded,\n            or left over if an error occurred.\n\n            If ``\"always\"``, the ephemeral branch is always deleted after transaction regardless of success\n            or failure.\n\n            If ``\"never\"``, the transaction branch is always left in the repository.\n        \"\"\"\n\n        if isinstance(repository, str):\n            self.repository = repository\n        else:\n            self.repository = repository.id\n\n        repo = lakefs.Repository(self.repository, client=self.fs.client)\n        try:\n            _ = repo.metadata\n        except ServerException:\n            raise ValueError(f\"repository {self.repository!r} does not exist\") from None\n\n        # base branch needs to be a lakefs.Branch, since it is being diffed\n        # with the ephemeral branch in __exit__.\n        self.base_branch = _ensurebranch(base_branch, self.repository, self.fs.client)\n\n        self.automerge = automerge\n        self.delete = delete\n\n        ephem_name = branch_name or \"transaction-\" + \"\".join(random.choices(string.digits, k=6))  # nosec: B311\n        self._ephemeral_branch = Branch(self.repository, ephem_name, client=self.fs.client)\n        return self\n\n    def __enter__(self):\n        logger.debug(\n            f\"Creating ephemeral branch {self._ephemeral_branch.id!r} \"\n            f\"from branch {self.base_branch.id!r}.\"\n        )\n        self._ephemeral_branch.create(self.base_branch, exist_ok=False)\n        self.fs._intrans = True\n        return self\n\n    def __exit__(self, exc_type, exc_val, exc_tb):\n        success = exc_type is None\n        while self.files:\n            # fsspec base class calls `append` on the file, which means we\n            # have to pop from the left to preserve order.\n            f = self.files.popleft()\n            if not success:\n                f.discard()\n\n        self.fs._intrans = False\n        self.fs._transaction = None\n\n        if any(self._ephemeral_branch.uncommitted()):\n            msg = f\"Finished transaction on branch {self._ephemeral_branch.id!r} with uncommitted changes.\"\n            if self.delete != \"never\":\n                msg += \" Objects added but not committed are lost.\"\n            warnings.warn(msg)\n\n        if success and self.automerge:\n            if any(self.base_branch.diff(self._ephemeral_branch)):\n                self._ephemeral_branch.merge_into(self.base_branch)\n        if self.delete == \"always\" or (success and self.delete == \"onsuccess\"):\n            self._ephemeral_branch.delete()\n\n    @property\n    def branch(self):\n        return self._ephemeral_branch\n\n    def commit(self, message: str, metadata: dict[str, str] | None = None) -&gt; Reference:\n        \"\"\"\n        Create a commit on this transaction's ephemeral branch with a commit message\n        and attached metadata.\n\n        Parameters\n        ----------\n        message: str\n            The commit message to attach to the newly created commit.\n        metadata: dict[str, str] | None\n            Optional metadata to enrich the created commit with (author, e-mail, ...).\n\n        Returns\n        -------\n        Reference\n            The created commit.\n        \"\"\"\n\n        diff = list(self.branch.uncommitted())\n\n        if not diff:\n            logger.warning(f\"No changes to commit on branch {self.branch.id!r}.\")\n            return self.branch.head\n\n        return self.branch.commit(message, metadata=metadata)\n\n    def merge(self, source_ref: str | Branch, into: str | Branch) -&gt; Commit:\n        \"\"\"\n        Merge a branch into another branch in a repository.\n\n        In case the branch contains no changes relevant to the target branch,\n        no merge happens, and the tip of the target branch is returned instead.\n\n        Parameters\n        ----------\n        source_ref: str | Branch\n            Source reference containing the changes to merge.\n            Can be a branch name or partial commit SHA.\n        into: str | Branch\n            Target branch into which the changes will be merged.\n\n        Returns\n        -------\n        Commit\n            Either the created merge commit, or the head commit of the target branch.\n        \"\"\"\n        source = _ensurebranch(source_ref, self.repository, self.fs.client)\n        dest = _ensurebranch(into, self.repository, self.fs.client)\n\n        if any(dest.diff(source)):\n            source.merge_into(dest)\n        return dest.head.get_commit()\n\n    def revert(self, branch: str | Branch, ref: ReferenceType, parent_number: int = 1) -&gt; Commit:\n        \"\"\"\n        Revert a previous commit on a branch.\n\n        Parameters\n        ----------\n        branch: str | Branch\n            Branch on which the commit should be reverted.\n        ref: ReferenceType\n            The reference to revert.\n        parent_number: int\n            If there are multiple parents to a commit, specify to which parent\n            the commit should be reverted. ``parent_number = 1`` (the default)\n            refers to the first parent commit of the current ``branch`` tip.\n\n        Returns\n        -------\n        Commit\n            The created revert commit.\n        \"\"\"\n\n        b = _ensurebranch(branch, self.repository, self.fs.client)\n\n        ref_id = ref if isinstance(ref, str) else ref.id\n        b.revert(ref_id, parent_number=parent_number)\n        return b.head.get_commit()\n\n    def rev_parse(self, ref: ReferenceType) -&gt; Commit:\n        \"\"\"\n        Parse a given lakeFS reference expression and obtain its corresponding commit.\n\n        Parameters\n        ----------\n        ref: ReferenceType\n            Reference object to resolve, can be a branch, commit SHA, or tag.\n\n        Returns\n        -------\n        Commit\n            The commit referenced by the expression ``ref``.\n        \"\"\"\n\n        ref_id = ref.id if isinstance(ref, Reference) else ref\n        reference = lakefs.Reference(self.repository, ref_id, client=self.fs.client)\n        return reference.get_commit()\n\n    def tag(self, ref: ReferenceType, name: str) -&gt; Tag:\n        \"\"\"\n        Create a tag referencing a commit in a repository.\n\n        Parameters\n        ----------\n        ref: ReferenceType\n            Commit SHA or placeholder for a reference or commit object\n            to which the new tag will point.\n        name: str\n            Name of the tag to be created.\n\n        Returns\n        -------\n        Tag\n            The requested tag.\n        \"\"\"\n\n        return lakefs.Tag(self.repository, name, client=self.fs.client).create(ref)\n</code></pre>"},{"location":"reference/lakefs_spec/transaction/#lakefs_spec.transaction.LakeFSTransaction.commit","title":"commit","text":"<pre><code>commit(message: str, metadata: dict[str, str] | None = None) -&gt; Reference\n</code></pre> <p>Create a commit on this transaction's ephemeral branch with a commit message and attached metadata.</p> PARAMETER  DESCRIPTION <code>message</code> <p>The commit message to attach to the newly created commit.</p> <p> TYPE: <code>str</code> </p> <code>metadata</code> <p>Optional metadata to enrich the created commit with (author, e-mail, ...).</p> <p> TYPE: <code>dict[str, str] | None</code> DEFAULT: <code>None</code> </p> RETURNS DESCRIPTION <code>Reference</code> <p>The created commit.</p> Source code in <code>src/lakefs_spec/transaction.py</code> <pre><code>def commit(self, message: str, metadata: dict[str, str] | None = None) -&gt; Reference:\n    \"\"\"\n    Create a commit on this transaction's ephemeral branch with a commit message\n    and attached metadata.\n\n    Parameters\n    ----------\n    message: str\n        The commit message to attach to the newly created commit.\n    metadata: dict[str, str] | None\n        Optional metadata to enrich the created commit with (author, e-mail, ...).\n\n    Returns\n    -------\n    Reference\n        The created commit.\n    \"\"\"\n\n    diff = list(self.branch.uncommitted())\n\n    if not diff:\n        logger.warning(f\"No changes to commit on branch {self.branch.id!r}.\")\n        return self.branch.head\n\n    return self.branch.commit(message, metadata=metadata)\n</code></pre>"},{"location":"reference/lakefs_spec/transaction/#lakefs_spec.transaction.LakeFSTransaction.merge","title":"merge","text":"<pre><code>merge(source_ref: str | Branch, into: str | Branch) -&gt; Commit\n</code></pre> <p>Merge a branch into another branch in a repository.</p> <p>In case the branch contains no changes relevant to the target branch, no merge happens, and the tip of the target branch is returned instead.</p> PARAMETER  DESCRIPTION <code>source_ref</code> <p>Source reference containing the changes to merge. Can be a branch name or partial commit SHA.</p> <p> TYPE: <code>str | Branch</code> </p> <code>into</code> <p>Target branch into which the changes will be merged.</p> <p> TYPE: <code>str | Branch</code> </p> RETURNS DESCRIPTION <code>Commit</code> <p>Either the created merge commit, or the head commit of the target branch.</p> Source code in <code>src/lakefs_spec/transaction.py</code> <pre><code>def merge(self, source_ref: str | Branch, into: str | Branch) -&gt; Commit:\n    \"\"\"\n    Merge a branch into another branch in a repository.\n\n    In case the branch contains no changes relevant to the target branch,\n    no merge happens, and the tip of the target branch is returned instead.\n\n    Parameters\n    ----------\n    source_ref: str | Branch\n        Source reference containing the changes to merge.\n        Can be a branch name or partial commit SHA.\n    into: str | Branch\n        Target branch into which the changes will be merged.\n\n    Returns\n    -------\n    Commit\n        Either the created merge commit, or the head commit of the target branch.\n    \"\"\"\n    source = _ensurebranch(source_ref, self.repository, self.fs.client)\n    dest = _ensurebranch(into, self.repository, self.fs.client)\n\n    if any(dest.diff(source)):\n        source.merge_into(dest)\n    return dest.head.get_commit()\n</code></pre>"},{"location":"reference/lakefs_spec/transaction/#lakefs_spec.transaction.LakeFSTransaction.revert","title":"revert","text":"<pre><code>revert(branch: str | Branch, ref: ReferenceType, parent_number: int = 1) -&gt; Commit\n</code></pre> <p>Revert a previous commit on a branch.</p> PARAMETER  DESCRIPTION <code>branch</code> <p>Branch on which the commit should be reverted.</p> <p> TYPE: <code>str | Branch</code> </p> <code>ref</code> <p>The reference to revert.</p> <p> TYPE: <code>ReferenceType</code> </p> <code>parent_number</code> <p>If there are multiple parents to a commit, specify to which parent the commit should be reverted. <code>parent_number = 1</code> (the default) refers to the first parent commit of the current <code>branch</code> tip.</p> <p> TYPE: <code>int</code> DEFAULT: <code>1</code> </p> RETURNS DESCRIPTION <code>Commit</code> <p>The created revert commit.</p> Source code in <code>src/lakefs_spec/transaction.py</code> <pre><code>def revert(self, branch: str | Branch, ref: ReferenceType, parent_number: int = 1) -&gt; Commit:\n    \"\"\"\n    Revert a previous commit on a branch.\n\n    Parameters\n    ----------\n    branch: str | Branch\n        Branch on which the commit should be reverted.\n    ref: ReferenceType\n        The reference to revert.\n    parent_number: int\n        If there are multiple parents to a commit, specify to which parent\n        the commit should be reverted. ``parent_number = 1`` (the default)\n        refers to the first parent commit of the current ``branch`` tip.\n\n    Returns\n    -------\n    Commit\n        The created revert commit.\n    \"\"\"\n\n    b = _ensurebranch(branch, self.repository, self.fs.client)\n\n    ref_id = ref if isinstance(ref, str) else ref.id\n    b.revert(ref_id, parent_number=parent_number)\n    return b.head.get_commit()\n</code></pre>"},{"location":"reference/lakefs_spec/transaction/#lakefs_spec.transaction.LakeFSTransaction.rev_parse","title":"rev_parse","text":"<pre><code>rev_parse(ref: ReferenceType) -&gt; Commit\n</code></pre> <p>Parse a given lakeFS reference expression and obtain its corresponding commit.</p> PARAMETER  DESCRIPTION <code>ref</code> <p>Reference object to resolve, can be a branch, commit SHA, or tag.</p> <p> TYPE: <code>ReferenceType</code> </p> RETURNS DESCRIPTION <code>Commit</code> <p>The commit referenced by the expression <code>ref</code>.</p> Source code in <code>src/lakefs_spec/transaction.py</code> <pre><code>def rev_parse(self, ref: ReferenceType) -&gt; Commit:\n    \"\"\"\n    Parse a given lakeFS reference expression and obtain its corresponding commit.\n\n    Parameters\n    ----------\n    ref: ReferenceType\n        Reference object to resolve, can be a branch, commit SHA, or tag.\n\n    Returns\n    -------\n    Commit\n        The commit referenced by the expression ``ref``.\n    \"\"\"\n\n    ref_id = ref.id if isinstance(ref, Reference) else ref\n    reference = lakefs.Reference(self.repository, ref_id, client=self.fs.client)\n    return reference.get_commit()\n</code></pre>"},{"location":"reference/lakefs_spec/transaction/#lakefs_spec.transaction.LakeFSTransaction.tag","title":"tag","text":"<pre><code>tag(ref: ReferenceType, name: str) -&gt; Tag\n</code></pre> <p>Create a tag referencing a commit in a repository.</p> PARAMETER  DESCRIPTION <code>ref</code> <p>Commit SHA or placeholder for a reference or commit object to which the new tag will point.</p> <p> TYPE: <code>ReferenceType</code> </p> <code>name</code> <p>Name of the tag to be created.</p> <p> TYPE: <code>str</code> </p> RETURNS DESCRIPTION <code>Tag</code> <p>The requested tag.</p> Source code in <code>src/lakefs_spec/transaction.py</code> <pre><code>def tag(self, ref: ReferenceType, name: str) -&gt; Tag:\n    \"\"\"\n    Create a tag referencing a commit in a repository.\n\n    Parameters\n    ----------\n    ref: ReferenceType\n        Commit SHA or placeholder for a reference or commit object\n        to which the new tag will point.\n    name: str\n        Name of the tag to be created.\n\n    Returns\n    -------\n    Tag\n        The requested tag.\n    \"\"\"\n\n    return lakefs.Tag(self.repository, name, client=self.fs.client).create(ref)\n</code></pre>"},{"location":"reference/lakefs_spec/util/","title":"util","text":"<p>Useful utilities for handling lakeFS URIs and results of lakeFS API calls.</p>"},{"location":"reference/lakefs_spec/util/#lakefs_spec.util.depaginate","title":"depaginate","text":"<pre><code>depaginate(\n    api: Callable[..., PaginatedApiResponse], *args: Any, **kwargs: Any\n) -&gt; Generator[Any, None, None]\n</code></pre> <p>Unwrap the responses from a paginated lakeFS API method into a generator.</p> PARAMETER  DESCRIPTION <code>api</code> <p>The lakeFS client API to call. Must return a paginated response with the <code>pagination</code> and <code>results</code> fields set.</p> <p> TYPE: <code>Callable[..., PaginatedApiResponse]</code> </p> <code>*args</code> <p>Positional arguments to pass to the API call.</p> <p> TYPE: <code>Any</code> DEFAULT: <code>()</code> </p> <code>**kwargs</code> <p>Keyword arguments to pass to the API call.</p> <p> TYPE: <code>Any</code> DEFAULT: <code>{}</code> </p> YIELDS DESCRIPTION <code>Any</code> <p>The obtained API result objects.</p> Source code in <code>src/lakefs_spec/util.py</code> <pre><code>def depaginate(\n    api: Callable[..., PaginatedApiResponse], *args: Any, **kwargs: Any\n) -&gt; Generator[Any, None, None]:\n    \"\"\"\n    Unwrap the responses from a paginated lakeFS API method into a generator.\n\n    Parameters\n    ----------\n    api: Callable[..., PaginatedApiResponse]\n        The lakeFS client API to call. Must return a paginated response with the ``pagination`` and ``results`` fields set.\n    *args: Any\n        Positional arguments to pass to the API call.\n    **kwargs: Any\n        Keyword arguments to pass to the API call.\n\n    Yields\n    ------\n    Any\n        The obtained API result objects.\n    \"\"\"\n    while True:\n        resp = api(*args, **kwargs)\n        yield from resp.results\n        if not resp.pagination.has_more:\n            break\n        kwargs[\"after\"] = resp.pagination.next_offset\n</code></pre>"},{"location":"reference/lakefs_spec/util/#lakefs_spec.util.md5_checksum","title":"md5_checksum","text":"<pre><code>md5_checksum(lpath: str | PathLike[str], blocksize: int = 2 ** 22) -&gt; str\n</code></pre> <p>Calculate a local file's MD5 hash.</p> PARAMETER  DESCRIPTION <code>lpath</code> <p>The local path whose MD5 hash to calculate. Must be a file.</p> <p> TYPE: <code>str | PathLike[str]</code> </p> <code>blocksize</code> <p>Block size (in bytes) to use while reading in the file.</p> <p> TYPE: <code>int</code> DEFAULT: <code>2 ** 22</code> </p> RETURNS DESCRIPTION <code>str</code> <p>The file's MD5 hash value, as a string.</p> Source code in <code>src/lakefs_spec/util.py</code> <pre><code>def md5_checksum(lpath: str | os.PathLike[str], blocksize: int = 2**22) -&gt; str:\n    \"\"\"\n    Calculate a local file's MD5 hash.\n\n    Parameters\n    ----------\n    lpath: str | os.PathLike[str]\n        The local path whose MD5 hash to calculate. Must be a file.\n    blocksize: int\n        Block size (in bytes) to use while reading in the file.\n\n    Returns\n    -------\n    str\n        The file's MD5 hash value, as a string.\n    \"\"\"\n    with open(lpath, \"rb\") as f:\n        file_hash = hashlib.md5(usedforsecurity=False)\n        chunk = f.read(blocksize)\n        while chunk:\n            file_hash.update(chunk)\n            chunk = f.read(blocksize)\n    return file_hash.hexdigest()\n</code></pre>"},{"location":"reference/lakefs_spec/util/#lakefs_spec.util.parse","title":"parse","text":"<pre><code>parse(path: str) -&gt; tuple[str, str, str]\n</code></pre> <p>Parses a lakeFS URI in the form <code>lakefs://&lt;repo&gt;/&lt;ref&gt;/&lt;resource&gt;</code>.</p> PARAMETER  DESCRIPTION <code>path</code> <p>String path, needs to conform to the lakeFS URI format described above. The <code>&lt;resource&gt;</code> part can be the empty string; the leading <code>lakefs://</code> scheme may be omitted.</p> <p> TYPE: <code>str</code> </p> RETURNS DESCRIPTION <code>tuple[str, str, str]</code> <p>A 3-tuple of repository name, reference, and resource name.</p> RAISES DESCRIPTION <code>ValueError</code> <p>If the path does not conform to the lakeFS URI format.</p> Source code in <code>src/lakefs_spec/util.py</code> <pre><code>def parse(path: str) -&gt; tuple[str, str, str]:\n    \"\"\"\n    Parses a lakeFS URI in the form ``lakefs://&lt;repo&gt;/&lt;ref&gt;/&lt;resource&gt;``.\n\n    Parameters\n    ----------\n    path: str\n        String path, needs to conform to the lakeFS URI format described above.\n        The ``&lt;resource&gt;`` part can be the empty string; the leading ``lakefs://`` scheme may be omitted.\n\n    Returns\n    -------\n    tuple[str, str, str]\n        A 3-tuple of repository name, reference, and resource name.\n\n    Raises\n    ------\n    ValueError\n        If the path does not conform to the lakeFS URI format.\n    \"\"\"\n\n    # First regex reflects the lakeFS repository naming rules:\n    # only lowercase letters, digits and dash, no leading dash, minimum 3, maximum 63 characters\n    # https://docs.lakefs.io/understand/model.html#repository\n    # Second regex is the branch: Only letters, digits, underscores and dash, no leading dash.\n    path_regex = re.compile(r\"(?:lakefs://)?([a-z0-9][a-z0-9\\-]{2,62})/(\\w[\\w\\-]*)/(.*)\")\n    results = path_regex.fullmatch(path)\n    if results is None:\n        raise ValueError(\n            f\"expected path with structure lakefs://&lt;repo&gt;/&lt;ref&gt;/&lt;resource&gt;, got {path!r}\"\n        )\n\n    repo, ref, resource = results.groups()\n    return repo, ref, resource\n</code></pre>"},{"location":"tutorials/","title":"Tutorials","text":"<p>Info</p> <p>We aim to provide additional tutorials in the future - contributions are welcome!</p> <ul> <li>Quickstart example: Using lakeFS-spec as a file system</li> <li>A fully-worked data science example: Using lakeFS-spec together with Pandas to train a classifier based on a public dataset and simulate additional data being collected</li> </ul>"},{"location":"tutorials/demo_data_science_project/","title":"Data Science with lakeFS-spec","text":"<pre><code>%pip install numpy pandas scikit-learn\n</code></pre> <pre>\n<code>Collecting numpy\n  Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)\n</code>\n</pre> <pre>\n<code>Collecting pandas\n</code>\n</pre> <pre>\n<code>  Downloading pandas-2.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)\n</code>\n</pre> <pre>\n<code>Collecting scikit-learn\n  Using cached scikit_learn-1.4.1.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)\n</code>\n</pre> <pre>\n<code>Requirement already satisfied: python-dateutil&gt;=2.8.2 in /opt/hostedtoolcache/Python/3.11.8/x64/lib/python3.11/site-packages (from pandas) (2.8.2)\nRequirement already satisfied: pytz&gt;=2020.1 in /opt/hostedtoolcache/Python/3.11.8/x64/lib/python3.11/site-packages (from pandas) (2024.1)\nCollecting tzdata&gt;=2022.7 (from pandas)\n  Using cached tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)\n</code>\n</pre> <pre>\n<code>Collecting scipy&gt;=1.6.0 (from scikit-learn)\n  Using cached scipy-1.12.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)\nCollecting joblib&gt;=1.2.0 (from scikit-learn)\n</code>\n</pre> <pre>\n<code>  Using cached joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)\nCollecting threadpoolctl&gt;=2.0.0 (from scikit-learn)\n  Using cached threadpoolctl-3.3.0-py3-none-any.whl.metadata (13 kB)\nRequirement already satisfied: six&gt;=1.5 in /opt/hostedtoolcache/Python/3.11.8/x64/lib/python3.11/site-packages (from python-dateutil&gt;=2.8.2-&gt;pandas) (1.16.0)\n</code>\n</pre> <pre>\n<code>Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)\nDownloading pandas-2.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.0 MB)\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 0.0/13.0 MB ? eta -:--:--</code>\n</pre> <pre>\n<code>\n   \u2501\u2501\u2501\u2501\u2578\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 1.6/13.0 MB 76.4 MB/s eta 0:00:01</code>\n</pre> <pre>\n<code>\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u257a\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 6.2/13.0 MB 90.8 MB/s eta 0:00:01\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u257a\u2501\u2501\u2501 11.8/13.0 MB 123.4 MB/s eta 0:00:01</code>\n</pre> <pre>\n<code>\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2578 13.0/13.0 MB 164.6 MB/s eta 0:00:01\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 13.0/13.0 MB 107.9 MB/s eta 0:00:00\nUsing cached scikit_learn-1.4.1.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)\n</code>\n</pre> <pre>\n<code>Using cached joblib-1.3.2-py3-none-any.whl (302 kB)\nUsing cached scipy-1.12.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.4 MB)\n</code>\n</pre> <pre>\n<code>Using cached threadpoolctl-3.3.0-py3-none-any.whl (17 kB)\nUsing cached tzdata-2024.1-py2.py3-none-any.whl (345 kB)\n</code>\n</pre> <pre>\n<code>Installing collected packages: tzdata, threadpoolctl, numpy, joblib, scipy, pandas, scikit-learn\n</code>\n</pre> <pre>\n<code>Successfully installed joblib-1.3.2 numpy-1.26.4 pandas-2.2.1 scikit-learn-1.4.1.post1 scipy-1.12.0 threadpoolctl-3.3.0 tzdata-2024.1\n</code>\n</pre> <pre>\n<code>Note: you may need to restart the kernel to use updated packages.\n</code>\n</pre> <p>Also install an appropriate lakeFS-spec version, which can be either the latest release from PyPI via <code>pip install --upgrade lakefs-spec</code>, or the development version from GitHub via <code>pip install git+https://github.com/aai-institute/lakefs-spec.git</code>.</p> <pre><code>import os\nimport tempfile\nimport urllib.request\nfrom pathlib import Path\n\nurllib.request.urlretrieve(\n    \"https://raw.githubusercontent.com/aai-institute/lakefs-spec/main/docs/tutorials/.lakectl.yaml\",\n    os.path.expanduser(\"~/.lakectl.yaml\"),\n)\n</code></pre> <pre>\n<code>('/home/runner/.lakectl.yaml', &lt;http.client.HTTPMessage at 0x7f781c8cc090&gt;)</code>\n</pre> <p>We can now instantiate the <code>LakeFSFileSystem</code> with the credentials we just downloaded. Alternatively, we could have passed the credentials directly in the code. It is important that the credentials are available at the time of filesystem instantiation.</p> <pre><code>from lakefs_spec import LakeFSFileSystem\n\nfs = LakeFSFileSystem()\n\nREPO_NAME = \"weather\"\n</code></pre> <p>We will create a repository using a helper function provided by lakeFS-spec. If you have already created one in the UI, make sure to set the <code>REPO_NAME</code> variable accordingly in the cell directly above.</p> <pre><code>import lakefs\n\nrepo = lakefs.Repository(REPO_NAME, fs.client).create(storage_namespace=f\"local://{REPO_NAME}\")\n</code></pre> <pre><code>def _maybe_urlretrieve(url: str, filename: str) -&amp;gt; str:\n    # Avoid API rate limit errors by downloading to a fixed local location\n    destination = Path(tempfile.gettempdir()) / \"lakefs-spec-tutorials\" / filename\n    destination.parent.mkdir(exist_ok=True, parents=True)\n    if destination.exists():\n        return str(destination)\n\n    outfile, _ = urllib.request.urlretrieve(url, str(destination))\n    return outfile\n\n\noutfile = _maybe_urlretrieve(\n    \"https://archive-api.open-meteo.com/v1/archive?latitude=52.52&amp;amp;longitude=13.41&amp;amp;start_date=2010-01-01&amp;amp;end_date=2010-12-31&amp;amp;hourly=temperature_2m,relativehumidity_2m,rain,pressure_msl,surface_pressure,cloudcover,cloudcover_low,cloudcover_mid,cloudcover_high,windspeed_10m,windspeed_100m,winddirection_10m,winddirection_100m\",\n    \"weather-2010.json\",\n)\n</code></pre> <p>The data is in JSON format. Therefore, we need to wrangle the data a bit to make it usable. But first, we will upload it to our lakeFS instance.</p> <pre><code>NEW_BRANCH = lakefs.Branch(REPO_NAME, \"transform-raw-data\", client=fs.client)\nNEW_BRANCH.create(\"main\")\n\nwith fs.transaction(REPO_NAME, NEW_BRANCH) as tx:\n    fs.put(outfile, f\"{REPO_NAME}/{tx.branch.id}/weather-2010.json\")\n    tx.commit(message=\"Add 2010 weather data\")\n</code></pre> <p>You can inspect this commit by selecting the <code>transform-raw-data</code> branch, and navigating to the Commits tab.</p> <pre><code>import json\n\nimport pandas as pd\n\n\ndef transform_json_weather_data(filepath):\n    if hasattr(filepath, \"close\") and hasattr(filepath, \"tell\"):\n        data = json.load(filepath)\n    else:\n        with open(filepath, \"r\") as f:\n            data = json.load(f)\n\n    df = pd.DataFrame.from_dict(data[\"hourly\"])\n    df.time = pd.to_datetime(df.time)\n    df[\"is_raining\"] = df.rain &amp;gt; 0\n    df[\"is_raining_in_1_day\"] = df.is_raining.shift(24).astype(bool)\n    df = df.dropna()\n    return df\n\n\ndf = transform_json_weather_data(outfile)\ndf.head(5)\n</code></pre> time temperature_2m relativehumidity_2m rain pressure_msl surface_pressure cloudcover cloudcover_low cloudcover_mid cloudcover_high windspeed_10m windspeed_100m winddirection_10m winddirection_100m is_raining is_raining_in_1_day 0 2010-01-01 00:00:00 -2.6 88 0.0 996.9 992.1 100 100 97 75 16.0 27.2 54 58 False True 1 2010-01-01 01:00:00 -2.7 88 0.0 996.4 991.6 100 99 96 49 16.3 28.0 55 58 False True 2 2010-01-01 02:00:00 -2.7 88 0.0 996.2 991.4 100 96 94 60 16.3 27.5 55 58 False True 3 2010-01-01 03:00:00 -2.7 88 0.0 996.1 991.3 100 97 96 83 15.4 26.6 53 57 False True 4 2010-01-01 04:00:00 -2.7 88 0.0 996.0 991.2 100 92 98 82 14.8 25.6 47 52 False True <p>Next, we save this data as a CSV file into the main branch. When the transaction commit helper is called, the newly put CSV file is committed. You can verify the saving worked in the lakeFS UI in your browser by switching to the commits tab of the <code>main</code> branch.</p> <pre><code>with fs.transaction(REPO_NAME, \"main\") as tx:\n    df.to_csv(f\"lakefs://{REPO_NAME}/{tx.branch.id}/weather_2010.csv\")\n    tx.commit(message=\"Update weather data\")\n</code></pre> <pre><code>import sklearn.model_selection\n\nmodel_data = df.drop(\"time\", axis=1)\n\ntrain, test = sklearn.model_selection.train_test_split(model_data, random_state=7)\n</code></pre> <p>We save these train and test datasets into a new <code>training</code> branch. If the branch does not exist yet, as in this case, it is implicitly created by default. You can control this behaviour with the <code>create_branch_ok</code> flag when initializing the <code>LakeFSFileSystem</code>. By default, <code>create_branch_ok</code> is set to <code>True</code>, so we need to only set <code>fs = LakeFSFileSystem()</code> to enable implicit branch creation.</p> <pre><code>TRAINING_BRANCH = lakefs.Branch(REPO_NAME, \"training\", client=fs.client)\nTRAINING_BRANCH.create(\"main\")\n\nwith fs.transaction(REPO_NAME, TRAINING_BRANCH) as tx:\n    train.to_csv(f\"lakefs://{REPO_NAME}/{tx.branch.id}/train_weather.csv\")\n    test.to_csv(f\"lakefs://{REPO_NAME}/{tx.branch.id}/test_weather.csv\")\n    tx.commit(message=\"Add train-test split of 2010 weather data\")\n</code></pre> <p>Let's check the shape of train and test data. Later on, we will get back to this data version and reproduce the results of the experiment.</p> <pre><code>print(f\"Initial train data shape: {train.shape}\")\nprint(f\"Initial test data shape: {test.shape}\")\n</code></pre> <pre>\n<code>Initial train data shape: (6570, 15)\nInitial test data shape: (2190, 15)\n</code>\n</pre> <p>We now proceed to train a decision tree classifier and evaluate it on the test set:</p> <pre><code>from sklearn.tree import DecisionTreeClassifier\n\ndependent_variable = \"is_raining_in_1_day\"\n\nmodel = DecisionTreeClassifier(random_state=7)\n\nx_train, y_train = train.drop(dependent_variable, axis=1), train[dependent_variable].astype(bool)\nx_test, y_test = test.drop(dependent_variable, axis=1), test[dependent_variable].astype(bool)\n\nmodel.fit(x_train, y_train)\n\ntest_acc = model.score(x_test, y_test)\nprint(f\"Test accuracy: {test_acc:.2%}\")\n</code></pre> <pre>\n<code>Test accuracy: 87.31%\n</code>\n</pre> <pre><code>outfile = _maybe_urlretrieve(\n    \"https://archive-api.open-meteo.com/v1/archive?latitude=52.52&amp;amp;longitude=13.41&amp;amp;start_date=2020-01-01&amp;amp;end_date=2020-12-31&amp;amp;hourly=temperature_2m,relativehumidity_2m,rain,pressure_msl,surface_pressure,cloudcover,cloudcover_low,cloudcover_mid,cloudcover_high,windspeed_10m,windspeed_100m,winddirection_10m,winddirection_100m\",\n    \"weather-2020.json\",\n)\n\nnew_data = transform_json_weather_data(outfile)\n\nwith fs.transaction(REPO_NAME, \"main\") as tx:\n    new_data.to_csv(f\"lakefs://{REPO_NAME}/{tx.branch.id}/weather_2020.csv\")\n    tx.commit(message=\"Add 2020 weather data\")\n\n# Remove leftover temporary files from previous `urlretrieve` calls\nurllib.request.urlcleanup()\n</code></pre> <p>Let's concatenate the old data and the new data, create a new train-test split, and push the updated files to lakeFS:</p> <pre><code>new_data = new_data.drop(\"time\", axis=1)\nfull_data = pd.concat([new_data, train, test])\n\ntrain_df, test_df = sklearn.model_selection.train_test_split(full_data, random_state=7)\n\nprint(f\"Updated train data shape: {train_df.shape}\")\nprint(f\"Updated test data shape: {test_df.shape}\")\n\nwith fs.transaction(REPO_NAME, TRAINING_BRANCH) as tx:\n    train_df.to_csv(f\"lakefs://{REPO_NAME}/{tx.branch.id}/train_weather.csv\")\n    test_df.to_csv(f\"lakefs://{REPO_NAME}/{tx.branch.id}/test_weather.csv\")\n    tx.commit(message=\"Add train-test split of 2010 and 2020 data\")\n</code></pre> <pre>\n<code>Updated train data shape: (13158, 15)\nUpdated test data shape: (4386, 15)\n</code>\n</pre> <p>Now, we train the model on the new data and validate on the new test data.</p> <pre><code>x_train, y_train = (\n    train_df.drop(dependent_variable, axis=1),\n    train_df[dependent_variable].astype(bool),\n)\nx_test, y_test = test_df.drop(dependent_variable, axis=1), test_df[dependent_variable].astype(bool)\n\nmodel.fit(x_train, y_train)\n\ntest_acc = model.score(x_test, y_test)\n\nprint(f\"Test accuracy: {test_acc:.2%}\")\n</code></pre> <pre>\n<code>Test accuracy: 82.67%\n</code>\n</pre> <pre><code># access the data of the previous commit with a lakefs ref expression, in this case the same as in git.\nprevious_commit = repo.ref(f\"{TRAINING_BRANCH.id}~\").get_commit()\nfixed_commit_id = previous_commit.id\nprint(fixed_commit_id)\n</code></pre> <pre>\n<code>698964fd36f1fa620e92c786e62fc0d59c2f62b8c82eb721cc1233f95520702d\n</code>\n</pre> <p>Let's check whether we managed to get the initial train and test data with this commit SHA, checking equality to the initial data:</p> <pre><code>orig_train = pd.read_csv(f\"lakefs://{REPO_NAME}/{fixed_commit_id}/train_weather.csv\", index_col=0)\norig_test = pd.read_csv(f\"lakefs://{REPO_NAME}/{fixed_commit_id}/test_weather.csv\", index_col=0)\n\nprint(f\"Is the pulled training data equal to the local training data? {train.equals(orig_train)}\")\nprint(f\"Is the pulled test data equal to the local test data? {test.equals(orig_test)}\")\n</code></pre> <pre>\n<code>Is the pulled training data equal to the local training data? True\nIs the pulled test data equal to the local test data? True\n</code>\n</pre> <p>Let's train and validate the model again based on the redownloaded data and see if we manage to reproduce the initial accuracy.</p> <pre><code>x_train, y_train = train.drop(dependent_variable, axis=1), train[dependent_variable].astype(bool)\nx_test, y_test = test.drop(dependent_variable, axis=1), test[dependent_variable].astype(bool)\n\nmodel.fit(x_train, y_train)\n\ntest_acc = model.score(x_test, y_test)\n\nprint(f\"Test accuracy: {test_acc:.2%}\")\n</code></pre> <pre>\n<code>Test accuracy: 87.31%\n</code>\n</pre> <pre><code>with fs.transaction(REPO_NAME, \"main\") as tx:\n    # returns the tag as a lakeFS object.\n    tag = tx.tag(fixed_commit_id, name=\"train-test-split-2010\")\n</code></pre> <p>Now we can access the specific files with the semantic tag. Both the <code>fixed_commit_id</code> and <code>tag</code> reference the same version <code>ref</code> in lakeFS, whereas a branch name always points to the latest version on that respective branch.</p> <pre><code>train_from_commit = pd.read_csv(\n    f\"lakefs://{REPO_NAME}/{fixed_commit_id}/train_weather.csv\", index_col=0\n)\ntrain_from_tag = pd.read_csv(f\"lakefs://{REPO_NAME}/{tag.id}/train_weather.csv\", index_col=0)\n</code></pre> <p>We can verify this by comparing the <code>DataFrame</code>s. We see that the <code>train_from_commit</code> and <code>train_from_tag</code> are equal.</p> <pre><code>print(\n    f\"Is the data tagged {tag!r} equal to the data in commit {fixed_commit_id[:8]}? {train_from_commit.equals(train_from_tag)}\"\n)\n</code></pre> <pre>\n<code>Is the data tagged Tag(repository=\"weather\", id=\"train-test-split-2010\") equal to the data in commit 698964fd? True\n</code>\n</pre>"},{"location":"tutorials/demo_data_science_project/#data-science-with-lakefs-spec","title":"Data Science with lakeFS-spec","text":"<p>In this notebook, we will complete a small end-to-end data science tutorial that employs lakeFS-spec for data versioning. We will use versioned weather data to train a decision tree classifier to predict whether it is raining tomorrow given the current weather.</p> <p>We will do the following:</p> <ul> <li>Environment setup</li> <li>LakeFS setup</li> <li>Authenticating with the lakeFS server</li> <li>Data ingestion via transactions</li> <li>Model training</li> <li>Updating data and retraining a model</li> <li>Accessing data versions and reproducing experiments</li> <li>Using tags for semantic versioning</li> </ul> <p>Local Execution</p> <p>If you want to execute the code in this tutorial as a Jupyter notebook yourself, download the <code>demo_data_science_project.py</code> file from the lakeFS-spec repository.</p> <p>You can then convert the Python file to a notebook using Jupytext using the following command: <code>jupytext --to notebook demo_data_science_project.py</code>.</p> <p>This tutorial assumes that you have installed lakeFS-spec in a virtual environment, and that you have followed the quickstart guide to set up a local lakeFS instance.</p>"},{"location":"tutorials/demo_data_science_project/#environment-setup","title":"Environment setup","text":"<p>Install the necessary libraries for this notebook on the environment you have just created:</p>"},{"location":"tutorials/demo_data_science_project/#lakefs-setup","title":"lakeFS Setup","text":"<p>With Docker Desktop or a similar runtime running set up lakeFS by executing the following <code>docker run</code> command (from the lakeFS quickstart) in your console:</p> <pre><code>docker run --name lakefs --pull always --rm --publish 8000:8000 treeverse/lakefs:latest run --quickstart\n</code></pre> <p>You find the authentication credentials in the terminal output. The default address for the local lakeFS GUI is http://localhost:8000/.</p>"},{"location":"tutorials/demo_data_science_project/#authenticating-with-the-lakefs-server","title":"Authenticating with the lakeFS server","text":"<p>There are multiple ways to authenticate with lakeFS from Python code. In this tutorial, we choose the YAML file configuration. By executing the cell below, you will download a YAML file containing the default lakeFS quickstart credentials and server URL to your user directory.</p>"},{"location":"tutorials/demo_data_science_project/#data-ingestion","title":"Data Ingestion","text":"<p>Now it's time to get some data. We will use the Open-Meteo API, where we can pull weather data from an API for free (as long as we are non-commercial) and without an API token. In order to prevent hitting the rate limits when repeatedly querying the API (and out of courtesy towards the operators of the API), the <code>_maybe_urlretrieve</code> function provides a simple local cache for the downloaded data.</p> <p>For training our toy model, we download the full weather data of Munich for the year 2010:</p>"},{"location":"tutorials/demo_data_science_project/#upload-a-file-using-transactions","title":"Upload a file using transactions","text":"<p>lakeFS works similar to <code>git</code> as a versioning system. You can create commits that contain specific changes to the data. You can also work with branches to create your own isolated view of the data independently of your colleagues. Every commit (on any branch) is identified by a commit SHA. This SHA can be used to programmatically interact with specific states of your data and enables logging of the specific data versions used to create a certain model.</p> <p>To easily carry out versioning operations while uploading files, you can use transactions. A transaction is a context manager that keeps track of all files that were uploaded in its scope, as well as all versioning operations happening between file uploads. All operations are deferred to the end of the transaction, and are executed sequentially on completion.</p> <p>To create a commit after a file upload, you can run the following transaction:</p>"},{"location":"tutorials/demo_data_science_project/#data-transformation","title":"Data Transformation","text":"<p>Now let's transform the data for our use case. We put the transformation into a function to be able to reuse it later.</p> <p>In this notebook, we use a simple toy model to predict whether it is raining at the same time tomorrow given weather data from right now.</p> <p>We will skip a lot of possible feature engineering and other data science aspects in order to focus more on the application of the <code>LakeFSFileSystem</code>.</p>"},{"location":"tutorials/demo_data_science_project/#training-the-initial-weather-model","title":"Training the initial weather model","text":"<p>First we will do a train-test split:</p>"},{"location":"tutorials/demo_data_science_project/#updating-data-and-retraining-the-model","title":"Updating data and retraining the model","text":"<p>Until now, we only have used data from 2010. Let's download additional 2020 data, transform it, and save it to lakeFS.</p>"},{"location":"tutorials/demo_data_science_project/#accessing-data-versions-through-commits-and-reproducing-experiments","title":"Accessing data versions through commits and reproducing experiments","text":"<p>If we need to go to our initial data and reproduce the first experiment (the model trained on the 2010 data with its initial accuracy), we can go back in the commit history of the <code>training</code> branch and select the appropriate commit data snapshot. Since we have created multiple commits on the same branch already, we will address different data versions by their commit SHAs.</p> <p>To obtain the actual commit SHA from a branch, we have multiple options. Manually, we could go into the lakeFS UI, select the training branch, and navigate to the Commits tab. There, we take the parent of the previous commit, titled <code>Add train-test split of 2010 weather data</code>, and copy its revision SHA (also called <code>ID</code>).</p> <p>In code, we can obtain commit SHAs for different revisions on the <code>training</code> branch by using <code>lakefs.Reference</code> objects.</p>"},{"location":"tutorials/demo_data_science_project/#using-tags-instead-of-commit-shas-for-semantic-versioning","title":"Using tags instead of commit SHAs for semantic versioning","text":"<p>The above method for data versioning works great when you have experiment tracking tools to store and retrieve the commit SHA in automated pipelines. But it can be tedious to retrieve in manual prototyping. We can make selected versions of the dataset more accessible with semantic versioning by attaching a human-interpretable tag to a specific commit SHA.</p> <p>Creating a tag is easiest when done inside a transaction, just like the files we already uploaded. To do this, simply call <code>tx.tag</code> on the transaction and supply the repository name, the commit SHA to tag, and the intended tag name. Tags are immutable once created, so attempting to tag two different commits with the same name will result in an error.</p>"}]}
\ No newline at end of file
diff --git a/development/sitemap.xml b/development/sitemap.xml
index dd53ecd8..eb007fa5 100644
--- a/development/sitemap.xml
+++ b/development/sitemap.xml
@@ -2,82 +2,82 @@
 <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
     <url>
          <loc>https://lakefs-spec.org/latest/</loc>
-         <lastmod>2024-02-21</lastmod>
+         <lastmod>2024-02-28</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://lakefs-spec.org/latest/CONTRIBUTING/</loc>
-         <lastmod>2024-02-21</lastmod>
+         <lastmod>2024-02-28</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://lakefs-spec.org/latest/quickstart/</loc>
-         <lastmod>2024-02-21</lastmod>
+         <lastmod>2024-02-28</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://lakefs-spec.org/latest/guides/</loc>
-         <lastmod>2024-02-21</lastmod>
+         <lastmod>2024-02-28</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://lakefs-spec.org/latest/guides/configuration/</loc>
-         <lastmod>2024-02-21</lastmod>
+         <lastmod>2024-02-28</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://lakefs-spec.org/latest/guides/filesystem-usage/</loc>
-         <lastmod>2024-02-21</lastmod>
+         <lastmod>2024-02-28</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://lakefs-spec.org/latest/guides/integrations/</loc>
-         <lastmod>2024-02-21</lastmod>
+         <lastmod>2024-02-28</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://lakefs-spec.org/latest/guides/transactions/</loc>
-         <lastmod>2024-02-21</lastmod>
+         <lastmod>2024-02-28</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://lakefs-spec.org/latest/reference/SUMMARY/</loc>
-         <lastmod>2024-02-21</lastmod>
+         <lastmod>2024-02-28</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://lakefs-spec.org/latest/reference/lakefs_spec/</loc>
-         <lastmod>2024-02-21</lastmod>
+         <lastmod>2024-02-28</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://lakefs-spec.org/latest/reference/lakefs_spec/errors/</loc>
-         <lastmod>2024-02-21</lastmod>
+         <lastmod>2024-02-28</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://lakefs-spec.org/latest/reference/lakefs_spec/spec/</loc>
-         <lastmod>2024-02-21</lastmod>
+         <lastmod>2024-02-28</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://lakefs-spec.org/latest/reference/lakefs_spec/transaction/</loc>
-         <lastmod>2024-02-21</lastmod>
+         <lastmod>2024-02-28</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://lakefs-spec.org/latest/reference/lakefs_spec/util/</loc>
-         <lastmod>2024-02-21</lastmod>
+         <lastmod>2024-02-28</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://lakefs-spec.org/latest/tutorials/</loc>
-         <lastmod>2024-02-21</lastmod>
+         <lastmod>2024-02-28</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://lakefs-spec.org/latest/tutorials/demo_data_science_project/</loc>
-         <lastmod>2024-02-21</lastmod>
+         <lastmod>2024-02-28</lastmod>
          <changefreq>daily</changefreq>
     </url>
 </urlset>
\ No newline at end of file
diff --git a/development/sitemap.xml.gz b/development/sitemap.xml.gz
index c0020266..902edf25 100644
Binary files a/development/sitemap.xml.gz and b/development/sitemap.xml.gz differ
diff --git a/development/tutorials/demo_data_science_project/index.html b/development/tutorials/demo_data_science_project/index.html
index f9d52518..aa5f07eb 100644
--- a/development/tutorials/demo_data_science_project/index.html
+++ b/development/tutorials/demo_data_science_project/index.html
@@ -112,21 +112,55 @@
       </div>
     
     
-      
+      <!--
+  Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>
+
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to
+  deal in the Software without restriction, including without limitation the
+  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+  sell copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+  IN THE SOFTWARE.
+-->
+
+<!-- Determine classes -->
+
 
   
 
+
+<!-- Header -->
 <header class="md-header md-header--shadow" data-md-component="header">
-  <nav class="md-header__inner md-grid" aria-label="Header">
+  <nav
+    class="md-header__inner md-grid"
+    aria-label="Header"
+  >
+
+    <!-- Link to home -->
     <a href="https://lakefs-spec.org" title="lakeFS-spec" class="md-header__button md-logo" aria-label="lakeFS-spec" data-md-component="logo">
       
   <img src="../../_images/aai-logo-cropped.png" alt="logo">
 
     </a>
+
+    <!-- Button to open drawer -->
     <label class="md-header__button md-icon" for="__drawer">
       
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
     </label>
+
+    <!-- Header title -->
     <div class="md-header__title" data-md-component="header-title">
       <div class="md-header__ellipsis">
         <div class="md-header__topic">
@@ -137,12 +171,14 @@
         <div class="md-header__topic" data-md-component="header-topic">
           <span class="md-ellipsis">
             
-              Data Science with lakeFS-spec
+              lakeFS-spec &mdash; <strong>Data Science with lakeFS-spec</strong>
             
           </span>
         </div>
       </div>
     </div>
+
+    <!-- Color palette toggle -->
     
       
         <form class="md-header__option" data-md-component="palette">
@@ -170,15 +206,23 @@
 </form>
       
     
+
+    <!-- User preference: color palette -->
     
       <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
     
+
+    <!-- Site language selector -->
     
+
+    <!-- Button to open search modal -->
     
       <label class="md-header__button md-icon" for="__search">
         
         <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
       </label>
+
+      <!-- Search interface -->
       <div class="md-search" data-md-component="search" role="dialog">
   <label class="md-search__overlay" for="__search"></label>
   <div class="md-search__inner" role="search">
@@ -212,6 +256,8 @@
   </div>
 </div>
     
+
+    <!-- Repository information -->
     
       <div class="md-header__source">
         <a href="https://github.com/aai-institute/lakefs-spec" title="Go to repository" class="md-source" data-md-component="source">
@@ -226,6 +272,8 @@
       </div>
     
   </nav>
+
+  <!-- Navigation tabs (sticky) -->
   
 </header>
     
@@ -1064,6 +1112,7 @@ <h2 id="environment-setup">Environment setup<a class="headerlink" href="#environ
 <div class="output_subarea output_stream output_stdout output_text">
 <pre>
 <code>Collecting numpy
+  Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
 </code>
 </pre>
 </div>
@@ -1071,9 +1120,7 @@ <h2 id="environment-setup">Environment setup<a class="headerlink" href="#environ
 <div class="output_area">
 <div class="output_subarea output_stream output_stdout output_text">
 <pre>
-<code>  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
-     <span class="ansi-black-intense-fg">━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━</span> <span class="ansi-green-fg">0.0/61.0 kB</span> <span class="ansi-red-fg">?</span> eta <span class="ansi-cyan-fg">-:--:--</span>
-     <span class="ansi-black-intense-fg">━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━</span> <span class="ansi-green-fg">61.0/61.0 kB</span> <span class="ansi-red-fg">3.6 MB/s</span> eta <span class="ansi-cyan-fg">0:00:00</span>
+<code>Collecting pandas
 </code>
 </pre>
 </div>
@@ -1081,8 +1128,7 @@ <h2 id="environment-setup">Environment setup<a class="headerlink" href="#environ
 <div class="output_area">
 <div class="output_subarea output_stream output_stdout output_text">
 <pre>
-<code>Collecting pandas
-  Downloading pandas-2.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
+<code>  Downloading pandas-2.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
 </code>
 </pre>
 </div>
@@ -1091,7 +1137,7 @@ <h2 id="environment-setup">Environment setup<a class="headerlink" href="#environ
 <div class="output_subarea output_stream output_stdout output_text">
 <pre>
 <code>Collecting scikit-learn
-  Downloading scikit_learn-1.4.1.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
+  Using cached scikit_learn-1.4.1.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
 </code>
 </pre>
 </div>
@@ -1102,7 +1148,7 @@ <h2 id="environment-setup">Environment setup<a class="headerlink" href="#environ
 <code>Requirement already satisfied: python-dateutil&gt;=2.8.2 in /opt/hostedtoolcache/Python/3.11.8/x64/lib/python3.11/site-packages (from pandas) (2.8.2)
 Requirement already satisfied: pytz&gt;=2020.1 in /opt/hostedtoolcache/Python/3.11.8/x64/lib/python3.11/site-packages (from pandas) (2024.1)
 Collecting tzdata&gt;=2022.7 (from pandas)
-  Downloading tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
+  Using cached tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
 </code>
 </pre>
 </div>
@@ -1111,9 +1157,8 @@ <h2 id="environment-setup">Environment setup<a class="headerlink" href="#environ
 <div class="output_subarea output_stream output_stdout output_text">
 <pre>
 <code>Collecting scipy&gt;=1.6.0 (from scikit-learn)
-  Downloading scipy-1.12.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
-     <span class="ansi-black-intense-fg">━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━</span> <span class="ansi-green-fg">0.0/60.4 kB</span> <span class="ansi-red-fg">?</span> eta <span class="ansi-cyan-fg">-:--:--</span>
-     <span class="ansi-black-intense-fg">━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━</span> <span class="ansi-green-fg">60.4/60.4 kB</span> <span class="ansi-red-fg">17.5 MB/s</span> eta <span class="ansi-cyan-fg">0:00:00</span>
+  Using cached scipy-1.12.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
+Collecting joblib&gt;=1.2.0 (from scikit-learn)
 </code>
 </pre>
 </div>
@@ -1121,17 +1166,9 @@ <h2 id="environment-setup">Environment setup<a class="headerlink" href="#environ
 <div class="output_area">
 <div class="output_subarea output_stream output_stdout output_text">
 <pre>
-<code>Collecting joblib&gt;=1.2.0 (from scikit-learn)
-  Downloading joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)
-</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_subarea output_stream output_stdout output_text">
-<pre>
-<code>Collecting threadpoolctl&gt;=2.0.0 (from scikit-learn)
-  Downloading threadpoolctl-3.3.0-py3-none-any.whl.metadata (13 kB)
+<code>  Using cached joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)
+Collecting threadpoolctl&gt;=2.0.0 (from scikit-learn)
+  Using cached threadpoolctl-3.3.0-py3-none-any.whl.metadata (13 kB)
 Requirement already satisfied: six&gt;=1.5 in /opt/hostedtoolcache/Python/3.11.8/x64/lib/python3.11/site-packages (from python-dateutil&gt;=2.8.2-&gt;pandas) (1.16.0)
 </code>
 </pre>
@@ -1140,43 +1177,8 @@ <h2 id="environment-setup">Environment setup<a class="headerlink" href="#environ
 <div class="output_area">
 <div class="output_subarea output_stream output_stdout output_text">
 <pre>
-<code>Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
-   <span class="ansi-black-intense-fg">━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━</span> <span class="ansi-green-fg">0.0/18.3 MB</span> <span class="ansi-red-fg">?</span> eta <span class="ansi-cyan-fg">-:--:--</span></code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_subarea output_stream output_stdout output_text">
-<pre>
-<code>
-   <span class="ansi-red-intense-fg">━━━━</span><span class="ansi-red-intense-fg">╸</span><span class="ansi-black-intense-fg">━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━</span> <span class="ansi-green-fg">2.2/18.3 MB</span> <span class="ansi-red-fg">65.1 MB/s</span> eta <span class="ansi-cyan-fg">0:00:01</span>
-   <span class="ansi-red-intense-fg">━━━━━━━━━━━━━━━━</span><span class="ansi-black-intense-fg">╺</span><span class="ansi-black-intense-fg">━━━━━━━━━━━━━━━━━━━━━━━</span> <span class="ansi-green-fg">7.4/18.3 MB</span> <span class="ansi-red-fg">107.5 MB/s</span> eta <span class="ansi-cyan-fg">0:00:01</span></code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_subarea output_stream output_stdout output_text">
-<pre>
-<code>
-   <span class="ansi-red-intense-fg">━━━━━━━━━━━━━━━━━━━━━━━━━━━</span><span class="ansi-red-intense-fg">╸</span><span class="ansi-black-intense-fg">━━━━━━━━━━━━</span> <span class="ansi-green-fg">12.6/18.3 MB</span> <span class="ansi-red-fg">152.4 MB/s</span> eta <span class="ansi-cyan-fg">0:00:01</span></code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_subarea output_stream output_stdout output_text">
-<pre>
-<code>
-   <span class="ansi-red-intense-fg">━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━</span><span class="ansi-red-intense-fg">╸</span> <span class="ansi-green-fg">18.2/18.3 MB</span> <span class="ansi-red-fg">163.2 MB/s</span> eta <span class="ansi-cyan-fg">0:00:01</span>
-   <span class="ansi-red-intense-fg">━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━</span><span class="ansi-red-intense-fg">╸</span> <span class="ansi-green-fg">18.2/18.3 MB</span> <span class="ansi-red-fg">163.2 MB/s</span> eta <span class="ansi-cyan-fg">0:00:01</span>
-   <span class="ansi-black-intense-fg">━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━</span> <span class="ansi-green-fg">18.3/18.3 MB</span> <span class="ansi-red-fg">94.2 MB/s</span> eta <span class="ansi-cyan-fg">0:00:00</span>
-</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_subarea output_stream output_stdout output_text">
-<pre>
-<code>Downloading pandas-2.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.0 MB)
+<code>Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
+Downloading pandas-2.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.0 MB)
    <span class="ansi-black-intense-fg">━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━</span> <span class="ansi-green-fg">0.0/13.0 MB</span> <span class="ansi-red-fg">?</span> eta <span class="ansi-cyan-fg">-:--:--</span></code>
 </pre>
 </div>
@@ -1185,57 +1187,7 @@ <h2 id="environment-setup">Environment setup<a class="headerlink" href="#environ
 <div class="output_subarea output_stream output_stdout output_text">
 <pre>
 <code>
-   <span class="ansi-red-intense-fg">━━━━━━━━━━━━━━━</span><span class="ansi-red-intense-fg">╸</span><span class="ansi-black-intense-fg">━━━━━━━━━━━━━━━━━━━━━━━━</span> <span class="ansi-green-fg">5.1/13.0 MB</span> <span class="ansi-red-fg">153.9 MB/s</span> eta <span class="ansi-cyan-fg">0:00:01</span>
-   <span class="ansi-red-intense-fg">━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━</span><span class="ansi-red-intense-fg">╸</span><span class="ansi-black-intense-fg">━━━━━━━━</span> <span class="ansi-green-fg">10.4/13.0 MB</span> <span class="ansi-red-fg">153.4 MB/s</span> eta <span class="ansi-cyan-fg">0:00:01</span></code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_subarea output_stream output_stdout output_text">
-<pre>
-<code>
-   <span class="ansi-red-intense-fg">━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━</span><span class="ansi-red-intense-fg">╸</span> <span class="ansi-green-fg">13.0/13.0 MB</span> <span class="ansi-red-fg">154.8 MB/s</span> eta <span class="ansi-cyan-fg">0:00:01</span>
-   <span class="ansi-black-intense-fg">━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━</span> <span class="ansi-green-fg">13.0/13.0 MB</span> <span class="ansi-red-fg">102.0 MB/s</span> eta <span class="ansi-cyan-fg">0:00:00</span>
-</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_subarea output_stream output_stdout output_text">
-<pre>
-<code>Downloading scikit_learn-1.4.1.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
-   <span class="ansi-black-intense-fg">━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━</span> <span class="ansi-green-fg">0.0/12.1 MB</span> <span class="ansi-red-fg">?</span> eta <span class="ansi-cyan-fg">-:--:--</span>
-   <span class="ansi-red-intense-fg">━━━━━━━━━━━━━━━━━━</span><span class="ansi-black-intense-fg">╺</span><span class="ansi-black-intense-fg">━━━━━━━━━━━━━━━━━━━━━</span> <span class="ansi-green-fg">5.5/12.1 MB</span> <span class="ansi-red-fg">166.6 MB/s</span> eta <span class="ansi-cyan-fg">0:00:01</span></code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_subarea output_stream output_stdout output_text">
-<pre>
-<code>
-   <span class="ansi-red-intense-fg">━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━</span><span class="ansi-black-intense-fg">╺</span><span class="ansi-black-intense-fg">━━━━━</span> <span class="ansi-green-fg">10.3/12.1 MB</span> <span class="ansi-red-fg">151.7 MB/s</span> eta <span class="ansi-cyan-fg">0:00:01</span></code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_subarea output_stream output_stdout output_text">
-<pre>
-<code>
-   <span class="ansi-red-intense-fg">━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━</span><span class="ansi-red-intense-fg">╸</span> <span class="ansi-green-fg">12.1/12.1 MB</span> <span class="ansi-red-fg">151.0 MB/s</span> eta <span class="ansi-cyan-fg">0:00:01</span>
-   <span class="ansi-black-intense-fg">━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━</span> <span class="ansi-green-fg">12.1/12.1 MB</span> <span class="ansi-red-fg">103.2 MB/s</span> eta <span class="ansi-cyan-fg">0:00:00</span>
-Downloading joblib-1.3.2-py3-none-any.whl (302 kB)
-   <span class="ansi-black-intense-fg">━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━</span> <span class="ansi-green-fg">0.0/302.2 kB</span> <span class="ansi-red-fg">?</span> eta <span class="ansi-cyan-fg">-:--:--</span>
-   <span class="ansi-black-intense-fg">━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━</span> <span class="ansi-green-fg">302.2/302.2 kB</span> <span class="ansi-red-fg">59.9 MB/s</span> eta <span class="ansi-cyan-fg">0:00:00</span>
-Downloading scipy-1.12.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.4 MB)
-   <span class="ansi-black-intense-fg">━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━</span> <span class="ansi-green-fg">0.0/38.4 MB</span> <span class="ansi-red-fg">?</span> eta <span class="ansi-cyan-fg">-:--:--</span></code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_subarea output_stream output_stdout output_text">
-<pre>
-<code>
-   <span class="ansi-red-intense-fg">━━━━━</span><span class="ansi-red-intense-fg">╸</span><span class="ansi-black-intense-fg">━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━</span> <span class="ansi-green-fg">5.4/38.4 MB</span> <span class="ansi-red-fg">162.5 MB/s</span> eta <span class="ansi-cyan-fg">0:00:01</span></code>
+   <span class="ansi-red-intense-fg">━━━━</span><span class="ansi-red-intense-fg">╸</span><span class="ansi-black-intense-fg">━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━</span> <span class="ansi-green-fg">1.6/13.0 MB</span> <span class="ansi-red-fg">76.4 MB/s</span> eta <span class="ansi-cyan-fg">0:00:01</span></code>
 </pre>
 </div>
 </div>
@@ -1243,7 +1195,8 @@ <h2 id="environment-setup">Environment setup<a class="headerlink" href="#environ
 <div class="output_subarea output_stream output_stdout output_text">
 <pre>
 <code>
-   <span class="ansi-red-intense-fg">━━━━━━━━━━━</span><span class="ansi-black-intense-fg">╺</span><span class="ansi-black-intense-fg">━━━━━━━━━━━━━━━━━━━━━━━━━━━━</span> <span class="ansi-green-fg">10.6/38.4 MB</span> <span class="ansi-red-fg">156.3 MB/s</span> eta <span class="ansi-cyan-fg">0:00:01</span></code>
+   <span class="ansi-red-intense-fg">━━━━━━━━━━━━━━━━━━━</span><span class="ansi-black-intense-fg">╺</span><span class="ansi-black-intense-fg">━━━━━━━━━━━━━━━━━━━━</span> <span class="ansi-green-fg">6.2/13.0 MB</span> <span class="ansi-red-fg">90.8 MB/s</span> eta <span class="ansi-cyan-fg">0:00:01</span>
+   <span class="ansi-red-intense-fg">━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━</span><span class="ansi-black-intense-fg">╺</span><span class="ansi-black-intense-fg">━━━</span> <span class="ansi-green-fg">11.8/13.0 MB</span> <span class="ansi-red-fg">123.4 MB/s</span> eta <span class="ansi-cyan-fg">0:00:01</span></code>
 </pre>
 </div>
 </div>
@@ -1251,43 +1204,18 @@ <h2 id="environment-setup">Environment setup<a class="headerlink" href="#environ
 <div class="output_subarea output_stream output_stdout output_text">
 <pre>
 <code>
-   <span class="ansi-red-intense-fg">━━━━━━━━━━━━━━━━</span><span class="ansi-red-intense-fg">╸</span><span class="ansi-black-intense-fg">━━━━━━━━━━━━━━━━━━━━━━━</span> <span class="ansi-green-fg">16.1/38.4 MB</span> <span class="ansi-red-fg">156.9 MB/s</span> eta <span class="ansi-cyan-fg">0:00:01</span>
-   <span class="ansi-red-intense-fg">━━━━━━━━━━━━━━━━━━━━━━</span><span class="ansi-black-intense-fg">╺</span><span class="ansi-black-intense-fg">━━━━━━━━━━━━━━━━━</span> <span class="ansi-green-fg">21.4/38.4 MB</span> <span class="ansi-red-fg">158.4 MB/s</span> eta <span class="ansi-cyan-fg">0:00:01</span></code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_subarea output_stream output_stdout output_text">
-<pre>
-<code>
-   <span class="ansi-red-intense-fg">━━━━━━━━━━━━━━━━━━━━━━━━━━━</span><span class="ansi-red-intense-fg">╸</span><span class="ansi-black-intense-fg">━━━━━━━━━━━━</span> <span class="ansi-green-fg">26.7/38.4 MB</span> <span class="ansi-red-fg">155.5 MB/s</span> eta <span class="ansi-cyan-fg">0:00:01</span></code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_subarea output_stream output_stdout output_text">
-<pre>
-<code>
-   <span class="ansi-red-intense-fg">━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━</span><span class="ansi-red-intense-fg">╸</span><span class="ansi-black-intense-fg">━━━━━━</span> <span class="ansi-green-fg">32.2/38.4 MB</span> <span class="ansi-red-fg">157.9 MB/s</span> eta <span class="ansi-cyan-fg">0:00:01</span>
-   <span class="ansi-red-intense-fg">━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━</span><span class="ansi-red-intense-fg">╸</span> <span class="ansi-green-fg">38.2/38.4 MB</span> <span class="ansi-red-fg">169.0 MB/s</span> eta <span class="ansi-cyan-fg">0:00:01</span></code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_subarea output_stream output_stdout output_text">
-<pre>
-<code>
-   <span class="ansi-red-intense-fg">━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━</span><span class="ansi-red-intense-fg">╸</span> <span class="ansi-green-fg">38.4/38.4 MB</span> <span class="ansi-red-fg">165.0 MB/s</span> eta <span class="ansi-cyan-fg">0:00:01</span></code>
+   <span class="ansi-red-intense-fg">━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━</span><span class="ansi-red-intense-fg">╸</span> <span class="ansi-green-fg">13.0/13.0 MB</span> <span class="ansi-red-fg">164.6 MB/s</span> eta <span class="ansi-cyan-fg">0:00:01</span>
+   <span class="ansi-black-intense-fg">━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━</span> <span class="ansi-green-fg">13.0/13.0 MB</span> <span class="ansi-red-fg">107.9 MB/s</span> eta <span class="ansi-cyan-fg">0:00:00</span>
+Using cached scikit_learn-1.4.1.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
+</code>
 </pre>
 </div>
 </div>
 <div class="output_area">
 <div class="output_subarea output_stream output_stdout output_text">
 <pre>
-<code>
-   <span class="ansi-red-intense-fg">━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━</span><span class="ansi-red-intense-fg">╸</span> <span class="ansi-green-fg">38.4/38.4 MB</span> <span class="ansi-red-fg">165.0 MB/s</span> eta <span class="ansi-cyan-fg">0:00:01</span>
-   <span class="ansi-black-intense-fg">━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━</span> <span class="ansi-green-fg">38.4/38.4 MB</span> <span class="ansi-red-fg">66.1 MB/s</span> eta <span class="ansi-cyan-fg">0:00:00</span>
-Downloading threadpoolctl-3.3.0-py3-none-any.whl (17 kB)
+<code>Using cached joblib-1.3.2-py3-none-any.whl (302 kB)
+Using cached scipy-1.12.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.4 MB)
 </code>
 </pre>
 </div>
@@ -1295,9 +1223,8 @@ <h2 id="environment-setup">Environment setup<a class="headerlink" href="#environ
 <div class="output_area">
 <div class="output_subarea output_stream output_stdout output_text">
 <pre>
-<code>Downloading tzdata-2024.1-py2.py3-none-any.whl (345 kB)
-   <span class="ansi-black-intense-fg">━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━</span> <span class="ansi-green-fg">0.0/345.4 kB</span> <span class="ansi-red-fg">?</span> eta <span class="ansi-cyan-fg">-:--:--</span>
-   <span class="ansi-black-intense-fg">━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━</span> <span class="ansi-green-fg">345.4/345.4 kB</span> <span class="ansi-red-fg">64.5 MB/s</span> eta <span class="ansi-cyan-fg">0:00:00</span>
+<code>Using cached threadpoolctl-3.3.0-py3-none-any.whl (17 kB)
+Using cached tzdata-2024.1-py2.py3-none-any.whl (345 kB)
 </code>
 </pre>
 </div>
@@ -1313,7 +1240,7 @@ <h2 id="environment-setup">Environment setup<a class="headerlink" href="#environ
 <div class="output_area">
 <div class="output_subarea output_stream output_stdout output_text">
 <pre>
-<code>Successfully installed joblib-1.3.2 numpy-1.26.4 pandas-2.2.0 scikit-learn-1.4.1.post1 scipy-1.12.0 threadpoolctl-3.3.0 tzdata-2024.1
+<code>Successfully installed joblib-1.3.2 numpy-1.26.4 pandas-2.2.1 scikit-learn-1.4.1.post1 scipy-1.12.0 threadpoolctl-3.3.0 tzdata-2024.1
 </code>
 </pre>
 </div>
@@ -1370,7 +1297,7 @@ <h2 id="authenticating-with-the-lakefs-server">Authenticating with the lakeFS se
 <div class="output_area">
 <div class="output_text output_subarea output_execute_result">
 <pre>
-<code>('/home/runner/.lakectl.yaml', &lt;http.client.HTTPMessage at 0x7fd726d97590&gt;)</code>
+<code>('/home/runner/.lakectl.yaml', &lt;http.client.HTTPMessage at 0x7f781c8cc090&gt;)</code>
 </pre>
 </div>
 </div>
@@ -1535,21 +1462,6 @@ <h2 id="data-transformation">Data Transformation<a class="headerlink" href="#dat
 <div class="output_wrapper">
 <div class="output">
 <div class="output_area">
-<div class="output_subarea output_stream output_stderr output_text">
-<pre>
-<code>/tmp/ipykernel_2291/2823322696.py:3: DeprecationWarning: 
-Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
-(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
-but was not found to be installed on your system.
-If this would cause problems for you,
-please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
-
-  import pandas as pd
-</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
 <div class="output_html rendered_html output_subarea output_execute_result">
 <div>
 <style scoped="">
@@ -1956,7 +1868,7 @@ <h2 id="accessing-data-versions-through-commits-and-reproducing-experiments">Acc
 <div class="output_area">
 <div class="output_subarea output_stream output_stdout output_text">
 <pre>
-<code>240be1477daa4fd3df6b6621be4398d480424683a09e65a8e0664c0a9e79f496
+<code>698964fd36f1fa620e92c786e62fc0d59c2f62b8c82eb721cc1233f95520702d
 </code>
 </pre>
 </div>
@@ -2093,7 +2005,7 @@ <h2 id="using-tags-instead-of-commit-shas-for-semantic-versioning">Using tags in
 <div class="output_area">
 <div class="output_subarea output_stream output_stdout output_text">
 <pre>
-<code>Is the data tagged Tag(repository="weather", id="train-test-split-2010") equal to the data in commit 240be147? True
+<code>Is the data tagged Tag(repository="weather", id="train-test-split-2010") equal to the data in commit 698964fd? True
 </code>
 </pre>
 </div>
@@ -2122,7 +2034,7 @@ <h2 id="using-tags-instead-of-commit-shas-for-semantic-versioning">Using tags in
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-02-21</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-02-28</span>
   </span>
 
     
diff --git a/development/tutorials/index.html b/development/tutorials/index.html
index 617b5c7c..284c5586 100644
--- a/development/tutorials/index.html
+++ b/development/tutorials/index.html
@@ -112,21 +112,55 @@
       </div>
     
     
-      
+      <!--
+  Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>
+
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to
+  deal in the Software without restriction, including without limitation the
+  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+  sell copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+  IN THE SOFTWARE.
+-->
+
+<!-- Determine classes -->
+
 
   
 
+
+<!-- Header -->
 <header class="md-header md-header--shadow" data-md-component="header">
-  <nav class="md-header__inner md-grid" aria-label="Header">
+  <nav
+    class="md-header__inner md-grid"
+    aria-label="Header"
+  >
+
+    <!-- Link to home -->
     <a href="https://lakefs-spec.org" title="lakeFS-spec" class="md-header__button md-logo" aria-label="lakeFS-spec" data-md-component="logo">
       
   <img src="../_images/aai-logo-cropped.png" alt="logo">
 
     </a>
+
+    <!-- Button to open drawer -->
     <label class="md-header__button md-icon" for="__drawer">
       
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
     </label>
+
+    <!-- Header title -->
     <div class="md-header__title" data-md-component="header-title">
       <div class="md-header__ellipsis">
         <div class="md-header__topic">
@@ -137,12 +171,14 @@
         <div class="md-header__topic" data-md-component="header-topic">
           <span class="md-ellipsis">
             
-              Tutorials
+              lakeFS-spec &mdash; <strong>Tutorials</strong>
             
           </span>
         </div>
       </div>
     </div>
+
+    <!-- Color palette toggle -->
     
       
         <form class="md-header__option" data-md-component="palette">
@@ -170,15 +206,23 @@
 </form>
       
     
+
+    <!-- User preference: color palette -->
     
       <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
     
+
+    <!-- Site language selector -->
     
+
+    <!-- Button to open search modal -->
     
       <label class="md-header__button md-icon" for="__search">
         
         <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
       </label>
+
+      <!-- Search interface -->
       <div class="md-search" data-md-component="search" role="dialog">
   <label class="md-search__overlay" for="__search"></label>
   <div class="md-search__inner" role="search">
@@ -212,6 +256,8 @@
   </div>
 </div>
     
+
+    <!-- Repository information -->
     
       <div class="md-header__source">
         <a href="https://github.com/aai-institute/lakefs-spec" title="Go to repository" class="md-source" data-md-component="source">
@@ -226,6 +272,8 @@
       </div>
     
   </nav>
+
+  <!-- Navigation tabs (sticky) -->
   
 </header>