From 1d1ed21325c16df9d50287e15fb71d74af22c6f0 Mon Sep 17 00:00:00 2001 From: Adrian Rumpold Date: Fri, 24 Nov 2023 12:20:30 +0100 Subject: [PATCH 1/6] docs: Add third-party integrations user guide (WIP) Section on Pandas done, rest as skeleton. --- docs/guides/index.md | 1 + docs/guides/integrations.md | 43 +++++++++++++++++++++++++++++++++++++ mkdocs.yml | 1 + 3 files changed, 45 insertions(+) create mode 100644 docs/guides/integrations.md diff --git a/docs/guides/index.md b/docs/guides/index.md index d847abd3..6f54a575 100644 --- a/docs/guides/index.md +++ b/docs/guides/index.md @@ -6,3 +6,4 @@ See the [Quickstart guide](../quickstart.md) for an introductory tutorial. - [How to use the lakeFS file system](filesystem-usage.md) - [Passing configuration to the file system](configuration.md) - [Using file system transactions](transactions.md) +- [How to use `lakefs-spec` with third-party data science libraries](integrations.md) diff --git a/docs/guides/integrations.md b/docs/guides/integrations.md new file mode 100644 index 00000000..0312b4c2 --- /dev/null +++ b/docs/guides/integrations.md @@ -0,0 +1,43 @@ +# How to use `lakefs-spec` with third-party data science libraries + +`lakefs-spec` is built on top of the `fsspec` library, which allows third-party libraries to make use of its file system abstraction to offer high-level features. +The [`fsspec` documentation](https://filesystem-spec.readthedocs.io/en/latest/#who-uses-fsspec){: target="_blank" rel="noopener"} lists examples of its users, mostly data science libraries. + +This user guide page adds more detail on how `lakefs-spec` can be used with four prominent data science libraries. + +!!! tip "Code Examples" + The code examples assume access to an existing lakeFS server with a `quickstart` containing the sample data set repository set up. + + Please see the [Quickstart guide](../quickstart.md) if you need guidance in getting started. + +## Pandas + +[Pandas](https://pandas.pydata.org){: target="_blank" rel="noopener"} can read and write data from remote locations, and uses `fsspec` for all URLs that are not local or HTTP(S). + +This means that (almost) all `pd.read_*` and `pd.DataFrame.to_*` operations can benefit from the lakeFS integration offered by our library without any additional configuration. +See the Pandas documentation on [reading/writing remote files](https://pandas.pydata.org/docs/user_guide/io.html#reading-writing-remote-files){: target="_blank" rel="noopener"} for additional details. + +The following code snippet illustrates how to read and write Pandas data frames in various formats from/to a lakeFS repository in the context of a [transaction](transactions.md): + +```python +import pandas as pd + +from lakefs_spec.transaction import LakeFSFileSystem + +fs = LakeFSFileSystem() + +with fs.transaction as tx: + tx.create_branch("quickstart", "german-lakes", "main") + + lakes = pd.read_parquet("lakefs://quickstart/main/lakes.parquet") + german_lakes = lakes.query('Country == "Germany"') + german_lakes.to_csv("lakefs://quickstart/german-lakes/german_lakes.csv") + + tx.commit("quickstart", "german-lakes", "Add German lakes") +``` + +## DuckDB + +## PyArrow + +## Polars diff --git a/mkdocs.yml b/mkdocs.yml index 65d38374..98671618 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -25,6 +25,7 @@ nav: - guides/filesystem-usage.md - guides/configuration.md - guides/transactions.md + - guides/integrations.md - Tutorials: - tutorials/index.md - tutorials/demo_data_science_project.ipynb From 7fa05c520f5743826efa3eb4241d38675040b1e8 Mon Sep 17 00:00:00 2001 From: Adrian Rumpold Date: Fri, 24 Nov 2023 12:52:05 +0100 Subject: [PATCH 2/6] docs: Add DuckDB example to integrations user guide --- docs/guides/integrations.md | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/docs/guides/integrations.md b/docs/guides/integrations.md index 0312b4c2..5ca0de6f 100644 --- a/docs/guides/integrations.md +++ b/docs/guides/integrations.md @@ -38,6 +38,32 @@ with fs.transaction as tx: ## DuckDB +Similar to the example above, the following code snippet illustrates how to read and write data from/to a lakeFS repository in the context of a [transaction](transactions.md) through the DuckDB Python API: + +```python +import duckdb + +from lakefs_spec import LakeFSFileSystem + +fs = LakeFSFileSystem() +duckdb.register_filesystem(fs) # (1)! + +with fs.transaction as tx: + tx.create_branch("quickstart", "german-lakes", "main") + + lakes = duckdb.read_parquet("lakefs://quickstart/main/lakes.parquet") + german_lakes = duckdb.sql("SELECT * FROM lakes where Country='Germany'") + german_lakes.to_csv("lakefs://quickstart/german-lakes/german_lakes.csv") + + tx.commit("quickstart", "german-lakes", "Add German lakes") +``` + +1. Makes the `lakefs-spec` file system known to DuckDB (`duckdb.register_filesystem(fsspec.filesystem("lakefs"))` can also be used to avoid the direct import of `LakeFSFileSystem`) + ## PyArrow +!!! todo + ## Polars + +!!! todo From a34731263e4a6de1453f4204a06aef9903856985 Mon Sep 17 00:00:00 2001 From: Adrian Rumpold Date: Fri, 24 Nov 2023 12:58:22 +0100 Subject: [PATCH 3/6] docs: Add more DuckDB explanation --- docs/guides/integrations.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/guides/integrations.md b/docs/guides/integrations.md index 5ca0de6f..69dd967f 100644 --- a/docs/guides/integrations.md +++ b/docs/guides/integrations.md @@ -38,7 +38,10 @@ with fs.transaction as tx: ## DuckDB -Similar to the example above, the following code snippet illustrates how to read and write data from/to a lakeFS repository in the context of a [transaction](transactions.md) through the DuckDB Python API: +The [DuckDB](https://duckdb.org/){: target="_blank" rel="noopener"} in-memory database management system includes support for `fsspec` file systems as part of its Python API (see the official documentation on [using fsspec filesystems](https://duckdb.org/docs/guides/python/filesystems.html){: target="_blank" rel="noopener"} for details). +This allows DuckDB to transparently query and store data located in lakeFS repositories through `lakefs-spec`. + +Similar to the example above, the following code snippet illustrates how to read and write data from/to a lakeFS repository in the context of a [transaction](transactions.md) through the [DuckDB Python API](https://duckdb.org/docs/api/python/overview.html){: target="_blank" rel="noopener"}: ```python import duckdb From aac96b41f55c5626d5dfd274d6ebe8165567a7e2 Mon Sep 17 00:00:00 2001 From: Adrian Rumpold Date: Fri, 24 Nov 2023 14:26:17 +0100 Subject: [PATCH 4/6] docs: Add Polars to integrations user guide --- docs/guides/integrations.md | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/docs/guides/integrations.md b/docs/guides/integrations.md index 69dd967f..1bedfce5 100644 --- a/docs/guides/integrations.md +++ b/docs/guides/integrations.md @@ -19,7 +19,7 @@ See the Pandas documentation on [reading/writing remote files](https://pandas.py The following code snippet illustrates how to read and write Pandas data frames in various formats from/to a lakeFS repository in the context of a [transaction](transactions.md): -```python +```python hl_lines="10 12" import pandas as pd from lakefs_spec.transaction import LakeFSFileSystem @@ -43,7 +43,7 @@ This allows DuckDB to transparently query and store data located in lakeFS repos Similar to the example above, the following code snippet illustrates how to read and write data from/to a lakeFS repository in the context of a [transaction](transactions.md) through the [DuckDB Python API](https://duckdb.org/docs/api/python/overview.html){: target="_blank" rel="noopener"}: -```python +```python hl_lines="6 11 13" import duckdb from lakefs_spec import LakeFSFileSystem @@ -69,4 +69,32 @@ with fs.transaction as tx: ## Polars -!!! todo +!!! warning + There is an ongoing discussion in the Polars development team whether to remove support for `fsspec` file systems, with no clear outcome as of the time this page was written. + Please refer to the discussion on the relevant [GitHub issue](https://github.com/pola-rs/polars/issues/11056){: target="_blank" rel="noopener"} in case you encounter any problems. + +The Python API wrapper for the Rust-based [Polars](https://pola-rs.github.io/polars/){: target="_blank" rel="noopener"} DataFrame library can access remote storage through `fsspec`, similar to Pandas (see the official [documentation on cloud storage](https://pola-rs.github.io/polars/user-guide/io/cloud-storage/){: target="_blank" rel="noopener"}). + +Again, the following code example demonstrates how to read a Parquet file and save a modified version back in CSV format to a lakeFS repository from Polars in the context of a [transaction](transactions.md): + + +```python hl_lines="10 13-14" +import polars as pl + +from lakefs_spec import LakeFSFileSystem + +fs = LakeFSFileSystem() + +with fs.transaction as tx: + tx.create_branch("quickstart", "german-lakes", "main") + + lakes = pl.read_parquet("lakefs://quickstart/main/lakes.parquet") + german_lakes = lakes.filter(pl.col("Country") == "Germany") + + with fs.open("lakefs://quickstart/german-lakes/german_lakes.csv", "wb") as f: # (1)! + german_lakes.write_csv(f) + + tx.commit("quickstart", "german-lakes", "Add German lakes") +``` + +1. Polars does not support directly writing to remote storage through the `pl.DataFrame.write_*` API (see [docs](https://pola-rs.github.io/polars/user-guide/io/cloud-storage/#writing-to-cloud-storage)) From 743114283de4995efff16fcba0bf25a2bd7ec5c3 Mon Sep 17 00:00:00 2001 From: Adrian Rumpold Date: Fri, 24 Nov 2023 14:49:56 +0100 Subject: [PATCH 5/6] docs: Add PyArrow to integrations user guide --- docs/guides/integrations.md | 39 +++++++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 4 deletions(-) diff --git a/docs/guides/integrations.md b/docs/guides/integrations.md index 1bedfce5..402142f2 100644 --- a/docs/guides/integrations.md +++ b/docs/guides/integrations.md @@ -10,6 +10,8 @@ This user guide page adds more detail on how `lakefs-spec` can be used with four Please see the [Quickstart guide](../quickstart.md) if you need guidance in getting started. + The relevant lines for the `lakefs-spec` integration in these examples are highlighted. + ## Pandas [Pandas](https://pandas.pydata.org){: target="_blank" rel="noopener"} can read and write data from remote locations, and uses `fsspec` for all URLs that are not local or HTTP(S). @@ -63,10 +65,6 @@ with fs.transaction as tx: 1. Makes the `lakefs-spec` file system known to DuckDB (`duckdb.register_filesystem(fsspec.filesystem("lakefs"))` can also be used to avoid the direct import of `LakeFSFileSystem`) -## PyArrow - -!!! todo - ## Polars !!! warning @@ -98,3 +96,36 @@ with fs.transaction as tx: ``` 1. Polars does not support directly writing to remote storage through the `pl.DataFrame.write_*` API (see [docs](https://pola-rs.github.io/polars/user-guide/io/cloud-storage/#writing-to-cloud-storage)) + +## PyArrow + +[Apache Arrow](https://arrow.apache.org/){: target="_blank" rel="noopener"} and its Python API, [PyArrow](https://arrow.apache.org/docs/python/){: target="_blank" rel="noopener"}, can also use `fsspec` file systems to perform I/O operations on data objects. The documentation has additional details on [using fsspec-compatible file systems with Arrow](https://arrow.apache.org/docs/python/filesystems.html#using-fsspec-compatible-filesystems-with-arrow){: target="_blank" rel="noopener"}. + +PyArrow `read_*` and `write_*` functions take an explicit `filesystem` parameter, which accepts any `fsspec` file system, such as the `LakeFSFileSystem` provided by this library. + +The following example code illustrates the use of `lakefs-spec` with PyArrow, reading a Parquet file and writing it back to a lakeFS repository as a partitioned dataset in the context of a [transaction](transactions.md): + +```python hl_lines="12 17" +import pyarrow as pa +import pyarrow.dataset as ds +import pyarrow.parquet as pq + +from lakefs_spec.spec import LakeFSFileSystem + +fs = LakeFSFileSystem() + +with fs.transaction as tx: + tx.create_branch("quickstart", "partitioned-data", "main") + + lakes_table = pq.read_table("quickstart/main/lakes.parquet", filesystem=fs) + + ds.write_dataset( + lakes_table, + "quickstart/partitioned-data/lakes", + filesystem=fs, + format="parquet", + partitioning=ds.partitioning(pa.schema([lakes_table.schema.field("Country")])), + ) + + tx.commit("quickstart", "partitioned-data", "Add German lakes") +``` From e52999a6d84b15f6ce3a21057494fc1b83defd23 Mon Sep 17 00:00:00 2001 From: Adrian Rumpold Date: Fri, 24 Nov 2023 14:54:22 +0100 Subject: [PATCH 6/6] docs: Make PyArrow example write back partitioned CSV dataset --- docs/guides/integrations.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/guides/integrations.md b/docs/guides/integrations.md index 402142f2..2a90a683 100644 --- a/docs/guides/integrations.md +++ b/docs/guides/integrations.md @@ -103,7 +103,7 @@ with fs.transaction as tx: PyArrow `read_*` and `write_*` functions take an explicit `filesystem` parameter, which accepts any `fsspec` file system, such as the `LakeFSFileSystem` provided by this library. -The following example code illustrates the use of `lakefs-spec` with PyArrow, reading a Parquet file and writing it back to a lakeFS repository as a partitioned dataset in the context of a [transaction](transactions.md): +The following example code illustrates the use of `lakefs-spec` with PyArrow, reading a Parquet file and writing it back to a lakeFS repository as a partitioned CSV dataset in the context of a [transaction](transactions.md): ```python hl_lines="12 17" import pyarrow as pa @@ -123,7 +123,7 @@ with fs.transaction as tx: lakes_table, "quickstart/partitioned-data/lakes", filesystem=fs, - format="parquet", + format="csv", partitioning=ds.partitioning(pa.schema([lakes_table.schema.field("Country")])), )