Skip to content

Commit 15c3526

Browse files
authored
Add integration tests for third-party data science libraries (#251)
* Add integration tests for third-party data science libraries This is another smoke test to detect breakages in file system APIs. Adds pandas, polars, and duckdb to the test requirements. * Add `use_pyarrow=True` for polars Parquet reads Without that, parquet reads on the newest versions result in weird errors. * Add existence check after the upload in Pandas integration test This is more robust than just checking if the upload breaks. * Add polars test docstring
1 parent a5a04f0 commit 15c3526

File tree

5 files changed

+67
-11
lines changed

5 files changed

+67
-11
lines changed

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ print(data.head())
8686
# Polars -- see https://pola-rs.github.io/polars/user-guide/io/cloud-storage/
8787
import polars as pl
8888

89-
data = pl.read_parquet("lakefs://quickstart/main/lakes.parquet")
89+
data = pl.read_parquet("lakefs://quickstart/main/lakes.parquet", use_pyarrow=True)
9090
print(data.head())
9191

9292

pyproject.toml

+4
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,10 @@ dev = [
5050
"pytest>=7.4.0",
5151
"pytest-cov>=4.1.0",
5252
"pydoclint",
53+
# for integration tests.
54+
"pandas[parquet]",
55+
"polars",
56+
"duckdb",
5357
]
5458
docs = [
5559
"mkdocs",

requirements-dev.txt

+8-1
Original file line numberDiff line numberDiff line change
@@ -11,26 +11,33 @@ click==8.1.7
1111
coverage[toml]==7.4.0
1212
distlib==0.3.8
1313
docstring-parser-fork==0.0.5
14+
duckdb==0.9.2
1415
filelock==3.13.1
1516
fsspec==2023.12.2
1617
identify==2.5.33
1718
iniconfig==2.0.0
1819
lakefs==0.2.0
19-
lakefs-sdk==1.5.0
20+
lakefs-sdk==1.7.0
2021
nodeenv==1.8.0
22+
numpy==1.26.3
2123
packaging==23.2
24+
pandas[parquet]==2.1.4
2225
platformdirs==4.1.0
2326
pluggy==1.3.0
27+
polars==0.20.3
2428
pre-commit==3.6.0
29+
pyarrow==14.0.2
2530
pydantic==1.10.13
2631
pydoclint==0.3.8
2732
pyproject-hooks==1.0.0
2833
pytest==7.4.4
2934
pytest-cov==4.1.0
3035
python-dateutil==2.8.2
36+
pytz==2023.3.post1
3137
pyyaml==6.0.1
3238
six==1.16.0
3339
typing-extensions==4.9.0
40+
tzdata==2023.4
3441
urllib3==2.0.7
3542
virtualenv==20.25.0
3643
wrapt==1.16.0

requirements-docs.txt

+9-9
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ fqdn==1.5.1
3434
fsspec==2023.12.2
3535
ghp-import==2.1.0
3636
gitdb==4.0.11
37-
gitpython==3.1.40
37+
gitpython==3.1.41
3838
griffe==0.38.1
3939
h11==0.14.0
4040
httpcore==1.0.2
@@ -43,7 +43,7 @@ idna==3.6
4343
importlib-metadata==7.0.1
4444
importlib-resources==6.1.1
4545
ipykernel==6.28.0
46-
ipython==8.19.0
46+
ipython==8.20.0
4747
ipywidgets==8.1.1
4848
isoduration==20.11.0
4949
jedi==0.19.1
@@ -55,18 +55,18 @@ jsonschema-specifications==2023.12.1
5555
jupyter==1.0.0
5656
jupyter-client==8.6.0
5757
jupyter-console==6.6.3
58-
jupyter-core==5.6.1
58+
jupyter-core==5.7.1
5959
jupyter-events==0.9.0
6060
jupyter-lsp==2.2.1
61-
jupyter-server==2.12.1
61+
jupyter-server==2.12.3
6262
jupyter-server-terminals==0.5.1
6363
jupyterlab==4.0.10
6464
jupyterlab-pygments==0.3.0
6565
jupyterlab-server==2.25.2
6666
jupyterlab-widgets==3.0.9
6767
jupytext==1.16.0
6868
lakefs==0.2.0
69-
lakefs-sdk==1.5.0
69+
lakefs-sdk==1.7.0
7070
markdown==3.5.1
7171
markdown-it-py==3.0.0
7272
markupsafe==2.1.3
@@ -87,7 +87,7 @@ mkdocs-material==9.5.3
8787
mkdocs-material-extensions==1.3.1
8888
mkdocs-section-index==0.3.8
8989
mkdocstrings[python]==0.24.0
90-
mkdocstrings-python==1.7.5
90+
mkdocstrings-python==1.8.0
9191
mknotebooks==0.8.0
9292
mypy-extensions==1.0.0
9393
nbclient==0.9.0
@@ -123,7 +123,7 @@ pyyaml-env-tag==0.1
123123
pyzmq==25.1.2
124124
qtconsole==5.5.1
125125
qtpy==2.4.1
126-
referencing==0.32.0
126+
referencing==0.32.1
127127
regex==2023.12.25
128128
requests==2.31.0
129129
rfc3339-validator==0.1.4
@@ -141,13 +141,13 @@ tinycss2==1.2.1
141141
toml==0.10.2
142142
tornado==6.4
143143
traitlets==5.14.1
144-
types-python-dateutil==2.8.19.14
144+
types-python-dateutil==2.8.19.20240106
145145
typing-extensions==4.9.0
146146
uri-template==1.3.0
147147
urllib3==2.0.7
148148
verspec==0.1.0
149149
watchdog==3.0.0
150-
wcwidth==0.2.12
150+
wcwidth==0.2.13
151151
webcolors==1.13
152152
webencodings==0.5.1
153153
websocket-client==1.7.0
+45
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
import duckdb
2+
import numpy as np
3+
import pandas as pd
4+
import polars as pl
5+
from lakefs.branch import Branch
6+
from lakefs.repository import Repository
7+
8+
from lakefs_spec.spec import LakeFSFileSystem
9+
10+
storage_options = dict(
11+
host="localhost:8000",
12+
username="AKIAIOSFOLQUICKSTART",
13+
password="wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY",
14+
)
15+
16+
17+
def test_pandas_integration(
18+
fs: LakeFSFileSystem, repository: Repository, temp_branch: Branch
19+
) -> None:
20+
"""Assure the correctness of pandas DataFrame reads and writes, which use `fs.open()`."""
21+
df = pd.read_parquet(
22+
f"lakefs://{repository.id}/{temp_branch.id}/lakes.parquet", storage_options=storage_options
23+
)
24+
df["randomcol"] = np.random.randn(len(df.index))
25+
df.to_parquet(
26+
f"lakefs://{repository.id}/{temp_branch.id}/lakes_new.parquet",
27+
storage_options=storage_options,
28+
)
29+
assert fs.exists(f"lakefs://{repository.id}/{temp_branch.id}/lakes_new.parquet")
30+
31+
32+
def test_polars_integration(repository: Repository) -> None:
33+
"""Test the download and instantiation of polars DataFrames via `fs.open()`."""
34+
pl.read_parquet(
35+
f"lakefs://{repository.id}/main/lakes.parquet",
36+
use_pyarrow=True,
37+
storage_options=storage_options,
38+
)
39+
40+
41+
def test_duckdb_integration(fs: LakeFSFileSystem, repository: Repository) -> None:
42+
"""Test the correct registration of the lakeFS file system in duckDB."""
43+
# see https://duckdb.org/docs/guides/python/filesystems.html
44+
duckdb.register_filesystem(fs)
45+
duckdb.read_parquet(f"lakefs://{repository.id}/main/lakes.parquet")

0 commit comments

Comments
 (0)