-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
8 changed files
with
272 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -27,7 +27,7 @@ py-prep: | |
|
||
.PHONY: sync | ||
sync: | ||
uv sync | ||
uv sync --extra all | ||
|
||
.PHONY: fix | ||
fix: | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
from typing import Any, Iterable, Union | ||
|
||
|
||
try: | ||
import polars as pl | ||
|
||
ENABLED = True | ||
except ImportError: | ||
ENABLED = False | ||
|
||
from airtrain.core import CreationArgs, DatasetMetadata, Unpack, upload_from_arrow_tables | ||
|
||
|
||
# In case polars is not installed | ||
DataFrame = Any | ||
|
||
|
||
def upload_from_polars( | ||
data: Union[Iterable[DataFrame], DataFrame], | ||
**kwargs: Unpack[CreationArgs], | ||
) -> DatasetMetadata: | ||
"""Upload an Airtrain dataset from the provided polars DataFrame(s). | ||
Parameters | ||
---------- | ||
data: | ||
Either an individual polars DataFrame or an iterable of DataFrames. | ||
Data will be intermediately represented as pyarrow tables. | ||
kwargs: | ||
See `upload_from_arrow_tables` for other arguments. | ||
Returns | ||
------- | ||
A DatasetMetadata object summarizing the created dataset. | ||
""" | ||
if not ENABLED: | ||
raise ImportError( | ||
"Polars integration not enabled. Please install Airtrain package as " | ||
"`airtrain-py[polars]`" | ||
) | ||
if isinstance(data, pl.DataFrame): | ||
data = [data] | ||
|
||
data = (df.to_arrow() for df in data) # type: ignore | ||
|
||
return upload_from_arrow_tables(data, **kwargs) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
import polars as pl | ||
import pytest | ||
|
||
from airtrain.core import DatasetMetadata | ||
from airtrain.integrations.polars import upload_from_polars | ||
from tests.fixtures import MockAirtrainClient, mock_client # noqa: F401 | ||
|
||
|
||
def test_upload_from_polars(mock_client: MockAirtrainClient): # noqa: F811 | ||
df = pl.DataFrame( | ||
[ | ||
{"foo": 42, "bar": "a"}, | ||
{"foo": 43, "bar": "b"}, | ||
{"foo": 44, "bar": "c"}, | ||
{"foo": 45, "bar": "d"}, | ||
] | ||
) | ||
name = "Foo dataset" | ||
result = upload_from_polars(df, name=name) | ||
assert isinstance(result, DatasetMetadata) | ||
assert result.size == df.shape[0] | ||
assert result.name == name | ||
fake_dataset = mock_client.get_fake_dataset(result.id) | ||
assert fake_dataset.name == name | ||
table = fake_dataset.ingested | ||
assert table is not None | ||
assert table.shape[0] == df.shape[0] | ||
assert table["foo"].to_pylist() == [42, 43, 44, 45] | ||
assert table["bar"].to_pylist() == ["a", "b", "c", "d"] | ||
|
||
|
||
def test_upload_from_polars_multiple(mock_client: MockAirtrainClient): # noqa: F811 | ||
df_1 = pl.DataFrame( | ||
[ | ||
{"foo": 42, "bar": "a"}, | ||
{"foo": 43, "bar": "b"}, | ||
{"foo": 44, "bar": "c"}, | ||
{"foo": 45, "bar": "d"}, | ||
] | ||
) | ||
df_2 = pl.DataFrame( | ||
[ | ||
{"foo": 46, "bar": "e"}, | ||
{"foo": 47, "bar": "f"}, | ||
{"foo": 48, "bar": "g"}, | ||
{"foo": 49, "bar": "h"}, | ||
] | ||
) | ||
result = upload_from_polars((df_1, df_2)) | ||
assert isinstance(result, DatasetMetadata) | ||
assert result.size == df_1.shape[0] + df_2.shape[0] | ||
fake_dataset = mock_client.get_fake_dataset(result.id) | ||
table = fake_dataset.ingested | ||
assert table is not None | ||
assert table.shape[0] == result.size | ||
assert table["foo"].to_pylist() == [42, 43, 44, 45, 46, 47, 48, 49] | ||
assert table["bar"].to_pylist() == ["a", "b", "c", "d", "e", "f", "g", "h"] | ||
|
||
|
||
def test_upload_from_polars_embeddings(mock_client: MockAirtrainClient): # noqa: F811 | ||
df = pl.DataFrame( | ||
[ | ||
{"foo": 42, "bar": [1.0, 0.0, 0.0, 0.0]}, | ||
{"foo": 43, "bar": [0.0, 1.0, 0.0, 0.0]}, | ||
{"foo": 44, "bar": [0.0, 0.0, 1.0, 0.0]}, | ||
{"foo": 45, "bar": [0.0, 0.0, 0.0, 1.0]}, | ||
] | ||
) | ||
result = upload_from_polars(df, embedding_column="bar") | ||
assert isinstance(result, DatasetMetadata) | ||
assert result.size == df.shape[0] | ||
fake_dataset = mock_client.get_fake_dataset(result.id) | ||
table = fake_dataset.ingested | ||
assert table is not None | ||
assert table.shape[0] == df.shape[0] | ||
assert table["foo"].to_pylist() == [42, 43, 44, 45] | ||
assert table["bar"].to_pylist()[1] == [0.0, 1.0, 0.0, 0.0] | ||
|
||
df_bad = pl.DataFrame( | ||
[ | ||
{"foo": 42, "bar": [1.0, 0.0, 0.0, 0.0]}, | ||
{"foo": 43, "bar": [0.0, 1.0, 0.0, 0.0]}, | ||
{"foo": 44, "bar": [0.0, 0.0, 1.0]}, | ||
{"foo": 45, "bar": [0.0, 0.0, 0.0, 1.0]}, | ||
] | ||
) | ||
with pytest.raises(ValueError): | ||
# one row has a different number of embedding dimensions. | ||
upload_from_polars(df_bad, embedding_column="bar") |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.