-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Some integrations are with libs that are widely adopted, and essentially come "for free." This PR adds those and associated docs/packaging changes.
- Loading branch information
Showing
10 changed files
with
534 additions
and
9 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
from typing import Any, Iterable, Union | ||
|
||
|
||
try: | ||
import pandas as pd | ||
|
||
ENABLED = True | ||
except ImportError: | ||
ENABLED = False | ||
import pyarrow as pa | ||
|
||
from airtrain.core import CreationArgs, DatasetMetadata, Unpack, upload_from_arrow_tables | ||
|
||
|
||
# In case pandas is not installed | ||
DataFrame = Any | ||
|
||
|
||
def upload_from_pandas( | ||
data: Union[Iterable[DataFrame], DataFrame], | ||
**kwargs: Unpack[CreationArgs], | ||
) -> DatasetMetadata: | ||
"""Upload an Airtrain dataset from the provided pandas DataFrame(s). | ||
Parameters | ||
---------- | ||
data: | ||
Either an individual pandas DataFrame or an iterable of DataFrames. | ||
Data will be intermediately represented as pyarrow tables. | ||
kwargs: | ||
See `upload_from_arrow_tables` for other arguments. | ||
Returns | ||
------- | ||
A DatasetMetadata object summarizing the created dataset. | ||
""" | ||
if not ENABLED: | ||
raise ImportError( | ||
"Pandas integration not enabled. Please install Airtrain package as " | ||
"`airtrain-py[pandas]`" | ||
) | ||
if isinstance(data, pd.DataFrame): | ||
data = [data] | ||
data = (pa.Table.from_pandas(df) for df in data) # type: ignore | ||
|
||
return upload_from_arrow_tables(data, **kwargs) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
from typing import Any, Iterable, Union | ||
|
||
|
||
try: | ||
import polars as pl | ||
|
||
ENABLED = True | ||
except ImportError: | ||
ENABLED = False | ||
|
||
from airtrain.core import CreationArgs, DatasetMetadata, Unpack, upload_from_arrow_tables | ||
|
||
|
||
# In case polars is not installed | ||
DataFrame = Any | ||
|
||
|
||
def upload_from_polars( | ||
data: Union[Iterable[DataFrame], DataFrame], | ||
**kwargs: Unpack[CreationArgs], | ||
) -> DatasetMetadata: | ||
"""Upload an Airtrain dataset from the provided polars DataFrame(s). | ||
Parameters | ||
---------- | ||
data: | ||
Either an individual polars DataFrame or an iterable of DataFrames. | ||
Data will be intermediately represented as pyarrow tables. | ||
kwargs: | ||
See `upload_from_arrow_tables` for other arguments. | ||
Returns | ||
------- | ||
A DatasetMetadata object summarizing the created dataset. | ||
""" | ||
if not ENABLED: | ||
raise ImportError( | ||
"Polars integration not enabled. Please install Airtrain package as " | ||
"`airtrain-py[polars]`" | ||
) | ||
if isinstance(data, pl.DataFrame): | ||
data = [data] | ||
|
||
data = (df.to_arrow() for df in data) # type: ignore | ||
|
||
return upload_from_arrow_tables(data, **kwargs) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
import numpy as np | ||
import pandas as pd | ||
import pytest | ||
|
||
from airtrain.core import DatasetMetadata | ||
from airtrain.integrations.pandas import upload_from_pandas | ||
from tests.fixtures import MockAirtrainClient, mock_client # noqa: F401 | ||
|
||
|
||
def test_upload_from_pandas(mock_client: MockAirtrainClient): # noqa: F811 | ||
df = pd.DataFrame( | ||
[ | ||
{"foo": 42, "bar": "a"}, | ||
{"foo": 43, "bar": "b"}, | ||
{"foo": 44, "bar": "c"}, | ||
{"foo": 45, "bar": "d"}, | ||
] | ||
) | ||
name = "Foo dataset" | ||
result = upload_from_pandas(df, name=name) | ||
assert isinstance(result, DatasetMetadata) | ||
assert result.size == df.shape[0] | ||
assert result.name == name | ||
fake_dataset = mock_client.get_fake_dataset(result.id) | ||
assert fake_dataset.name == name | ||
table = fake_dataset.ingested | ||
assert table is not None | ||
assert table.shape[0] == df.shape[0] | ||
assert table["foo"].to_pylist() == [42, 43, 44, 45] | ||
assert table["bar"].to_pylist() == ["a", "b", "c", "d"] | ||
|
||
|
||
def test_upload_from_pandas_multiple(mock_client: MockAirtrainClient): # noqa: F811 | ||
df_1 = pd.DataFrame( | ||
[ | ||
{"foo": 42, "bar": "a"}, | ||
{"foo": 43, "bar": "b"}, | ||
{"foo": 44, "bar": "c"}, | ||
{"foo": 45, "bar": "d"}, | ||
] | ||
) | ||
df_2 = pd.DataFrame( | ||
[ | ||
{"foo": 46, "bar": "e"}, | ||
{"foo": 47, "bar": "f"}, | ||
{"foo": 48, "bar": "g"}, | ||
{"foo": 49, "bar": "h"}, | ||
] | ||
) | ||
result = upload_from_pandas((df_1, df_2)) | ||
assert isinstance(result, DatasetMetadata) | ||
assert result.size == df_1.shape[0] + df_2.shape[0] | ||
fake_dataset = mock_client.get_fake_dataset(result.id) | ||
table = fake_dataset.ingested | ||
assert table is not None | ||
assert table.shape[0] == result.size | ||
assert table["foo"].to_pylist() == [42, 43, 44, 45, 46, 47, 48, 49] | ||
assert table["bar"].to_pylist() == ["a", "b", "c", "d", "e", "f", "g", "h"] | ||
|
||
|
||
def test_upload_from_pandas_embeddings(mock_client: MockAirtrainClient): # noqa: F811 | ||
df = pd.DataFrame( | ||
[ | ||
{"foo": 42, "bar": np.array([1.0, 0.0, 0.0, 0.0])}, | ||
{"foo": 43, "bar": np.array([0.0, 1.0, 0.0, 0.0])}, | ||
{"foo": 44, "bar": np.array([0.0, 0.0, 1.0, 0.0])}, | ||
{"foo": 45, "bar": np.array([0.0, 0.0, 0.0, 1.0])}, | ||
] | ||
) | ||
result = upload_from_pandas(df, embedding_column="bar") | ||
assert isinstance(result, DatasetMetadata) | ||
assert result.size == df.shape[0] | ||
fake_dataset = mock_client.get_fake_dataset(result.id) | ||
table = fake_dataset.ingested | ||
assert table is not None | ||
assert table.shape[0] == df.shape[0] | ||
assert table["foo"].to_pylist() == [42, 43, 44, 45] | ||
assert table["bar"].to_pylist()[1] == [0.0, 1.0, 0.0, 0.0] | ||
|
||
df_bad = pd.DataFrame( | ||
[ | ||
{"foo": 42, "bar": np.array([1.0, 0.0, 0.0, 0.0])}, | ||
{"foo": 43, "bar": np.array([0.0, 1.0, 0.0, 0.0])}, | ||
{"foo": 44, "bar": np.array([0.0, 0.0, 1.0])}, | ||
{"foo": 45, "bar": np.array([0.0, 0.0, 0.0, 1.0])}, | ||
] | ||
) | ||
with pytest.raises(ValueError): | ||
# one row has a different number of embedding dimensions. | ||
upload_from_pandas(df_bad, embedding_column="bar") |
Oops, something went wrong.