diff --git a/Makefile b/Makefile index b92db7f..f55729a 100644 --- a/Makefile +++ b/Makefile @@ -27,7 +27,7 @@ py-prep: .PHONY: sync sync: - uv sync + uv sync --extra all .PHONY: fix fix: diff --git a/README.md b/README.md index ad490a7..bfab223 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,26 @@ This repository holds the SDK for interacting with the tool for improving your AI apps, RAG pipelines, and models by curating high-quality training and eval datasets. +## Installation + +To install the core package without any integrations, simply + +`pip install airtrain-py` + +You may install integrations by using pip extras. As an example, to +install the pandas integration: + +`pip install airtrain-py[pandas]` + +If you want to install all integrations, you may do the following: + +`pip install airtrain-py[all]` + +The following are available extras: + +- `pandas` +- `polars` + ## Usage Obtain your API key by going to your user settings on @@ -54,9 +74,85 @@ url = at.upload_from_dicts( [ {"foo": "some text", "bar": "more text"}, {"foo": "even more text", "bar": "so much text"}, - ] + ], + name="My Dataset name", # name is Optional ).url -# You may view your dataset at this URL +# You may view your dataset in the Airtrain dashboard at this URL +# It may take some time to complete ingestion and generation of +# automated insights. You will receive an email when it is complete. print(f"Dataset URL: {url}") ``` + +The data may be any iterable of dictionaries that can be represented using +automatically inferred [Apache Arrow](https://arrow.apache.org/docs/python/index.html) +types. If you would like to give a hint as to the Arrow schema of the data being +uploaded, you may provide one using the `schema` parameter to `upload_from_dicts`. + +### Custom Embeddings + +Airtrain produces a variety of insights into your data automatically. Some of +these insights (ex: automatic clustering) relies on embeddings of the data. Airtrain +will also embed your data automatically, but if you wish to provide your own embeddings +you may do so by adding the `embedding_column` parameter when you upload: + +```python +url = at.upload_from_dicts( + [ + {"foo": "some text", "bar": [0.0, 0.707, 0.707, 0.0]}, + {"foo": "even more text", "bar": [0.577, 0.577, 0.0, 0.577]}, + ], + embedding_column="bar", +).url +``` + +If you provide this argument, the embeddings must all be lists of floating point +numbers with the same length. + +### Integrations + +Airtrain provides integrations to allow for uploading data from a variety of +sources. In general most integrations take the form of an `upload_from_x(...)` +function with a signature matching that of `upload_from_dicts` except for +the first parameter specifying the data to be uploaded. Integrations may require +installing the Airtrain SDK [with extras](#installation). + +#### Pandas + +```python +import pandas as pd + +# ... + +df = pd.DataFrame( + { + "foo": ["some text", "more text", "even more"], + "bar": [1, 2, 3], + } +) + + +url = at.upload_from_pandas(df, name="My Pandas Dataset").url +``` + +You may also provide an iterable of dataframes instead of a single one. + +#### Polars + +```python +import polars as pl + +# ... + +df = pl.DataFrame( + { + "foo": ["some text", "more text", "even more"], + "bar": [1, 2, 3], + } +) + + +url = at.upload_from_polars(df, name="My Polars Dataset").url +``` + +You may also provide an iterable of dataframes instead of a single one. diff --git a/pyproject.toml b/pyproject.toml index eef8d3f..a24ece9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [project] -name = "airtrain" +name = "airtrain-py" description = "SDK for interacting with https://airtrain.ai" version = "0.0.1" requires-python = ">=3.8" @@ -37,6 +37,10 @@ classifiers = [ pandas = [ "pandas>=1.0.0", ] +polars = [ + "polars>=0.19.0", +] +all = ["airtrain-py[pandas,polars]"] [tool.uv] dev-dependencies = [ @@ -90,6 +94,10 @@ ignore_missing_imports = true module = "pandas.*" ignore_missing_imports = true +[[tool.mypy.overrides]] +module = "polars.*" +ignore_missing_imports = true + [[tool.mypy.overrides]] module = "tests.*" ignore_missing_imports = true diff --git a/src/airtrain/__init__.py b/src/airtrain/__init__.py index 8498f8f..6b568bd 100644 --- a/src/airtrain/__init__.py +++ b/src/airtrain/__init__.py @@ -5,3 +5,4 @@ upload_from_dicts, ) from airtrain.integrations.pandas import upload_from_pandas # noqa: F401 +from airtrain.integrations.polars import upload_from_polars # noqa: F401 diff --git a/src/airtrain/integrations/pandas.py b/src/airtrain/integrations/pandas.py index 5f2007c..70d30c2 100644 --- a/src/airtrain/integrations/pandas.py +++ b/src/airtrain/integrations/pandas.py @@ -37,7 +37,7 @@ def upload_from_pandas( if not ENABLED: raise ImportError( "Pandas integration not enabled. Please install Airtrain package as " - "`airtrain[pandas]`" + "`airtrain-py[pandas]`" ) if isinstance(data, pd.DataFrame): data = [data] diff --git a/src/airtrain/integrations/polars.py b/src/airtrain/integrations/polars.py new file mode 100644 index 0000000..f152e1a --- /dev/null +++ b/src/airtrain/integrations/polars.py @@ -0,0 +1,46 @@ +from typing import Any, Iterable, Union + + +try: + import polars as pl + + ENABLED = True +except ImportError: + ENABLED = False + +from airtrain.core import CreationArgs, DatasetMetadata, Unpack, upload_from_arrow_tables + + +# In case polars is not installed +DataFrame = Any + + +def upload_from_polars( + data: Union[Iterable[DataFrame], DataFrame], + **kwargs: Unpack[CreationArgs], +) -> DatasetMetadata: + """Upload an Airtrain dataset from the provided polars DataFrame(s). + + Parameters + ---------- + data: + Either an individual polars DataFrame or an iterable of DataFrames. + Data will be intermediately represented as pyarrow tables. + kwargs: + See `upload_from_arrow_tables` for other arguments. + + Returns + ------- + A DatasetMetadata object summarizing the created dataset. + """ + if not ENABLED: + raise ImportError( + "Polars integration not enabled. Please install Airtrain package as " + "`airtrain-py[polars]`" + ) + if isinstance(data, pl.DataFrame): + data = [data] + + data = (df.to_arrow() for df in data) # type: ignore + + return upload_from_arrow_tables(data, **kwargs) diff --git a/src/airtrain/integrations/tests/test_polars.py b/src/airtrain/integrations/tests/test_polars.py new file mode 100644 index 0000000..1df378c --- /dev/null +++ b/src/airtrain/integrations/tests/test_polars.py @@ -0,0 +1,89 @@ +import polars as pl +import pytest + +from airtrain.core import DatasetMetadata +from airtrain.integrations.polars import upload_from_polars +from tests.fixtures import MockAirtrainClient, mock_client # noqa: F401 + + +def test_upload_from_polars(mock_client: MockAirtrainClient): # noqa: F811 + df = pl.DataFrame( + [ + {"foo": 42, "bar": "a"}, + {"foo": 43, "bar": "b"}, + {"foo": 44, "bar": "c"}, + {"foo": 45, "bar": "d"}, + ] + ) + name = "Foo dataset" + result = upload_from_polars(df, name=name) + assert isinstance(result, DatasetMetadata) + assert result.size == df.shape[0] + assert result.name == name + fake_dataset = mock_client.get_fake_dataset(result.id) + assert fake_dataset.name == name + table = fake_dataset.ingested + assert table is not None + assert table.shape[0] == df.shape[0] + assert table["foo"].to_pylist() == [42, 43, 44, 45] + assert table["bar"].to_pylist() == ["a", "b", "c", "d"] + + +def test_upload_from_polars_multiple(mock_client: MockAirtrainClient): # noqa: F811 + df_1 = pl.DataFrame( + [ + {"foo": 42, "bar": "a"}, + {"foo": 43, "bar": "b"}, + {"foo": 44, "bar": "c"}, + {"foo": 45, "bar": "d"}, + ] + ) + df_2 = pl.DataFrame( + [ + {"foo": 46, "bar": "e"}, + {"foo": 47, "bar": "f"}, + {"foo": 48, "bar": "g"}, + {"foo": 49, "bar": "h"}, + ] + ) + result = upload_from_polars((df_1, df_2)) + assert isinstance(result, DatasetMetadata) + assert result.size == df_1.shape[0] + df_2.shape[0] + fake_dataset = mock_client.get_fake_dataset(result.id) + table = fake_dataset.ingested + assert table is not None + assert table.shape[0] == result.size + assert table["foo"].to_pylist() == [42, 43, 44, 45, 46, 47, 48, 49] + assert table["bar"].to_pylist() == ["a", "b", "c", "d", "e", "f", "g", "h"] + + +def test_upload_from_polars_embeddings(mock_client: MockAirtrainClient): # noqa: F811 + df = pl.DataFrame( + [ + {"foo": 42, "bar": [1.0, 0.0, 0.0, 0.0]}, + {"foo": 43, "bar": [0.0, 1.0, 0.0, 0.0]}, + {"foo": 44, "bar": [0.0, 0.0, 1.0, 0.0]}, + {"foo": 45, "bar": [0.0, 0.0, 0.0, 1.0]}, + ] + ) + result = upload_from_polars(df, embedding_column="bar") + assert isinstance(result, DatasetMetadata) + assert result.size == df.shape[0] + fake_dataset = mock_client.get_fake_dataset(result.id) + table = fake_dataset.ingested + assert table is not None + assert table.shape[0] == df.shape[0] + assert table["foo"].to_pylist() == [42, 43, 44, 45] + assert table["bar"].to_pylist()[1] == [0.0, 1.0, 0.0, 0.0] + + df_bad = pl.DataFrame( + [ + {"foo": 42, "bar": [1.0, 0.0, 0.0, 0.0]}, + {"foo": 43, "bar": [0.0, 1.0, 0.0, 0.0]}, + {"foo": 44, "bar": [0.0, 0.0, 1.0]}, + {"foo": 45, "bar": [0.0, 0.0, 0.0, 1.0]}, + ] + ) + with pytest.raises(ValueError): + # one row has a different number of embedding dimensions. + upload_from_polars(df_bad, embedding_column="bar") diff --git a/uv.lock b/uv.lock index af47227..ef6c9b5 100644 --- a/uv.lock +++ b/uv.lock @@ -2,14 +2,15 @@ version = 1 requires-python = ">=3.8" resolution-markers = [ "python_full_version < '3.9'", - "python_full_version < '3.10'", + "python_full_version < '3.9'", + "python_full_version == '3.9.*'", "python_full_version == '3.10.*'", "python_full_version == '3.11.*'", "python_full_version >= '3.12'", ] [[package]] -name = "airtrain" +name = "airtrain-py" version = "0.0.1" source = { editable = "." } dependencies = [ @@ -20,9 +21,16 @@ dependencies = [ ] [package.optional-dependencies] +all = [ + { name = "pandas" }, + { name = "polars" }, +] pandas = [ { name = "pandas" }, ] +polars = [ + { name = "polars" }, +] [package.dev-dependencies] dev = [ @@ -35,11 +43,13 @@ dev = [ [package.metadata] requires-dist = [ + { name = "airtrain-py", extras = ["pandas", "polars"], marker = "extra == 'all'" }, { name = "httpx", specifier = ">=0.25.0" }, { name = "numpy", marker = "python_full_version == '3.8.*'", specifier = "<=1.24.4" }, { name = "numpy", marker = "python_full_version >= '3.9'", specifier = ">=1.19.3" }, { name = "numpy", marker = "python_full_version >= '3.12'", specifier = ">=1.26.0" }, { name = "pandas", marker = "extra == 'pandas'", specifier = ">=1.0.0" }, + { name = "polars", marker = "extra == 'polars'", specifier = ">=0.19.0" }, { name = "pyarrow", specifier = ">=13.0.0" }, ] @@ -273,7 +283,8 @@ version = "1.24.4" source = { registry = "https://pypi.org/simple" } resolution-markers = [ "python_full_version < '3.9'", - "python_full_version < '3.10'", + "python_full_version < '3.9'", + "python_full_version == '3.9.*'", "python_full_version == '3.10.*'", "python_full_version == '3.11.*'", ] @@ -436,6 +447,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669", size = 20556 }, ] +[[package]] +name = "polars" +version = "1.6.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2b/a9/cf169ce361224d4b397f52d6fcceb191452ecdc50813ce2aa6c60ff46e04/polars-1.6.0.tar.gz", hash = "sha256:d7e8d5e577883a9755bc3be92ecbf6f20bced68267bdb8bdb440120e905cc19c", size = 3929590 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/51/a6/00e9c0cc08d8b279ee576dca105fb5b6c3f812f56ce6bbefdf127773641b/polars-1.6.0-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:6d1665c23e3574ebd47a26a5d7b619e6e73e53718c3b0bfd7d08b6a0a4ae7daa", size = 30510442 }, + { url = "https://files.pythonhosted.org/packages/95/0d/7665314925d774236404919678c197abe4818d1820387017a23f21e27815/polars-1.6.0-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:d7f3abf085adf034720b358119c4c8e144bcc2d96010b7e7d0afa11b80da383c", size = 26758515 }, + { url = "https://files.pythonhosted.org/packages/04/1c/1a0a0a2c076bec8501ada9496afe5486c9e994558b0c80057f7e3ee6ec16/polars-1.6.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a166adb429f8ee099c9d803e7470a80c76368437a8b272c67cef9eef6d5e9da1", size = 31869680 }, + { url = "https://files.pythonhosted.org/packages/c1/95/224139dbd93ce450f194233f643f08e759f369c10c5bd62a13d615dd886c/polars-1.6.0-cp38-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:1c811b772c9476f7f0bb4445a8387d2ab6d86f5e79140b1bfba914a32788d261", size = 28441792 }, + { url = "https://files.pythonhosted.org/packages/fa/cb/8f97ea9bbe41f862cc685b1f223ee8508c60f6510918de75637b3539e62d/polars-1.6.0-cp38-abi3-win_amd64.whl", hash = "sha256:ffae15ffa80fda5cc3af44a340b565bcf7f2ab6d7854d3f967baf505710c78e2", size = 31424668 }, +] + [[package]] name = "pyarrow" version = "17.0.0"