diff --git a/docs/requirements.txt b/docs/requirements.txt index 8eb74496..d384f0f5 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -19,4 +19,5 @@ sphinx==5.3.0 pydata-sphinx-theme==0.8.0 myst-parser maturin -jinja2 \ No newline at end of file +jinja2 +ipython \ No newline at end of file diff --git a/docs/source/api.rst b/docs/source/api.rst index a5d65433..d9f4a09d 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -24,7 +24,6 @@ API Reference .. toctree:: :maxdepth: 2 - api/config api/dataframe api/execution_context api/expression diff --git a/docs/source/conf.py b/docs/source/conf.py index 929c2493..0822e0ab 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -52,6 +52,7 @@ "sphinx.ext.viewcode", "sphinx.ext.napoleon", "myst_parser", + "IPython.sphinxext.ipython_directive", ] source_suffix = { diff --git a/docs/source/contributor-guide/introduction.rst b/docs/source/contributor-guide/introduction.rst new file mode 100644 index 00000000..dd61ad8f --- /dev/null +++ b/docs/source/contributor-guide/introduction.rst @@ -0,0 +1,85 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Introduction +============ +We welcome and encourage contributions of all kinds, such as: + +1. Tickets with issue reports of feature requests +2. Documentation improvements +3. Code, both PR and (especially) PR Review. + +In addition to submitting new PRs, we have a healthy tradition of community members reviewing each other’s PRs. +Doing so is a great way to help the community as well as get more familiar with Rust and the relevant codebases. + +How to develop +-------------- + +This assumes that you have rust and cargo installed. We use the workflow recommended by `pyo3 `_ and `maturin `_. + +Bootstrap: + +.. code-block:: shell + + # fetch this repo + git clone git@github.com:apache/arrow-datafusion-python.git + # prepare development environment (used to build wheel / install in development) + python3 -m venv venv + # activate the venv + source venv/bin/activate + # update pip itself if necessary + python -m pip install -U pip + # install dependencies (for Python 3.8+) + python -m pip install -r requirements-310.txt + +The tests rely on test data in git submodules. + +.. code-block:: shell + + git submodule init + git submodule update + + +Whenever rust code changes (your changes or via `git pull`): + +.. code-block:: shell + + # make sure you activate the venv using "source venv/bin/activate" first + maturin develop + python -m pytest + + +Update Dependencies +------------------- + +To change test dependencies, change the `requirements.in` and run + +.. code-block:: shell + + # install pip-tools (this can be done only once), also consider running in venv + python -m pip install pip-tools + python -m piptools compile --generate-hashes -o requirements-310.txt + + +To update dependencies, run with `-U` + +.. code-block:: shell + + python -m piptools compile -U --generate-hashes -o requirements-310.txt + + +More details about pip-tools `here `_ diff --git a/docs/source/index.rst b/docs/source/index.rst index 78f44ea1..db0b5c56 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -31,12 +31,17 @@ Its query engine, DataFusion, is written in `Rust `_, Technically, zero-copy is achieved via the `c data interface `_. -How to use it -============= +Install +------- -Simple usage: +.. code-block:: shell + + pip install datafusion -.. code-block:: python +Example +------- + +.. ipython:: python import datafusion from datafusion import col @@ -58,234 +63,44 @@ Simple usage: col("a") - col("b"), ) - # execute and collect the first (and only) batch - result = df.collect()[0] - - assert result.column(0) == pyarrow.array([5, 7, 9]) - assert result.column(1) == pyarrow.array([-3, -3, -3]) - - -We can also execute a query against data stored in CSV - -.. code-block:: bash - - echo "a,b\n1,4\n2,5\n3,6" > example.csv - - -.. code-block:: python - - import datafusion - from datafusion import col - import pyarrow - - # create a context - ctx = datafusion.SessionContext() - - # register a CSV - ctx.register_csv('example', 'example.csv') - - # create a new statement - df = ctx.table('example').select( - col("a") + col("b"), - col("a") - col("b"), - ) - - # execute and collect the first (and only) batch - result = df.collect()[0] - - assert result.column(0) == pyarrow.array([5, 7, 9]) - assert result.column(1) == pyarrow.array([-3, -3, -3]) - - -And how to execute a query against a CSV using SQL: - - -.. code-block:: python - - import datafusion - from datafusion import col - import pyarrow - - # create a context - ctx = datafusion.SessionContext() - - # register a CSV - ctx.register_csv('example', 'example.csv') - - # create a new statement via SQL - df = ctx.sql("SELECT a+b, a-b FROM example") - - # execute and collect the first (and only) batch - result = df.collect()[0] - - assert result.column(0) == pyarrow.array([5, 7, 9]) - assert result.column(1) == pyarrow.array([-3, -3, -3]) - - - -UDFs ----- - -.. code-block:: python - - import pyarrow - from datafusion import udf - - def is_null(array: pyarrow.Array) -> pyarrow.Array: - return array.is_null() - - is_null_arr = udf(is_null, [pyarrow.int64()], pyarrow.bool_(), 'stable') - - # create a context - ctx = datafusion.SessionContext() - - # create a RecordBatch and a new DataFrame from it - batch = pyarrow.RecordBatch.from_arrays( - [pyarrow.array([1, 2, 3]), pyarrow.array([4, 5, 6])], - names=["a", "b"], - ) - df = ctx.create_dataframe([[batch]]) - - df = df.select(is_null_arr(col("a"))) - - result = df.collect()[0] - - assert result.column(0) == pyarrow.array([False] * 3) - + df -UDAF ----- - -.. code-block:: python - - import pyarrow - import pyarrow.compute - import datafusion - from datafusion import udaf, Accumulator - from datafusion import col - - - class MyAccumulator(Accumulator): - """ - Interface of a user-defined accumulation. - """ - def __init__(self): - self._sum = pyarrow.scalar(0.0) - - def update(self, values: pyarrow.Array) -> None: - # not nice since pyarrow scalars can't be summed yet. This breaks on `None` - self._sum = pyarrow.scalar(self._sum.as_py() + pyarrow.compute.sum(values).as_py()) - - def merge(self, states: pyarrow.Array) -> None: - # not nice since pyarrow scalars can't be summed yet. This breaks on `None` - self._sum = pyarrow.scalar(self._sum.as_py() + pyarrow.compute.sum(states).as_py()) - - def state(self) -> pyarrow.Array: - return pyarrow.array([self._sum.as_py()]) - - def evaluate(self) -> pyarrow.Scalar: - return self._sum - - # create a context - ctx = datafusion.SessionContext() - - # create a RecordBatch and a new DataFrame from it - batch = pyarrow.RecordBatch.from_arrays( - [pyarrow.array([1, 2, 3]), pyarrow.array([4, 5, 6])], - names=["a", "b"], - ) - df = ctx.create_dataframe([[batch]]) - - my_udaf = udaf(MyAccumulator, pyarrow.float64(), pyarrow.float64(), [pyarrow.float64()], 'stable') - - df = df.aggregate( - [], - [my_udaf(col("a"))] - ) - - result = df.collect()[0] - - assert result.column(0) == pyarrow.array([6.0]) - -How to install (from pip) -========================= - -.. code-block:: shell - - pip install datafusion - -You can verify the installation by running: - -.. code-block:: python - - >>> import datafusion - >>> datafusion.__version__ - '0.6.0' - - -How to develop -============== - -This assumes that you have rust and cargo installed. We use the workflow recommended by `pyo3 `_ and `maturin `_. - -Bootstrap: - -.. code-block:: shell - - # fetch this repo - git clone git@github.com:apache/arrow-datafusion-python.git - # prepare development environment (used to build wheel / install in development) - python3 -m venv venv - # activate the venv - source venv/bin/activate - # update pip itself if necessary - python -m pip install -U pip - # install dependencies (for Python 3.8+) - python -m pip install -r requirements-310.txt - -The tests rely on test data in git submodules. - -.. code-block:: shell - - git submodule init - git submodule update - - -Whenever rust code changes (your changes or via `git pull`): - -.. code-block:: shell - - # make sure you activate the venv using "source venv/bin/activate" first - maturin develop - python -m pytest - - -How to update dependencies -========================== - -To change test dependencies, change the `requirements.in` and run - -.. code-block:: shell - - # install pip-tools (this can be done only once), also consider running in venv - python -m pip install pip-tools - python -m piptools compile --generate-hashes -o requirements-310.txt +.. _toc.links: +.. toctree:: + :hidden: + :maxdepth: 1 + :caption: LINKS -To update dependencies, run with `-U` + Github and Issue Tracker + Rust's API Docs + Code of conduct -.. code-block:: shell - - python -m piptools compile -U --generate-hashes -o requirements-310.txt +.. _toc.guide: +.. toctree:: + :hidden: + :maxdepth: 1 + :caption: USER GUIDE + user-guide/introduction + user-guide/basics + user-guide/common-operations/index + user-guide/io/index + user-guide/sql -More details about pip-tools `here `_ +.. _toc.contributor_guide: +.. toctree:: + :hidden: + :maxdepth: 1 + :caption: CONTRIBUTOR GUIDE -API reference -============= + contributor-guide/introduction +.. _toc.api: .. toctree:: - :maxdepth: 2 + :hidden: + :maxdepth: 1 + :caption: API api diff --git a/docs/source/user-guide/basics.rst b/docs/source/user-guide/basics.rst new file mode 100644 index 00000000..0409c2e3 --- /dev/null +++ b/docs/source/user-guide/basics.rst @@ -0,0 +1,89 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Concepts +======== + +In this section, we will cover a basic example to introduce a few key concepts. + +.. code-block:: python + + import datafusion + from datafusion import col + import pyarrow + + # create a context + ctx = datafusion.SessionContext() + + # create a RecordBatch and a new DataFrame from it + batch = pyarrow.RecordBatch.from_arrays( + [pyarrow.array([1, 2, 3]), pyarrow.array([4, 5, 6])], + names=["a", "b"], + ) + df = ctx.create_dataframe([[batch]]) + + # create a new statement + df = df.select( + col("a") + col("b"), + col("a") - col("b"), + ) + + # execute and collect the first (and only) batch + result = df.collect()[0] + +The first statement group: + +.. code-block:: python + + # create a context + ctx = datafusion.SessionContext() + +creates a :code:`SessionContext`, that is, the main interface for executing queries with DataFusion. It maintains the state +of the connection between a user and an instance of the DataFusion engine. Additionally it provides the following functionality: + +- Create a DataFrame from a CSV or Parquet data source. +- Register a CSV or Parquet data source as a table that can be referenced from a SQL query. +- Register a custom data source that can be referenced from a SQL query. +- Execution a SQL query + +The second statement group creates a :code:`DataFrame`, + +.. code-block:: python + + # create a RecordBatch and a new DataFrame from it + batch = pyarrow.RecordBatch.from_arrays( + [pyarrow.array([1, 2, 3]), pyarrow.array([4, 5, 6])], + names=["a", "b"], + ) + df = ctx.create_dataframe([[batch]]) + +A DataFrame refers to a (logical) set of rows that share the same column names, similar to a `Pandas DataFrame `_. +DataFrames are typically created by calling a method on :code:`SessionContext`, such as :code:`read_csv`, and can then be modified by +calling the transformation methods, such as :meth:`.DataFrame.filter`, :meth:`.DataFrame.select`, :meth:`.DataFrame.aggregate`, +and :meth:`.DataFrame.limit` to build up a query definition. + +The third statement uses :code:`Expressions` to build up a query definition. + +.. code-block:: python + + df = df.select( + col("a") + col("b"), + col("a") - col("b"), + ) + +Finally the :code:`collect` method converts the logical plan represented by the DataFrame into a physical plan and execute it, +collecting all results into a list of `RecordBatch `_. \ No newline at end of file diff --git a/docs/source/user-guide/common-operations/aggregations.rst b/docs/source/user-guide/common-operations/aggregations.rst new file mode 100644 index 00000000..8315677c --- /dev/null +++ b/docs/source/user-guide/common-operations/aggregations.rst @@ -0,0 +1,59 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Aggregation +============ + +An aggregate or aggregation is a function where the values of multiple rows are processed together to form a single summary value. +For performing an aggregation, DataFusion provides the :meth:`.DataFrame.aggregate` + +.. ipython:: python + + from datafusion import SessionContext + from datafusion import column, lit + from datafusion import functions as f + import random + + ctx = SessionContext() + df = ctx.from_pydict( + { + "a": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "b": ["one", "one", "two", "three", "two", "two", "one", "three"], + "c": [random.randint(0, 100) for _ in range(8)], + "d": [random.random() for _ in range(8)], + } + ) + + col_a = column("a") + col_b = column("b") + col_c = column("c") + col_d = column("d") + + df.aggregate([], [f.approx_distinct(col_c), f.approx_median(col_d), f.approx_percentile_cont(col_d, lit(0.5))]) + +When the :code:`group_by` list is empty the aggregation is done over the whole :class:`.DataFrame`. For grouping +the :code:`group_by` list must contain at least one column + +.. ipython:: python + + df.aggregate([col_a], [f.sum(col_c), f.max(col_d), f.min(col_d)]) + +More than one column can be used for grouping + +.. ipython:: python + + df.aggregate([col_a, col_b], [f.sum(col_c), f.max(col_d), f.min(col_d)]) diff --git a/docs/source/user-guide/common-operations/basic-info.rst b/docs/source/user-guide/common-operations/basic-info.rst new file mode 100644 index 00000000..424e1cc9 --- /dev/null +++ b/docs/source/user-guide/common-operations/basic-info.rst @@ -0,0 +1,61 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Basic Operations +================ + +In this section, you will learn how to display essential details of DataFrames using specific functions. + +.. ipython:: python + + from datafusion import SessionContext + import random + + ctx = SessionContext() + df = ctx.from_pydict({ + "nrs": [1, 2, 3, 4, 5], + "names": ["python", "ruby", "java", "haskell", "go"], + "random": random.sample(range(1000), 5), + "groups": ["A", "A", "B", "C", "B"], + }) + df + +Use :meth:`.DataFrame.limit` to view the top rows of the frame: + +.. ipython:: python + + df.limit(2) + +Display the columns of the DataFrame using :meth:`.DataFrame.schema`: + +.. ipython:: python + + df.schema() + +The method :meth:`.DataFrame.to_pandas` uses pyarrow to convert to pandas DataFrame, by collecting the batches, +passing them to an Arrow table, and then converting them to a pandas DataFrame. + +.. ipython:: python + + df.to_pandas() + +:meth:`.DataFrame.describe` shows a quick statistic summary of your data: + +.. ipython:: python + + df.describe() + diff --git a/docs/source/user-guide/common-operations/functions.rst b/docs/source/user-guide/common-operations/functions.rst new file mode 100644 index 00000000..7e5c592d --- /dev/null +++ b/docs/source/user-guide/common-operations/functions.rst @@ -0,0 +1,117 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Functions +========= + +DataFusion provides a large number of built-in functions for performing complex queries without requiring user-defined functions. +In here we will cover some of the more popular use cases. If you want to view all the functions go to the :ref:`Functions` API Reference. + +We'll use the pokemon dataset in the following examples. + +.. ipython:: python + + import urllib.request + from datafusion import SessionContext + + urllib.request.urlretrieve( + "https://gist.githubusercontent.com/ritchie46/cac6b337ea52281aa23c049250a4ff03/raw/89a957ff3919d90e6ef2d34235e6bf22304f3366/pokemon.csv", + "pokemon.csv", + ) + + ctx = SessionContext() + ctx.register_csv("pokemon", "pokemon.csv") + df = ctx.table("pokemon") + +Mathematical +------------ + +DataFusion offers mathematical functions such as :func:`.pow` or :func:`.log` + +.. ipython:: python + + from datafusion import col, literal + from datafusion import functions as f + + df.select( + f.pow(col('"Attack"'), literal(2)) - f.pow(col('"Defense"'), literal(2)) + ).limit(10) + + +Conditional +----------- + +There 3 conditional functions in DataFusion :func:`.coalesce`, :func:`.nullif` and :func:`.case` (not available in Python) + +.. ipython:: python + + df.select( + f.coalesce(col('"Type 1"'), col('"Type 2"')).alias("dominant_type") + ).limit(10) + +Temporal +-------- + +For selecting the current time use :func:`.now` + +.. ipython:: python + + df.select(f.now()) + +Convert to timestamps using :func:`.to_timestamp` + +.. ipython:: python + + df.select(f.to_timestamp(col('"Total"')).alias("timestamp")) + +String +------ + +In the field of data science, working with textual data is a common task. To make string manipulation easier, +DataFusion offers a range of helpful options. + +.. ipython:: python + + df.select( + f.char_length(col('"Name"')).alias("len"), + f.lower(col('"Name"')).alias("lower"), + f.left(col('"Name"'), literal(4)).alias("code") + ) + +This also includes the functions for regular expressions :func:`.regexp_replace` and :func:`.regexp_match` + +.. ipython:: python + + df.select( + f.regexp_match(col('"Name"'), literal("Char")).alias("dragons"), + f.regexp_replace(col('"Name"'), literal("saur"), literal("fleur")).alias("flowers") + ) + + +Other +----- + +The function :func:`.in_list` allows to check a column for the presence of multiple values: + +.. ipython:: python + + types = [literal("Grass"), literal("Fire"), literal("Water")] + ( + df.select(f.in_list(col('"Type 1"'), types, negated=False).alias("basic_types")) + .limit(20) + .to_pandas() + ) diff --git a/docs/source/user-guide/common-operations/index.rst b/docs/source/user-guide/common-operations/index.rst new file mode 100644 index 00000000..950afb93 --- /dev/null +++ b/docs/source/user-guide/common-operations/index.rst @@ -0,0 +1,30 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Common Operations +================= + +.. toctree:: + :maxdepth: 2 + + basic-info + select-and-filter + joins + functions + aggregations + windows + udf-and-udfa diff --git a/docs/source/user-guide/common-operations/joins.rst b/docs/source/user-guide/common-operations/joins.rst new file mode 100644 index 00000000..12820311 --- /dev/null +++ b/docs/source/user-guide/common-operations/joins.rst @@ -0,0 +1,104 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Joins +===== + +DataFusion supports the following join variants via the method :meth:`.DataFrame.join` + +- Inner Join +- Left Join +- Right Join +- Full Join +- Left Semi Join +- Left Anti Join + +For the examples in this section we'll use the following two DataFrames + +.. ipython:: python + + from datafusion import SessionContext + + ctx = SessionContext() + + left = ctx.from_pydict( + { + "customer_id": [1, 2, 3], + "customer": ["Alice", "Bob", "Charlie"], + } + ) + + right = ctx.from_pylist([ + {"id": 1, "name": "CityCabs"}, + {"id": 2, "name": "MetroRide"}, + {"id": 5, "name": "UrbanGo"}, + ]) + +Inner Join +---------- + +When using an inner join, only rows containing the common values between the two join columns present in both DataFrames +will be included in the resulting DataFrame. + +.. ipython:: python + + left.join(right, join_keys=(["customer_id"], ["id"]), how="inner") + +The parameter :code:`join_keys` specifies the columns from the left DataFrame and right DataFrame that contains the values +that should match. + +Left Join +--------- + +A left join combines rows from two DataFrames using the key columns. It returns all rows from the left DataFrame and +matching rows from the right DataFrame. If there's no match in the right DataFrame, it returns null +values for the corresponding columns. + +.. ipython:: python + + left.join(right, join_keys=(["customer_id"], ["id"]), how="left") + +Full Join +--------- + +A full join merges rows from two tables based on a related column, returning all rows from both tables, even if there +is no match. Unmatched rows will have null values. + +.. ipython:: python + + left.join(right, join_keys=(["customer_id"], ["id"]), how="full") + +Left Semi Join +-------------- + +A left semi join retrieves matching rows from the left table while +omitting duplicates with multiple matches in the right table. + +.. ipython:: python + + left.join(right, join_keys=(["customer_id"], ["id"]), how="semi") + +Left Anti Join +-------------- + +A left anti join shows all rows from the left table without any matching rows in the right table, +based on a the specified matching columns. It excludes rows from the left table that have at least one matching row in +the right table. + +.. ipython:: python + + left.join(right, join_keys=(["customer_id"], ["id"]), how="anti") \ No newline at end of file diff --git a/docs/source/user-guide/common-operations/select-and-filter.rst b/docs/source/user-guide/common-operations/select-and-filter.rst new file mode 100644 index 00000000..8ede230e --- /dev/null +++ b/docs/source/user-guide/common-operations/select-and-filter.rst @@ -0,0 +1,67 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Column Selections +================= + +Use :meth:`.DataFrame.select_columns` for basic column selection. + +DataFusion can work with several file types, to start simple we can use a subset of the +`TLC Trip Record Data `_ + +.. ipython:: python + + import urllib.request + from datafusion import SessionContext + + urllib.request.urlretrieve("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet", + "yellow_trip_data.parquet") + + ctx = SessionContext() + df = ctx.read_parquet("yellow_trip_data.parquet") + df.select_columns("trip_distance", "passenger_count") + +For mathematical or logical operations use :func:`.col` to select columns, and give meaningful names to the resulting +operations using :func:`.alias` + + +.. ipython:: python + + from datafusion import col, lit + df.select((col("tip_amount") + col("tolls_amount")).alias("tips_plus_tolls")) + +.. warning:: + + Please be aware that all identifiers are effectively made lower-case in SQL, so if your file has capital letters + (ex: Name) you must put your column name in double quotes or the selection won’t work. As an alternative for simple + column selection use :meth:`.DataFrame.select_columns` without double quotes + +For selecting columns with capital letters use ``'"VendorID"'`` + +.. ipython:: python + + df.select(col('"VendorID"')) + + +To combine it with literal values use the :func:`.lit` + +.. ipython:: python + + large_trip_distance = col("trip_distance") > lit(5.0) + low_passenger_count = col("passenger_count") < lit(4) + df.select((large_trip_distance & low_passenger_count).alias("lonely_trips")) + diff --git a/docs/source/user-guide/common-operations/udf-and-udfa.rst b/docs/source/user-guide/common-operations/udf-and-udfa.rst new file mode 100644 index 00000000..23ccf225 --- /dev/null +++ b/docs/source/user-guide/common-operations/udf-and-udfa.rst @@ -0,0 +1,85 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +User Defined Functions +====================== + +DataFusion provides powerful expressions and functions, reducing the need for custom Python functions. +However you can still incorporate your own functions, i.e. User-Defined Functions (UDFs), with the :func:`.udf` function. + +.. ipython:: python + + import pyarrow + import datafusion + from datafusion import udf, col + + def is_null(array: pyarrow.Array) -> pyarrow.Array: + return array.is_null() + + is_null_arr = udf(is_null, [pyarrow.int64()], pyarrow.bool_(), 'stable') + + ctx = datafusion.SessionContext() + + batch = pyarrow.RecordBatch.from_arrays( + [pyarrow.array([1, 2, 3]), pyarrow.array([4, 5, 6])], + names=["a", "b"], + ) + df = ctx.create_dataframe([[batch]]) + + df.select(is_null_arr(col("a"))).to_pandas() + +Additionally the :func:`.udaf` function allows you to define User-Defined Aggregate Functions (UDAFs) + +.. code-block:: python + + import pyarrow + import pyarrow.compute + import datafusion + from datafusion import col, udaf, Accumulator + + class MyAccumulator(Accumulator): + """ + Interface of a user-defined accumulation. + """ + def __init__(self): + self._sum = pyarrow.scalar(0.0) + + def update(self, values: pyarrow.Array) -> None: + # not nice since pyarrow scalars can't be summed yet. This breaks on `None` + self._sum = pyarrow.scalar(self._sum.as_py() + pyarrow.compute.sum(values).as_py()) + + def merge(self, states: pyarrow.Array) -> None: + # not nice since pyarrow scalars can't be summed yet. This breaks on `None` + self._sum = pyarrow.scalar(self._sum.as_py() + pyarrow.compute.sum(states).as_py()) + + def state(self) -> pyarrow.Array: + return pyarrow.array([self._sum.as_py()]) + + def evaluate(self) -> pyarrow.Scalar: + return self._sum + + ctx = datafusion.SessionContext() + df = ctx.from_pydict( + { + "a": [1, 2, 3], + "b": [4, 5, 6], + } + ) + + my_udaf = udaf(MyAccumulator, pyarrow.float64(), pyarrow.float64(), [pyarrow.float64()], 'stable') + + df.aggregate([],[my_udaf(col("a"))]) diff --git a/docs/source/user-guide/common-operations/windows.rst b/docs/source/user-guide/common-operations/windows.rst new file mode 100644 index 00000000..f884c7e0 --- /dev/null +++ b/docs/source/user-guide/common-operations/windows.rst @@ -0,0 +1,93 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Window Functions +================ + +In this section you will learn about window functions. A window function utilizes values from one or multiple rows to +produce a result for each individual row, unlike an aggregate function that provides a single value for multiple rows. + +The functionality of window functions in DataFusion is supported by the dedicated :func:`.window` function. + +We'll use the pokemon dataset (from Ritchie Vink) in the following examples. + +.. ipython:: python + + import urllib.request + from datafusion import SessionContext + from datafusion import col + from datafusion import functions as f + + urllib.request.urlretrieve( + "https://gist.githubusercontent.com/ritchie46/cac6b337ea52281aa23c049250a4ff03/raw/89a957ff3919d90e6ef2d34235e6bf22304f3366/pokemon.csv", + "pokemon.csv", + ) + + ctx = SessionContext() + df = ctx.read_csv("pokemon.csv") + +Here is an example that shows how to compare each pokemons’s attack power with the average attack power in its :code:`"Type 1"` + +.. ipython:: python + + df.select( + col('"Name"'), + col('"Attack"'), + f.alias( + f.window("avg", [col('"Attack"')], partition_by=[col('"Type 1"')]), + "Average Attack", + ) + ) + +You can also control the order in which rows are processed by window functions by providing +a list of :func:`.order_by` functions for the :code:`order_by` parameter. + +.. ipython:: python + + df.select( + col('"Name"'), + col('"Attack"'), + f.alias( + f.window( + "rank", + [], + partition_by=[col('"Type 1"')], + order_by=[f.order_by(col('"Attack"'))], + ), + "rank", + ), + ) + +The possible window functions are: + +1. Rank Functions + - rank + - dense_rank + - row_number + - ntile + +2. Analytical Functions + - cume_dist + - percent_rank + - lag + - lead + - first_value + - last_value + - nth_value + +3. Aggregate Functions + - All aggregate functions can be used as window functions. diff --git a/docs/source/user-guide/introduction.rst b/docs/source/user-guide/introduction.rst new file mode 100644 index 00000000..c74ca5ab --- /dev/null +++ b/docs/source/user-guide/introduction.rst @@ -0,0 +1,44 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. _guide: + +Introduction +============ + +Welcome to the User Guide for the Python bindings of Arrow DataFusion. This guide aims to provide an introduction to +DataFusion through various examples and highlight the most effective ways of using it. + +Installation +------------ + +DataFusion is a Python library and, as such, can be installed via pip from `PyPI `__. + +.. code-block:: shell + + pip install datafusion + +You can verify the installation by running: + +.. code-block:: python + + >>> import datafusion + >>> datafusion.__version__ + '0.6.0' + + + diff --git a/docs/source/user-guide/io/avro.rst b/docs/source/user-guide/io/avro.rst new file mode 100644 index 00000000..4be223a6 --- /dev/null +++ b/docs/source/user-guide/io/avro.rst @@ -0,0 +1,13 @@ +Avro +==== + +`Avro `_ is a serialization format for record data. Reading an avro file is very straightforward +with :meth:`.SessionContext.read_avro` + +.. code-block:: python + + + from datafusion import SessionContext + + ctx = SessionContext() + df = ctx.read_avro("file.avro") \ No newline at end of file diff --git a/docs/source/user-guide/io/csv.rst b/docs/source/user-guide/io/csv.rst new file mode 100644 index 00000000..cde17466 --- /dev/null +++ b/docs/source/user-guide/io/csv.rst @@ -0,0 +1,19 @@ +CSV +=== + +Reading a csv is very straightforward with :meth:`.SessionContext.read_csv` + +.. code-block:: python + + + from datafusion import SessionContext + + ctx = SessionContext() + df = ctx.read_csv("file.csv") + +An alternative is to use :meth:`.SessionContext.register_csv` + +.. code-block:: python + + ctx.register_csv("file", "file.csv") + df = ctx.table("file") \ No newline at end of file diff --git a/docs/source/user-guide/io/index.rst b/docs/source/user-guide/io/index.rst new file mode 100644 index 00000000..af08240f --- /dev/null +++ b/docs/source/user-guide/io/index.rst @@ -0,0 +1,28 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +IO +== + +.. toctree:: + :maxdepth: 2 + + csv + parquet + json + avro + diff --git a/docs/source/user-guide/io/json.rst b/docs/source/user-guide/io/json.rst new file mode 100644 index 00000000..7c7976e7 --- /dev/null +++ b/docs/source/user-guide/io/json.rst @@ -0,0 +1,12 @@ +JSON +==== +`JSON `_ (JavaScript Object Notation) is a lightweight data-interchange format. +When it comes to reading a JSON file, using :meth:`.SessionContext.read_json` is a simple and easy + +.. code-block:: python + + + from datafusion import SessionContext + + ctx = SessionContext() + df = ctx.read_avro("file.json") \ No newline at end of file diff --git a/docs/source/user-guide/io/parquet.rst b/docs/source/user-guide/io/parquet.rst new file mode 100644 index 00000000..b3e88dce --- /dev/null +++ b/docs/source/user-guide/io/parquet.rst @@ -0,0 +1,19 @@ +Parquet +======= + +It is quite simple to read a parquet file using the :meth:`.SessionContext.read_parquet` function. + +.. code-block:: python + + + from datafusion import SessionContext + + ctx = SessionContext() + df = ctx.read_parquet("file.parquet") + +An alternative is to use :meth:`.SessionContext.register_parquet` + +.. code-block:: python + + ctx.register_parquet("file", "file.parquet") + df = ctx.table("file") \ No newline at end of file diff --git a/docs/source/user-guide/sql.rst b/docs/source/user-guide/sql.rst new file mode 100644 index 00000000..f54dca9c --- /dev/null +++ b/docs/source/user-guide/sql.rst @@ -0,0 +1,22 @@ +SQL +=== + +DataFusion also offers a SQL API, read the full reference `here `_ + +.. ipython:: python + + import datafusion + from datafusion import col + import pyarrow + + # create a context + ctx = datafusion.SessionContext() + + # register a CSV + ctx.register_csv('pokemon', 'pokemon.csv') + + # create a new statement via SQL + df = ctx.sql('SELECT "Attack"+"Defense", "Attack"-"Defense" FROM pokemon') + + # collect and convert to pandas DataFrame + df.to_pandas() \ No newline at end of file