From b418622c947cfffef968e831ba83c33522cd4f00 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Thu, 14 Sep 2023 10:39:33 +0200 Subject: [PATCH] docs: Integrate user guide (#11089) --- .github/workflows/docs-global.yml | 85 +++++ .github/workflows/test-python.yml | 11 +- .gitignore | 38 +- CONTRIBUTING.md | 6 +- Makefile | 5 + README.md | 4 +- _typos.toml | 1 + crates/polars/src/docs/lazy.rs | 2 +- crates/polars/src/lib.rs | 4 +- docs/_build/API_REFERENCE_LINKS.yml | 264 ++++++++++++++ docs/_build/assets/logo.png | Bin 0 -> 10725 bytes docs/_build/css/extra.css | 64 ++++ docs/_build/overrides/404.html | 222 ++++++++++++ docs/_build/scripts/macro.py | 156 +++++++++ docs/_build/scripts/people.py | 38 ++ docs/_build/snippets/under_construction.md | 4 + docs/data/apple_stock.csv | 101 ++++++ docs/data/iris.csv | 151 ++++++++ docs/data/reddit.csv | 100 ++++++ docs/getting-started/expressions.md | 130 +++++++ docs/getting-started/installation.md | 31 ++ docs/getting-started/intro.md | 16 + docs/getting-started/joins.md | 26 ++ docs/getting-started/reading-writing.md | 45 +++ docs/getting-started/series-dataframes.md | 102 ++++++ docs/images/.gitignore | 1 + docs/index.md | 71 ++++ docs/requirements.txt | 9 + .../src/python/getting-started/expressions.py | 91 +++++ docs/src/python/getting-started/joins.py | 29 ++ .../python/getting-started/reading-writing.py | 41 +++ .../getting-started/series-dataframes.py | 64 ++++ docs/src/python/home/example.py | 12 + .../python/user-guide/concepts/contexts.py | 55 +++ .../python/user-guide/concepts/expressions.py | 16 + .../user-guide/concepts/lazy-vs-eager.py | 20 ++ .../python/user-guide/concepts/streaming.py | 12 + .../user-guide/expressions/aggregation.py | 169 +++++++++ .../python/user-guide/expressions/casting.py | 129 +++++++ .../expressions/column-selections.py | 91 +++++ .../python/user-guide/expressions/folds.py | 50 +++ .../user-guide/expressions/functions.py | 60 ++++ .../python/user-guide/expressions/lists.py | 111 ++++++ .../src/python/user-guide/expressions/null.py | 88 +++++ .../user-guide/expressions/numpy-example.py | 7 + .../user-guide/expressions/operators.py | 44 +++ .../python/user-guide/expressions/strings.py | 61 ++++ .../python/user-guide/expressions/structs.py | 66 ++++ .../expressions/user-defined-functions.py | 56 +++ .../python/user-guide/expressions/window.py | 84 +++++ docs/src/python/user-guide/io/aws.py | 14 + docs/src/python/user-guide/io/bigquery.py | 38 ++ docs/src/python/user-guide/io/csv.py | 19 + docs/src/python/user-guide/io/database.py | 32 ++ docs/src/python/user-guide/io/multiple.py | 41 +++ docs/src/python/user-guide/io/parquet.py | 19 + docs/src/python/user-guide/lazy/execution.py | 36 ++ docs/src/python/user-guide/lazy/query_plan.py | 48 +++ docs/src/python/user-guide/lazy/schema.py | 38 ++ docs/src/python/user-guide/lazy/using.py | 15 + .../python/user-guide/misc/multiprocess.py | 84 +++++ docs/src/python/user-guide/sql/create.py | 21 ++ docs/src/python/user-guide/sql/cte.py | 24 ++ docs/src/python/user-guide/sql/intro.py | 100 ++++++ docs/src/python/user-guide/sql/show.py | 26 ++ docs/src/python/user-guide/sql/sql_select.py | 106 ++++++ .../transformations/concatenation.py | 76 ++++ .../user-guide/transformations/joins.py | 150 ++++++++ .../python/user-guide/transformations/melt.py | 18 + .../user-guide/transformations/pivot.py | 31 ++ .../transformations/time-series/filter.py | 30 ++ .../transformations/time-series/parsing.py | 43 +++ .../transformations/time-series/resampling.py | 36 ++ .../transformations/time-series/rolling.py | 75 ++++ .../transformations/time-series/timezones.py | 27 ++ docs/src/rust/getting-started/expressions.rs | 144 ++++++++ docs/src/rust/getting-started/joins.rs | 29 ++ .../rust/getting-started/reading-writing.rs | 67 ++++ .../rust/getting-started/series-dataframes.rs | 59 ++++ docs/src/rust/home/example.rs | 16 + docs/src/rust/user-guide/concepts/contexts.rs | 69 ++++ .../rust/user-guide/concepts/expressions.rs | 24 ++ .../rust/user-guide/concepts/lazy-vs-eager.rs | 30 ++ .../src/rust/user-guide/concepts/streaming.rs | 19 + .../user-guide/expressions/aggregation.rs | 204 +++++++++++ .../rust/user-guide/expressions/casting.rs | 201 +++++++++++ .../expressions/column-selections.rs | 99 ++++++ docs/src/rust/user-guide/expressions/folds.rs | 49 +++ .../rust/user-guide/expressions/functions.rs | 79 +++++ docs/src/rust/user-guide/expressions/lists.rs | 162 +++++++++ docs/src/rust/user-guide/expressions/null.rs | 89 +++++ .../rust/user-guide/expressions/operators.rs | 54 +++ .../rust/user-guide/expressions/strings.rs | 93 +++++ .../rust/user-guide/expressions/structs.rs | 99 ++++++ .../expressions/user-defined-functions.rs | 84 +++++ .../src/rust/user-guide/expressions/window.rs | 131 +++++++ docs/src/rust/user-guide/io/aws.rs | 32 ++ docs/src/rust/user-guide/io/csv.rs | 29 ++ docs/src/rust/user-guide/io/json-file.rs | 47 +++ docs/src/rust/user-guide/io/parquet.rs | 30 ++ .../transformations/concatenation.rs | 49 +++ .../rust/user-guide/transformations/joins.rs | 205 +++++++++++ .../rust/user-guide/transformations/melt.rs | 21 ++ .../rust/user-guide/transformations/pivot.rs | 28 ++ .../transformations/time-series/filter.rs | 61 ++++ .../transformations/time-series/parsing.rs | 75 ++++ .../transformations/time-series/resampling.rs | 43 +++ .../transformations/time-series/rolling.rs | 130 +++++++ .../transformations/time-series/timezones.rs | 46 +++ docs/user-guide/concepts/contexts.md | 64 ++++ docs/user-guide/concepts/data-structures.md | 68 ++++ docs/user-guide/concepts/data-types.md | 31 ++ docs/user-guide/concepts/expressions.md | 49 +++ docs/user-guide/concepts/lazy-vs-eager.md | 28 ++ docs/user-guide/concepts/streaming.md | 21 ++ docs/user-guide/expressions/aggregation.md | 122 +++++++ docs/user-guide/expressions/casting.md | 100 ++++++ .../expressions/column-selections.md | 134 +++++++ docs/user-guide/expressions/folds.md | 43 +++ docs/user-guide/expressions/functions.md | 71 ++++ docs/user-guide/expressions/lists.md | 119 +++++++ docs/user-guide/expressions/null.md | 140 ++++++++ docs/user-guide/expressions/numpy.md | 22 ++ docs/user-guide/expressions/operators.md | 30 ++ docs/user-guide/expressions/strings.md | 62 ++++ docs/user-guide/expressions/structs.md | 99 ++++++ .../expressions/user-defined-functions.md | 187 ++++++++++ docs/user-guide/expressions/window.md | 91 +++++ docs/user-guide/index.md | 31 ++ docs/user-guide/installation.md | 174 ++++++++++ docs/user-guide/io/aws.md | 20 ++ docs/user-guide/io/bigquery.md | 19 + docs/user-guide/io/csv.md | 21 ++ docs/user-guide/io/database.md | 70 ++++ docs/user-guide/io/json_file.md | 26 ++ docs/user-guide/io/multiple.md | 40 +++ docs/user-guide/io/parquet.md | 24 ++ docs/user-guide/lazy/execution.md | 79 +++++ docs/user-guide/lazy/optimizations.md | 17 + docs/user-guide/lazy/query_plan.md | 96 +++++ docs/user-guide/lazy/schemas.md | 60 ++++ docs/user-guide/lazy/streaming.md | 3 + docs/user-guide/lazy/using.md | 37 ++ docs/user-guide/migration/pandas.md | 328 ++++++++++++++++++ docs/user-guide/migration/spark.md | 158 +++++++++ docs/user-guide/misc/alternatives.md | 66 ++++ docs/user-guide/misc/contributing.md | 11 + docs/user-guide/misc/multiprocessing.md | 104 ++++++ docs/user-guide/misc/reference-guides.md | 6 + docs/user-guide/sql/create.md | 28 ++ docs/user-guide/sql/cte.md | 27 ++ docs/user-guide/sql/intro.md | 106 ++++++ docs/user-guide/sql/select.md | 72 ++++ docs/user-guide/sql/show.md | 22 ++ .../transformations/concatenation.md | 51 +++ docs/user-guide/transformations/joins.md | 183 ++++++++++ docs/user-guide/transformations/melt.md | 21 ++ docs/user-guide/transformations/pivot.md | 46 +++ .../transformations/time-series/filter.md | 48 +++ .../transformations/time-series/parsing.md | 58 ++++ .../transformations/time-series/resampling.md | 42 +++ .../transformations/time-series/rolling.md | 148 ++++++++ .../transformations/time-series/timezones.md | 46 +++ mkdocs.yml | 163 +++++++++ py-polars/Makefile | 1 + py-polars/docs/source/conf.py | 2 +- py-polars/polars/expr/expr.py | 2 +- py-polars/pyproject.toml | 2 + py-polars/requirements-dev.txt | 2 + py-polars/tests/docs/test_user_guide.py | 32 ++ 170 files changed, 10733 insertions(+), 27 deletions(-) create mode 100644 .github/workflows/docs-global.yml create mode 100644 docs/_build/API_REFERENCE_LINKS.yml create mode 100644 docs/_build/assets/logo.png create mode 100644 docs/_build/css/extra.css create mode 100644 docs/_build/overrides/404.html create mode 100644 docs/_build/scripts/macro.py create mode 100644 docs/_build/scripts/people.py create mode 100644 docs/_build/snippets/under_construction.md create mode 100644 docs/data/apple_stock.csv create mode 100644 docs/data/iris.csv create mode 100644 docs/data/reddit.csv create mode 100644 docs/getting-started/expressions.md create mode 100644 docs/getting-started/installation.md create mode 100644 docs/getting-started/intro.md create mode 100644 docs/getting-started/joins.md create mode 100644 docs/getting-started/reading-writing.md create mode 100644 docs/getting-started/series-dataframes.md create mode 100644 docs/images/.gitignore create mode 100644 docs/index.md create mode 100644 docs/requirements.txt create mode 100644 docs/src/python/getting-started/expressions.py create mode 100644 docs/src/python/getting-started/joins.py create mode 100644 docs/src/python/getting-started/reading-writing.py create mode 100644 docs/src/python/getting-started/series-dataframes.py create mode 100644 docs/src/python/home/example.py create mode 100644 docs/src/python/user-guide/concepts/contexts.py create mode 100644 docs/src/python/user-guide/concepts/expressions.py create mode 100644 docs/src/python/user-guide/concepts/lazy-vs-eager.py create mode 100644 docs/src/python/user-guide/concepts/streaming.py create mode 100644 docs/src/python/user-guide/expressions/aggregation.py create mode 100644 docs/src/python/user-guide/expressions/casting.py create mode 100644 docs/src/python/user-guide/expressions/column-selections.py create mode 100644 docs/src/python/user-guide/expressions/folds.py create mode 100644 docs/src/python/user-guide/expressions/functions.py create mode 100644 docs/src/python/user-guide/expressions/lists.py create mode 100644 docs/src/python/user-guide/expressions/null.py create mode 100644 docs/src/python/user-guide/expressions/numpy-example.py create mode 100644 docs/src/python/user-guide/expressions/operators.py create mode 100644 docs/src/python/user-guide/expressions/strings.py create mode 100644 docs/src/python/user-guide/expressions/structs.py create mode 100644 docs/src/python/user-guide/expressions/user-defined-functions.py create mode 100644 docs/src/python/user-guide/expressions/window.py create mode 100644 docs/src/python/user-guide/io/aws.py create mode 100644 docs/src/python/user-guide/io/bigquery.py create mode 100644 docs/src/python/user-guide/io/csv.py create mode 100644 docs/src/python/user-guide/io/database.py create mode 100644 docs/src/python/user-guide/io/multiple.py create mode 100644 docs/src/python/user-guide/io/parquet.py create mode 100644 docs/src/python/user-guide/lazy/execution.py create mode 100644 docs/src/python/user-guide/lazy/query_plan.py create mode 100644 docs/src/python/user-guide/lazy/schema.py create mode 100644 docs/src/python/user-guide/lazy/using.py create mode 100644 docs/src/python/user-guide/misc/multiprocess.py create mode 100644 docs/src/python/user-guide/sql/create.py create mode 100644 docs/src/python/user-guide/sql/cte.py create mode 100644 docs/src/python/user-guide/sql/intro.py create mode 100644 docs/src/python/user-guide/sql/show.py create mode 100644 docs/src/python/user-guide/sql/sql_select.py create mode 100644 docs/src/python/user-guide/transformations/concatenation.py create mode 100644 docs/src/python/user-guide/transformations/joins.py create mode 100644 docs/src/python/user-guide/transformations/melt.py create mode 100644 docs/src/python/user-guide/transformations/pivot.py create mode 100644 docs/src/python/user-guide/transformations/time-series/filter.py create mode 100644 docs/src/python/user-guide/transformations/time-series/parsing.py create mode 100644 docs/src/python/user-guide/transformations/time-series/resampling.py create mode 100644 docs/src/python/user-guide/transformations/time-series/rolling.py create mode 100644 docs/src/python/user-guide/transformations/time-series/timezones.py create mode 100644 docs/src/rust/getting-started/expressions.rs create mode 100644 docs/src/rust/getting-started/joins.rs create mode 100644 docs/src/rust/getting-started/reading-writing.rs create mode 100644 docs/src/rust/getting-started/series-dataframes.rs create mode 100644 docs/src/rust/home/example.rs create mode 100644 docs/src/rust/user-guide/concepts/contexts.rs create mode 100644 docs/src/rust/user-guide/concepts/expressions.rs create mode 100644 docs/src/rust/user-guide/concepts/lazy-vs-eager.rs create mode 100644 docs/src/rust/user-guide/concepts/streaming.rs create mode 100644 docs/src/rust/user-guide/expressions/aggregation.rs create mode 100644 docs/src/rust/user-guide/expressions/casting.rs create mode 100644 docs/src/rust/user-guide/expressions/column-selections.rs create mode 100644 docs/src/rust/user-guide/expressions/folds.rs create mode 100644 docs/src/rust/user-guide/expressions/functions.rs create mode 100644 docs/src/rust/user-guide/expressions/lists.rs create mode 100644 docs/src/rust/user-guide/expressions/null.rs create mode 100644 docs/src/rust/user-guide/expressions/operators.rs create mode 100644 docs/src/rust/user-guide/expressions/strings.rs create mode 100644 docs/src/rust/user-guide/expressions/structs.rs create mode 100644 docs/src/rust/user-guide/expressions/user-defined-functions.rs create mode 100644 docs/src/rust/user-guide/expressions/window.rs create mode 100644 docs/src/rust/user-guide/io/aws.rs create mode 100644 docs/src/rust/user-guide/io/csv.rs create mode 100644 docs/src/rust/user-guide/io/json-file.rs create mode 100644 docs/src/rust/user-guide/io/parquet.rs create mode 100644 docs/src/rust/user-guide/transformations/concatenation.rs create mode 100644 docs/src/rust/user-guide/transformations/joins.rs create mode 100644 docs/src/rust/user-guide/transformations/melt.rs create mode 100644 docs/src/rust/user-guide/transformations/pivot.rs create mode 100644 docs/src/rust/user-guide/transformations/time-series/filter.rs create mode 100644 docs/src/rust/user-guide/transformations/time-series/parsing.rs create mode 100644 docs/src/rust/user-guide/transformations/time-series/resampling.rs create mode 100644 docs/src/rust/user-guide/transformations/time-series/rolling.rs create mode 100644 docs/src/rust/user-guide/transformations/time-series/timezones.rs create mode 100644 docs/user-guide/concepts/contexts.md create mode 100644 docs/user-guide/concepts/data-structures.md create mode 100644 docs/user-guide/concepts/data-types.md create mode 100644 docs/user-guide/concepts/expressions.md create mode 100644 docs/user-guide/concepts/lazy-vs-eager.md create mode 100644 docs/user-guide/concepts/streaming.md create mode 100644 docs/user-guide/expressions/aggregation.md create mode 100644 docs/user-guide/expressions/casting.md create mode 100644 docs/user-guide/expressions/column-selections.md create mode 100644 docs/user-guide/expressions/folds.md create mode 100644 docs/user-guide/expressions/functions.md create mode 100644 docs/user-guide/expressions/lists.md create mode 100644 docs/user-guide/expressions/null.md create mode 100644 docs/user-guide/expressions/numpy.md create mode 100644 docs/user-guide/expressions/operators.md create mode 100644 docs/user-guide/expressions/strings.md create mode 100644 docs/user-guide/expressions/structs.md create mode 100644 docs/user-guide/expressions/user-defined-functions.md create mode 100644 docs/user-guide/expressions/window.md create mode 100644 docs/user-guide/index.md create mode 100644 docs/user-guide/installation.md create mode 100644 docs/user-guide/io/aws.md create mode 100644 docs/user-guide/io/bigquery.md create mode 100644 docs/user-guide/io/csv.md create mode 100644 docs/user-guide/io/database.md create mode 100644 docs/user-guide/io/json_file.md create mode 100644 docs/user-guide/io/multiple.md create mode 100644 docs/user-guide/io/parquet.md create mode 100644 docs/user-guide/lazy/execution.md create mode 100644 docs/user-guide/lazy/optimizations.md create mode 100644 docs/user-guide/lazy/query_plan.md create mode 100644 docs/user-guide/lazy/schemas.md create mode 100644 docs/user-guide/lazy/streaming.md create mode 100644 docs/user-guide/lazy/using.md create mode 100644 docs/user-guide/migration/pandas.md create mode 100644 docs/user-guide/migration/spark.md create mode 100644 docs/user-guide/misc/alternatives.md create mode 100644 docs/user-guide/misc/contributing.md create mode 100644 docs/user-guide/misc/multiprocessing.md create mode 100644 docs/user-guide/misc/reference-guides.md create mode 100644 docs/user-guide/sql/create.md create mode 100644 docs/user-guide/sql/cte.md create mode 100644 docs/user-guide/sql/intro.md create mode 100644 docs/user-guide/sql/select.md create mode 100644 docs/user-guide/sql/show.md create mode 100644 docs/user-guide/transformations/concatenation.md create mode 100644 docs/user-guide/transformations/joins.md create mode 100644 docs/user-guide/transformations/melt.md create mode 100644 docs/user-guide/transformations/pivot.md create mode 100644 docs/user-guide/transformations/time-series/filter.md create mode 100644 docs/user-guide/transformations/time-series/parsing.md create mode 100644 docs/user-guide/transformations/time-series/resampling.md create mode 100644 docs/user-guide/transformations/time-series/rolling.md create mode 100644 docs/user-guide/transformations/time-series/timezones.md create mode 100644 mkdocs.yml create mode 100644 py-polars/tests/docs/test_user_guide.py diff --git a/.github/workflows/docs-global.yml b/.github/workflows/docs-global.yml new file mode 100644 index 000000000000..801449af6e02 --- /dev/null +++ b/.github/workflows/docs-global.yml @@ -0,0 +1,85 @@ +name: Build documentation + +on: + pull_request: + paths: + - docs/** + - mkdocs.yml + - .github/workflows/docs-global.yml + push: + tags: + - py-** + +jobs: + markdown-link-check: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: gaurav-nelson/github-action-markdown-link-check@v1 + with: + folder-path: docs + + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: psf/black@stable + with: + src: docs/src/python + version: "23.7.0" + + deploy: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.11' + + - name: Create virtual environment + run: | + python -m venv .venv + echo "$GITHUB_WORKSPACE/.venv/bin" >> $GITHUB_PATH + + - name: Install dependencies + run: | + pip install -r py-polars/requirements-dev.txt + pip install -r docs/requirements.txt + + - name: Set up Rust + run: rustup show + + - name: Cache Rust + uses: Swatinem/rust-cache@v2 + with: + workspaces: py-polars + save-if: ${{ github.ref_name == 'main' }} + + - name: Install Polars + working-directory: py-polars + run: | + source activate + maturin develop + + - name: Set up Graphviz + uses: ts-graphviz/setup-graphviz@v1 + + - name: Build documentation + run: mkdocs build + + - name: Add .nojekyll + if: ${{ github.ref_type == 'tag' }} + working-directory: site + run: touch .nojekyll + + - name: Deploy docs + if: ${{ github.ref_type == 'tag' }} + uses: JamesIves/github-pages-deploy-action@v4 + with: + folder: site + clean-exclude: | + docs/ + py-polars/ + single-commit: true diff --git a/.github/workflows/test-python.yml b/.github/workflows/test-python.yml index 1d9517d0ed5c..eb17b0b6ed3b 100644 --- a/.github/workflows/test-python.yml +++ b/.github/workflows/test-python.yml @@ -41,6 +41,9 @@ jobs: with: python-version: ${{ matrix.python-version }} + - name: Set up Graphviz + uses: ts-graphviz/setup-graphviz@v1 + - name: Create virtual environment run: | python -m venv .venv @@ -65,11 +68,13 @@ jobs: - name: Run tests and report coverage if: github.ref_name != 'main' - run: pytest --cov -n auto --dist loadgroup -m "not benchmark" + run: pytest --cov -n auto --dist loadgroup -m "not benchmark and not docs" - name: Run doctests if: github.ref_name != 'main' - run: python tests/docs/run_doctest.py + run: | + python tests/docs/run_doctest.py + pytest tests/docs/test_user_guide.py -m docs - name: Check import without optional dependencies if: github.ref_name != 'main' @@ -125,7 +130,7 @@ jobs: - name: Run tests if: github.ref_name != 'main' - run: pytest -n auto --dist loadgroup -m "not benchmark" + run: pytest -n auto --dist loadgroup -m "not benchmark and not docs" - name: Check import without optional dependencies if: github.ref_name != 'main' diff --git a/.gitignore b/.gitignore index 1dd5ecb4236f..5eb602ae7f52 100644 --- a/.gitignore +++ b/.gitignore @@ -1,27 +1,37 @@ *.iml *.so *.ipynb -.DS_Store .ENV -.coverage .env -.hypothesis/ -.idea/ .ipynb_checkpoints/ -.mypy_cache/ -.pytest_cache/ .python-version .yarn/ -.vscode/ -__pycache__/ -AUTO_CHANGELOG.md -Cargo.lock coverage.lcov coverage.xml data/ -node_modules/ polars/vendor -target/ -venv*/ -.venv*/ + +# OS +.DS_Store + +# IDE +.idea/ +.vscode/ .vim + +# Python +.hypothesis/ +.mypy_cache/ +.pytest_cache/ +.venv/ +__pycache__/ +.coverage + +# Rust +target/ +Cargo.lock + +# Project +/docs/data/ +/docs/images/ +/docs/people.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 315ac4c8acd8..cc0993ff47e3 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -148,12 +148,12 @@ If you are stuck or unsure about your solution, feel free to open a draft pull r ## Contributing to documentation -The most important components of Polars documentation are the [user guide](https://pola-rs.github.io/polars-book/user-guide/), the API references, and the database of questions on [StackOverflow](https://stackoverflow.com/). +The most important components of Polars documentation are the [user guide](https://pola-rs.github.io/polars/user-guide/), the API references, and the database of questions on [StackOverflow](https://stackoverflow.com/). ### User guide -The user guide is maintained in the [polars-book](https://github.com/pola-rs/polars-book) repository. -For contributing to the user guide, please refer to the [contributing guide](https://github.com/pola-rs/polars-book/blob/master/CONTRIBUTING.md) in that repository. +The user guide is maintained in the `docs` folder. +Further contributing information will be added shortly. ### API reference diff --git a/Makefile b/Makefile index 532342913f97..54d1bd6d4404 100644 --- a/Makefile +++ b/Makefile @@ -20,6 +20,11 @@ requirements: .venv ## Install/refresh Python project requirements $(VENV_BIN)/pip install --upgrade -r py-polars/requirements-dev.txt $(VENV_BIN)/pip install --upgrade -r py-polars/requirements-lint.txt $(VENV_BIN)/pip install --upgrade -r py-polars/docs/requirements-docs.txt + $(VENV_BIN)/pip install --upgrade -r docs/requirements.txt + +.PHONY: build-python +build-python: .venv ## Compile and install Python Polars for development + @$(MAKE) -s -C py-polars build .PHONY: clean clean: ## Clean up caches and build artifacts diff --git a/README.md b/README.md index 80857d303ab3..01ba31396c62 100644 --- a/README.md +++ b/README.md @@ -40,7 +40,7 @@ - R | - User Guide + User Guide | Discord

@@ -58,7 +58,7 @@ Polars is a DataFrame interface on top of an OLAP Query Engine implemented in Ru - Hybrid Streaming (larger than RAM datasets) - Rust | Python | NodeJS | R | ... -To learn more, read the [User Guide](https://pola-rs.github.io/polars-book/). +To learn more, read the [User Guide](https://pola-rs.github.io/polars/). ## Python diff --git a/_typos.toml b/_typos.toml index 12406b2f4ea8..4d9ec510b278 100644 --- a/_typos.toml +++ b/_typos.toml @@ -7,6 +7,7 @@ extend-ignore-identifiers-re = [ ba = "ba" Fo = "Fo" nd = "nd" +ND = "ND" opt_nd = "opt_nd" ser = "ser" strat = "strat" diff --git a/crates/polars/src/docs/lazy.rs b/crates/polars/src/docs/lazy.rs index fb0c7dfd2d9e..44b536914ce1 100644 --- a/crates/polars/src/docs/lazy.rs +++ b/crates/polars/src/docs/lazy.rs @@ -106,7 +106,7 @@ //! //! ## Groupby //! -//! This example is from the polars [user guide](https://pola-rs.github.io/polars-book/user-guide/concepts/contexts/#group_by-aggregation). +//! This example is from the polars [user guide](https://pola-rs.github.io/polars/user-guide/concepts/contexts/#group_by-aggregation). //! //! ``` //! use polars::prelude::*; diff --git a/crates/polars/src/lib.rs b/crates/polars/src/lib.rs index bd6affefb10e..a9c1aeb879b9 100644 --- a/crates/polars/src/lib.rs +++ b/crates/polars/src/lib.rs @@ -147,7 +147,7 @@ //! (Note that within an expression there may be more parallelization going on). //! //! Understanding polars expressions is most important when starting with the polars library. Read more -//! about them in the [User Guide](https://pola-rs.github.io/polars-book/user-guide/concepts/expressions). +//! about them in the [User Guide](https://pola-rs.github.io/polars/user-guide/concepts/expressions). //! Though the examples given there are in python. The expressions API is almost identical and the //! the read should certainly be valuable to rust users as well. //! @@ -397,7 +397,7 @@ //! * `POLARS_NO_CHUNKED_JOIN` -> force rechunk before joins. //! //! ## User Guide -//! If you want to read more, [check the User Guide](https://pola-rs.github.io/polars-book/). +//! If you want to read more, [check the User Guide](https://pola-rs.github.io/polars/). #![cfg_attr(docsrs, feature(doc_auto_cfg))] #![allow(ambiguous_glob_reexports)] pub mod docs; diff --git a/docs/_build/API_REFERENCE_LINKS.yml b/docs/_build/API_REFERENCE_LINKS.yml new file mode 100644 index 000000000000..4e028d99a8b2 --- /dev/null +++ b/docs/_build/API_REFERENCE_LINKS.yml @@ -0,0 +1,264 @@ +python: + DataFrame: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/index.html + Categorical: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.Categorical.html + Series: https://pola-rs.github.io/polars/py-polars/html/reference/series/index.html + select: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.select.html + filter: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.filter.html + with_columns: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.with_columns.html + group_by: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.group_by.html + join: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.join.html + hstack: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.hstack.html + read_csv: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.read_csv.html + write_csv: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.DataFrame.write_csv.html + read_json: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.read_json.html + write_json: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.DataFrame.write_json.html + read_parquet: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.read_parquet.html + write_parquet: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.DataFrame.write_parquet.html + min: https://pola-rs.github.io/polars/py-polars/html/reference/series/api/polars.Series.min.html + max: https://pola-rs.github.io/polars/py-polars/html/reference/series/api/polars.Series.max.html + value_counts: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.value_counts.html + unnest: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.unnest.html + field: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.struct.field.html + struct: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.struct.html + rename_fields: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.struct.rename_fields.html + is_duplicated: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.is_duplicated.html + replace: https://pola-rs.github.io/polars/py-polars/html/reference/series/api/polars.Series.str.replace.html + sample: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.sample.html + day: https://pola-rs.github.io/polars/py-polars/html/reference/series/api/polars.Series.dt.day.html + head: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.head.html + tail: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.tail.html + describe: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.describe.html + col: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.col.html + sort: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.sort.html + scan_csv: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.scan_csv.html + collect: https://pola-rs.github.io/polars/py-polars/html/reference/lazyframe/api/polars.LazyFrame.collect.html + fold: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.fold.html + concat_str: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.concat_str.html + str.split: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.str.split.html + Expr.List: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/list.html + element: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.element.html + all: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.all.html + exclude: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.exclude.html + alias: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.alias.html + prefix: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.prefix.html + suffix: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.suffix.html + map_alias: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.map_alias.html + n_unique: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.n_unique.html + approx_n_unique: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.approx_n_unique.html + when: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.when.html + concat_list: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.concat_list.html + list.eval: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.list.eval.html + null_count: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.null_count.html + is_null: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.is_null.html + fill_null: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.fill_null.html + interpolate: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.interpolate.html + fill_nan: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.fill_nan.html + operators: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/operators.html + map: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.map.html + apply: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.apply.html + over: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.over.html + implode: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.implode.html + dt_to_string: + link: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.dt.to_string.html + name: dt.to_string + selectors: https://pola-rs.github.io/polars/py-polars/html/reference/selectors.html + cs_numeric: + link: https://pola-rs.github.io/polars/py-polars/html/reference/selectors.html#polars.selectors.numeric + name: cs.numeric + cs_by_name: + link: https://pola-rs.github.io/polars/py-polars/html/reference/selectors.html#polars.selectors.by_name + name: cs.by_name + cs_first: + link: https://pola-rs.github.io/polars/py-polars/html/reference/selectors.html#polars.selectors.first + name: cs.first + cs_temporal: + link: https://pola-rs.github.io/polars/py-polars/html/reference/selectors.html#polars.selectors.temporal + name: cs.temporal + cs_contains: + link: https://pola-rs.github.io/polars/py-polars/html/reference/selectors.html#polars.selectors.contains + name: cs.contains + cs_matches: + link: https://pola-rs.github.io/polars/py-polars/html/reference/selectors.html#polars.selectors.matches + name: cs.matches + is_selector: + link: https://pola-rs.github.io/polars/py-polars/html/reference/selectors.html#polars.selectors.is_selector + name: is_selector + selector_column_names: + link: https://pola-rs.github.io/polars/py-polars/html/reference/selectors.html#polars.selectors.selector_column_names + name: selector_column_names + DataFrame.explode: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.explode.html + read_database_connectorx: + name: read_database + link: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.read_database.html + feature_flags: ['connectorx'] + read_database: + name: read_database + link: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.read_database.html + write_database: + name: write_database + link: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.DataFrame.write_database.html + read_parquet: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.read_parquet.html + write_parquet: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.DataFrame.write_parquet.html + scan_parquet: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.scan_parquet.html + read_json: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.read_json.html + read_ndjson: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.read_ndjson.html + write_ndjson: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.DataFrame.write_ndjson.html + write_json: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.DataFrame.write_json.html + scan_ndjson: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.scan_ndjson.html + from_arrow: + name: from_arrow + link: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.from_arrow.html + feature_flags: ['fsspec','pyarrow'] + show_graph: https://pola-rs.github.io/polars/py-polars/html/reference/lazyframe/api/polars.LazyFrame.show_graph.html + lazy: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.lazy.html + explain: https://pola-rs.github.io/polars/py-polars/html/reference/lazyframe/api/polars.LazyFrame.explain.html + fetch: https://pola-rs.github.io/polars/py-polars/html/reference/lazyframe/api/polars.LazyFrame.fetch.html + SQLContext: https://pola-rs.github.io/polars/py-polars/html/reference/sql + SQLregister: + name: register + link: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.SQLContext.register.html#polars.SQLContext.register + SQLregister_many: + name: register_many + link: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.SQLContext.register_many.html + SQLquery: + name: query + link: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.SQLContext.query.html + SQLexecute: + name: execute + link: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.SQLContext.execute.html + join_asof: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.join_asof.html + concat: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.concat.html + pivot: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.pivot.html + melt: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.melt.html + is_between: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.is_between.html + strftime: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.dt.strftime.html + strptime: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.str.strptime.html + year: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.dt.year.html + convert_time_zone: + name: convert_time_zone + link: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.dt.convert_time_zone.html + feature_flags: ['timezone'] + replace_time_zone: + name: replace_time_zone + link: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.dt.replace_time_zone.html + feature_flags: ['timezone'] + date_range: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.date_range.html + upsample: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.upsample.html + group_by_dynamic: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.group_by_dynamic.html + explode: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.explode.html + cast: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.cast.html + np.log: + name: log + link: https://numpy.org/doc/stable/reference/generated/numpy.log.html + feature_flags: ['numpy'] + lengths: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.str.lengths.html + n_chars: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.str.n_chars.html + str.contains: + name: str.contains + link: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.str.contains.html + starts_with: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.str.starts_with.html + ends_with: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.str.ends_with.html + extract: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.str.extract.html + extract_all: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.str.extract_all.html + replace: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.str.replace.html + replace_all: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.str.replace_all.html + Array: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.Array.html + arr: https://pola-rs.github.io/polars/py-polars/html/reference/series/array.html + +rust: + DataFrame: https://pola-rs.github.io/polars/docs/rust/dev/polars/frame/struct.DataFrame.html + Series: https://pola-rs.github.io/polars/docs/rust/dev/polars/series/struct.Series.html + Categorical: + name: Categorical + link: https://pola-rs.github.io/polars/docs/rust/dev/polars/prelude/enum.DataType.html#variant.Categorical + feature_flags: ['dtype-categorical'] + select: https://pola-rs.github.io/polars/docs/rust/dev/polars_lazy/frame/struct.LazyFrame.html#method.select + filter: https://pola-rs.github.io/polars/docs/rust/dev/polars_lazy/frame/struct.LazyFrame.html#method.filter + with_columns: https://pola-rs.github.io/polars/docs/rust/dev/polars_lazy/frame/struct.LazyFrame.html#method.with_columns + group_by: https://pola-rs.github.io/polars/docs/rust/dev/polars_lazy/frame/struct.LazyFrame.html#method.group_by + join: https://pola-rs.github.io/polars/docs/rust/dev/polars_core/frame/hash_join/index.html + hstack: https://pola-rs.github.io/polars/docs/rust/dev/polars_core/frame/struct.DataFrame.html#method.hstack + SQLContext: https://pola-rs.github.io/polars/py-polars/html/reference/sql.html + read_csv: + name: CsvReader + link: https://pola-rs.github.io/polars/docs/rust/dev/polars_io/csv/struct.CsvReader.html + feature_flags: ['csv'] + scan_csv: + name: LazyCsvReader + link: https://pola-rs.github.io/polars/docs/rust/dev/polars/prelude/struct.LazyCsvReader.html + feature_flags: ['csv'] + write_csv: + name: CsvWriter + link: https://pola-rs.github.io/polars/docs/rust/dev/polars_io/csv/struct.CsvWriter.html + feature_flags: ['csv'] + read_json: + name: JsonReader + link: https://pola-rs.github.io/polars/docs/rust/dev/polars_io/json/struct.JsonReader.html + feature_flags: ['json'] + read_ndjson: + name: JsonLineReader + link: https://pola-rs.github.io/polars/docs/rust/dev/polars_io/ndjson_core/ndjson/struct.JsonLineReader.html + feature_flags: ['json'] + write_json: + name: JsonWriter + link: https://pola-rs.github.io/polars/docs/rust/dev/polars_io/json/struct.JsonWriter.html + feature_flags: ['json'] + write_ndjson: + name: JsonWriter + link: https://pola-rs.github.io/polars/docs/rust/dev/polars_io/json/struct.JsonWriter.html + feature_flags: ['json'] + scan_ndjson: + name: LazyJsonLineReader + link: https://pola-rs.github.io/polars/docs/rust/dev/polars_lazy/frame/struct.LazyJsonLineReader.html + feature_flags: ['json'] + read_parquet: + name: ParquetReader + link: https://pola-rs.github.io/polars/docs/rust/dev/polars_io/parquet/struct.ParquetReader.html + feature_flags: ['parquet'] + write_parquet: + name: ParquetWriter + link: https://pola-rs.github.io/polars/docs/rust/dev/polars_io/parquet/struct.ParquetWriter.html + feature_flags: ['parquet'] + scan_parquet: + name: scan_parquet + link: https://pola-rs.github.io/polars/docs/rust/dev/polars/prelude/struct.LazyFrame.html#method.scan_parquet + feature_flags: ['parquet'] + min: https://pola-rs.github.io/polars/docs/rust/dev/polars/series/struct.Series.html#method.min + max: https://pola-rs.github.io/polars/docs/rust/dev/polars/series/struct.Series.html#method.max + struct: + name: Struct + link: https://pola-rs.github.io/polars/docs/rust/dev/polars/datatypes/enum.DataType.html#variant.Struct + feature_flags: ['dtype-struct'] + implode: https://pola-rs.github.io/polars/docs/rust/dev/polars_lazy/dsl/enum.Expr.html#method.implode + sample: + name: sample_n + link: https://pola-rs.github.io/polars/docs/rust/dev/polars/frame/struct.DataFrame.html#method.sample_n + head: https://pola-rs.github.io/polars/docs/rust/dev/polars/frame/struct.DataFrame.html#method.head + tail: https://pola-rs.github.io/polars/docs/rust/dev/polars/frame/struct.DataFrame.html#method.tail + describe: + name: describe + link: https://pola-rs.github.io/polars/docs/rust/dev/polars/frame/struct.DataFrame.html#method.describe + feature_flags: ['describe'] + collect: + name: collect + link: https://pola-rs.github.io/polars/docs/rust/dev/polars/prelude/struct.LazyFrame.html#method.collect + feature_flags: ['streaming'] + col: https://pola-rs.github.io/polars/docs/rust/dev/polars_lazy/dsl/fn.col.html + sort: https://pola-rs.github.io/polars/docs/rust/dev/polars_lazy/dsl/enum.Expr.html#method.sort + arr.eval: + name: arr + link: https://pola-rs.github.io/polars/docs/rust/dev/polars_lazy/dsl/enum.Expr.html#method.arr + feature_flags: ['list_eval','rank'] + fold: + name: fold_exprs + link: https://pola-rs.github.io/polars/docs/rust/dev/polars_lazy/dsl/fn.fold_exprs.html + concat_str: + name: concat_str + link: https://pola-rs.github.io/polars/docs/rust/dev/polars_lazy/dsl/fn.concat_str.html + feature_flags: ['concat_str'] + concat_list: + name: concat_lst + link: https://pola-rs.github.io/polars/docs/rust/dev/polars_lazy/dsl/fn.concat_lst.html + map: https://pola-rs.github.io/polars/docs/rust/dev/polars_lazy/dsl/enum.Expr.html#method.map + apply: https://pola-rs.github.io/polars/docs/rust/dev/polars_lazy/dsl/enum.Expr.html#method.apply + over: https://pola-rs.github.io/polars/docs/rust/dev/polars_lazy/dsl/enum.Expr.html#method.over diff --git a/docs/_build/assets/logo.png b/docs/_build/assets/logo.png new file mode 100644 index 0000000000000000000000000000000000000000..9b5486edce3bf6483fc211621bddbb2cb99290b6 GIT binary patch literal 10725 zcmaia2UJtp_ihSBq>VBP0wQW4Ho8FQzlo7muAWfPPLhn+7%%~#*dO>;_rO8El zN01;2Mkz{fVw4DDXc45SAn(MP|8M=?dhe~b7A$h_Id|u>&pvy9-?z`7znK{cZF(l>_s;Zinktd@lh*Lmg1^Mv{pA&HuXK}c8PK( z+~3VI-0Yl7xTlMjE73rIyKbm9FhF$+be0XJdU*$Ehw2f3*wqG~xwn;wvOlH-dg>7& zsaJ~Z%J5$k;7X5pDKOAiTS+M-Bt$VpP0`2S zLrFzTOG`;vRY_G<0nAVc2=fkf4ps0DI0!6km;GVmlv{v{KgBnY;^QsLwds7(Cn!*l zNZc;_uW478|5)}7^7r~-*VRSI&C8AI<{cQIq@t*z1kB0G>bkmU2T}sP+_)G27(d|^ z_~YWAD=!Myq_&r{w}&1vRKeBF-8slBkf?9s9OmXPtD>vK{eCaXKac;o`1g5TrT;Sh z&$Rz~ZQzU_lYuw5AOGWBaPgn@aq|X?;t#wZJ}=G2U}Q1SsT1cyv*zD~B^SVv3!fGf zQ_h~|IV*INbn;es>`Cbxe_CF(JoowLNpIPkX@`!O{V7R2Rn|e;&y-L<^ef^?e;|8S z`nR8cyYt}e&eKl?cIBr_-Y$HU`>j)pD*0=_$B0k%a?aPNA@qZ7=%mLYTN_>((3ulP z2+)81bYIU{;USzrcG0P{ZX-ROh6c23&f5yiz{xU7k!DTChdpnb2u1Q=+RsPJAo&Yn zg{C-UVn^tKile?sJ4EgHH>63GXitt)S2nF26C%AfGJ9Td&Kfo{SH8ZNM#MhC+r@`V zRO}@2k{VIfg$7{Rg}jtOC?S&iYwzVDw`{|Q60d9|yLkVN5_A%^zFQ&}v_CWy$v4i2 zZoTKuHeR|<8SmpukaU^IR&8h=?K|2jvE>X69hY32BjioS5h3Jw4nfo0|8(B_s>#I! z(rUX@VkV*U)~CkY!O*!);+F9l#ISPoAyKyb)k#{WdQZgX8|C9opBgaZx0db;DS9lt z4xQ^f-jhVIPb@Hfd0u?&;>x@4W!d4#VQ7s0z?=caJ%}@~{mQxi>O#aMq=8@MSzz4o z5V)?0H!DYd5_Jj*E9nKMeF%9;hpr;DeLQ~;QQ3ygvhJ3bhXT6#`IaRPvhLROE=$ zou_2dSY`A`iACq+qqJ(2J>l1x$q(IZJ9!AWH^J|3V`dH7;HWX z>&^dA;!{N}-JIrgZm_9ry3pNlvnlxQ7k}nLe8(o0DMxRMUup`JsG#OOY)R6sL-lKm z#YMwiO=rn%(8=iinl4Z}NrN%C-}Uc8)oF zXFV<&cIIrQOhE#I;pG$q2NV{+NWjMo^mudQ1d7yFh!$# zERSZV(KU`;cVW5+s}&T>jmEXLxb=UJsZ#L0fV?e6_n8`0KTBnOqTA6PnS^e9q%WiF zxk9m|4px6&My<{o!7^jW^HPA}9zM^PE%|4Mew9+x^4AsHU3ZTkS0wkPMWY&#ccJt@ zq`povp>6~sN^yZP=d{0$redJX0>Cdjr zLgD3L{)M;7=8|y*rbx|$g&|;r0D`mczNAO=;CXlBgOxQPo2<}&Y@2b1#=9wmVzILR z!dVIZ4|OZQUoS^db%mM!N6=m%?H|Jv>%wBR9PU5^xv`ISDzWsS#30q2dnXqDmXyUdK+f?^YeK$GA456)IjO-?O zQZ$6;I-R~1SuKceM71~foEx-@iQO}&LW)ns_2xtwBD+r@27C^ahZa84vGG*{E|dN8 zG@an&Fk}A4Zg_AOiQpNL8oB$rRcQ!oz;E^dt=7)S$&U(;nrg$MrisluzcnJa0t2CWzzPLpERx5jq3NoD@^TSyP z**)Mp-+dkPNcrpHMlGF}o{FL?RcA6lL4>%r8yzv?!!EMtF>W4i6%x9wHjob$sdBIMr(=bl1X5BZIM!1A~rGoz;q=& zAZZSU#cR^j38r4lIj)yYfCc}xVD7efBetHFsZB24Y@8t6Avy;5KRAsQcI=_11ZH`@|~S_fr4_jBEBAqEBGWd@Y}J((2Y|te)~p^-&{1PHTjL; zftrCywV5#Fcq(^vEHkE8{Y5w#j;uSILg-c}XXeYn@GveB=YZdcc#^h=i?)K5JT+9H zi~w&JISNBKZhSv9Nq5@OL+v1KRH01|Gt;5I4e-So>F9P&$@$)s$XEKyJP-sap9z*O z|17=L>VG~yKJ`t_#nCIPNZ>~d$-+VU98uwe29|J^33+@P!-{wgq|PDfQ7qGf$8 zV*Pq~awcVsgx5bmAVBlkd~;*Ui?>i)O?F%f8zm(2DAdMM)~Y>pn9t&?2$VAJopgd7PSjiaTSdqJELGp?FX}xL_xvXB zH-e=;s`EU#MnM0DZ1JLPD*f~_p1GYlEv&ZH;+&$L?5gSH<0NjlmED@s_P#0iA?g#T zUau%On%(x~$&})e4Yld@<+>WE4_AVEBF(wMYv1i(GEeO6t|ag_r?(A9_LW>NCohT8 z9nTG_mrf5A5HeM4M%HBux0-V;;S#&bAkWmASf3jnPt@9IR-hs^Oq6|@xkKp1c$nC!hkRyyGPdx(OIn(>sjf!M z55fm?)te_s!^ni>`s}qk-+ISq=0RSv<-3+zFTyLbSO3|g4>zv!*{Dx+T3AXrG)XZ? zneA_B*Hyj_hy2W|yPZ_z>@ME4-n@1(LEnd*Nl;L0#|_6_N&c$HaG<}Wy~TTLclfos zbLgEr;FZ@Z*WXaOhI@}x*Z#59+@gB~vfxLWx33?MRr}>2KozAlevA<@>}^_S@TW&D zqLKIFw0Z{nKSr(}ID-rvg=Dr%_Z?aog;uG_1v1q&0!@u3?+e7Cm!pDGGkXU}=6bFa zG5W{&Wn9)M$rAa>(vM$Wxcl_mt5hv=Cg!`q`i1u9)>1S;Y8tY5d+Unmc%c`cS3k*8 z&&5+5uGL`V#dzRF`Rj6&**lpoaL|QnhC-RA&hxC2yN~1`>iIDqx^GFs!R!0j4&ojM z^d23C1iN15^$DFD6s4(28kXS`C}~6}LoJ7r z1(%~_N!CB}@$gC{^BX8jyUh`t!yDx#4I#z()?(+v{2lAER|yJrI@o&8tS`4gz#D?> zoBu5MWsa-y7>`n-gE_J{MqF*=s_QazfF?*=vmQLGFayH!C~XU`d-bFEuwL=#Q$lwh zpcVNap7+(o5bCjyx6k#-E_{SmOHfhG=j?MxH*G4WQ~kyjDufckEy8kNuP zsyXbXyiWc+56NrU)eo!|nmXGp4nhk9%I3)NrlVJoVwLk_WwYPom+SZ!#z@tIt#3&_ zYRnVyporvYJ^E3q=9>)d4|)!zmr_ShOg|_oB_ooOicu5(na#^IF)2{)0DeJs_o~ry z)t@)=gx-t`?d9Kh>bCnafEII=8vk<}%QlqA(zfis%@C+`IS20VKElzi79yaQ;SXz2;T&i_2sp}6**wNTuocWP1L+|b&MeO1PF-K6_ z!Dkx;iVKK5uRy<)E2sI>1&h%#%%sk2*qJ^ONS6>Yfam%xUxu>;GQZ#v#&%|ukW7T+ z%I=#%0Ho|$_+9xip&lbcciQ@N)*hc_3C~Q7#DNNuUv^>a#I)RASH*lnB5wjuaJusQ z`vj#@5D8gBPmKM=n8NfECJjX)NPmon)nN4GtF1+lZY+q`%tz`b-RB27!UPX9#A3{q z!<_LWJ4iMoG16$V)!n0HH@_w7DfxB|ntn%q}Mv@E$+r$&c^lt;?2uUqvv&@Y_ui6yz2u{U-2W zIn7$*R>Wzz)GMusS2cpL#l!qyVl{&C#%>66%G=c|2o%U>?05hiB;hL?OEymesMs6c z``X>g_P1WV%+9Z5O4n`ur||LUnv>3vQDs89qC}3!Yg0A-4CtA z7El3%m!lb^5fpwWWMVsU<{-uj=?7zs2w{G3OWr6eKj#w&+hTIB$I_F!d$`FAR+Mm6 zJ{O`PO`o4!JD;g?D5kH9_2F!HJVBFR=7p2d?`OCEvJu?)dHq+olzrJ7=}2Xz(DzDp zF1#k0vxUi-)4w7!nCO>IY75YX3|2YH-V`p9qiAc+%Pza`bv15rqV#vkY5hcsc#$?w zTa#;q;70L$0_7byoQ^O==$Vp+kF^-9fTTbfe{5B@6cm_NuU;B+__c9Qg>}0w7k>5? zJpa7(xAdOLnr{}&6DIWy3q~m0F~jtsUC7^1Apw$Fc4H<=PHzgWzgyYLE1&}AozjNB zeFXAq1Y&JHC?)9{R2sMT*PekS&Sdw~2K7QQ34=&tShPEKNcRenm(g6#5Jqey4CML6^j z=P>gCbJ!)~A&#ma+#$%jLp^e1EQ3{`qVtKH32e zScx^2)J|xd?C5Xq)C}~8v(vUm6Jnt3G;76^tTS}Nqs z8n7{EkO|&(7pIVpac!!cuQ421lEi?7&ke%RDpp^3BH~5EKHN7`z2bspAwG{b9|O0Z zXLL)HJ@&C^ps^QgjknW$Vb_^;7Y5dDfo&cWy*KKNphpAqVNccECa2 zc@O2e`S=ej_^pTU^q&Mfm9LuTwoRW#ZnA5Wx*UK{Bca13#b;}@k zxP`K=9wnEgq}(b(+8N6Dy)&c(upV2DYaXeo*{Ci4;rXlgdN7K-b)IQqeI%tabXe}o zPq8h(ggD=-df^1+i=ph+j?DLc6*Z8>T2q7T!0E!{chRpwh&BGY=1tvVR9~}AH;SjZ z`nB}*HFDG$i#5f2n`Jx1)q9dM9-s$l!>XI?p&yb~P$8P>I@Rv&ce2bC*A` zuAR5Agjc>2I&oW44QhFB9qSU>^t`sL;M~Q_QIT!{xfFboWX9c!itdhYOlt$e#e}3j zW+~c3FY)wyE~v%9K}R?qLPK{lI(lP_tOu0}bMNNse=-5E^$x1-9-1C#z;vK%xJXCs z@zwS{2iw1H`%LHt=+V8qhz%fv9_7${_tZXi88=_?GYP~r^oX?Z9&wApDMU!R^2(uMR1+4Qtf4YTIXxKS5Ygn8(nsg%U3eH!@(DCJE1&$>%l+S!H%xADLzFPe4 zgM|^2x!_-@qWO|NvtFtr=e07Ky^EXSV0*!hP0qHopd$Gt zl4>LeWf!BO{e_oG`BqEOEw9^hD}5JoDdZ}#*bm#hRXUulv2UflbN53Tm1wvh(u;`} zm=GHYztygUe~fFcMIZM++sJ>M_OdU9~Yq3cfeuO8#dsq;8M7|4ixiYu9^Mt_<9SB3EW_GTjEk!?5GBD(8M zx(1UHzr2e5e&4T98*x?RYkk#me#ylRE}iwZvJg`$db2M-I-T`9dbv@#HeX9K(PPZE ze67YFfwoY4#WJ?+@(VwQELIQneTNP21)hzIPZQ&4N0Er*jIwTbrP{T*jxc^e{*TX0 zSJnhoPWJ2OoXsNUQnKnLf${;?3BRr#TYY~@7p|Ms5VB$!9A)ScNe=7*U-6sSQk``d zDy%F-yMUaxQ%Zku<*}jPEKU5}Aa7oC=sJ^8Ne>q8n3N#}hyDp?5bc6)Ycckbknk?G z-*A7SIZ3Q7tf)t`+cu4wousHP;qiSU6VxZ4yr1jQ3^0#hDO)1>>D4tKHBq)JlO-p! zI=R<%QV`HLN?Yh2s9NLW`IZ8pn@|e5q5!1Fm1H0`7K}p{qFnI+38!<#kpDr6V7GH+ z!MItrCNd%2V31TAu$#G)Spo-9`~L(((#im zq2e3Su3H;oGu0ddl-?g>jpL&0Ct|b6F-7MF%cWe455gZrmMI7on_%5;OQl)vs%D@N4;9k4G(U1D64ey;ccv<|h1zfK@2Sp}wJzrdxuLp|X# zwvz3*X*nv?@u~OeOywH_g`JJ6=Q~(w1llgR8OdDdr-fjTg8NDp(Fh)8p9IPbq}kIKtOu_gWYU0SMV|JRSMri-(%~pU3yYJ<%SRcp zF(GQ3rTaCplesJ%cx8~g)ibB{sJ3;;oY35`^3r3vgYfden2JU~mmHEC&QHo@EWwQXVc@Li`-2Dr@xjhua5n);b6$q;@ zICma)$*wIykH%cb`Ct^&oaTanX5j#<52aV5wlA5Y`Kg#0Gv#1fIrfV6;H*5!0SRYZ z8OsOwi5w%2RWkM$bc)*{D(p*vOVcmTmBd)%%gw43Dh4H79IfDzB8e*sv1;Bvsf?iyuT(SB8Mmzui;r{=~L3G148B>2L z<|y4h{ozC@?_*J}h$ubPpr~;*IDi6DmZzr87&6BfAuv+eCZTj@=^MP#vcqu;@07|n zRaRexDsojEq=uyt%r`#|a5JW!Z&IUUHj#3CA-of03!uLHOC6;~kA!CVE$Q<_l)Y}4 z!Io1&ia?I5fdM;p8IResjyAAoOec1cZ^%Stx`?J%!hU~P6}|@`43~D-C)~{iZfYqNkSXuRUng@2|@eioQ3O$y(&< z@+J~N`#GP`rPMz^8)R?Ot@g_yC>^8ZW5ke^t^vI?hO^UI3-lzK9xXH8Z~X|L=(_ZbULKN? z0s5m=Y8R~j{`=i^!pf70lsZB^E*NiR;!pC85*@w_yx4RizxqY+@`ut2UwY_W};}2&~97m%Fj%}5_IL}x4xR8 zz}utO4PNB`p_W)cO(_Eu0u@oU)8QjkJ5PQqVH-$J1IfS)FY_euTg~^MMjoO1uiULH zENAeZQXV!7vw$~Q!kh$pU7`?~Rs%u7xCJ3E750Am>ChDOgAdSvBwdMhSq@FWr3^bM z(%b|jy!dC_q+Bbx{)%%FNOq5~Mwmp*<*PSx<-*+H$cFi@noB5Kni&HH^9OryZVx@M z!g-tpR1hFoo7yb+bPzo}U+zzUw-JhPz~@Q|<`A9M>Z2c_TW?Hhmv+xWGF6 zFlQHYAIiR|I4!^O2gn84Ykl5Y87C2GYj|t$kojeKMil*u#NyoKwVdtE2N+Y=MMa>O z2w)0ow_UB7qCjWGe6MlN890Sp$L(_h{C<)eBbGD)dI-=aXtB6YdFwHf3xl@JM-5Nh zYSGEj7?Q)NC~sJNJ5r)G0GzrYB9g79_#V%RT4Ojgf*j&Fpgbc;13eme19 z)4Wda4kP#!=^pxc^z{G%l+vAtR<41rm;>0d<<3>llCdJD{y%fF(5AvST_EIqK=tv2 zL@xT~eXd#E`+X6}4dCuR08Q!u9tw?Ye;%3CoQHmIaV{J%L8uaQ<+=+A+r;qzYiMex zr{&+V8LPQ*>BWA3J9Ur)p$^IbS|MNU0tNN=t?@!jIc} z@$(cQHE68yA&t4qRc5WE%Cvg_yvZO-C>B_<*1xd>lk!9%#TYE z{SU$^m((J~raw(DLo3 z-{4faQL#{Bg=o^|M$1l;5D>(JKCu7*cLx+tPPMsy-aHA26I7>J6_Hk<^WNOp_Nx!L^p?#;=2wb zCsB4b6@ZMU{ot8B#{PCpA?B*((~0e*V~p$)_8(LLTE-;E(!GRmsA7Y~{;M9;Geg8VC+pY#ob^2Z+X@gDJ z$NL=xRYqH-IEJ8!&lw;EJJS0K8O1?~XDe;~xEb z+QW)A=8o^1G3A6tx9UHk`MK?0uSv@b@yq%0i0&RzI%B(((85}T5ND9E=B1%R!GU~0 z2*Yq*82UX@S$}Dkp*f{1r;QHD{gz>i%*V7AbvqQ-V-k#2*#&U17I{2Kuf%Us4e=TT z0^sosqS_(5(hbE5qjp`&v6Q;m%augtwK{j-!^TL>=O`e(b9epl;Is28y`Y=rwXS9_ z`0T$K!H5mzQPn3>iR28xp9)7oT_mc@6b0n~`KhX^vM=~2POPCzmSA(k*^~hb?#Y`z zVILP5HZb*9VitqiJvUh&L|>ytqiBhI?)`!ExGUT_9cwascbAGPzPpfLuDl;7IKH6Ts}jw*p@MFYK-v0fi%s z`cL@FLviz+LJO^4A2=l1|mfDZx`)AwFyTf_acL!aOk2EeHu-1Z8u zHE2ynVD^j6x?VLeBRx zyy@TN?6K)EdvU|c<5)puw+qE?9CSQ1-{m}GmcO>L!WjU0_}M1~KJ%5@4H;d}r^B0= zhMz~g`CL4&p-1S$*Y=TBCJUe*JZP^3XRXxO$zHQ#MdJliZ~^Tv0s zy#$ThiRMU6N`HTM?M^DjD4w(`$(Pz_+jy;QEg|sa& zR_qW1<0{4Z=D@B`xp!D&d{a5MU6;$gXAeX}-Z`ui;Ym+x`7l8Z0mG#NJozI{I zvVZQ-mb|U*o}cSCk*stUs`Az|RbESlak5Y%I+d_!zEpiH#DY0ScneM!5XS2^H1GUx i$L2r`Lx}du7B=N7Ymi+`jyq4bp literal 0 HcmV?d00001 diff --git a/docs/_build/css/extra.css b/docs/_build/css/extra.css new file mode 100644 index 000000000000..420db3966780 --- /dev/null +++ b/docs/_build/css/extra.css @@ -0,0 +1,64 @@ +:root { + --md-primary-fg-color: #0B7189 ; + --md-primary-fg-color--light: #C2CCD6; + --md-primary-fg-color--dark: #103547; + --md-text-font: 'Proxima Nova', sans-serif; +} + + +span .md-typeset .emojione, .md-typeset .gemoji, .md-typeset .twemoji { + vertical-align: text-bottom; +} + +@font-face { + font-family: 'Proxima Nova', sans-serif; + src: 'https://fonts.cdnfonts.com/css/proxima-nova-2' +} + +:root { + --md-code-font: "Source Code Pro" !important; +} + +.contributor_icon { + height:40px; + width:40px; + border-radius: 20px; + margin: 0 5px; +} + +.feature-flag{ + background-color: rgba(255, 245, 214,.5); + border: none; + padding: 0px 5px; + text-align: center; + text-decoration: none; + display: inline-block; + margin: 4px 2px; + cursor: pointer; + font-size: .85em; +} + +[data-md-color-scheme=slate] .feature-flag{ + background-color:var(--md-code-bg-color); +} +.md-typeset ol li, .md-typeset ul li{ + margin-bottom: 0em !important; +} + +:root { + --md-admonition-icon--rust: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 512 512'%3E%3C!--! Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--%3E%3Cpath d='m508.52 249.75-21.82-13.51c-.17-2-.34-3.93-.55-5.88l18.72-17.5a7.35 7.35 0 0 0-2.44-12.25l-24-9c-.54-1.88-1.08-3.78-1.67-5.64l15-20.83a7.35 7.35 0 0 0-4.79-11.54l-25.42-4.15c-.9-1.73-1.79-3.45-2.73-5.15l10.68-23.42a7.35 7.35 0 0 0-6.95-10.39l-25.82.91q-1.79-2.22-3.61-4.4L439 81.84a7.36 7.36 0 0 0-8.84-8.84L405 78.93q-2.17-1.83-4.4-3.61l.91-25.82a7.35 7.35 0 0 0-10.39-7L367.7 53.23c-1.7-.94-3.43-1.84-5.15-2.73l-4.15-25.42a7.35 7.35 0 0 0-11.54-4.79L326 35.26c-1.86-.59-3.75-1.13-5.64-1.67l-9-24a7.35 7.35 0 0 0-12.25-2.44l-17.5 18.72c-1.95-.21-3.91-.38-5.88-.55L262.25 3.48a7.35 7.35 0 0 0-12.5 0L236.24 25.3c-2 .17-3.93.34-5.88.55l-17.5-18.72a7.35 7.35 0 0 0-12.25 2.44l-9 24c-1.89.55-3.79 1.08-5.66 1.68l-20.82-15a7.35 7.35 0 0 0-11.54 4.79l-4.15 25.41c-1.73.9-3.45 1.79-5.16 2.73l-23.4-10.63a7.35 7.35 0 0 0-10.39 7l.92 25.81c-1.49 1.19-3 2.39-4.42 3.61L81.84 73A7.36 7.36 0 0 0 73 81.84L78.93 107c-1.23 1.45-2.43 2.93-3.62 4.41l-25.81-.91a7.42 7.42 0 0 0-6.37 3.26 7.35 7.35 0 0 0-.57 7.13l10.66 23.41c-.94 1.7-1.83 3.43-2.73 5.16l-25.41 4.14a7.35 7.35 0 0 0-4.79 11.54l15 20.82c-.59 1.87-1.13 3.77-1.68 5.66l-24 9a7.35 7.35 0 0 0-2.44 12.25l18.72 17.5c-.21 1.95-.38 3.91-.55 5.88l-21.86 13.5a7.35 7.35 0 0 0 0 12.5l21.82 13.51c.17 2 .34 3.92.55 5.87l-18.72 17.5a7.35 7.35 0 0 0 2.44 12.25l24 9c.55 1.89 1.08 3.78 1.68 5.65l-15 20.83a7.35 7.35 0 0 0 4.79 11.54l25.42 4.15c.9 1.72 1.79 3.45 2.73 5.14l-10.63 23.43a7.35 7.35 0 0 0 .57 7.13 7.13 7.13 0 0 0 6.37 3.26l25.83-.91q1.77 2.22 3.6 4.4L73 430.16a7.36 7.36 0 0 0 8.84 8.84l25.16-5.93q2.18 1.83 4.41 3.61l-.92 25.82a7.35 7.35 0 0 0 10.39 6.95l23.43-10.68c1.69.94 3.42 1.83 5.14 2.73l4.15 25.42a7.34 7.34 0 0 0 11.54 4.78l20.83-15c1.86.6 3.76 1.13 5.65 1.68l9 24a7.36 7.36 0 0 0 12.25 2.44l17.5-18.72c1.95.21 3.92.38 5.88.55l13.51 21.82a7.35 7.35 0 0 0 12.5 0l13.51-21.82c2-.17 3.93-.34 5.88-.56l17.5 18.73a7.36 7.36 0 0 0 12.25-2.44l9-24c1.89-.55 3.78-1.08 5.65-1.68l20.82 15a7.34 7.34 0 0 0 11.54-4.78l4.15-25.42c1.72-.9 3.45-1.79 5.15-2.73l23.42 10.68a7.35 7.35 0 0 0 10.39-6.95l-.91-25.82q2.22-1.79 4.4-3.61l25.15 5.93a7.36 7.36 0 0 0 8.84-8.84L433.07 405q1.83-2.17 3.61-4.4l25.82.91a7.23 7.23 0 0 0 6.37-3.26 7.35 7.35 0 0 0 .58-7.13l-10.68-23.42c.94-1.7 1.83-3.43 2.73-5.15l25.42-4.15a7.35 7.35 0 0 0 4.79-11.54l-15-20.83c.59-1.87 1.13-3.76 1.67-5.65l24-9a7.35 7.35 0 0 0 2.44-12.25l-18.72-17.5c.21-1.95.38-3.91.55-5.87l21.82-13.51a7.35 7.35 0 0 0 0-12.5Zm-151 129.08A13.91 13.91 0 0 0 341 389.51l-7.64 35.67a187.51 187.51 0 0 1-156.36-.74l-7.64-35.66a13.87 13.87 0 0 0-16.46-10.68l-31.51 6.76a187.38 187.38 0 0 1-16.26-19.21H258.3c1.72 0 2.89-.29 2.89-1.91v-54.19c0-1.57-1.17-1.91-2.89-1.91h-44.83l.05-34.35H262c4.41 0 23.66 1.28 29.79 25.87 1.91 7.55 6.17 32.14 9.06 40 2.89 8.82 14.6 26.46 27.1 26.46H407a187.3 187.3 0 0 1-17.34 20.09Zm25.77 34.49A15.24 15.24 0 1 1 368 398.08h.44a15.23 15.23 0 0 1 14.8 15.24Zm-225.62-.68a15.24 15.24 0 1 1-15.25-15.25h.45a15.25 15.25 0 0 1 14.75 15.25Zm-88.1-178.49 32.83-14.6a13.88 13.88 0 0 0 7.06-18.33L102.69 186h26.56v119.73h-53.6a187.65 187.65 0 0 1-6.08-71.58Zm-11.26-36.06a15.24 15.24 0 0 1 15.23-15.25H74a15.24 15.24 0 1 1-15.67 15.24Zm155.16 24.49.05-35.32h63.26c3.28 0 23.07 3.77 23.07 18.62 0 12.29-15.19 16.7-27.68 16.7ZM399 306.71c-9.8 1.13-20.63-4.12-22-10.09-5.78-32.49-15.39-39.4-30.57-51.4 18.86-11.95 38.46-29.64 38.46-53.26 0-25.52-17.49-41.59-29.4-49.48-16.76-11-35.28-13.23-40.27-13.23h-198.9a187.49 187.49 0 0 1 104.89-59.19l23.47 24.6a13.82 13.82 0 0 0 19.6.44l26.26-25a187.51 187.51 0 0 1 128.37 91.43l-18 40.57a14 14 0 0 0 7.09 18.33l34.59 15.33a187.12 187.12 0 0 1 .4 32.54h-19.28c-1.91 0-2.69 1.27-2.69 3.13v8.82C421 301 409.31 305.58 399 306.71ZM240 60.21A15.24 15.24 0 0 1 255.21 45h.45A15.24 15.24 0 1 1 240 60.21ZM436.84 214a15.24 15.24 0 1 1 0-30.48h.44a15.24 15.24 0 0 1-.44 30.48Z'/%3E%3C/svg%3E"); + } + .md-typeset .admonition.rust, + .md-typeset details.rust { + border-color: rgb(205, 121, 44); + } + .md-typeset .rust > .admonition-title, + .md-typeset .rust > summary { + background-color: rgb(205, 121, 44,.1); + } + .md-typeset .rust > .admonition-title::before, + .md-typeset .rust > summary::before { + background-color:rgb(205, 121, 44); + -webkit-mask-image: var(--md-admonition-icon--rust); + mask-image: var(--md-admonition-icon--rust); + } \ No newline at end of file diff --git a/docs/_build/overrides/404.html b/docs/_build/overrides/404.html new file mode 100644 index 000000000000..ee9b8faa2aba --- /dev/null +++ b/docs/_build/overrides/404.html @@ -0,0 +1,222 @@ +{% extends "main.html" %} +{% block content %} +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+

404 - You're lost.

+

The page you are looking for does no longer exist. + How you got here is a mystery. But you can click the button below + to go back to the homepage or use the search bar in the navigation menu to find what you are looking for. +

+ Home +
+{% endblock %} diff --git a/docs/_build/scripts/macro.py b/docs/_build/scripts/macro.py new file mode 100644 index 000000000000..d93d5170adec --- /dev/null +++ b/docs/_build/scripts/macro.py @@ -0,0 +1,156 @@ +from collections import OrderedDict +import os +from typing import List, Optional, Set +import yaml +import logging + + +# Supported Languages and their metadata +LANGUAGES = OrderedDict( + python={ + "extension": ".py", + "display_name": "Python", + "icon_name": "python", + "code_name": "python", + }, + rust={ + "extension": ".rs", + "display_name": "Rust", + "icon_name": "rust", + "code_name": "rust", + }, +) + +# Load all links to reference docs +with open("docs/_build/API_REFERENCE_LINKS.yml", "r") as f: + API_REFERENCE_LINKS = yaml.load(f, Loader=yaml.CLoader) + + +def create_feature_flag_link(feature_name: str) -> str: + """Create a feature flag warning telling the user to activate a certain feature before running the code + + Args: + feature_name (str): name of the feature + + Returns: + str: Markdown formatted string with a link and the feature flag message + """ + return f'[:material-flag-plus: Available on feature {feature_name}](/polars/user-guide/installation/#feature-flags "To use this functionality enable the feature flag {feature_name}"){{.feature-flag}}' + + +def create_feature_flag_links(language: str, api_functions: List[str]) -> List[str]: + """Generate markdown feature flags for the code tas based on the api_functions. + It checks for the key feature_flag in the configuration yaml for the function and if it exists print out markdown + + Args: + language (str): programming languages + api_functions (List[str]): Api functions that are called + + Returns: + List[str]: Per unique feature flag a markdown formatted string for the feature flag + """ + api_functions_info = [ + info + for f in api_functions + if (info := API_REFERENCE_LINKS.get(language).get(f)) + ] + feature_flags: Set[str] = { + flag + for info in api_functions_info + if type(info) == dict and info.get("feature_flags") + for flag in info.get("feature_flags") + } + + return [create_feature_flag_link(flag) for flag in feature_flags] + + +def create_api_function_link(language: str, function_key: str) -> Optional[str]: + """Create an API link in markdown with an icon of the YAML file + + Args: + language (str): programming language + function_key (str): Key to the specific function + + Returns: + str: If the function is found than the link else None + """ + info = API_REFERENCE_LINKS.get(language, {}).get(function_key) + + if info is None: + logging.warning(f"Could not find {function_key} for language {language}") + return None + else: + # Either be a direct link + if type(info) == str: + return f"[:material-api: `{function_key}`]({info})" + else: + function_name = info["name"] + link = info["link"] + return f"[:material-api: `{function_name}`]({link})" + + +def code_tab( + base_path: str, + section: Optional[str], + language_info: dict, + api_functions: List[str], +) -> str: + """Generate a single tab for the code block corresponding to a specific language. + It gets the code at base_path and possible section and pretty prints markdown for it + + Args: + base_path (str): path where the code is located + section (str, optional): section in the code that should be displayed + language_info (dict): Language specific information (icon name, display name, ...) + api_functions (List[str]): List of api functions which should be linked + + Returns: + str: A markdown formatted string represented a single tab + """ + language = language_info["code_name"] + + # Create feature flags + feature_flags_links = create_feature_flag_links(language, api_functions) + + # Create API Links if they are defined in the YAML + api_functions = [ + link for f in api_functions if (link := create_api_function_link(language, f)) + ] + language_headers = " ·".join(api_functions + feature_flags_links) + + # Create path for Snippets extension + snippets_file_name = f"{base_path}:{section}" if section else f"{base_path}" + + # See Content Tabs for details https://squidfunk.github.io/mkdocs-material/reference/content-tabs/ + return f"""=== \":fontawesome-brands-{language_info['icon_name']}: {language_info['display_name']}\" + {language_headers} + ```{language} + --8<-- \"{snippets_file_name}\" + ``` + """ + + +def define_env(env): + @env.macro + def code_block( + path: str, section: str = None, api_functions: List[str] = None + ) -> str: + """Dynamically generate a code block for the code located under {language}/path + + Args: + path (str): base_path for each language + section (str, optional): Optional segment within the code file. Defaults to None. + api_functions (List[str], optional): API functions that should be linked. Defaults to None. + Returns: + str: Markdown tabbed code block with possible links to api functions and feature flags + """ + result = [] + + for language, info in LANGUAGES.items(): + base_path = f"{language}/{path}{info['extension']}" + full_path = "docs/src/" + base_path + # Check if file exists for the language + if os.path.exists(full_path): + result.append(code_tab(base_path, section, info, api_functions)) + + return "\n".join(result) diff --git a/docs/_build/scripts/people.py b/docs/_build/scripts/people.py new file mode 100644 index 000000000000..81ba1982f132 --- /dev/null +++ b/docs/_build/scripts/people.py @@ -0,0 +1,38 @@ +import itertools +from github import Github + +g = Github(None) + +ICON_TEMPLATE = "[![{login}]({avatar_url}){{.contributor_icon}}]({html_url})" + + +def get_people_md(): + repo = g.get_repo("pola-rs/polars") + contributors = repo.get_contributors() + with open("./docs/people.md", "w") as f: + for c in itertools.islice(contributors, 50): + # We love dependabot, but he doesn't need a spot on our website + if c.login == "dependabot[bot]": + continue + + f.write( + ICON_TEMPLATE.format( + login=c.login, + avatar_url=c.avatar_url, + html_url=c.html_url, + ) + + "\n" + ) + + +def on_startup(command, dirty): + """Mkdocs hook to autogenerate docs/people.md on startup""" + try: + get_people_md() + except Exception as e: + msg = f"WARNING:{__file__}: Could not generate docs/people.md. Got error: {str(e)}" + print(msg) + + +if __name__ == "__main__": + get_people_md() diff --git a/docs/_build/snippets/under_construction.md b/docs/_build/snippets/under_construction.md new file mode 100644 index 000000000000..c4bb56a735af --- /dev/null +++ b/docs/_build/snippets/under_construction.md @@ -0,0 +1,4 @@ +!!! warning ":construction: Under Construction :construction: " + + This section is still under development. Want to help out? Consider contributing and making a [pull request](https://github.com/pola-rs/polars) to our repository. + Please read our [Contribution Guidelines](https://github.com/pola-rs/polars/blob/main/CONTRIBUTING.md) on how to proceed. diff --git a/docs/data/apple_stock.csv b/docs/data/apple_stock.csv new file mode 100644 index 000000000000..6c3f9752d587 --- /dev/null +++ b/docs/data/apple_stock.csv @@ -0,0 +1,101 @@ +Date,Close +1981-02-23,24.62 +1981-05-06,27.38 +1981-05-18,28.0 +1981-09-25,14.25 +1982-07-08,11.0 +1983-01-03,28.5 +1983-04-06,40.0 +1983-10-03,23.13 +1984-07-27,27.13 +1984-08-17,27.5 +1984-08-24,28.12 +1985-05-07,20.0 +1985-09-03,14.75 +1985-12-06,19.75 +1986-03-12,24.75 +1986-04-09,27.13 +1986-04-17,29.0 +1986-09-17,34.25 +1986-11-26,40.5 +1987-02-25,69.13 +1987-04-15,71.0 +1988-02-23,42.75 +1988-03-07,46.88 +1988-03-23,42.5 +1988-12-12,38.5 +1988-12-19,40.75 +1989-04-17,39.25 +1989-11-13,46.5 +1990-11-23,36.38 +1991-03-22,63.25 +1991-05-17,47.0 +1991-06-03,49.25 +1991-06-18,42.12 +1992-06-25,45.62 +1992-10-12,44.0 +1993-07-06,37.75 +1993-09-15,24.5 +1993-09-30,23.37 +1993-11-09,30.12 +1994-01-24,35.0 +1994-03-15,37.62 +1994-06-27,26.25 +1994-07-08,27.06 +1994-12-21,38.38 +1995-07-06,47.0 +1995-10-16,36.13 +1995-11-17,40.13 +1995-12-12,38.0 +1996-01-31,27.63 +1996-02-05,29.25 +1996-07-15,17.19 +1996-09-20,22.87 +1996-12-23,23.25 +1997-03-17,16.5 +1997-05-09,17.06 +1997-08-06,26.31 +1997-09-30,21.69 +1998-02-09,19.19 +1998-03-12,27.0 +1998-05-07,30.19 +1998-05-12,30.12 +1999-07-09,55.63 +1999-12-08,110.06 +2000-01-14,100.44 +2000-06-27,51.75 +2000-07-05,51.62 +2000-07-19,52.69 +2000-08-07,47.94 +2000-08-28,58.06 +2000-09-26,51.44 +2001-03-02,19.25 +2001-12-10,22.54 +2002-01-25,23.25 +2002-03-07,24.38 +2002-08-16,15.81 +2002-10-03,14.3 +2003-11-18,20.41 +2004-02-26,23.04 +2004-03-08,26.0 +2004-09-22,36.92 +2005-06-24,37.76 +2005-12-07,73.95 +2005-12-22,74.02 +2006-06-22,59.58 +2006-11-28,91.81 +2007-08-13,127.79 +2007-12-04,179.81 +2007-12-31,198.08 +2008-05-09,183.45 +2008-06-27,170.09 +2009-08-03,166.43 +2010-04-01,235.97 +2010-12-10,320.56 +2011-04-28,346.75 +2011-12-02,389.7 +2012-05-16,546.08 +2012-12-04,575.85 +2013-07-05,417.42 +2013-11-07,512.49 +2014-02-25,522.06 \ No newline at end of file diff --git a/docs/data/iris.csv b/docs/data/iris.csv new file mode 100644 index 000000000000..d6b466b31892 --- /dev/null +++ b/docs/data/iris.csv @@ -0,0 +1,151 @@ +sepal_length,sepal_width,petal_length,petal_width,species +5.1,3.5,1.4,.2,Setosa +4.9,3,1.4,.2,Setosa +4.7,3.2,1.3,.2,Setosa +4.6,3.1,1.5,.2,Setosa +5,3.6,1.4,.2,Setosa +5.4,3.9,1.7,.4,Setosa +4.6,3.4,1.4,.3,Setosa +5,3.4,1.5,.2,Setosa +4.4,2.9,1.4,.2,Setosa +4.9,3.1,1.5,.1,Setosa +5.4,3.7,1.5,.2,Setosa +4.8,3.4,1.6,.2,Setosa +4.8,3,1.4,.1,Setosa +4.3,3,1.1,.1,Setosa +5.8,4,1.2,.2,Setosa +5.7,4.4,1.5,.4,Setosa +5.4,3.9,1.3,.4,Setosa +5.1,3.5,1.4,.3,Setosa +5.7,3.8,1.7,.3,Setosa +5.1,3.8,1.5,.3,Setosa +5.4,3.4,1.7,.2,Setosa +5.1,3.7,1.5,.4,Setosa +4.6,3.6,1,.2,Setosa +5.1,3.3,1.7,.5,Setosa +4.8,3.4,1.9,.2,Setosa +5,3,1.6,.2,Setosa +5,3.4,1.6,.4,Setosa +5.2,3.5,1.5,.2,Setosa +5.2,3.4,1.4,.2,Setosa +4.7,3.2,1.6,.2,Setosa +4.8,3.1,1.6,.2,Setosa +5.4,3.4,1.5,.4,Setosa +5.2,4.1,1.5,.1,Setosa +5.5,4.2,1.4,.2,Setosa +4.9,3.1,1.5,.2,Setosa +5,3.2,1.2,.2,Setosa +5.5,3.5,1.3,.2,Setosa +4.9,3.6,1.4,.1,Setosa +4.4,3,1.3,.2,Setosa +5.1,3.4,1.5,.2,Setosa +5,3.5,1.3,.3,Setosa +4.5,2.3,1.3,.3,Setosa +4.4,3.2,1.3,.2,Setosa +5,3.5,1.6,.6,Setosa +5.1,3.8,1.9,.4,Setosa +4.8,3,1.4,.3,Setosa +5.1,3.8,1.6,.2,Setosa +4.6,3.2,1.4,.2,Setosa +5.3,3.7,1.5,.2,Setosa +5,3.3,1.4,.2,Setosa +7,3.2,4.7,1.4,Versicolor +6.4,3.2,4.5,1.5,Versicolor +6.9,3.1,4.9,1.5,Versicolor +5.5,2.3,4,1.3,Versicolor +6.5,2.8,4.6,1.5,Versicolor +5.7,2.8,4.5,1.3,Versicolor +6.3,3.3,4.7,1.6,Versicolor +4.9,2.4,3.3,1,Versicolor +6.6,2.9,4.6,1.3,Versicolor +5.2,2.7,3.9,1.4,Versicolor +5,2,3.5,1,Versicolor +5.9,3,4.2,1.5,Versicolor +6,2.2,4,1,Versicolor +6.1,2.9,4.7,1.4,Versicolor +5.6,2.9,3.6,1.3,Versicolor +6.7,3.1,4.4,1.4,Versicolor +5.6,3,4.5,1.5,Versicolor +5.8,2.7,4.1,1,Versicolor +6.2,2.2,4.5,1.5,Versicolor +5.6,2.5,3.9,1.1,Versicolor +5.9,3.2,4.8,1.8,Versicolor +6.1,2.8,4,1.3,Versicolor +6.3,2.5,4.9,1.5,Versicolor +6.1,2.8,4.7,1.2,Versicolor +6.4,2.9,4.3,1.3,Versicolor +6.6,3,4.4,1.4,Versicolor +6.8,2.8,4.8,1.4,Versicolor +6.7,3,5,1.7,Versicolor +6,2.9,4.5,1.5,Versicolor +5.7,2.6,3.5,1,Versicolor +5.5,2.4,3.8,1.1,Versicolor +5.5,2.4,3.7,1,Versicolor +5.8,2.7,3.9,1.2,Versicolor +6,2.7,5.1,1.6,Versicolor +5.4,3,4.5,1.5,Versicolor +6,3.4,4.5,1.6,Versicolor +6.7,3.1,4.7,1.5,Versicolor +6.3,2.3,4.4,1.3,Versicolor +5.6,3,4.1,1.3,Versicolor +5.5,2.5,4,1.3,Versicolor +5.5,2.6,4.4,1.2,Versicolor +6.1,3,4.6,1.4,Versicolor +5.8,2.6,4,1.2,Versicolor +5,2.3,3.3,1,Versicolor +5.6,2.7,4.2,1.3,Versicolor +5.7,3,4.2,1.2,Versicolor +5.7,2.9,4.2,1.3,Versicolor +6.2,2.9,4.3,1.3,Versicolor +5.1,2.5,3,1.1,Versicolor +5.7,2.8,4.1,1.3,Versicolor +6.3,3.3,6,2.5,Virginica +5.8,2.7,5.1,1.9,Virginica +7.1,3,5.9,2.1,Virginica +6.3,2.9,5.6,1.8,Virginica +6.5,3,5.8,2.2,Virginica +7.6,3,6.6,2.1,Virginica +4.9,2.5,4.5,1.7,Virginica +7.3,2.9,6.3,1.8,Virginica +6.7,2.5,5.8,1.8,Virginica +7.2,3.6,6.1,2.5,Virginica +6.5,3.2,5.1,2,Virginica +6.4,2.7,5.3,1.9,Virginica +6.8,3,5.5,2.1,Virginica +5.7,2.5,5,2,Virginica +5.8,2.8,5.1,2.4,Virginica +6.4,3.2,5.3,2.3,Virginica +6.5,3,5.5,1.8,Virginica +7.7,3.8,6.7,2.2,Virginica +7.7,2.6,6.9,2.3,Virginica +6,2.2,5,1.5,Virginica +6.9,3.2,5.7,2.3,Virginica +5.6,2.8,4.9,2,Virginica +7.7,2.8,6.7,2,Virginica +6.3,2.7,4.9,1.8,Virginica +6.7,3.3,5.7,2.1,Virginica +7.2,3.2,6,1.8,Virginica +6.2,2.8,4.8,1.8,Virginica +6.1,3,4.9,1.8,Virginica +6.4,2.8,5.6,2.1,Virginica +7.2,3,5.8,1.6,Virginica +7.4,2.8,6.1,1.9,Virginica +7.9,3.8,6.4,2,Virginica +6.4,2.8,5.6,2.2,Virginica +6.3,2.8,5.1,1.5,Virginica +6.1,2.6,5.6,1.4,Virginica +7.7,3,6.1,2.3,Virginica +6.3,3.4,5.6,2.4,Virginica +6.4,3.1,5.5,1.8,Virginica +6,3,4.8,1.8,Virginica +6.9,3.1,5.4,2.1,Virginica +6.7,3.1,5.6,2.4,Virginica +6.9,3.1,5.1,2.3,Virginica +5.8,2.7,5.1,1.9,Virginica +6.8,3.2,5.9,2.3,Virginica +6.7,3.3,5.7,2.5,Virginica +6.7,3,5.2,2.3,Virginica +6.3,2.5,5,1.9,Virginica +6.5,3,5.2,2,Virginica +6.2,3.4,5.4,2.3,Virginica +5.9,3,5.1,1.8,Virginica \ No newline at end of file diff --git a/docs/data/reddit.csv b/docs/data/reddit.csv new file mode 100644 index 000000000000..88f91e3df7db --- /dev/null +++ b/docs/data/reddit.csv @@ -0,0 +1,100 @@ +id,name,created_utc,updated_on,comment_karma,link_karma +1,truman48lamb_jasonbroken,1397113470,1536527864,0,0 +2,johnethen06_jasonbroken,1397113483,1536527864,0,0 +3,yaseinrez_jasonbroken,1397113483,1536527864,0,1 +4,Valve92_jasonbroken,1397113503,1536527864,0,0 +5,srbhuyan_jasonbroken,1397113506,1536527864,0,0 +6,taojianlong_jasonbroken,1397113510,1536527864,4,0 +7,YourPalGrant92_jasonbroken,1397113513,1536527864,0,0 +8,Lucki87_jasonbroken,1397113515,1536527864,0,0 +9,punkstock_jasonbroken,1397113517,1536527864,0,0 +10,duder_con_chile_jasonbroken,1397113519,1536527864,0,2 +11,IHaveBigBalls_jasonbroken,1397113520,1536527864,0,0 +12,Foggybanana_jasonbroken,1397113523,1536527864,0,0 +13,Thedrinkdriver_jasonbroken,1397113527,1536527864,-9,0 +14,littlemissd_jasonbroken,1397113530,1536527864,0,-3 +15,phonethaway_jasonbroken,1397113537,1536527864,0,0 +16,DreamingOfWinterfell_jasonbroken,1397113538,1536527864,0,0 +17,ssaig_jasonbroken,1397113544,1536527864,1,0 +18,divinetribe_jasonbroken,1397113549,1536527864,0,0 +19,fdbvfdssdgfds_jasonbroken,1397113552,1536527864,3,0 +20,hjtrsh54yh43_jasonbroken,1397113559,1536527864,-1,-1 +21,Dalin86_jasonbroken,1397113561,1536527864,0,0 +22,sgalex_jasonbroken,1397113561,1536527864,0,0 +23,beszhthw_jasonbroken,1397113566,1536527864,0,0 +24,WojkeN_jasonbroken,1397113572,1536527864,-8,0 +25,LixksHD_jasonbroken,1397113572,1536527864,0,0 +26,bradhrvf78_jasonbroken,1397113574,1536527864,0,0 +27,ravenfeathers_jasonbroken,1397113576,1536527864,0,0 +28,jayne101_jasonbroken,1397113583,1536527864,0,0 +29,jdennis6701_jasonbroken,1397113585,1536527864,0,0 +30,Puppy243_jasonbroken,1397113592,1536527864,0,0 +31,sissyt_jasonbroken,1397113609,1536527864,0,0 +32,fengye78_jasonbroken,1397113613,1536527864,0,0 +33,bigspender1988_jasonbroken,1397113614,1536527864,0,21 +34,bitdownworld_jasonbroken,1397113618,1536527864,0,0 +35,adhyufsdtha12_jasonbroken,1397113619,1536527864,0,0 +36,Haydenac_jasonbroken,1397113635,1536527864,0,0 +37,ihatewhoweare_jasonbroken,1397113636,1536527864,61,0 +38,HungDaddy69__jasonbroken,1397113641,1536527864,0,0 +39,FSUJohnny24_jasonbroken,1397113646,1536527864,0,0 +40,Toejimon_jasonbroken,1397113650,1536527864,0,0 +41,mine69flesh_jasonbroken,1397113651,1536527864,0,0 +42,brycentkt_jasonbroken,1397113653,1536527864,0,0 +43,hmmmitsbig,1397113655,1536527864,0,0 +77714,hockeyschtick,1137474000,1536497404,11104,451 +77715,kbmunkholm,1137474000,1536528267,0,0 +77716,dickb,1137588452,1536528267,0,0 +77717,stephenjcole,1137474000,1536528267,0,2 +77718,rosetree,1137474000,1536528267,0,0 +77719,benhawK,1138180921,1536528267,0,0 +77720,joenowak,1137474000,1536528268,0,0 +77721,constant,1137474000,1536528268,1,0 +77722,jpscott,1137474000,1536528268,0,1 +77723,meryn,1137474000,1536528268,0,2 +77724,momerath,1128916800,1536528268,2490,101 +77725,inuse,1137474000,1536528269,0,0 +77726,dubert11,1137474000,1536528269,38,59 +77727,CaliMark,1137474000,1536528269,0,0 +77728,Maniac,1137474000,1536528269,0,0 +77729,earlpearl,1137474000,1536528269,0,0 +77730,ghost,1137474000,1536497404,767,0 +77731,paulzg,1137474000,1536528270,0,0 +77732,rshawgo,1137474000,1536497404,707,6883 +77733,spage,1137474000,1536528270,0,0 +77734,HrothgarReborn,1137474000,1536528270,0,0 +77735,darknessvisible,1137474000,1536528270,26133,139 +77736,finleyt,1137714898,1536528270,0,0 +77737,Dalton,1137474000,1536528271,118,2 +77738,graemes,1137474000,1536528271,0,0 +77739,lettuce,1137780958,1536497404,4546,724 +77740,mudkicker,1137474000,1536528271,0,0 +77741,mydignet,1139649149,1536528271,0,0 +77742,markbo,1137474000,1536528271,0,0 +77743,mrfrostee,1137474000,1536528272,227,43 +77744,parappayo,1136350800,1536528272,53,164 +77745,danastasi,1137474000,1536528272,2335,146 +77747,AltherrWeb,1137474000,1536528272,1387,1605 +77748,dtpetty,1137474000,1536528273,0,0 +77749,jamesluke4,1137474000,1536528273,0,0 +77750,sankeld,1137474000,1536528273,9,45 +77751,iampivot,1139479524,1536497404,2640,31 +77752,mcaamano,1137474000,1536528273,0,0 +77753,wonsungi,1137596632,1536528273,0,0 +77754,naotakem,1137474000,1536528274,0,0 +77755,bis,1137474000,1536497404,2191,285 +77756,imeinzen,1137474000,1536528274,0,0 +77757,zrenneh,1137474000,1536528274,79,0 +77758,onclephilippe,1137474000,1536528274,0,0 +77759,Mokzaio415,1139422169,1536528274,0,0 +77761,-brisse,1137474000,1536528275,14,1 +77762,coolin86,1138303196,1536528275,40,7 +77763,Lunchy,1137599510,1536528275,65,0 +77764,jannemans,1137474000,1536528275,0,0 +77765,compostellas,1137474000,1536528276,6,0 +77766,genericbob,1137474000,1536528276,291,14 +77767,domlexch,1139482978,1536528276,0,0 +77768,TinheadNed,1139665457,1536497404,4434,103 +77769,patopurifik,1137474000,1536528276,0,0 +77770,PoPPo,1139057558,1536528276,0,0 +77771,tandrews,1137474000,1536528277,0,0 diff --git a/docs/getting-started/expressions.md b/docs/getting-started/expressions.md new file mode 100644 index 000000000000..692806d75de9 --- /dev/null +++ b/docs/getting-started/expressions.md @@ -0,0 +1,130 @@ +# Expressions + +`Expressions` are the core strength of `Polars`. The `expressions` offer a versatile structure that both solves easy queries and is easily extended to complex ones. Below we will cover the basic components that serve as building block (or in `Polars` terminology contexts) for all your queries: + +- `select` +- `filter` +- `with_columns` +- `group_by` + +To learn more about expressions and the context in which they operate, see the User Guide sections: [Contexts](../user-guide/concepts/contexts.md) and [Expressions](../user-guide/concepts/expressions.md). + +### Select statement + +To select a column we need to do two things. Define the `DataFrame` we want the data from. And second, select the data that we need. In the example below you see that we select `col('*')`. The asterisk stands for all columns. + +{{code_block('getting-started/expressions','select',['select'])}} + +```python exec="on" result="text" session="getting-started/expressions" +--8<-- "python/getting-started/expressions.py:setup" +print( + --8<-- "python/getting-started/expressions.py:select" +) +``` + +You can also specify the specific columns that you want to return. There are two ways to do this. The first option is to create a `list` of column names, as seen below. + +{{code_block('getting-started/expressions','select2',['select'])}} + +```python exec="on" result="text" session="getting-started/expressions" +print( + --8<-- "python/getting-started/expressions.py:select2" +) +``` + +The second option is to specify each column within a `list` in the `select` statement. This option is shown below. + +{{code_block('getting-started/expressions','select3',['select'])}} + +```python exec="on" result="text" session="getting-started/expressions" +print( + --8<-- "python/getting-started/expressions.py:select3" +) +``` + +If you want to exclude an entire column from your view, you can simply use `exclude` in your `select` statement. + +{{code_block('getting-started/expressions','exclude',['select'])}} + +```python exec="on" result="text" session="getting-started/expressions" +print( + --8<-- "python/getting-started/expressions.py:exclude" +) +``` + +### Filter + +The `filter` option allows us to create a subset of the `DataFrame`. We use the same `DataFrame` as earlier and we filter between two specified dates. + +{{code_block('getting-started/expressions','filter',['filter'])}} + +```python exec="on" result="text" session="getting-started/expressions" +print( + --8<-- "python/getting-started/expressions.py:filter" +) +``` + +With `filter` you can also create more complex filters that include multiple columns. + +{{code_block('getting-started/expressions','filter2',['filter'])}} + +```python exec="on" result="text" session="getting-started/expressions" +print( + --8<-- "python/getting-started/expressions.py:filter2" +) +``` + +### With_columns + +`with_columns` allows you to create new columns for your analyses. We create two new columns `e` and `b+42`. First we sum all values from column `b` and store the results in column `e`. After that we add `42` to the values of `b`. Creating a new column `b+42` to store these results. + +{{code_block('getting-started/expressions','with_columns',['with_columns'])}} + +```python exec="on" result="text" session="getting-started/expressions" +print( + --8<-- "python/getting-started/expressions.py:with_columns" +) +``` + +### Group by + +We will create a new `DataFrame` for the Group by functionality. This new `DataFrame` will include several 'groups' that we want to group by. + +{{code_block('getting-started/expressions','dataframe2',['DataFrame'])}} + +```python exec="on" result="text" session="getting-started/expressions" +--8<-- "python/getting-started/expressions.py:dataframe2" +print(df2) +``` + +{{code_block('getting-started/expressions','group_by',['group_by'])}} + +```python exec="on" result="text" session="getting-started/expressions" +print( + --8<-- "python/getting-started/expressions.py:group_by" +) +``` + +{{code_block('getting-started/expressions','group_by2',['group_by'])}} + +```python exec="on" result="text" session="getting-started/expressions" +print( + --8<-- "python/getting-started/expressions.py:group_by2" +) +``` + +### Combining operations + +Below are some examples on how to combine operations to create the `DataFrame` you require. + +{{code_block('getting-started/expressions','combine',['select','with_columns'])}} + +```python exec="on" result="text" session="getting-started/expressions" +--8<-- "python/getting-started/expressions.py:combine" +``` + +{{code_block('getting-started/expressions','combine2',['select','with_columns'])}} + +```python exec="on" result="text" session="getting-started/expressions" +--8<-- "python/getting-started/expressions.py:combine2" +``` diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md new file mode 100644 index 000000000000..b8b8d18441e6 --- /dev/null +++ b/docs/getting-started/installation.md @@ -0,0 +1,31 @@ +# Installation + +Polars is a library and installation is as simple as invoking the package manager of the corresponding programming language. + +=== ":fontawesome-brands-python: Python" + + ``` bash + pip install polars + ``` + +=== ":fontawesome-brands-rust: Rust" + + ``` shell + cargo add polars + ``` + +## Importing + +To use the library import it into your project + +=== ":fontawesome-brands-python: Python" + + ``` python + import polars as pl + ``` + +=== ":fontawesome-brands-rust: Rust" + + ``` rust + use polars::prelude::*; + ``` diff --git a/docs/getting-started/intro.md b/docs/getting-started/intro.md new file mode 100644 index 000000000000..81d4ac110efc --- /dev/null +++ b/docs/getting-started/intro.md @@ -0,0 +1,16 @@ +# Introduction + +This getting started guide is written for new users of Polars. The goal is to provide a quick overview of the most common functionality. For a more detailed explanation, please go to the [User Guide](../user-guide/index.md) + +!!! rust "Rust Users Only" + + Due to historical reasons the eager API in Rust is outdated. In the future we would like to redesign it as a small wrapper around the lazy API (as is the design in Python / NodeJS). In the examples we will use the lazy API instead with `.lazy()` and `.collect()`. For now you can ignore these two functions. If you want to know more about the lazy and eager API go [here](../user-guide/concepts/lazy-vs-eager.md). + + To enable the Lazy API ensure you have the feature flag `lazy` configured when installing Polars + ``` + # Cargo.toml + [dependencies] + polars = { version = "x", features = ["lazy", ...]} + ``` + + Because of the ownership ruling in Rust we can not reuse the same `DataFrame` multiple times in the examples. For simplicity reasons we call `clone()` to overcome this issue. Note that this does not duplicate the data but just increments a pointer (`Arc`). diff --git a/docs/getting-started/joins.md b/docs/getting-started/joins.md new file mode 100644 index 000000000000..42d875d79144 --- /dev/null +++ b/docs/getting-started/joins.md @@ -0,0 +1,26 @@ +# Combining DataFrames + +There are two ways `DataFrame`s can be combined depending on the use case: join and concat. + +## Join + +Polars supports all types of join (e.g. left, right, inner, outer). Let's have a closer look on how to `join` two `DataFrames` into a single `DataFrame`. Our two `DataFrames` both have an 'id'-like column: `a` and `x`. We can use those columns to `join` the `DataFrames` in this example. + +{{code_block('getting-started/joins','join',['join'])}} + +```python exec="on" result="text" session="getting-started/joins" +--8<-- "python/getting-started/joins.py:setup" +--8<-- "python/getting-started/joins.py:join" +``` + +To see more examples with other types of joins, go the [User Guide](../user-guide/transformations/joins.md). + +## Concat + +We can also `concatenate` two `DataFrames`. Vertical concatenation will make the `DataFrame` longer. Horizontal concatenation will make the `DataFrame` wider. Below you can see the result of an horizontal concatenation of our two `DataFrames`. + +{{code_block('getting-started/joins','hstack',['hstack'])}} + +```python exec="on" result="text" session="getting-started/joins" +--8<-- "python/getting-started/joins.py:hstack" +``` diff --git a/docs/getting-started/reading-writing.md b/docs/getting-started/reading-writing.md new file mode 100644 index 000000000000..ad91be50f0f6 --- /dev/null +++ b/docs/getting-started/reading-writing.md @@ -0,0 +1,45 @@ +# Reading & writing + +Polars supports reading and writing to all common files (e.g. csv, json, parquet), cloud storage (S3, Azure Blob, BigQuery) and databases (e.g. postgres, mysql). In the following examples we will show how to operate on most common file formats. For the following dataframe + +{{code_block('getting-started/reading-writing','dataframe',['DataFrame'])}} + +```python exec="on" result="text" session="getting-started/reading" +--8<-- "python/getting-started/reading-writing.py:dataframe" +``` + +#### CSV + +Polars has its own fast implementation for csv reading with many flexible configuration options. + +{{code_block('getting-started/reading-writing','csv',['read_csv','write_csv'])}} + +```python exec="on" result="text" session="getting-started/reading" +--8<-- "python/getting-started/reading-writing.py:csv" +``` + +As we can see above, Polars made the datetimes a `string`. We can tell Polars to parse dates, when reading the csv, to ensure the date becomes a datetime. The example can be found below: + +{{code_block('getting-started/reading-writing','csv2',['read_csv'])}} + +```python exec="on" result="text" session="getting-started/reading" +--8<-- "python/getting-started/reading-writing.py:csv2" +``` + +#### JSON + +{{code_block('getting-started/reading-writing','json',['read_json','write_json'])}} + +```python exec="on" result="text" session="getting-started/reading" +--8<-- "python/getting-started/reading-writing.py:json" +``` + +#### Parquet + +{{code_block('getting-started/reading-writing','parquet',['read_parquet','write_parquet'])}} + +```python exec="on" result="text" session="getting-started/reading" +--8<-- "python/getting-started/reading-writing.py:parquet" +``` + +To see more examples and other data formats go to the [User Guide](../user-guide/io/csv.md), section IO. diff --git a/docs/getting-started/series-dataframes.md b/docs/getting-started/series-dataframes.md new file mode 100644 index 000000000000..07e05c194b93 --- /dev/null +++ b/docs/getting-started/series-dataframes.md @@ -0,0 +1,102 @@ +# Series & DataFrames + +The core base data structures provided by Polars are `Series` and `DataFrames`. + +## Series + +Series are a 1-dimensional data structure. Within a series all elements have the same data type (e.g. int, string). +The snippet below shows how to create a simple named `Series` object. In a later section of this getting started guide we will learn how to read data from external sources (e.g. files, database), for now lets keep it simple. + +{{code_block('getting-started/series-dataframes','series',['Series'])}} + +```python exec="on" result="text" session="getting-started/series" +--8<-- "python/getting-started/series-dataframes.py:series" +``` + +### Methods + +Although it is more common to work directly on a `DataFrame` object, `Series` implement a number of base methods which make it easy to perform transformations. Below are some examples of common operations you might want to perform. Note that these are for illustration purposes and only show a small subset of what is available. + +##### Aggregations + +`Series` out of the box supports all basic aggregations (e.g. min, max, mean, mode, ...). + +{{code_block('getting-started/series-dataframes','minmax',['min','max'])}} + +```python exec="on" result="text" session="getting-started/series" +--8<-- "python/getting-started/series-dataframes.py:minmax" +``` + +##### String + +There are a number of methods related to string operations in the `StringNamespace`. These only work on `Series` with the Datatype `Utf8`. + +{{code_block('getting-started/series-dataframes','string',['replace'])}} + +```python exec="on" result="text" session="getting-started/series" +--8<-- "python/getting-started/series-dataframes.py:string" +``` + +##### Datetime + +Similar to strings, there is a separate namespace for datetime related operations in the `DateLikeNameSpace`. These only work on `Series`with DataTypes related to dates. + +{{code_block('getting-started/series-dataframes','dt',['day'])}} + +```python exec="on" result="text" session="getting-started/series" +--8<-- "python/getting-started/series-dataframes.py:dt" +``` + +## DataFrame + +A `DataFrame` is a 2-dimensional data structure that is backed by a `Series`, and it could be seen as an abstraction of on collection (e.g. list) of `Series`. Operations that can be executed on `DataFrame` are very similar to what is done in a `SQL` like query. You can `GROUP BY`, `JOIN`, `PIVOT`, but also define custom functions. In the next pages we will cover how to perform these transformations. + +{{code_block('getting-started/series-dataframes','dataframe',['DataFrame'])}} + +```python exec="on" result="text" session="getting-started/series" +--8<-- "python/getting-started/series-dataframes.py:dataframe" +``` + +### Viewing data + +This part focuses on viewing data in a `DataFrame`. We will use the `DataFrame` from the previous example as a starting point. + +#### Head + +The `head` function shows by default the first 5 rows of a `DataFrame`. You can specify the number of rows you want to see (e.g. `df.head(10)`). + +{{code_block('getting-started/series-dataframes','head',['head'])}} + +```python exec="on" result="text" session="getting-started/series" +--8<-- "python/getting-started/series-dataframes.py:head" +``` + +#### Tail + +The `tail` function shows the last 5 rows of a `DataFrame`. You can also specify the number of rows you want to see, similar to `head`. + +{{code_block('getting-started/series-dataframes','tail',['tail'])}} + +```python exec="on" result="text" session="getting-started/series" +--8<-- "python/getting-started/series-dataframes.py:tail" +``` + +#### Sample + +If you want to get an impression of the data of your `DataFrame`, you can also use `sample`. With `sample` you get an _n_ number of random rows from the `DataFrame`. + +{{code_block('getting-started/series-dataframes','sample',['sample'])}} + +```python exec="on" result="text" session="getting-started/series" +--8<-- "python/getting-started/series-dataframes.py:sample" +``` + +#### Describe + +`Describe` returns summary statistics of your `DataFrame`. It will provide several quick statistics if possible. + +{{code_block('getting-started/series-dataframes','describe',['describe'])}} + +```python exec="on" result="text" session="getting-started/series" +--8<-- "python/getting-started/series-dataframes.py:describe" +``` diff --git a/docs/images/.gitignore b/docs/images/.gitignore new file mode 100644 index 000000000000..72e8ffc0db8a --- /dev/null +++ b/docs/images/.gitignore @@ -0,0 +1 @@ +* diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 000000000000..2621ba4ee11d --- /dev/null +++ b/docs/index.md @@ -0,0 +1,71 @@ +--- +hide: + - navigation +--- + +# Polars + +![logo](https://raw.githubusercontent.com/pola-rs/polars-static/master/logos/polars_github_logo_rect_dark_name.svg) + +

Blazingly Fast DataFrame Library

+ + +Polars is a highly performant DataFrame library for manipulating structured data. The core is written in Rust, but the library is also available in Python. Its key features are: + +- **Fast**: Polars is written from the ground up, designed close to the machine and without external dependencies. +- **I/O**: First class support for all common data storage layers: local, cloud storage & databases. +- **Easy to use**: Write your queries the way they were intended. Polars, internally, will determine the most efficient way to execute using its query optimizer. +- **Out of Core**: Polars supports out of core data transformation with its streaming API. Allowing you to process your results without requiring all your data to be in memory at the same time +- **Parallel**: Polars fully utilises the power of your machine by dividing the workload among the available CPU cores without any additional configuration. +- **Vectorized Query Engine**: Polars uses [Apache Arrow](https://arrow.apache.org/), a columnar data format, to process your queries in a vectorized manner. It uses [SIMD](https://en.wikipedia.org/wiki/Single_instruction,_multiple_data) to optimize CPU usage. + +## About this guide + +The `Polars` user guide is intended to live alongside the API documentation. Its purpose is to explain (new) users how to use `Polars` and to provide meaningful examples. The guide is split into two parts: + +- [Getting Started](getting-started/intro.md): A 10 minute helicopter view of the library and its primary function. +- [User Guide](user-guide/index.md): A detailed explanation of how the library is setup and how to use it most effectively. + +If you are looking for details on a specific level / object, it is probably best to go the API documentation: [Python](https://pola-rs.github.io/polars/py-polars/html/reference/index.html) | [Rust](https://docs.rs/polars/latest/polars/). + +## Performance :rocket: :rocket: + +`Polars` is very fast, and in fact is one of the best performing solutions available. +See the results in h2oai's [db-benchmark](https://duckdblabs.github.io/db-benchmark/), revived by the DuckDB project. + +`Polars` [TPCH Benchmark results](https://www.pola.rs/benchmarks.html) are now available on the official website. + +## Example + +{{code_block('home/example','example',['scan_csv','filter','group_by','collect'])}} + +## Sponsors + +[](https://www.xomnia.com/)   [](https://www.jetbrains.com) + +## Community + +`Polars` has a very active community with frequent releases (approximately weekly). Below are some of the top contributors to the project: + +--8<-- "docs/people.md" + +## Contribute + +Thanks for taking the time to contribute! We appreciate all contributions, from reporting bugs to implementing new features. If you're unclear on how to proceed read our [contribution guide](https://github.com/pola-rs/polars/blob/main/CONTRIBUTING.md) or contact us on [discord](https://discord.com/invite/4UfP5cfBE7). + +## License + +This project is licensed under the terms of the MIT license. diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 000000000000..2c317b06415b --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,9 @@ +pandas +pyarrow +graphviz +matplotlib + +mkdocs-material==9.2.5 +mkdocs-macros-plugin==1.0.4 +markdown-exec[ansi]==1.6.0 +PyGithub==1.59.1 diff --git a/docs/src/python/getting-started/expressions.py b/docs/src/python/getting-started/expressions.py new file mode 100644 index 000000000000..ea73e0819a90 --- /dev/null +++ b/docs/src/python/getting-started/expressions.py @@ -0,0 +1,91 @@ +# --8<-- [start:setup] +import polars as pl +import numpy as np +from datetime import datetime + +df = pl.DataFrame( + { + "a": np.arange(0, 8), + "b": np.random.rand(8), + "c": [ + datetime(2022, 12, 1), + datetime(2022, 12, 2), + datetime(2022, 12, 3), + datetime(2022, 12, 4), + datetime(2022, 12, 5), + datetime(2022, 12, 6), + datetime(2022, 12, 7), + datetime(2022, 12, 8), + ], + "d": [1, 2.0, np.NaN, np.NaN, 0, -5, -42, None], + } +) +# --8<-- [end:setup] + +# --8<-- [start:select] +df.select(pl.col("*")) +# --8<-- [end:select] + +# --8<-- [start:select2] +df.select(pl.col(["a", "b"])) +# --8<-- [end:select2] + +# --8<-- [start:select3] +df.select([pl.col("a"), pl.col("b")]).limit(3) +# --8<-- [end:select3] + +# --8<-- [start:exclude] +df.select([pl.exclude("a")]) +# --8<-- [end:exclude] + +# --8<-- [start:filter] +df.filter( + pl.col("c").is_between(datetime(2022, 12, 2), datetime(2022, 12, 8)), +) +# --8<-- [end:filter] + +# --8<-- [start:filter2] +df.filter((pl.col("a") <= 3) & (pl.col("d").is_not_nan())) +# --8<-- [end:filter2] + +# --8<-- [start:with_columns] +df.with_columns([pl.col("b").sum().alias("e"), (pl.col("b") + 42).alias("b+42")]) +# --8<-- [end:with_columns] + +# --8<-- [start:dataframe2] +df2 = pl.DataFrame( + { + "x": np.arange(0, 8), + "y": ["A", "A", "A", "B", "B", "C", "X", "X"], + } +) +# --8<-- [end:dataframe2] + +# --8<-- [start:group_by] +df2.group_by("y", maintain_order=True).count() +# --8<-- [end:group_by] + +# --8<-- [start:group_by2] +df2.group_by("y", maintain_order=True).agg( + [ + pl.col("*").count().alias("count"), + pl.col("*").sum().alias("sum"), + ] +) +# --8<-- [end:group_by2] + +# --8<-- [start:combine] +df_x = df.with_columns((pl.col("a") * pl.col("b")).alias("a * b")).select( + [pl.all().exclude(["c", "d"])] +) + +print(df_x) +# --8<-- [end:combine] + +# --8<-- [start:combine2] +df_y = df.with_columns([(pl.col("a") * pl.col("b")).alias("a * b")]).select( + [pl.all().exclude("d")] +) + +print(df_y) +# --8<-- [end:combine2] diff --git a/docs/src/python/getting-started/joins.py b/docs/src/python/getting-started/joins.py new file mode 100644 index 000000000000..e5a52416eef1 --- /dev/null +++ b/docs/src/python/getting-started/joins.py @@ -0,0 +1,29 @@ +# --8<-- [start:setup] +import polars as pl +import numpy as np + +# --8<-- [end:setup] + +# --8<-- [start:join] +df = pl.DataFrame( + { + "a": np.arange(0, 8), + "b": np.random.rand(8), + "d": [1, 2.0, np.NaN, np.NaN, 0, -5, -42, None], + } +) + +df2 = pl.DataFrame( + { + "x": np.arange(0, 8), + "y": ["A", "A", "A", "B", "B", "C", "X", "X"], + } +) +joined = df.join(df2, left_on="a", right_on="x") +print(joined) +# --8<-- [end:join] + +# --8<-- [start:hstack] +stacked = df.hstack(df2) +print(stacked) +# --8<-- [end:hstack] diff --git a/docs/src/python/getting-started/reading-writing.py b/docs/src/python/getting-started/reading-writing.py new file mode 100644 index 000000000000..dc8a54ebd18f --- /dev/null +++ b/docs/src/python/getting-started/reading-writing.py @@ -0,0 +1,41 @@ +# --8<-- [start:dataframe] +import polars as pl +from datetime import datetime + +df = pl.DataFrame( + { + "integer": [1, 2, 3], + "date": [ + datetime(2022, 1, 1), + datetime(2022, 1, 2), + datetime(2022, 1, 3), + ], + "float": [4.0, 5.0, 6.0], + } +) + +print(df) +# --8<-- [end:dataframe] + +# --8<-- [start:csv] +df.write_csv("docs/data/output.csv") +df_csv = pl.read_csv("docs/data/output.csv") +print(df_csv) +# --8<-- [end:csv] + +# --8<-- [start:csv2] +df_csv = pl.read_csv("docs/data/output.csv", try_parse_dates=True) +print(df_csv) +# --8<-- [end:csv2] + +# --8<-- [start:json] +df.write_json("docs/data/output.json") +df_json = pl.read_json("docs/data/output.json") +print(df_json) +# --8<-- [end:json] + +# --8<-- [start:parquet] +df.write_parquet("docs/data/output.parquet") +df_parquet = pl.read_parquet("docs/data/output.parquet") +print(df_parquet) +# --8<-- [end:parquet] diff --git a/docs/src/python/getting-started/series-dataframes.py b/docs/src/python/getting-started/series-dataframes.py new file mode 100644 index 000000000000..6f2fdf265c22 --- /dev/null +++ b/docs/src/python/getting-started/series-dataframes.py @@ -0,0 +1,64 @@ +# --8<-- [start:series] +import polars as pl + +s = pl.Series("a", [1, 2, 3, 4, 5]) +print(s) +# --8<-- [end:series] + +# --8<-- [start:minmax] +s = pl.Series("a", [1, 2, 3, 4, 5]) +print(s.min()) +print(s.max()) +# --8<-- [end:minmax] + +# --8<-- [start:string] +s = pl.Series("a", ["polar", "bear", "arctic", "polar fox", "polar bear"]) +s2 = s.str.replace("polar", "pola") +print(s2) +# --8<-- [end:string] + +# --8<-- [start:dt] +from datetime import date + +start = date(2001, 1, 1) +stop = date(2001, 1, 9) +s = pl.date_range(start, stop, interval="2d", eager=True) +s.dt.day() +print(s) +# --8<-- [end:dt] + +# --8<-- [start:dataframe] +from datetime import datetime + +df = pl.DataFrame( + { + "integer": [1, 2, 3, 4, 5], + "date": [ + datetime(2022, 1, 1), + datetime(2022, 1, 2), + datetime(2022, 1, 3), + datetime(2022, 1, 4), + datetime(2022, 1, 5), + ], + "float": [4.0, 5.0, 6.0, 7.0, 8.0], + } +) + +print(df) +# --8<-- [end:dataframe] + +# --8<-- [start:head] +print(df.head(3)) +# --8<-- [end:head] + +# --8<-- [start:tail] +print(df.tail(3)) +# --8<-- [end:tail] + +# --8<-- [start:sample] +print(df.sample(2)) +# --8<-- [end:sample] + +# --8<-- [start:describe] +print(df.describe()) +# --8<-- [end:describe] diff --git a/docs/src/python/home/example.py b/docs/src/python/home/example.py new file mode 100644 index 000000000000..5f675f4e82e4 --- /dev/null +++ b/docs/src/python/home/example.py @@ -0,0 +1,12 @@ +# --8<-- [start:example] +import polars as pl + +q = ( + pl.scan_csv("docs/data/iris.csv") + .filter(pl.col("sepal_length") > 5) + .group_by("species") + .agg(pl.all().sum()) +) + +df = q.collect() +# --8<-- [end:example] diff --git a/docs/src/python/user-guide/concepts/contexts.py b/docs/src/python/user-guide/concepts/contexts.py new file mode 100644 index 000000000000..ea3baf965b52 --- /dev/null +++ b/docs/src/python/user-guide/concepts/contexts.py @@ -0,0 +1,55 @@ +# --8<-- [start:setup] +import polars as pl +import numpy as np + +np.random.seed(12) +# --8<-- [end:setup] + +# --8<-- [start:dataframe] +df = pl.DataFrame( + { + "nrs": [1, 2, 3, None, 5], + "names": ["foo", "ham", "spam", "egg", None], + "random": np.random.rand(5), + "groups": ["A", "A", "B", "C", "B"], + } +) +print(df) +# --8<-- [end:dataframe] + +# --8<-- [start:select] + +out = df.select( + pl.sum("nrs"), + pl.col("names").sort(), + pl.col("names").first().alias("first name"), + (pl.mean("nrs") * 10).alias("10xnrs"), +) +print(out) +# --8<-- [end:select] + +# --8<-- [start:filter] +out = df.filter(pl.col("nrs") > 2) +print(out) +# --8<-- [end:filter] + +# --8<-- [start:with_columns] + +df = df.with_columns( + pl.sum("nrs").alias("nrs_sum"), + pl.col("random").count().alias("count"), +) +print(df) +# --8<-- [end:with_columns] + + +# --8<-- [start:group_by] +out = df.group_by("groups").agg( + pl.sum("nrs"), # sum nrs by groups + pl.col("random").count().alias("count"), # count group members + # sum random where name != null + pl.col("random").filter(pl.col("names").is_not_null()).sum().suffix("_sum"), + pl.col("names").reverse().alias("reversed names"), +) +print(out) +# --8<-- [end:group_by] diff --git a/docs/src/python/user-guide/concepts/expressions.py b/docs/src/python/user-guide/concepts/expressions.py new file mode 100644 index 000000000000..83e6c4514c23 --- /dev/null +++ b/docs/src/python/user-guide/concepts/expressions.py @@ -0,0 +1,16 @@ +import polars as pl + +df = pl.DataFrame( + { + "foo": [1, 2, 3, None, 5], + "bar": ["foo", "ham", "spam", "egg", None], + } +) + +# --8<-- [start:example1] +pl.col("foo").sort().head(2) +# --8<-- [end:example1] + +# --8<-- [start:example2] +df.select(pl.col("foo").sort().head(2), pl.col("bar").filter(pl.col("foo") == 1).sum()) +# --8<-- [end:example2] diff --git a/docs/src/python/user-guide/concepts/lazy-vs-eager.py b/docs/src/python/user-guide/concepts/lazy-vs-eager.py new file mode 100644 index 000000000000..1327bac6357a --- /dev/null +++ b/docs/src/python/user-guide/concepts/lazy-vs-eager.py @@ -0,0 +1,20 @@ +import polars as pl + +# --8<-- [start:eager] + +df = pl.read_csv("docs/data/iris.csv") +df_small = df.filter(pl.col("sepal_length") > 5) +df_agg = df_small.group_by("species").agg(pl.col("sepal_width").mean()) +print(df_agg) +# --8<-- [end:eager] + +# --8<-- [start:lazy] +q = ( + pl.scan_csv("docs/data/iris.csv") + .filter(pl.col("sepal_length") > 5) + .group_by("species") + .agg(pl.col("sepal_width").mean()) +) + +df = q.collect() +# --8<-- [end:lazy] diff --git a/docs/src/python/user-guide/concepts/streaming.py b/docs/src/python/user-guide/concepts/streaming.py new file mode 100644 index 000000000000..955750bf6c30 --- /dev/null +++ b/docs/src/python/user-guide/concepts/streaming.py @@ -0,0 +1,12 @@ +import polars as pl + +# --8<-- [start:streaming] +q = ( + pl.scan_csv("docs/data/iris.csv") + .filter(pl.col("sepal_length") > 5) + .group_by("species") + .agg(pl.col("sepal_width").mean()) +) + +df = q.collect(streaming=True) +# --8<-- [end:streaming] diff --git a/docs/src/python/user-guide/expressions/aggregation.py b/docs/src/python/user-guide/expressions/aggregation.py new file mode 100644 index 000000000000..55a986164fbd --- /dev/null +++ b/docs/src/python/user-guide/expressions/aggregation.py @@ -0,0 +1,169 @@ +# --8<-- [start:setup] +import polars as pl +from datetime import date + +# --8<-- [end:setup] + +# --8<-- [start:dataframe] +url = "https://theunitedstates.io/congress-legislators/legislators-historical.csv" + +dtypes = { + "first_name": pl.Categorical, + "gender": pl.Categorical, + "type": pl.Categorical, + "state": pl.Categorical, + "party": pl.Categorical, +} + +dataset = pl.read_csv(url, dtypes=dtypes).with_columns( + pl.col("birthday").str.strptime(pl.Date, strict=False) +) +# --8<-- [end:dataframe] + +# --8<-- [start:basic] +q = ( + dataset.lazy() + .group_by("first_name") + .agg( + pl.count(), + pl.col("gender"), + pl.first("last_name"), + ) + .sort("count", descending=True) + .limit(5) +) + +df = q.collect() +print(df) +# --8<-- [end:basic] + +# --8<-- [start:conditional] +q = ( + dataset.lazy() + .group_by("state") + .agg( + (pl.col("party") == "Anti-Administration").sum().alias("anti"), + (pl.col("party") == "Pro-Administration").sum().alias("pro"), + ) + .sort("pro", descending=True) + .limit(5) +) + +df = q.collect() +print(df) +# --8<-- [end:conditional] + +# --8<-- [start:nested] +q = ( + dataset.lazy() + .group_by("state", "party") + .agg(pl.count("party").alias("count")) + .filter( + (pl.col("party") == "Anti-Administration") + | (pl.col("party") == "Pro-Administration") + ) + .sort("count", descending=True) + .limit(5) +) + +df = q.collect() +print(df) +# --8<-- [end:nested] + + +# --8<-- [start:filter] +def compute_age() -> pl.Expr: + return date(2021, 1, 1).year - pl.col("birthday").dt.year() + + +def avg_birthday(gender: str) -> pl.Expr: + return ( + compute_age() + .filter(pl.col("gender") == gender) + .mean() + .alias(f"avg {gender} birthday") + ) + + +q = ( + dataset.lazy() + .group_by("state") + .agg( + avg_birthday("M"), + avg_birthday("F"), + (pl.col("gender") == "M").sum().alias("# male"), + (pl.col("gender") == "F").sum().alias("# female"), + ) + .limit(5) +) + +df = q.collect() +print(df) +# --8<-- [end:filter] + + +# --8<-- [start:sort] +def get_person() -> pl.Expr: + return pl.col("first_name") + pl.lit(" ") + pl.col("last_name") + + +q = ( + dataset.lazy() + .sort("birthday", descending=True) + .group_by("state") + .agg( + get_person().first().alias("youngest"), + get_person().last().alias("oldest"), + ) + .limit(5) +) + +df = q.collect() +print(df) +# --8<-- [end:sort] + + +# --8<-- [start:sort2] +def get_person() -> pl.Expr: + return pl.col("first_name") + pl.lit(" ") + pl.col("last_name") + + +q = ( + dataset.lazy() + .sort("birthday", descending=True) + .group_by("state") + .agg( + get_person().first().alias("youngest"), + get_person().last().alias("oldest"), + get_person().sort().first().alias("alphabetical_first"), + ) + .limit(5) +) + +df = q.collect() +print(df) +# --8<-- [end:sort2] + + +# --8<-- [start:sort3] +def get_person() -> pl.Expr: + return pl.col("first_name") + pl.lit(" ") + pl.col("last_name") + + +q = ( + dataset.lazy() + .sort("birthday", descending=True) + .group_by("state") + .agg( + get_person().first().alias("youngest"), + get_person().last().alias("oldest"), + get_person().sort().first().alias("alphabetical_first"), + pl.col("gender").sort_by("first_name").first().alias("gender"), + ) + .sort("state") + .limit(5) +) + +df = q.collect() +print(df) +# --8<-- [end:sort3] diff --git a/docs/src/python/user-guide/expressions/casting.py b/docs/src/python/user-guide/expressions/casting.py new file mode 100644 index 000000000000..7a57ac13656f --- /dev/null +++ b/docs/src/python/user-guide/expressions/casting.py @@ -0,0 +1,129 @@ +# --8<-- [start:setup] + +import polars as pl + +# --8<-- [end:setup] + +# --8<-- [start:dfnum] +df = pl.DataFrame( + { + "integers": [1, 2, 3, 4, 5], + "big_integers": [1, 10000002, 3, 10000004, 10000005], + "floats": [4.0, 5.0, 6.0, 7.0, 8.0], + "floats_with_decimal": [4.532, 5.5, 6.5, 7.5, 8.5], + } +) + +print(df) +# --8<-- [end:dfnum] + +# --8<-- [start:castnum] +out = df.select( + pl.col("integers").cast(pl.Float32).alias("integers_as_floats"), + pl.col("floats").cast(pl.Int32).alias("floats_as_integers"), + pl.col("floats_with_decimal") + .cast(pl.Int32) + .alias("floats_with_decimal_as_integers"), +) +print(out) +# --8<-- [end:castnum] + + +# --8<-- [start:downcast] +out = df.select( + pl.col("integers").cast(pl.Int16).alias("integers_smallfootprint"), + pl.col("floats").cast(pl.Float32).alias("floats_smallfootprint"), +) +print(out) +# --8<-- [end:downcast] + +# --8<-- [start:overflow] +try: + out = df.select(pl.col("big_integers").cast(pl.Int8)) + print(out) +except Exception as e: + print(e) +# --8<-- [end:overflow] + +# --8<-- [start:overflow2] +out = df.select(pl.col("big_integers").cast(pl.Int8, strict=False)) +print(out) +# --8<-- [end:overflow2] + + +# --8<-- [start:strings] +df = pl.DataFrame( + { + "integers": [1, 2, 3, 4, 5], + "float": [4.0, 5.03, 6.0, 7.0, 8.0], + "floats_as_string": ["4.0", "5.0", "6.0", "7.0", "8.0"], + } +) + +out = df.select( + pl.col("integers").cast(pl.Utf8), + pl.col("float").cast(pl.Utf8), + pl.col("floats_as_string").cast(pl.Float64), +) +print(out) +# --8<-- [end:strings] + + +# --8<-- [start:strings2] +df = pl.DataFrame({"strings_not_float": ["4.0", "not_a_number", "6.0", "7.0", "8.0"]}) +try: + out = df.select(pl.col("strings_not_float").cast(pl.Float64)) + print(out) +except Exception as e: + print(e) +# --8<-- [end:strings2] + +# --8<-- [start:bool] +df = pl.DataFrame( + { + "integers": [-1, 0, 2, 3, 4], + "floats": [0.0, 1.0, 2.0, 3.0, 4.0], + "bools": [True, False, True, False, True], + } +) + +out = df.select(pl.col("integers").cast(pl.Boolean), pl.col("floats").cast(pl.Boolean)) +print(out) +# --8<-- [end:bool] + +# --8<-- [start:dates] +from datetime import date, datetime + +df = pl.DataFrame( + { + "date": pl.date_range(date(2022, 1, 1), date(2022, 1, 5), eager=True), + "datetime": pl.datetime_range( + datetime(2022, 1, 1), datetime(2022, 1, 5), eager=True + ), + } +) + +out = df.select(pl.col("date").cast(pl.Int64), pl.col("datetime").cast(pl.Int64)) +print(out) +# --8<-- [end:dates] + +# --8<-- [start:dates2] +df = pl.DataFrame( + { + "date": pl.date_range(date(2022, 1, 1), date(2022, 1, 5), eager=True), + "string": [ + "2022-01-01", + "2022-01-02", + "2022-01-03", + "2022-01-04", + "2022-01-05", + ], + } +) + +out = df.select( + pl.col("date").dt.strftime("%Y-%m-%d"), + pl.col("string").str.strptime(pl.Datetime, "%Y-%m-%d"), +) +print(out) +# --8<-- [end:dates2] diff --git a/docs/src/python/user-guide/expressions/column-selections.py b/docs/src/python/user-guide/expressions/column-selections.py new file mode 100644 index 000000000000..88951eaee831 --- /dev/null +++ b/docs/src/python/user-guide/expressions/column-selections.py @@ -0,0 +1,91 @@ +# --8<-- [start:setup] +import polars as pl + +# --8<-- [end:setup] + +# --8<-- [start:selectors_df] +from datetime import date, datetime + +df = pl.DataFrame( + { + "id": [9, 4, 2], + "place": ["Mars", "Earth", "Saturn"], + "date": pl.date_range(date(2022, 1, 1), date(2022, 1, 3), "1d", eager=True), + "sales": [33.4, 2142134.1, 44.7], + "has_people": [False, True, False], + "logged_at": pl.datetime_range( + datetime(2022, 12, 1), datetime(2022, 12, 1, 0, 0, 2), "1s", eager=True + ), + } +).with_row_count("rn") +print(df) +# --8<-- [end:selectors_df] + +# --8<-- [start:all] +out = df.select(pl.col("*")) + +# Is equivalent to +out = df.select(pl.all()) +print(out) +# --8<-- [end:all] + +# --8<-- [start:exclude] +out = df.select(pl.col("*").exclude("logged_at", "rn")) +print(out) +# --8<-- [end:exclude] + +# --8<-- [start:expansion_by_names] +out = df.select(pl.col("date", "logged_at").dt.to_string("%Y-%h-%d")) +print(out) +# --8<-- [end:expansion_by_names] + +# --8<-- [start:expansion_by_regex] +out = df.select(pl.col("^.*(as|sa).*$")) +print(out) +# --8<-- [end:expansion_by_regex] + +# --8<-- [start:expansion_by_dtype] +out = df.select(pl.col(pl.Int64, pl.UInt32, pl.Boolean).n_unique()) +print(out) +# --8<-- [end:expansion_by_dtype] + +# --8<-- [start:selectors_intro] +import polars.selectors as cs + +out = df.select(cs.integer(), cs.string()) +print(out) +# --8<-- [end:selectors_intro] + +# --8<-- [start:selectors_diff] +out = df.select(cs.numeric() - cs.first()) +print(out) +# --8<-- [end:selectors_diff] + +# --8<-- [start:selectors_union] +out = df.select(cs.by_name("rn") | ~cs.numeric()) +print(out) +# --8<-- [end:selectors_union] + +# --8<-- [start:selectors_by_name] +out = df.select(cs.contains("rn"), cs.matches(".*_.*")) +print(out) +# --8<-- [end:selectors_by_name] + +# --8<-- [start:selectors_to_expr] +out = df.select(cs.temporal().as_expr().dt.to_string("%Y-%h-%d")) +print(out) +# --8<-- [end:selectors_to_expr] + +# --8<-- [start:selectors_is_selector_utility] +from polars.selectors import is_selector + +out = cs.temporal() +print(is_selector(out)) +# --8<-- [end:selectors_is_selector_utility] + +# --8<-- [start:selectors_colnames_utility] +from polars.selectors import expand_selector + +out = cs.temporal().as_expr().dt.to_string("%Y-%h-%d") +print(expand_selector(df, out)) +# --8<-- [end:selectors_colnames_utility] diff --git a/docs/src/python/user-guide/expressions/folds.py b/docs/src/python/user-guide/expressions/folds.py new file mode 100644 index 000000000000..803591b5b581 --- /dev/null +++ b/docs/src/python/user-guide/expressions/folds.py @@ -0,0 +1,50 @@ +# --8<-- [start:setup] +import polars as pl + +# --8<-- [end:setup] + +# --8<-- [start:mansum] +df = pl.DataFrame( + { + "a": [1, 2, 3], + "b": [10, 20, 30], + } +) + +out = df.select( + pl.fold(acc=pl.lit(0), function=lambda acc, x: acc + x, exprs=pl.all()).alias( + "sum" + ), +) +print(out) +# --8<-- [end:mansum] + +# --8<-- [start:conditional] +df = pl.DataFrame( + { + "a": [1, 2, 3], + "b": [0, 1, 2], + } +) + +out = df.filter( + pl.fold( + acc=pl.lit(True), + function=lambda acc, x: acc & x, + exprs=pl.col("*") > 1, + ) +) +print(out) +# --8<-- [end:conditional] + +# --8<-- [start:string] +df = pl.DataFrame( + { + "a": ["a", "b", "c"], + "b": [1, 2, 3], + } +) + +out = df.select(pl.concat_str(["a", "b"])) +print(out) +# --8<-- [end:string] diff --git a/docs/src/python/user-guide/expressions/functions.py b/docs/src/python/user-guide/expressions/functions.py new file mode 100644 index 000000000000..5f9bbd5bb1da --- /dev/null +++ b/docs/src/python/user-guide/expressions/functions.py @@ -0,0 +1,60 @@ +# --8<-- [start:setup] + +import polars as pl +import numpy as np + +np.random.seed(12) +# --8<-- [end:setup] + +# --8<-- [start:dataframe] +df = pl.DataFrame( + { + "nrs": [1, 2, 3, None, 5], + "names": ["foo", "ham", "spam", "egg", "spam"], + "random": np.random.rand(5), + "groups": ["A", "A", "B", "C", "B"], + } +) +print(df) +# --8<-- [end:dataframe] + +# --8<-- [start:samename] +df_samename = df.select(pl.col("nrs") + 5) +print(df_samename) +# --8<-- [end:samename] + + +# --8<-- [start:samenametwice] +try: + df_samename2 = df.select(pl.col("nrs") + 5, pl.col("nrs") - 5) + print(df_samename2) +except Exception as e: + print(e) +# --8<-- [end:samenametwice] + +# --8<-- [start:samenamealias] +df_alias = df.select( + (pl.col("nrs") + 5).alias("nrs + 5"), + (pl.col("nrs") - 5).alias("nrs - 5"), +) +print(df_alias) +# --8<-- [end:samenamealias] + +# --8<-- [start:countunique] +df_alias = df.select( + pl.col("names").n_unique().alias("unique"), + pl.approx_n_unique("names").alias("unique_approx"), +) +print(df_alias) +# --8<-- [end:countunique] + +# --8<-- [start:conditional] +df_conditional = df.select( + pl.col("nrs"), + pl.when(pl.col("nrs") > 2) + .then(pl.lit(True)) + .otherwise(pl.lit(False)) + .alias("conditional"), +) +print(df_conditional) +# --8<-- [end:conditional] diff --git a/docs/src/python/user-guide/expressions/lists.py b/docs/src/python/user-guide/expressions/lists.py new file mode 100644 index 000000000000..d81dac154461 --- /dev/null +++ b/docs/src/python/user-guide/expressions/lists.py @@ -0,0 +1,111 @@ +# --8<-- [start:setup] +import polars as pl + +# --8<-- [end:setup] + +# --8<-- [start:weather_df] +weather = pl.DataFrame( + { + "station": ["Station " + str(x) for x in range(1, 6)], + "temperatures": [ + "20 5 5 E1 7 13 19 9 6 20", + "18 8 16 11 23 E2 8 E2 E2 E2 90 70 40", + "19 24 E9 16 6 12 10 22", + "E2 E0 15 7 8 10 E1 24 17 13 6", + "14 8 E0 16 22 24 E1", + ], + } +) +print(weather) +# --8<-- [end:weather_df] + +# --8<-- [start:string_to_list] +out = weather.with_columns(pl.col("temperatures").str.split(" ")) +print(out) +# --8<-- [end:string_to_list] + +# --8<-- [start:explode_to_atomic] +out = weather.with_columns(pl.col("temperatures").str.split(" ")).explode( + "temperatures" +) +print(out) +# --8<-- [end:explode_to_atomic] + +# --8<-- [start:list_ops] +out = weather.with_columns(pl.col("temperatures").str.split(" ")).with_columns( + pl.col("temperatures").list.head(3).alias("top3"), + pl.col("temperatures").list.slice(-3, 3).alias("bottom_3"), + pl.col("temperatures").list.lengths().alias("obs"), +) +print(out) +# --8<-- [end:list_ops] + + +# --8<-- [start:count_errors] +out = weather.with_columns( + pl.col("temperatures") + .str.split(" ") + .list.eval(pl.element().cast(pl.Int64, strict=False).is_null()) + .list.sum() + .alias("errors") +) +print(out) +# --8<-- [end:count_errors] + +# --8<-- [start:count_errors_regex] +out = weather.with_columns( + pl.col("temperatures") + .str.split(" ") + .list.eval(pl.element().str.contains("(?i)[a-z]")) + .list.sum() + .alias("errors") +) +print(out) +# --8<-- [end:count_errors_regex] + +# --8<-- [start:weather_by_day] +weather_by_day = pl.DataFrame( + { + "station": ["Station " + str(x) for x in range(1, 11)], + "day_1": [17, 11, 8, 22, 9, 21, 20, 8, 8, 17], + "day_2": [15, 11, 10, 8, 7, 14, 18, 21, 15, 13], + "day_3": [16, 15, 24, 24, 8, 23, 19, 23, 16, 10], + } +) +print(weather_by_day) +# --8<-- [end:weather_by_day] + +# --8<-- [start:weather_by_day_rank] +rank_pct = (pl.element().rank(descending=True) / pl.col("*").count()).round(2) + +out = weather_by_day.with_columns( + # create the list of homogeneous data + pl.concat_list(pl.all().exclude("station")).alias("all_temps") +).select( + # select all columns except the intermediate list + pl.all().exclude("all_temps"), + # compute the rank by calling `list.eval` + pl.col("all_temps").list.eval(rank_pct, parallel=True).alias("temps_rank"), +) + +print(out) +# --8<-- [end:weather_by_day_rank] + +# --8<-- [start:array_df] +array_df = pl.DataFrame( + [ + pl.Series("Array_1", [[1, 3], [2, 5]]), + pl.Series("Array_2", [[1, 7, 3], [8, 1, 0]]), + ], + schema={"Array_1": pl.Array(2, pl.Int64), "Array_2": pl.Array(3, pl.Int64)}, +) +print(array_df) +# --8<-- [end:array_df] + +# --8<-- [start:array_ops] +out = array_df.select( + pl.col("Array_1").arr.min().suffix("_min"), + pl.col("Array_2").arr.sum().suffix("_sum"), +) +print(out) +# --8<-- [end:array_ops] diff --git a/docs/src/python/user-guide/expressions/null.py b/docs/src/python/user-guide/expressions/null.py new file mode 100644 index 000000000000..4641773bbb85 --- /dev/null +++ b/docs/src/python/user-guide/expressions/null.py @@ -0,0 +1,88 @@ +# --8<-- [start:setup] +import polars as pl +import numpy as np + +# --8<-- [end:setup] + +# --8<-- [start:dataframe] +df = pl.DataFrame( + { + "value": [1, None], + }, +) +print(df) +# --8<-- [end:dataframe] + + +# --8<-- [start:count] +null_count_df = df.null_count() +print(null_count_df) +# --8<-- [end:count] + + +# --8<-- [start:isnull] +is_null_series = df.select( + pl.col("value").is_null(), +) +print(is_null_series) +# --8<-- [end:isnull] + + +# --8<-- [start:dataframe2] +df = pl.DataFrame( + { + "col1": [1, 2, 3], + "col2": [1, None, 3], + }, +) +print(df) +# --8<-- [end:dataframe2] + + +# --8<-- [start:fill] +fill_literal_df = ( + df.with_columns( + pl.col("col2").fill_null( + pl.lit(2), + ), + ), +) +print(fill_literal_df) +# --8<-- [end:fill] + +# --8<-- [start:fillstrategy] +fill_forward_df = df.with_columns( + pl.col("col2").fill_null(strategy="forward"), +) +print(fill_forward_df) +# --8<-- [end:fillstrategy] + +# --8<-- [start:fillexpr] +fill_median_df = df.with_columns( + pl.col("col2").fill_null(pl.median("col2")), +) +print(fill_median_df) +# --8<-- [end:fillexpr] + +# --8<-- [start:fillinterpolate] +fill_interpolation_df = df.with_columns( + pl.col("col2").interpolate(), +) +print(fill_interpolation_df) +# --8<-- [end:fillinterpolate] + +# --8<-- [start:nan] +nan_df = pl.DataFrame( + { + "value": [1.0, np.NaN, float("nan"), 3.0], + }, +) +print(nan_df) +# --8<-- [end:nan] + +# --8<-- [start:nanfill] +mean_nan_df = nan_df.with_columns( + pl.col("value").fill_nan(None).alias("value"), +).mean() +print(mean_nan_df) +# --8<-- [end:nanfill] diff --git a/docs/src/python/user-guide/expressions/numpy-example.py b/docs/src/python/user-guide/expressions/numpy-example.py new file mode 100644 index 000000000000..d3300591c4d6 --- /dev/null +++ b/docs/src/python/user-guide/expressions/numpy-example.py @@ -0,0 +1,7 @@ +import polars as pl +import numpy as np + +df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + +out = df.select(np.log(pl.all()).suffix("_log")) +print(out) diff --git a/docs/src/python/user-guide/expressions/operators.py b/docs/src/python/user-guide/expressions/operators.py new file mode 100644 index 000000000000..6f617487c81e --- /dev/null +++ b/docs/src/python/user-guide/expressions/operators.py @@ -0,0 +1,44 @@ +# --8<-- [start:setup] + +import polars as pl +import numpy as np + +np.random.seed(12) +# --8<-- [end:setup] + + +# --8<-- [start:dataframe] +df = pl.DataFrame( + { + "nrs": [1, 2, 3, None, 5], + "names": ["foo", "ham", "spam", "egg", None], + "random": np.random.rand(5), + "groups": ["A", "A", "B", "C", "B"], + } +) +print(df) +# --8<-- [end:dataframe] + +# --8<-- [start:numerical] + +df_numerical = df.select( + (pl.col("nrs") + 5).alias("nrs + 5"), + (pl.col("nrs") - 5).alias("nrs - 5"), + (pl.col("nrs") * pl.col("random")).alias("nrs * random"), + (pl.col("nrs") / pl.col("random")).alias("nrs / random"), +) +print(df_numerical) + +# --8<-- [end:numerical] + +# --8<-- [start:logical] +df_logical = df.select( + (pl.col("nrs") > 1).alias("nrs > 1"), + (pl.col("random") <= 0.5).alias("random < .5"), + (pl.col("nrs") != 1).alias("nrs != 1"), + (pl.col("nrs") == 1).alias("nrs == 1"), + ((pl.col("random") <= 0.5) & (pl.col("nrs") > 1)).alias("and_expr"), # and + ((pl.col("random") <= 0.5) | (pl.col("nrs") > 1)).alias("or_expr"), # or +) +print(df_logical) +# --8<-- [end:logical] diff --git a/docs/src/python/user-guide/expressions/strings.py b/docs/src/python/user-guide/expressions/strings.py new file mode 100644 index 000000000000..9bec188f8930 --- /dev/null +++ b/docs/src/python/user-guide/expressions/strings.py @@ -0,0 +1,61 @@ +# --8<-- [start:setup] +import polars as pl + +# --8<-- [end:setup] + + +# --8<-- [start:df] +df = pl.DataFrame({"animal": ["Crab", "cat and dog", "rab$bit", None]}) + +out = df.select( + pl.col("animal").str.lengths().alias("byte_count"), + pl.col("animal").str.n_chars().alias("letter_count"), +) +print(out) +# --8<-- [end:df] + +# --8<-- [start:existence] +out = df.select( + pl.col("animal"), + pl.col("animal").str.contains("cat|bit").alias("regex"), + pl.col("animal").str.contains("rab$", literal=True).alias("literal"), + pl.col("animal").str.starts_with("rab").alias("starts_with"), + pl.col("animal").str.ends_with("dog").alias("ends_with"), +) +print(out) +# --8<-- [end:existence] + +# --8<-- [start:extract] +df = pl.DataFrame( + { + "a": [ + "http://vote.com/ballon_dor?candidate=messi&ref=polars", + "http://vote.com/ballon_dor?candidat=jorginho&ref=polars", + "http://vote.com/ballon_dor?candidate=ronaldo&ref=polars", + ] + } +) +out = df.select( + pl.col("a").str.extract(r"candidate=(\w+)", group_index=1), +) +print(out) +# --8<-- [end:extract] + + +# --8<-- [start:extract_all] +df = pl.DataFrame({"foo": ["123 bla 45 asd", "xyz 678 910t"]}) +out = df.select( + pl.col("foo").str.extract_all(r"(\d+)").alias("extracted_nrs"), +) +print(out) +# --8<-- [end:extract_all] + + +# --8<-- [start:replace] +df = pl.DataFrame({"id": [1, 2], "text": ["123abc", "abc456"]}) +out = df.with_columns( + pl.col("text").str.replace(r"abc\b", "ABC"), + pl.col("text").str.replace_all("a", "-", literal=True).alias("text_replace_all"), +) +print(out) +# --8<-- [end:replace] diff --git a/docs/src/python/user-guide/expressions/structs.py b/docs/src/python/user-guide/expressions/structs.py new file mode 100644 index 000000000000..f209420a37ab --- /dev/null +++ b/docs/src/python/user-guide/expressions/structs.py @@ -0,0 +1,66 @@ +# --8<-- [start:setup] +import polars as pl + +# --8<-- [end:setup] + +# --8<-- [start:ratings_df] +ratings = pl.DataFrame( + { + "Movie": ["Cars", "IT", "ET", "Cars", "Up", "IT", "Cars", "ET", "Up", "ET"], + "Theatre": ["NE", "ME", "IL", "ND", "NE", "SD", "NE", "IL", "IL", "SD"], + "Avg_Rating": [4.5, 4.4, 4.6, 4.3, 4.8, 4.7, 4.7, 4.9, 4.7, 4.6], + "Count": [30, 27, 26, 29, 31, 28, 28, 26, 33, 26], + } +) +print(ratings) +# --8<-- [end:ratings_df] + +# --8<-- [start:state_value_counts] +out = ratings.select(pl.col("Theatre").value_counts(sort=True)) +print(out) +# --8<-- [end:state_value_counts] + +# --8<-- [start:struct_unnest] +out = ratings.select(pl.col("Theatre").value_counts(sort=True)).unnest("Theatre") +print(out) +# --8<-- [end:struct_unnest] + +# --8<-- [start:series_struct] +rating_Series = pl.Series( + "ratings", + [ + {"Movie": "Cars", "Theatre": "NE", "Avg_Rating": 4.5}, + {"Movie": "Toy Story", "Theatre": "ME", "Avg_Rating": 4.9}, + ], +) +print(rating_Series) +# --8<-- [end:series_struct] + +# --8<-- [start:series_struct_extract] +out = rating_Series.struct.field("Movie") +print(out) +# --8<-- [end:series_struct_extract] + +# --8<-- [start:series_struct_rename] +out = ( + rating_Series.to_frame() + .select(pl.col("ratings").struct.rename_fields(["Film", "State", "Value"])) + .unnest("ratings") +) +print(out) +# --8<-- [end:series_struct_rename] + +# --8<-- [start:struct_duplicates] +out = ratings.filter(pl.struct("Movie", "Theatre").is_duplicated()) +print(out) +# --8<-- [end:struct_duplicates] + +# --8<-- [start:struct_ranking] +out = ratings.with_columns( + pl.struct("Count", "Avg_Rating") + .rank("dense", descending=True) + .over("Movie", "Theatre") + .alias("Rank") +).filter(pl.struct("Movie", "Theatre").is_duplicated()) +print(out) +# --8<-- [end:struct_ranking] diff --git a/docs/src/python/user-guide/expressions/user-defined-functions.py b/docs/src/python/user-guide/expressions/user-defined-functions.py new file mode 100644 index 000000000000..89fa51420554 --- /dev/null +++ b/docs/src/python/user-guide/expressions/user-defined-functions.py @@ -0,0 +1,56 @@ +# --8<-- [start:setup] + +import polars as pl + +# --8<-- [end:setup] + +# --8<-- [start:dataframe] +df = pl.DataFrame( + { + "keys": ["a", "a", "b"], + "values": [10, 7, 1], + } +) + +out = df.group_by("keys", maintain_order=True).agg( + pl.col("values").map_batches(lambda s: s.shift()).alias("shift_map"), + pl.col("values").shift().alias("shift_expression"), +) +print(df) +# --8<-- [end:dataframe] + + +# --8<-- [start:apply] +out = df.group_by("keys", maintain_order=True).agg( + pl.col("values").map_elements(lambda s: s.shift()).alias("shift_map"), + pl.col("values").shift().alias("shift_expression"), +) +print(out) +# --8<-- [end:apply] + +# --8<-- [start:counter] +counter = 0 + + +def add_counter(val: int) -> int: + global counter + counter += 1 + return counter + val + + +out = df.select( + pl.col("values").map_elements(add_counter).alias("solution_apply"), + (pl.col("values") + pl.int_range(1, pl.count() + 1)).alias("solution_expr"), +) +print(out) +# --8<-- [end:counter] + +# --8<-- [start:combine] +out = df.select( + pl.struct(["keys", "values"]) + .map_elements(lambda x: len(x["keys"]) + x["values"]) + .alias("solution_apply"), + (pl.col("keys").str.lengths() + pl.col("values")).alias("solution_expr"), +) +print(out) +# --8<-- [end:combine] diff --git a/docs/src/python/user-guide/expressions/window.py b/docs/src/python/user-guide/expressions/window.py new file mode 100644 index 000000000000..bd2adda867f5 --- /dev/null +++ b/docs/src/python/user-guide/expressions/window.py @@ -0,0 +1,84 @@ +# --8<-- [start:pokemon] +import polars as pl + +# then let's load some csv data with information about pokemon +df = pl.read_csv( + "https://gist.githubusercontent.com/ritchie46/cac6b337ea52281aa23c049250a4ff03/raw/89a957ff3919d90e6ef2d34235e6bf22304f3366/pokemon.csv" +) +print(df.head()) +# --8<-- [end:pokemon] + + +# --8<-- [start:group_by] +out = df.select( + "Type 1", + "Type 2", + pl.col("Attack").mean().over("Type 1").alias("avg_attack_by_type"), + pl.col("Defense") + .mean() + .over(["Type 1", "Type 2"]) + .alias("avg_defense_by_type_combination"), + pl.col("Attack").mean().alias("avg_attack"), +) +print(out) +# --8<-- [end:group_by] + +# --8<-- [start:operations] +filtered = df.filter(pl.col("Type 2") == "Psychic").select( + "Name", + "Type 1", + "Speed", +) +print(filtered) +# --8<-- [end:operations] + +# --8<-- [start:sort] +out = filtered.with_columns( + pl.col(["Name", "Speed"]).sort_by("Speed", descending=True).over("Type 1"), +) +print(out) +# --8<-- [end:sort] + +# --8<-- [start:rules] +# aggregate and broadcast within a group +# output type: -> Int32 +pl.sum("foo").over("groups") + +# sum within a group and multiply with group elements +# output type: -> Int32 +(pl.col("x").sum() * pl.col("y")).over("groups") + +# sum within a group and multiply with group elements +# and aggregate the group to a list +# output type: -> List(Int32) +(pl.col("x").sum() * pl.col("y")).over("groups", mapping_strategy="join") + +# sum within a group and multiply with group elements +# and aggregate the group to a list +# then explode the list to multiple rows + +# This is the fastest method to do things over groups when the groups are sorted +(pl.col("x").sum() * pl.col("y")).over("groups", mapping_strategy="explode") +# --8<-- [end:rules] + +# --8<-- [start:examples] +out = df.sort("Type 1").select( + pl.col("Type 1").head(3).over("Type 1", mapping_strategy="explode"), + pl.col("Name") + .sort_by(pl.col("Speed"), descending=True) + .head(3) + .over("Type 1", mapping_strategy="explode") + .alias("fastest/group"), + pl.col("Name") + .sort_by(pl.col("Attack"), descending=True) + .head(3) + .over("Type 1", mapping_strategy="explode") + .alias("strongest/group"), + pl.col("Name") + .sort() + .head(3) + .over("Type 1", mapping_strategy="explode") + .alias("sorted_by_alphabet"), +) +print(out) +# --8<-- [end:examples] diff --git a/docs/src/python/user-guide/io/aws.py b/docs/src/python/user-guide/io/aws.py new file mode 100644 index 000000000000..c8bfa94941d2 --- /dev/null +++ b/docs/src/python/user-guide/io/aws.py @@ -0,0 +1,14 @@ +""" +# --8<-- [start:bucket] +import polars as pl +import pyarrow.parquet as pq +import s3fs + +fs = s3fs.S3FileSystem() +bucket = "" +path = "" + +dataset = pq.ParquetDataset(f"s3://{bucket}/{path}", filesystem=fs) +df = pl.from_arrow(dataset.read()) +# --8<-- [end:bucket] +""" diff --git a/docs/src/python/user-guide/io/bigquery.py b/docs/src/python/user-guide/io/bigquery.py new file mode 100644 index 000000000000..678ed70200b4 --- /dev/null +++ b/docs/src/python/user-guide/io/bigquery.py @@ -0,0 +1,38 @@ +""" +# --8<-- [start:read] +import polars as pl +from google.cloud import bigquery + +client = bigquery.Client() + +# Perform a query. +QUERY = ( + 'SELECT name FROM `bigquery-public-data.usa_names.usa_1910_2013` ' + 'WHERE state = "TX" ' + 'LIMIT 100') +query_job = client.query(QUERY) # API request +rows = query_job.result() # Waits for query to finish + +df = pl.from_arrow(rows.to_arrow()) +# --8<-- [end:read] + +# --8<-- [start:write] +from google.cloud import bigquery + +client = bigquery.Client() + +# Write dataframe to stream as parquet file; does not hit disk +with io.BytesIO() as stream: + df.write_parquet(stream) + stream.seek(0) + job = client.load_table_from_file( + stream, + destination='tablename', + project='projectname', + job_config=bigquery.LoadJobConfig( + source_format=bigquery.SourceFormat.PARQUET, + ), + ) +job.result() # Waits for the job to complete +# --8<-- [end:write] +""" diff --git a/docs/src/python/user-guide/io/csv.py b/docs/src/python/user-guide/io/csv.py new file mode 100644 index 000000000000..d4039a43ce35 --- /dev/null +++ b/docs/src/python/user-guide/io/csv.py @@ -0,0 +1,19 @@ +# --8<-- [start:setup] +import polars as pl + +# --8<-- [end:setup] + +""" +# --8<-- [start:read] +df = pl.read_csv("docs/data/path.csv") +# --8<-- [end:read] +""" + +# --8<-- [start:write] +df = pl.DataFrame({"foo": [1, 2, 3], "bar": [None, "bak", "baz"]}) +df.write_csv("docs/data/path.csv") +# --8<-- [end:write] + +# --8<-- [start:scan] +df = pl.scan_csv("docs/data/path.csv") +# --8<-- [end:scan] diff --git a/docs/src/python/user-guide/io/database.py b/docs/src/python/user-guide/io/database.py new file mode 100644 index 000000000000..97e8f659de73 --- /dev/null +++ b/docs/src/python/user-guide/io/database.py @@ -0,0 +1,32 @@ +""" +# --8<-- [start:read] +import polars as pl + +connection_uri = "postgres://username:password@server:port/database" +query = "SELECT * FROM foo" + +pl.read_database(query=query, connection_uri=connection_uri) +# --8<-- [end:read] + +# --8<-- [start:adbc] +connection_uri = "postgres://username:password@server:port/database" +query = "SELECT * FROM foo" + +pl.read_database(query=query, connection_uri=connection_uri, engine="adbc") +# --8<-- [end:adbc] + +# --8<-- [start:write] +connection_uri = "postgres://username:password@server:port/database" +df = pl.DataFrame({"foo": [1, 2, 3]}) + +df.write_database(table_name="records", connection_uri=connection_uri) +# --8<-- [end:write] + +# --8<-- [start:write_adbc] +connection_uri = "postgres://username:password@server:port/database" +df = pl.DataFrame({"foo": [1, 2, 3]}) + +df.write_database(table_name="records", connection_uri=connection_uri, engine="adbc") +# --8<-- [end:write_adbc] + +""" diff --git a/docs/src/python/user-guide/io/multiple.py b/docs/src/python/user-guide/io/multiple.py new file mode 100644 index 000000000000..f7500b6b6684 --- /dev/null +++ b/docs/src/python/user-guide/io/multiple.py @@ -0,0 +1,41 @@ +# --8<-- [start:create] +import polars as pl + +df = pl.DataFrame({"foo": [1, 2, 3], "bar": [None, "ham", "spam"]}) + +for i in range(5): + df.write_csv(f"docs/data/my_many_files_{i}.csv") +# --8<-- [end:create] + +# --8<-- [start:read] +df = pl.read_csv("docs/data/my_many_files_*.csv") +print(df) +# --8<-- [end:read] + +# --8<-- [start:creategraph] +import base64 + +pl.scan_csv("docs/data/my_many_files_*.csv").show_graph( + output_path="docs/images/multiple.png", show=False +) +with open("docs/images/multiple.png", "rb") as f: + png = base64.b64encode(f.read()).decode() + print(f'') +# --8<-- [end:creategraph] + +# --8<-- [start:graph] +pl.scan_csv("docs/data/my_many_files_*.csv").show_graph() +# --8<-- [end:graph] + +# --8<-- [start:glob] +import polars as pl +import glob + +queries = [] +for file in glob.glob("docs/data/my_many_files_*.csv"): + q = pl.scan_csv(file).group_by("bar").agg([pl.count(), pl.sum("foo")]) + queries.append(q) + +dataframes = pl.collect_all(queries) +print(dataframes) +# --8<-- [end:glob] diff --git a/docs/src/python/user-guide/io/parquet.py b/docs/src/python/user-guide/io/parquet.py new file mode 100644 index 000000000000..feba73df9a19 --- /dev/null +++ b/docs/src/python/user-guide/io/parquet.py @@ -0,0 +1,19 @@ +# --8<-- [start:setup] +import polars as pl + +# --8<-- [end:setup] + +""" +# --8<-- [start:read] +df = pl.read_parquet("docs/data/path.parquet") +# --8<-- [end:read] +""" + +# --8<-- [start:write] +df = pl.DataFrame({"foo": [1, 2, 3], "bar": [None, "bak", "baz"]}) +df.write_parquet("docs/data/path.parquet") +# --8<-- [end:write] + +# --8<-- [start:scan] +df = pl.scan_parquet("docs/data/path.parquet") +# --8<-- [end:scan] diff --git a/docs/src/python/user-guide/lazy/execution.py b/docs/src/python/user-guide/lazy/execution.py new file mode 100644 index 000000000000..110fb0105500 --- /dev/null +++ b/docs/src/python/user-guide/lazy/execution.py @@ -0,0 +1,36 @@ +import polars as pl + +""" +# --8<-- [start:df] +q1 = ( + pl.scan_csv("docs/data/reddit.csv") + .with_columns(pl.col("name").str.to_uppercase()) + .filter(pl.col("comment_karma") > 0) +) +# --8<-- [end:df] + +# --8<-- [start:collect] +q4 = ( + pl.scan_csv(f"docs/data/reddit.csv") + .with_columns(pl.col("name").str.to_uppercase()) + .filter(pl.col("comment_karma") > 0) + .collect() +) +# --8<-- [end:collect] +# --8<-- [start:stream] +q5 = ( + pl.scan_csv(f"docs/data/reddit.csv") + .with_columns(pl.col("name").str.to_uppercase()) + .filter(pl.col("comment_karma") > 0) + .collect(streaming=True) +) +# --8<-- [end:stream] +# --8<-- [start:partial] +q9 = ( + pl.scan_csv(f"docs/data/reddit.csv") + .with_columns(pl.col("name").str.to_uppercase()) + .filter(pl.col("comment_karma") > 0) + .fetch(n_rows=int(100)) +) +# --8<-- [end:partial] +""" diff --git a/docs/src/python/user-guide/lazy/query_plan.py b/docs/src/python/user-guide/lazy/query_plan.py new file mode 100644 index 000000000000..ed2c3f4bac45 --- /dev/null +++ b/docs/src/python/user-guide/lazy/query_plan.py @@ -0,0 +1,48 @@ +# --8<-- [start:setup] +import polars as pl + +# --8<-- [end:setup] + +# --8<-- [start:plan] +q1 = ( + pl.scan_csv("docs/data/reddit.csv") + .with_columns(pl.col("name").str.to_uppercase()) + .filter(pl.col("comment_karma") > 0) +) +# --8<-- [end:plan] + +# --8<-- [start:createplan] +import base64 + +q1.show_graph(optimized=False, show=False, output_path="docs/images/query_plan.png") +with open("docs/images/query_plan.png", "rb") as f: + png = base64.b64encode(f.read()).decode() + print(f'') +# --8<-- [end:createplan] + +""" +# --8<-- [start:showplan] +q1.show_graph(optimized=False) +# --8<-- [end:showplan] +""" + +# --8<-- [start:describe] +q1.explain(optimized=False) +# --8<-- [end:describe] + +# --8<-- [start:createplan2] +q1.show_graph(show=False, output_path="docs/images/query_plan_optimized.png") +with open("docs/images/query_plan_optimized.png", "rb") as f: + png = base64.b64encode(f.read()).decode() + print(f'') +# --8<-- [end:createplan2] + +""" +# --8<-- [start:show] +q1.show_graph() +# --8<-- [end:show] +""" + +# --8<-- [start:optimized] +q1.explain() +# --8<-- [end:optimized] diff --git a/docs/src/python/user-guide/lazy/schema.py b/docs/src/python/user-guide/lazy/schema.py new file mode 100644 index 000000000000..e621718307ee --- /dev/null +++ b/docs/src/python/user-guide/lazy/schema.py @@ -0,0 +1,38 @@ +# --8<-- [start:setup] +import polars as pl + +# --8<-- [end:setup] + +# --8<-- [start:schema] +q3 = pl.DataFrame({"foo": ["a", "b", "c"], "bar": [0, 1, 2]}).lazy() + +print(q3.schema) +# --8<-- [end:schema] + +# --8<-- [start:typecheck] +pl.DataFrame({"foo": ["a", "b", "c"], "bar": [0, 1, 2]}).lazy().with_columns( + pl.col("bar").round(0) +) +# --8<-- [end:typecheck] + +# --8<-- [start:lazyeager] +lazy_eager_query = ( + pl.DataFrame( + { + "id": ["a", "b", "c"], + "month": ["jan", "feb", "mar"], + "values": [0, 1, 2], + } + ) + .lazy() + .with_columns((2 * pl.col("values")).alias("double_values")) + .collect() + .pivot( + index="id", columns="month", values="double_values", aggregate_function="first" + ) + .lazy() + .filter(pl.col("mar").is_null()) + .collect() +) +print(lazy_eager_query) +# --8<-- [end:lazyeager] diff --git a/docs/src/python/user-guide/lazy/using.py b/docs/src/python/user-guide/lazy/using.py new file mode 100644 index 000000000000..1a10abb189d2 --- /dev/null +++ b/docs/src/python/user-guide/lazy/using.py @@ -0,0 +1,15 @@ +import polars as pl + +""" +# --8<-- [start:dataframe] +q1 = ( + pl.scan_csv(f"docs/data/reddit.csv") + .with_columns(pl.col("name").str.to_uppercase()) + .filter(pl.col("comment_karma") > 0) +) +# --8<-- [end:dataframe] + +# --8<-- [start:fromdf] +q3 = pl.DataFrame({"foo": ["a", "b", "c"], "bar": [0, 1, 2]}).lazy() +# --8<-- [end:fromdf] +""" diff --git a/docs/src/python/user-guide/misc/multiprocess.py b/docs/src/python/user-guide/misc/multiprocess.py new file mode 100644 index 000000000000..55aec52d6b9f --- /dev/null +++ b/docs/src/python/user-guide/misc/multiprocess.py @@ -0,0 +1,84 @@ +""" +# --8<-- [start:recommendation] +from multiprocessing import get_context + + +def my_fun(s): + print(s) + + +with get_context("spawn").Pool() as pool: + pool.map(my_fun, ["input1", "input2", ...]) + +# --8<-- [end:recommendation] + +# --8<-- [start:example1] +import multiprocessing +import polars as pl + + +def test_sub_process(df: pl.DataFrame, job_id): + df_filtered = df.filter(pl.col("a") > 0) + print(f"Filtered (job_id: {job_id})", df_filtered, sep="\n") + + +def create_dataset(): + return pl.DataFrame({"a": [0, 2, 3, 4, 5], "b": [0, 4, 5, 56, 4]}) + + +def setup(): + # some setup work + df = create_dataset() + df.write_parquet("/tmp/test.parquet") + + +def main(): + test_df = pl.read_parquet("/tmp/test.parquet") + + for i in range(0, 5): + proc = multiprocessing.get_context("spawn").Process( + target=test_sub_process, args=(test_df, i) + ) + proc.start() + proc.join() + + print(f"Executed sub process {i}") + + +if __name__ == "__main__": + setup() + main() + +# --8<-- [end:example1] +""" +# --8<-- [start:example2] +import multiprocessing +import polars as pl + + +def test_sub_process(df: pl.DataFrame, job_id): + df_filtered = df.filter(pl.col("a") > 0) + print(f"Filtered (job_id: {job_id})", df_filtered, sep="\n") + + +def create_dataset(): + return pl.DataFrame({"a": [0, 2, 3, 4, 5], "b": [0, 4, 5, 56, 4]}) + + +def main(): + test_df = create_dataset() + + for i in range(0, 5): + proc = multiprocessing.get_context("fork").Process( + target=test_sub_process, args=(test_df, i) + ) + proc.start() + proc.join() + + print(f"Executed sub process {i}") + + +if __name__ == "__main__": + main() + +# --8<-- [end:example2] diff --git a/docs/src/python/user-guide/sql/create.py b/docs/src/python/user-guide/sql/create.py new file mode 100644 index 000000000000..e26ffd0a31f1 --- /dev/null +++ b/docs/src/python/user-guide/sql/create.py @@ -0,0 +1,21 @@ +# --8<-- [start:setup] +import polars as pl + +# --8<-- [end:setup] + +# --8<-- [start:create] +data = {"name": ["Alice", "Bob", "Charlie", "David"], "age": [25, 30, 35, 40]} +df = pl.LazyFrame(data) + +ctx = pl.SQLContext(my_table=df, eager_execution=True) + +result = ctx.execute( + """ + CREATE TABLE older_people + AS + SELECT * FROM my_table WHERE age > 30 +""" +) + +print(ctx.execute("SELECT * FROM older_people")) +# --8<-- [end:create] diff --git a/docs/src/python/user-guide/sql/cte.py b/docs/src/python/user-guide/sql/cte.py new file mode 100644 index 000000000000..c44b906cf3ad --- /dev/null +++ b/docs/src/python/user-guide/sql/cte.py @@ -0,0 +1,24 @@ +# --8<-- [start:setup] +import polars as pl + +# --8<-- [end:setup] + +# --8<-- [start:cte] +ctx = pl.SQLContext() +df = pl.LazyFrame( + {"name": ["Alice", "Bob", "Charlie", "David"], "age": [25, 30, 35, 40]} +) +ctx.register("my_table", df) + +result = ctx.execute( + """ + WITH older_people AS ( + SELECT * FROM my_table WHERE age > 30 + ) + SELECT * FROM older_people WHERE STARTS_WITH(name,'C') +""", + eager=True, +) + +print(result) +# --8<-- [end:cte] diff --git a/docs/src/python/user-guide/sql/intro.py b/docs/src/python/user-guide/sql/intro.py new file mode 100644 index 000000000000..3b59ac9e70d1 --- /dev/null +++ b/docs/src/python/user-guide/sql/intro.py @@ -0,0 +1,100 @@ +# --8<-- [start:setup] +import os + +import pandas as pd +import polars as pl + +# --8<-- [end:setup] + +# --8<-- [start:context] +ctx = pl.SQLContext() +# --8<-- [end:context] + +# --8<-- [start:register_context] +df = pl.DataFrame({"a": [1, 2, 3]}) +lf = pl.LazyFrame({"b": [4, 5, 6]}) + +# Register all dataframes in the global namespace: registers both df and lf +ctx = pl.SQLContext(register_globals=True) + +# Other option: register dataframe df as "df" and lazyframe lf as "lf" +ctx = pl.SQLContext(df=df, lf=lf) +# --8<-- [end:register_context] + +# --8<-- [start:register_pandas] +import pandas as pd + +df_pandas = pd.DataFrame({"c": [7, 8, 9]}) +ctx = pl.SQLContext(df_pandas=pl.from_pandas(df_pandas)) +# --8<-- [end:register_pandas] + +# --8<-- [start:execute] +# For local files use scan_csv instead +pokemon = pl.read_csv( + "https://gist.githubusercontent.com/ritchie46/cac6b337ea52281aa23c049250a4ff03/raw/89a957ff3919d90e6ef2d34235e6bf22304f3366/pokemon.csv" +) +ctx = pl.SQLContext(register_globals=True, eager_execution=True) +df_small = ctx.execute("SELECT * from pokemon LIMIT 5") +print(df_small) +# --8<-- [end:execute] + +# --8<-- [start:prepare_multiple_sources] +with open("products_categories.json", "w") as temp_file: + json_data = """{"product_id": 1, "category": "Category 1"} +{"product_id": 2, "category": "Category 1"} +{"product_id": 3, "category": "Category 2"} +{"product_id": 4, "category": "Category 2"} +{"product_id": 5, "category": "Category 3"}""" + + temp_file.write(json_data) + +with open("products_masterdata.csv", "w") as temp_file: + csv_data = """product_id,product_name +1,Product A +2,Product B +3,Product C +4,Product D +5,Product E""" + + temp_file.write(csv_data) + +sales_data = pd.DataFrame( + { + "product_id": [1, 2, 3, 4, 5], + "sales": [100, 200, 150, 250, 300], + } +) +# --8<-- [end:prepare_multiple_sources] + +# --8<-- [start:execute_multiple_sources] +# Input data: +# products_masterdata.csv with schema {'product_id': Int64, 'product_name': Utf8} +# products_categories.json with schema {'product_id': Int64, 'category': Utf8} +# sales_data is a Pandas DataFrame with schema {'product_id': Int64, 'sales': Int64} + +ctx = pl.SQLContext( + products_masterdata=pl.scan_csv("products_masterdata.csv"), + products_categories=pl.scan_ndjson("products_categories.json"), + sales_data=pl.from_pandas(sales_data), + eager_execution=True, +) + +query = """ +SELECT + product_id, + product_name, + category, + sales +FROM + products_masterdata +LEFT JOIN products_categories USING (product_id) +LEFT JOIN sales_data USING (product_id) +""" + +print(ctx.execute(query)) +# --8<-- [end:execute_multiple_sources] + +# --8<-- [start:clean_multiple_sources] +os.remove("products_categories.json") +os.remove("products_masterdata.csv") +# --8<-- [end:clean_multiple_sources] diff --git a/docs/src/python/user-guide/sql/show.py b/docs/src/python/user-guide/sql/show.py new file mode 100644 index 000000000000..cedf425dc54b --- /dev/null +++ b/docs/src/python/user-guide/sql/show.py @@ -0,0 +1,26 @@ +# --8<-- [start:setup] +import polars as pl + +# --8<-- [end:setup] + + +# --8<-- [start:show] +# Create some DataFrames and register them with the SQLContext +df1 = pl.LazyFrame( + { + "name": ["Alice", "Bob", "Charlie", "David"], + "age": [25, 30, 35, 40], + } +) +df2 = pl.LazyFrame( + { + "name": ["Ellen", "Frank", "Gina", "Henry"], + "age": [45, 50, 55, 60], + } +) +ctx = pl.SQLContext(mytable1=df1, mytable2=df2) + +tables = ctx.execute("SHOW TABLES", eager=True) + +print(tables) +# --8<-- [end:show] diff --git a/docs/src/python/user-guide/sql/sql_select.py b/docs/src/python/user-guide/sql/sql_select.py new file mode 100644 index 000000000000..1e040c739b99 --- /dev/null +++ b/docs/src/python/user-guide/sql/sql_select.py @@ -0,0 +1,106 @@ +# --8<-- [start:setup] +import polars as pl + +# --8<-- [end:setup] + + +# --8<-- [start:df] +df = pl.DataFrame( + { + "city": [ + "New York", + "Los Angeles", + "Chicago", + "Houston", + "Phoenix", + "Amsterdam", + ], + "country": ["USA", "USA", "USA", "USA", "USA", "Netherlands"], + "population": [8399000, 3997000, 2705000, 2320000, 1680000, 900000], + } +) + +ctx = pl.SQLContext(population=df, eager_execution=True) + +print(ctx.execute("SELECT * FROM population")) +# --8<-- [end:df] + +# --8<-- [start:group_by] +result = ctx.execute( + """ + SELECT country, AVG(population) as avg_population + FROM population + GROUP BY country + """ +) +print(result) +# --8<-- [end:group_by] + + +# --8<-- [start:orderby] +result = ctx.execute( + """ + SELECT city, population + FROM population + ORDER BY population + """ +) +print(result) +# --8<-- [end:orderby] + +# --8<-- [start:join] +income = pl.DataFrame( + { + "city": [ + "New York", + "Los Angeles", + "Chicago", + "Houston", + "Amsterdam", + "Rotterdam", + "Utrecht", + ], + "country": [ + "USA", + "USA", + "USA", + "USA", + "Netherlands", + "Netherlands", + "Netherlands", + ], + "income": [55000, 62000, 48000, 52000, 42000, 38000, 41000], + } +) +ctx.register_many(income=income) +result = ctx.execute( + """ + SELECT country, city, income, population + FROM population + LEFT JOIN income on population.city = income.city + """ +) +print(result) +# --8<-- [end:join] + + +# --8<-- [start:functions] +result = ctx.execute( + """ + SELECT city, population + FROM population + WHERE STARTS_WITH(country,'U') + """ +) +print(result) +# --8<-- [end:functions] + +# --8<-- [start:tablefunctions] +result = ctx.execute( + """ + SELECT * + FROM read_csv('docs/data/iris.csv') + """ +) +print(result) +# --8<-- [end:tablefunctions] diff --git a/docs/src/python/user-guide/transformations/concatenation.py b/docs/src/python/user-guide/transformations/concatenation.py new file mode 100644 index 000000000000..65b5c8239e83 --- /dev/null +++ b/docs/src/python/user-guide/transformations/concatenation.py @@ -0,0 +1,76 @@ +# --8<-- [start:setup] +import polars as pl +from datetime import datetime + +# --8<-- [end:setup] + +# --8<-- [start:vertical] +df_v1 = pl.DataFrame( + { + "a": [1], + "b": [3], + } +) +df_v2 = pl.DataFrame( + { + "a": [2], + "b": [4], + } +) +df_vertical_concat = pl.concat( + [ + df_v1, + df_v2, + ], + how="vertical", +) +print(df_vertical_concat) +# --8<-- [end:vertical] + +# --8<-- [start:horizontal] +df_h1 = pl.DataFrame( + { + "l1": [1, 2], + "l2": [3, 4], + } +) +df_h2 = pl.DataFrame( + { + "r1": [5, 6], + "r2": [7, 8], + "r3": [9, 10], + } +) +df_horizontal_concat = pl.concat( + [ + df_h1, + df_h2, + ], + how="horizontal", +) +print(df_horizontal_concat) +# --8<-- [end:horizontal] + +# --8<-- [start:cross] +df_d1 = pl.DataFrame( + { + "a": [1], + "b": [3], + } +) +df_d2 = pl.DataFrame( + { + "a": [2], + "d": [4], + } +) + +df_diagonal_concat = pl.concat( + [ + df_d1, + df_d2, + ], + how="diagonal", +) +print(df_diagonal_concat) +# --8<-- [end:cross] diff --git a/docs/src/python/user-guide/transformations/joins.py b/docs/src/python/user-guide/transformations/joins.py new file mode 100644 index 000000000000..98828020820d --- /dev/null +++ b/docs/src/python/user-guide/transformations/joins.py @@ -0,0 +1,150 @@ +# --8<-- [start:setup] +import polars as pl +from datetime import datetime + +# --8<-- [end:setup] + +# --8<-- [start:innerdf] +df_customers = pl.DataFrame( + { + "customer_id": [1, 2, 3], + "name": ["Alice", "Bob", "Charlie"], + } +) +print(df_customers) +# --8<-- [end:innerdf] + +# --8<-- [start:innerdf2] +df_orders = pl.DataFrame( + { + "order_id": ["a", "b", "c"], + "customer_id": [1, 2, 2], + "amount": [100, 200, 300], + } +) +print(df_orders) +# --8<-- [end:innerdf2] + + +# --8<-- [start:inner] +df_inner_customer_join = df_customers.join(df_orders, on="customer_id", how="inner") +print(df_inner_customer_join) +# --8<-- [end:inner] + +# --8<-- [start:left] +df_left_join = df_customers.join(df_orders, on="customer_id", how="left") +print(df_left_join) +# --8<-- [end:left] + +# --8<-- [start:outer] +df_outer_join = df_customers.join(df_orders, on="customer_id", how="outer") +print(df_outer_join) +# --8<-- [end:outer] + +# --8<-- [start:df3] +df_colors = pl.DataFrame( + { + "color": ["red", "blue", "green"], + } +) +print(df_colors) +# --8<-- [end:df3] + +# --8<-- [start:df4] +df_sizes = pl.DataFrame( + { + "size": ["S", "M", "L"], + } +) +print(df_sizes) +# --8<-- [end:df4] + +# --8<-- [start:cross] +df_cross_join = df_colors.join(df_sizes, how="cross") +print(df_cross_join) +# --8<-- [end:cross] + +# --8<-- [start:df5] +df_cars = pl.DataFrame( + { + "id": ["a", "b", "c"], + "make": ["ford", "toyota", "bmw"], + } +) +print(df_cars) +# --8<-- [end:df5] + +# --8<-- [start:df6] +df_repairs = pl.DataFrame( + { + "id": ["c", "c"], + "cost": [100, 200], + } +) +print(df_repairs) +# --8<-- [end:df6] + +# --8<-- [start:inner2] +df_inner_join = df_cars.join(df_repairs, on="id", how="inner") +print(df_inner_join) +# --8<-- [end:inner2] + +# --8<-- [start:semi] +df_semi_join = df_cars.join(df_repairs, on="id", how="semi") +print(df_semi_join) +# --8<-- [end:semi] + +# --8<-- [start:anti] +df_anti_join = df_cars.join(df_repairs, on="id", how="anti") +print(df_anti_join) +# --8<-- [end:anti] + +# --8<-- [start:df7] +df_trades = pl.DataFrame( + { + "time": [ + datetime(2020, 1, 1, 9, 1, 0), + datetime(2020, 1, 1, 9, 1, 0), + datetime(2020, 1, 1, 9, 3, 0), + datetime(2020, 1, 1, 9, 6, 0), + ], + "stock": ["A", "B", "B", "C"], + "trade": [101, 299, 301, 500], + } +) +print(df_trades) +# --8<-- [end:df7] + +# --8<-- [start:df8] +df_quotes = pl.DataFrame( + { + "time": [ + datetime(2020, 1, 1, 9, 0, 0), + datetime(2020, 1, 1, 9, 2, 0), + datetime(2020, 1, 1, 9, 4, 0), + datetime(2020, 1, 1, 9, 6, 0), + ], + "stock": ["A", "B", "C", "A"], + "quote": [100, 300, 501, 102], + } +) + +print(df_quotes) +# --8<-- [end:df8] + +# --8<-- [start:asofpre] +df_trades = df_trades.sort("time") +df_quotes = df_quotes.sort("time") # Set column as sorted +# --8<-- [end:asofpre] + +# --8<-- [start:asof] +df_asof_join = df_trades.join_asof(df_quotes, on="time", by="stock") +print(df_asof_join) +# --8<-- [end:asof] + +# --8<-- [start:asof2] +df_asof_tolerance_join = df_trades.join_asof( + df_quotes, on="time", by="stock", tolerance="1m" +) +print(df_asof_tolerance_join) +# --8<-- [end:asof2] diff --git a/docs/src/python/user-guide/transformations/melt.py b/docs/src/python/user-guide/transformations/melt.py new file mode 100644 index 000000000000..e9bf53a96ec7 --- /dev/null +++ b/docs/src/python/user-guide/transformations/melt.py @@ -0,0 +1,18 @@ +# --8<-- [start:df] +import polars as pl + +df = pl.DataFrame( + { + "A": ["a", "b", "a"], + "B": [1, 3, 5], + "C": [10, 11, 12], + "D": [2, 4, 6], + } +) +print(df) +# --8<-- [end:df] + +# --8<-- [start:melt] +out = df.melt(id_vars=["A", "B"], value_vars=["C", "D"]) +print(out) +# --8<-- [end:melt] diff --git a/docs/src/python/user-guide/transformations/pivot.py b/docs/src/python/user-guide/transformations/pivot.py new file mode 100644 index 000000000000..d80b26ee0c34 --- /dev/null +++ b/docs/src/python/user-guide/transformations/pivot.py @@ -0,0 +1,31 @@ +# --8<-- [start:setup] +import polars as pl + +# --8<-- [end:setup] + +# --8<-- [start:df] +df = pl.DataFrame( + { + "foo": ["A", "A", "B", "B", "C"], + "N": [1, 2, 2, 4, 2], + "bar": ["k", "l", "m", "n", "o"], + } +) +print(df) +# --8<-- [end:df] + +# --8<-- [start:eager] +out = df.pivot(index="foo", columns="bar", values="N", aggregate_function="first") +print(out) +# --8<-- [end:eager] + +# --8<-- [start:lazy] +q = ( + df.lazy() + .collect() + .pivot(index="foo", columns="bar", values="N", aggregate_function="first") + .lazy() +) +out = q.collect() +print(out) +# --8<-- [end:lazy] diff --git a/docs/src/python/user-guide/transformations/time-series/filter.py b/docs/src/python/user-guide/transformations/time-series/filter.py new file mode 100644 index 000000000000..6a2a28e44f8c --- /dev/null +++ b/docs/src/python/user-guide/transformations/time-series/filter.py @@ -0,0 +1,30 @@ +# --8<-- [start:df] +import polars as pl +from datetime import datetime + +df = pl.read_csv("docs/data/apple_stock.csv", try_parse_dates=True) +print(df) +# --8<-- [end:df] + +# --8<-- [start:filter] +filtered_df = df.filter( + pl.col("Date") == datetime(1995, 10, 16), +) +print(filtered_df) +# --8<-- [end:filter] + +# --8<-- [start:range] +filtered_range_df = df.filter( + pl.col("Date").is_between(datetime(1995, 7, 1), datetime(1995, 11, 1)), +) +print(filtered_range_df) +# --8<-- [end:range] + +# --8<-- [start:negative] +ts = pl.Series(["-1300-05-23", "-1400-03-02"]).str.strptime(pl.Date) + +negative_dates_df = pl.DataFrame({"ts": ts, "values": [3, 4]}) + +negative_dates_filtered_df = negative_dates_df.filter(pl.col("ts").dt.year() < -1300) +print(negative_dates_filtered_df) +# --8<-- [end:negative] diff --git a/docs/src/python/user-guide/transformations/time-series/parsing.py b/docs/src/python/user-guide/transformations/time-series/parsing.py new file mode 100644 index 000000000000..0e49df5495a0 --- /dev/null +++ b/docs/src/python/user-guide/transformations/time-series/parsing.py @@ -0,0 +1,43 @@ +# --8<-- [start:setup] +import polars as pl + +# --8<-- [end:setup] + +# --8<-- [start:df] +df = pl.read_csv("docs/data/apple_stock.csv", try_parse_dates=True) +print(df) +# --8<-- [end:df] + + +# --8<-- [start:cast] +df = pl.read_csv("docs/data/apple_stock.csv", try_parse_dates=False) + +df = df.with_columns(pl.col("Date").str.strptime(pl.Date, format="%Y-%m-%d")) +print(df) +# --8<-- [end:cast] + + +# --8<-- [start:df3] +df_with_year = df.with_columns(pl.col("Date").dt.year().alias("year")) +print(df_with_year) +# --8<-- [end:df3] + +# --8<-- [start:extract] +df_with_year = df.with_columns(pl.col("Date").dt.year().alias("year")) +print(df_with_year) +# --8<-- [end:extract] + +# --8<-- [start:mixed] +data = [ + "2021-03-27T00:00:00+0100", + "2021-03-28T00:00:00+0100", + "2021-03-29T00:00:00+0200", + "2021-03-30T00:00:00+0200", +] +mixed_parsed = ( + pl.Series(data) + .str.strptime(pl.Datetime, format="%Y-%m-%dT%H:%M:%S%z") + .dt.convert_time_zone("Europe/Brussels") +) +print(mixed_parsed) +# --8<-- [end:mixed] diff --git a/docs/src/python/user-guide/transformations/time-series/resampling.py b/docs/src/python/user-guide/transformations/time-series/resampling.py new file mode 100644 index 000000000000..80a7b2597a67 --- /dev/null +++ b/docs/src/python/user-guide/transformations/time-series/resampling.py @@ -0,0 +1,36 @@ +# --8<-- [start:setup] +from datetime import datetime + +import polars as pl + +# --8<-- [end:setup] + +# --8<-- [start:df] +df = pl.DataFrame( + { + "time": pl.datetime_range( + start=datetime(2021, 12, 16), + end=datetime(2021, 12, 16, 3), + interval="30m", + eager=True, + ), + "groups": ["a", "a", "a", "b", "b", "a", "a"], + "values": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0], + } +) +print(df) +# --8<-- [end:df] + +# --8<-- [start:upsample] +out1 = df.upsample(time_column="time", every="15m").fill_null(strategy="forward") +print(out1) +# --8<-- [end:upsample] + +# --8<-- [start:upsample2] +out2 = ( + df.upsample(time_column="time", every="15m") + .interpolate() + .fill_null(strategy="forward") +) +print(out2) +# --8<-- [end:upsample2] diff --git a/docs/src/python/user-guide/transformations/time-series/rolling.py b/docs/src/python/user-guide/transformations/time-series/rolling.py new file mode 100644 index 000000000000..16f751523ade --- /dev/null +++ b/docs/src/python/user-guide/transformations/time-series/rolling.py @@ -0,0 +1,75 @@ +# --8<-- [start:setup] +import polars as pl +from datetime import date, datetime + +# --8<-- [end:setup] + +# --8<-- [start:df] +df = pl.read_csv("docs/data/apple_stock.csv", try_parse_dates=True) +df = df.sort("Date") +print(df) +# --8<-- [end:df] + +# --8<-- [start:group_by] +annual_average_df = df.group_by_dynamic("Date", every="1y").agg(pl.col("Close").mean()) + +df_with_year = annual_average_df.with_columns(pl.col("Date").dt.year().alias("year")) +print(df_with_year) +# --8<-- [end:group_by] + +# --8<-- [start:group_by_dyn] +df = ( + pl.date_range( + start=date(2021, 1, 1), + end=date(2021, 12, 31), + interval="1d", + eager=True, + ) + .alias("time") + .to_frame() +) + +out = ( + df.group_by_dynamic("time", every="1mo", period="1mo", closed="left") + .agg( + [ + pl.col("time").cumcount().reverse().head(3).alias("day/eom"), + ((pl.col("time") - pl.col("time").first()).last().dt.days() + 1).alias( + "days_in_month" + ), + ] + ) + .explode("day/eom") +) +print(out) +# --8<-- [end:group_by_dyn] + +# --8<-- [start:group_by_roll] +df = pl.DataFrame( + { + "time": pl.datetime_range( + start=datetime(2021, 12, 16), + end=datetime(2021, 12, 16, 3), + interval="30m", + eager=True, + ), + "groups": ["a", "a", "a", "b", "b", "a", "a"], + } +) +print(df) +# --8<-- [end:group_by_roll] + +# --8<-- [start:group_by_dyn2] +out = df.group_by_dynamic( + "time", + every="1h", + closed="both", + by="groups", + include_boundaries=True, +).agg( + [ + pl.count(), + ] +) +print(out) +# --8<-- [end:group_by_dyn2] diff --git a/docs/src/python/user-guide/transformations/time-series/timezones.py b/docs/src/python/user-guide/transformations/time-series/timezones.py new file mode 100644 index 000000000000..13234a9d8e30 --- /dev/null +++ b/docs/src/python/user-guide/transformations/time-series/timezones.py @@ -0,0 +1,27 @@ +# --8<-- [start:setup] +import polars as pl + +# --8<-- [end:setup] + +# --8<-- [start:example] +ts = ["2021-03-27 03:00", "2021-03-28 03:00"] +tz_naive = pl.Series("tz_naive", ts).str.strptime(pl.Datetime) +tz_aware = tz_naive.dt.replace_time_zone("UTC").rename("tz_aware") +time_zones_df = pl.DataFrame([tz_naive, tz_aware]) +print(time_zones_df) +# --8<-- [end:example] + +# --8<-- [start:example2] +time_zones_operations = time_zones_df.select( + [ + pl.col("tz_aware") + .dt.replace_time_zone("Europe/Brussels") + .alias("replace time zone"), + pl.col("tz_aware") + .dt.convert_time_zone("Asia/Kathmandu") + .alias("convert time zone"), + pl.col("tz_aware").dt.replace_time_zone(None).alias("unset time zone"), + ] +) +print(time_zones_operations) +# --8<-- [end:example2] diff --git a/docs/src/rust/getting-started/expressions.rs b/docs/src/rust/getting-started/expressions.rs new file mode 100644 index 000000000000..e8d031ebd1f7 --- /dev/null +++ b/docs/src/rust/getting-started/expressions.rs @@ -0,0 +1,144 @@ +use chrono::prelude::*; +use polars::prelude::*; +use rand::Rng; + +fn main() -> Result<(), Box> { + let mut rng = rand::thread_rng(); + + let df: DataFrame = df!("a" => 0..8, + "b"=> (0..8).map(|_| rng.gen::()).collect::>(), + "c"=> [ + NaiveDate::from_ymd_opt(2022, 12, 1).unwrap().and_hms_opt(0, 0, 0).unwrap(), + NaiveDate::from_ymd_opt(2022, 12, 2).unwrap().and_hms_opt(0, 0, 0).unwrap(), + NaiveDate::from_ymd_opt(2022, 12, 3).unwrap().and_hms_opt(0, 0, 0).unwrap(), + NaiveDate::from_ymd_opt(2022, 12, 4).unwrap().and_hms_opt(0, 0, 0).unwrap(), + NaiveDate::from_ymd_opt(2022, 12, 5).unwrap().and_hms_opt(0, 0, 0).unwrap(), + NaiveDate::from_ymd_opt(2022, 12, 6).unwrap().and_hms_opt(0, 0, 0).unwrap(), + NaiveDate::from_ymd_opt(2022, 12, 7).unwrap().and_hms_opt(0, 0, 0).unwrap(), + NaiveDate::from_ymd_opt(2022, 12, 8).unwrap().and_hms_opt(0, 0, 0).unwrap(), + ], + "d"=> [Some(1.0), Some(2.0), None, None, Some(0.0), Some(-5.0), Some(-42.), None] + ) + .expect("should not fail"); + + // --8<-- [start:select] + let out = df.clone().lazy().select([col("*")]).collect()?; + println!("{}", out); + // --8<-- [end:select] + + // --8<-- [start:select2] + let out = df.clone().lazy().select([col("a"), col("b")]).collect()?; + println!("{}", out); + // --8<-- [end:select2] + + // --8<-- [start:select3] + let out = df + .clone() + .lazy() + .select([col("a"), col("b")]) + .limit(3) + .collect()?; + println!("{}", out); + // --8<-- [end:select3] + + // --8<-- [start:exclude] + let out = df + .clone() + .lazy() + .select([col("*").exclude(["a"])]) + .collect()?; + println!("{}", out); + // --8<-- [end:exclude] + + // --8<-- [start:filter] + let start_date = NaiveDate::from_ymd_opt(2022, 12, 2) + .unwrap() + .and_hms_opt(0, 0, 0) + .unwrap(); + let end_date = NaiveDate::from_ymd_opt(2022, 12, 8) + .unwrap() + .and_hms_opt(0, 0, 0) + .unwrap(); + let out = df + .clone() + .lazy() + .filter( + col("c") + .gt_eq(lit(start_date)) + .and(col("c").lt_eq(lit(end_date))), + ) + .collect()?; + println!("{}", out); + // --8<-- [end:filter] + + // --8<-- [start:filter2] + let out = df + .clone() + .lazy() + .filter(col("a").lt_eq(3).and(col("d").is_not_null())) + .collect()?; + println!("{}", out); + // --8<-- [end:filter2] + + // --8<-- [start:with_columns] + let out = df + .clone() + .lazy() + .with_columns([ + col("b").sum().alias("e"), + (col("b") + lit(42)).alias("b+42"), + ]) + .collect()?; + println!("{}", out); + // --8<-- [end:with_columns] + + // --8<-- [start:dataframe2] + let df2: DataFrame = df!("x" => 0..8, + "y"=> &["A", "A", "A", "B", "B", "C", "X", "X"], + ) + .expect("should not fail"); + println!("{}", df2); + // --8<-- [end:dataframe2] + + // --8<-- [start:group_by] + let out = df2 + .clone() + .lazy() + .group_by(["y"]) + .agg([count()]) + .collect()?; + println!("{}", out); + // --8<-- [end:group_by] + + // --8<-- [start:group_by2] + let out = df2 + .clone() + .lazy() + .group_by(["y"]) + .agg([col("*").count().alias("count"), col("*").sum().alias("sum")]) + .collect()?; + println!("{}", out); + // --8<-- [end:group_by2] + + // --8<-- [start:combine] + let out = df + .clone() + .lazy() + .with_columns([(col("a") * col("b")).alias("a * b")]) + .select([col("*").exclude(["c", "d"])]) + .collect()?; + println!("{}", out); + // --8<-- [end:combine] + + // --8<-- [start:combine2] + let out = df + .clone() + .lazy() + .with_columns([(col("a") * col("b")).alias("a * b")]) + .select([col("*").exclude(["d"])]) + .collect()?; + println!("{}", out); + // --8<-- [end:combine2] + + Ok(()) +} diff --git a/docs/src/rust/getting-started/joins.rs b/docs/src/rust/getting-started/joins.rs new file mode 100644 index 000000000000..1f583dc0e4f9 --- /dev/null +++ b/docs/src/rust/getting-started/joins.rs @@ -0,0 +1,29 @@ +use polars::prelude::*; + + +fn main() -> Result<(), Box>{ + + + // --8<-- [start:join] + use rand::Rng; + let mut rng = rand::thread_rng(); + + let df: DataFrame = df!("a" => 0..8, + "b"=> (0..8).map(|_| rng.gen::()).collect::>(), + "d"=> [Some(1.0), Some(2.0), None, None, Some(0.0), Some(-5.0), Some(-42.), None] + ).expect("should not fail"); + let df2: DataFrame = df!("x" => 0..8, + "y"=> &["A", "A", "A", "B", "B", "C", "X", "X"], + ).expect("should not fail"); + let joined = df.join(&df2,["a"],["x"],JoinType::Left,None)?; + println!("{}",joined); + // --8<-- [end:join] + + // --8<-- [start:hstack] + let stacked = df.hstack(df2.get_columns())?; + println!("{}",stacked); + // --8<-- [end:hstack] + + Ok(()) + +} diff --git a/docs/src/rust/getting-started/reading-writing.rs b/docs/src/rust/getting-started/reading-writing.rs new file mode 100644 index 000000000000..4fe035d34f82 --- /dev/null +++ b/docs/src/rust/getting-started/reading-writing.rs @@ -0,0 +1,67 @@ +use polars::prelude::*; + +fn main() -> Result<(), Box> { + // --8<-- [start:dataframe] + use chrono::prelude::*; + use std::fs::File; + + let mut df: DataFrame = df!( + "integer" => &[1, 2, 3], + "date" => &[ + NaiveDate::from_ymd_opt(2022, 1, 1).unwrap().and_hms_opt(0, 0, 0).unwrap(), + NaiveDate::from_ymd_opt(2022, 1, 2).unwrap().and_hms_opt(0, 0, 0).unwrap(), + NaiveDate::from_ymd_opt(2022, 1, 3).unwrap().and_hms_opt(0, 0, 0).unwrap(), + ], + "float" => &[4.0, 5.0, 6.0] + ) + .expect("should not fail"); + println!("{}", df); + // --8<-- [end:dataframe] + + // --8<-- [start:csv] + let mut file = File::create("docs/data/output.csv").expect("could not create file"); + CsvWriter::new(&mut file) + .has_header(true) + .with_delimiter(b',') + .finish(&mut df); + let df_csv = CsvReader::from_path("docs/data/output.csv")? + .infer_schema(None) + .has_header(true) + .finish()?; + println!("{}", df_csv); + // --8<-- [end:csv] + + // --8<-- [start:csv2] + let mut file = File::create("docs/data/output.csv").expect("could not create file"); + CsvWriter::new(&mut file) + .has_header(true) + .with_delimiter(b',') + .finish(&mut df); + let df_csv = CsvReader::from_path("docs/data/output.csv")? + .infer_schema(None) + .has_header(true) + .with_parse_dates(true) + .finish()?; + println!("{}", df_csv); + // --8<-- [end:csv2] + + // --8<-- [start:json] + let mut file = File::create("docs/data/output.json").expect("could not create file"); + JsonWriter::new(&mut file).finish(&mut df); + let mut f = File::open("docs/data/output.json")?; + let df_json = JsonReader::new(f) + .with_json_format(JsonFormat::JsonLines) + .finish()?; + println!("{}", df_json); + // --8<-- [end:json] + + // --8<-- [start:parquet] + let mut file = File::create("docs/data/output.parquet").expect("could not create file"); + ParquetWriter::new(&mut file).finish(&mut df); + let mut f = File::open("docs/data/output.parquet")?; + let df_parquet = ParquetReader::new(f).finish()?; + println!("{}", df_parquet); + // --8<-- [end:parquet] + + Ok(()) +} diff --git a/docs/src/rust/getting-started/series-dataframes.rs b/docs/src/rust/getting-started/series-dataframes.rs new file mode 100644 index 000000000000..09b45d705bac --- /dev/null +++ b/docs/src/rust/getting-started/series-dataframes.rs @@ -0,0 +1,59 @@ +fn main() -> Result<(), Box> { + // --8<-- [start:series] + use polars::prelude::*; + + let s = Series::new("a", [1, 2, 3, 4, 5]); + println!("{}", s); + // --8<-- [end:series] + + // --8<-- [start:minmax] + let s = Series::new("a", [1, 2, 3, 4, 5]); + // The use of generics is necessary for the type system + println!("{}", s.min::().unwrap()); + println!("{}", s.max::().unwrap()); + // --8<-- [end:minmax] + + // --8<-- [start:string] + // This operation is not directly available on the Series object yet, only on the DataFrame + // --8<-- [end:string] + + // --8<-- [start:dt] + // This operation is not directly available on the Series object yet, only on the DataFrame + // --8<-- [end:dt] + + // --8<-- [start:dataframe] + use chrono::prelude::*; + + let df: DataFrame = df!( + "integer" => &[1, 2, 3, 4, 5], + "date" => &[ + NaiveDate::from_ymd_opt(2022, 1, 1).unwrap().and_hms_opt(0, 0, 0).unwrap(), + NaiveDate::from_ymd_opt(2022, 1, 2).unwrap().and_hms_opt(0, 0, 0).unwrap(), + NaiveDate::from_ymd_opt(2022, 1, 3).unwrap().and_hms_opt(0, 0, 0).unwrap(), + NaiveDate::from_ymd_opt(2022, 1, 4).unwrap().and_hms_opt(0, 0, 0).unwrap(), + NaiveDate::from_ymd_opt(2022, 1, 5).unwrap().and_hms_opt(0, 0, 0).unwrap() + ], + "float" => &[4.0, 5.0, 6.0, 7.0, 8.0], + ) + .unwrap(); + + println!("{}", df); + // --8<-- [end:dataframe] + + // --8<-- [start:head] + println!("{}", df.head(Some(3))); + // --8<-- [end:head] + + // --8<-- [start:tail] + println!("{}", df.tail(Some(3))); + // --8<-- [end:tail] + + // --8<-- [start:sample] + println!("{}", df.sample_n(2, false, true, None)?); + // --8<-- [end:sample] + + // --8<-- [start:describe] + println!("{:?}", df.describe(None)); + // --8<-- [end:describe] + Ok(()) +} diff --git a/docs/src/rust/home/example.rs b/docs/src/rust/home/example.rs new file mode 100644 index 000000000000..00cf7de67bfb --- /dev/null +++ b/docs/src/rust/home/example.rs @@ -0,0 +1,16 @@ +fn main() -> Result<(), Box> { + // --8<-- [start:example] + use polars::prelude::*; + + let q = LazyCsvReader::new("docs/data/iris.csv") + .has_header(true) + .finish()? + .filter(col("sepal_length").gt(lit(5))) + .group_by(vec![col("species")]) + .agg([col("*").sum()]); + + let df = q.collect(); + // --8<-- [end:example] + + Ok(()) +} diff --git a/docs/src/rust/user-guide/concepts/contexts.rs b/docs/src/rust/user-guide/concepts/contexts.rs new file mode 100644 index 000000000000..b911faa8fd6d --- /dev/null +++ b/docs/src/rust/user-guide/concepts/contexts.rs @@ -0,0 +1,69 @@ +use polars::prelude::*; + +fn main() -> Result<(), Box> { + // --8<-- [start:dataframe] + use rand::{thread_rng, Rng}; + + let mut arr = [0f64; 5]; + thread_rng().fill(&mut arr); + + let df = df! ( + "nrs" => &[Some(1), Some(2), Some(3), None, Some(5)], + "names" => &[Some("foo"), Some("ham"), Some("spam"), Some("eggs"), None], + "random" => &arr, + "groups" => &["A", "A", "B", "C", "B"], + )?; + + println!("{}", &df); + // --8<-- [end:dataframe] + + // --8<-- [start:select] + let out = df + .clone() + .lazy() + .select([ + sum("nrs"), + col("names").sort(false), + col("names").first().alias("first name"), + (mean("nrs") * lit(10)).alias("10xnrs"), + ]) + .collect()?; + println!("{}", out); + // --8<-- [end:select] + + // --8<-- [start:filter] + let out = df.clone().lazy().filter(col("nrs").gt(lit(2))).collect()?; + println!("{}", out); + // --8<-- [end:filter] + + // --8<-- [start:with_columns] + let out = df + .clone() + .lazy() + .with_columns([ + sum("nrs").alias("nrs_sum"), + col("random").count().alias("count"), + ]) + .collect()?; + println!("{}", out); + // --8<-- [end:with_columns] + + // --8<-- [start:group_by] + let out = df + .lazy() + .group_by([col("groups")]) + .agg([ + sum("nrs"), // sum nrs by groups + col("random").count().alias("count"), // count group members + // sum random where name != null + col("random") + .filter(col("names").is_not_null()) + .sum() + .suffix("_sum"), + col("names").reverse().alias("reversed names"), + ]) + .collect()?; + println!("{}", out); + // --8<-- [end:group_by] + Ok(()) +} diff --git a/docs/src/rust/user-guide/concepts/expressions.rs b/docs/src/rust/user-guide/concepts/expressions.rs new file mode 100644 index 000000000000..9c76fc6642e8 --- /dev/null +++ b/docs/src/rust/user-guide/concepts/expressions.rs @@ -0,0 +1,24 @@ +use polars::prelude::*; +use rand::Rng; +use chrono::prelude::*; + +fn main() -> Result<(), Box>{ + + let df = df! ( + "foo" => &[Some(1), Some(2), Some(3), None, Some(5)], + "bar" => &[Some("foo"), Some("ham"), Some("spam"), Some("egg"), None], + )?; + + // --8<-- [start:example1] + df.column("foo")?.sort(false).head(Some(2)); + // --8<-- [end:example1] + + // --8<-- [start:example2] + df.clone().lazy().select([ + col("foo").sort(Default::default()).head(Some(2)), + col("bar").filter(col("foo").eq(lit(1))).sum(), + ]).collect()?; + // --8<-- [end:example2] + + Ok(()) +} \ No newline at end of file diff --git a/docs/src/rust/user-guide/concepts/lazy-vs-eager.rs b/docs/src/rust/user-guide/concepts/lazy-vs-eager.rs new file mode 100644 index 000000000000..910235fbbf65 --- /dev/null +++ b/docs/src/rust/user-guide/concepts/lazy-vs-eager.rs @@ -0,0 +1,30 @@ +use polars::prelude::*; + +fn main() -> Result<(), Box> { + // --8<-- [start:eager] + let df = CsvReader::from_path("docs/data/iris.csv") + .unwrap() + .finish() + .unwrap(); + let mask = df.column("sepal_length")?.f64()?.gt(5.0); + let df_small = df.filter(&mask)?; + let df_agg = df_small + .group_by(["species"])? + .select(["sepal_width"]) + .mean()?; + println!("{}", df_agg); + // --8<-- [end:eager] + + // --8<-- [start:lazy] + let q = LazyCsvReader::new("docs/data/iris.csv") + .has_header(true) + .finish()? + .filter(col("sepal_length").gt(lit(5))) + .group_by(vec![col("species")]) + .agg([col("sepal_width").mean()]); + let df = q.collect()?; + println!("{}", df); + // --8<-- [end:lazy] + + Ok(()) +} diff --git a/docs/src/rust/user-guide/concepts/streaming.rs b/docs/src/rust/user-guide/concepts/streaming.rs new file mode 100644 index 000000000000..f00b5e92ca99 --- /dev/null +++ b/docs/src/rust/user-guide/concepts/streaming.rs @@ -0,0 +1,19 @@ +use chrono::prelude::*; +use polars::prelude::*; +use rand::Rng; + +fn main() -> Result<(), Box> { + // --8<-- [start:streaming] + let q = LazyCsvReader::new("docs/data/iris.csv") + .has_header(true) + .finish()? + .filter(col("sepal_length").gt(lit(5))) + .group_by(vec![col("species")]) + .agg([col("sepal_width").mean()]); + + let df = q.with_streaming(true).collect()?; + println!("{}", df); + // --8<-- [end:streaming] + + Ok(()) +} diff --git a/docs/src/rust/user-guide/expressions/aggregation.rs b/docs/src/rust/user-guide/expressions/aggregation.rs new file mode 100644 index 000000000000..205ec2f01bf7 --- /dev/null +++ b/docs/src/rust/user-guide/expressions/aggregation.rs @@ -0,0 +1,204 @@ +use polars::prelude::*; + +fn main() -> Result<(), Box> { + // --8<-- [start:dataframe] + use reqwest::blocking::Client; + use std::io::Cursor; + + let url = "https://theunitedstates.io/congress-legislators/legislators-historical.csv"; + + let mut schema = Schema::new(); + schema.with_column("first_name".to_string(), DataType::Categorical(None)); + schema.with_column("gender".to_string(), DataType::Categorical(None)); + schema.with_column("type".to_string(), DataType::Categorical(None)); + schema.with_column("state".to_string(), DataType::Categorical(None)); + schema.with_column("party".to_string(), DataType::Categorical(None)); + schema.with_column("birthday".to_string(), DataType::Date); + + let data: Vec = Client::new().get(url).send()?.text()?.bytes().collect(); + + let dataset = CsvReader::new(Cursor::new(data)) + .has_header(true) + .with_dtypes(Some(&schema)) + .with_parse_dates(true) + .finish()?; + + println!("{}", &dataset); + // --8<-- [end:dataframe] + + // --8<-- [start:basic] + let df = dataset + .clone() + .lazy() + .group_by(["first_name"]) + .agg([count(), col("gender").list(), col("last_name").first()]) + .sort( + "count", + SortOptions { + descending: true, + nulls_last: true, + }, + ) + .limit(5) + .collect()?; + + println!("{}", df); + // --8<-- [end:basic] + + // --8<-- [start:conditional] + let df = dataset + .clone() + .lazy() + .group_by(["state"]) + .agg([ + (col("party").eq(lit("Anti-Administration"))) + .sum() + .alias("anti"), + (col("party").eq(lit("Pro-Administration"))) + .sum() + .alias("pro"), + ]) + .sort( + "pro", + SortOptions { + descending: true, + nulls_last: false, + }, + ) + .limit(5) + .collect()?; + + println!("{}", df); + // --8<-- [end:conditional] + + // --8<-- [start:nested] + let df = dataset + .clone() + .lazy() + .group_by(["state", "party"]) + .agg([col("party").count().alias("count")]) + .filter( + col("party") + .eq(lit("Anti-Administration")) + .or(col("party").eq(lit("Pro-Administration"))), + ) + .sort( + "count", + SortOptions { + descending: true, + nulls_last: true, + }, + ) + .limit(5) + .collect()?; + + println!("{}", df); + // --8<-- [end:nested] + + // --8<-- [start:filter] + fn compute_age() -> Expr { + lit(2022) - col("birthday").dt().year() + } + + fn avg_birthday(gender: &str) -> Expr { + compute_age() + .filter(col("gender").eq(lit(gender))) + .mean() + .alias(&format!("avg {} birthday", gender)) + } + + let df = dataset + .clone() + .lazy() + .group_by(["state"]) + .agg([ + avg_birthday("M"), + avg_birthday("F"), + (col("gender").eq(lit("M"))).sum().alias("# male"), + (col("gender").eq(lit("F"))).sum().alias("# female"), + ]) + .limit(5) + .collect()?; + + println!("{}", df); + // --8<-- [end:filter] + + // --8<-- [start:sort] + fn get_person() -> Expr { + col("first_name") + lit(" ") + col("last_name") + } + + let df = dataset + .clone() + .lazy() + .sort( + "birthday", + SortOptions { + descending: true, + nulls_last: true, + }, + ) + .group_by(["state"]) + .agg([ + get_person().first().alias("youngest"), + get_person().last().alias("oldest"), + ]) + .limit(5) + .collect()?; + + println!("{}", df); + // --8<-- [end:sort] + + // --8<-- [start:sort2] + let df = dataset + .clone() + .lazy() + .sort( + "birthday", + SortOptions { + descending: true, + nulls_last: true, + }, + ) + .group_by(["state"]) + .agg([ + get_person().first().alias("youngest"), + get_person().last().alias("oldest"), + get_person().sort(false).first().alias("alphabetical_first"), + ]) + .limit(5) + .collect()?; + + println!("{}", df); + // --8<-- [end:sort2] + + // --8<-- [start:sort3] + let df = dataset + .clone() + .lazy() + .sort( + "birthday", + SortOptions { + descending: true, + nulls_last: true, + }, + ) + .group_by(["state"]) + .agg([ + get_person().first().alias("youngest"), + get_person().last().alias("oldest"), + get_person().sort(false).first().alias("alphabetical_first"), + col("gender") + .sort_by(["first_name"], [false]) + .first() + .alias("gender"), + ]) + .sort("state", SortOptions::default()) + .limit(5) + .collect()?; + + println!("{}", df); + // --8<-- [end:sort3] + + Ok(()) +} diff --git a/docs/src/rust/user-guide/expressions/casting.rs b/docs/src/rust/user-guide/expressions/casting.rs new file mode 100644 index 000000000000..2dda1e185215 --- /dev/null +++ b/docs/src/rust/user-guide/expressions/casting.rs @@ -0,0 +1,201 @@ +// --8<-- [start:setup] +use polars::lazy::dsl::StrptimeOptions; +use polars::prelude::*; +// --8<-- [end:setup] + +fn main() -> Result<(), Box> { + // --8<-- [start:dfnum] + let df = df! ( + "integers"=> &[1, 2, 3, 4, 5], + "big_integers"=> &[1, 10000002, 3, 10000004, 10000005], + "floats"=> &[4.0, 5.0, 6.0, 7.0, 8.0], + "floats_with_decimal"=> &[4.532, 5.5, 6.5, 7.5, 8.5], + )?; + + println!("{}", &df); + // --8<-- [end:dfnum] + + // --8<-- [start:castnum] + let out = df + .clone() + .lazy() + .select([ + col("integers") + .cast(DataType::Float32) + .alias("integers_as_floats"), + col("floats") + .cast(DataType::Int32) + .alias("floats_as_integers"), + col("floats_with_decimal") + .cast(DataType::Int32) + .alias("floats_with_decimal_as_integers"), + ]) + .collect()?; + println!("{}", &out); + // --8<-- [end:castnum] + + // --8<-- [start:downcast] + let out = df + .clone() + .lazy() + .select([ + col("integers") + .cast(DataType::Int16) + .alias("integers_smallfootprint"), + col("floats") + .cast(DataType::Float32) + .alias("floats_smallfootprint"), + ]) + .collect(); + match out { + Ok(out) => println!("{}", &out), + Err(e) => println!("{:?}", e), + }; + // --8<-- [end:downcast] + + // --8<-- [start:overflow] + + let out = df + .clone() + .lazy() + .select([col("big_integers").strict_cast(DataType::Int8)]) + .collect(); + match out { + Ok(out) => println!("{}", &out), + Err(e) => println!("{:?}", e), + }; + // --8<-- [end:overflow] + + // --8<-- [start:overflow2] + let out = df + .clone() + .lazy() + .select([col("big_integers").cast(DataType::Int8)]) + .collect(); + match out { + Ok(out) => println!("{}", &out), + Err(e) => println!("{:?}", e), + }; + // --8<-- [end:overflow2] + + // --8<-- [start:strings] + + let df = df! ( + "integers" => &[1, 2, 3, 4, 5], + "float" => &[4.0, 5.03, 6.0, 7.0, 8.0], + "floats_as_string" => &["4.0", "5.0", "6.0", "7.0", "8.0"], + )?; + + let out = df + .clone() + .lazy() + .select([ + col("integers").cast(DataType::Utf8), + col("float").cast(DataType::Utf8), + col("floats_as_string").cast(DataType::Float64), + ]) + .collect()?; + println!("{}", &out); + // --8<-- [end:strings] + + // --8<-- [start:strings2] + + let df = df! ("strings_not_float"=> ["4.0", "not_a_number", "6.0", "7.0", "8.0"])?; + + let out = df + .clone() + .lazy() + .select([col("strings_not_float").cast(DataType::Float64)]) + .collect(); + match out { + Ok(out) => println!("{}", &out), + Err(e) => println!("{:?}", e), + }; + // --8<-- [end:strings2] + + // --8<-- [start:bool] + + let df = df! ( + "integers"=> &[-1, 0, 2, 3, 4], + "floats"=> &[0.0, 1.0, 2.0, 3.0, 4.0], + "bools"=> &[true, false, true, false, true], + )?; + + let out = df + .clone() + .lazy() + .select([ + col("integers").cast(DataType::Boolean), + col("floats").cast(DataType::Boolean), + ]) + .collect()?; + println!("{}", &out); + // --8<-- [end:bool] + + // --8<-- [start:dates] + + use chrono::prelude::*; + use polars::time::*; + + let df = df! ( + "date" => date_range( + "date", + NaiveDate::from_ymd_opt(2022, 1, 1).unwrap().and_hms_opt(0, 0, 0).unwrap(), + NaiveDate::from_ymd_opt(2022, 1, 5).unwrap().and_hms_opt(0, 0, 0).unwrap(), + Duration::parse("1d"), + ClosedWindow::Both, + TimeUnit::Milliseconds, + None + )?.cast(&DataType::Date)?, + "datetime" => datetime_range( + "datetime", + NaiveDate::from_ymd_opt(2022, 1, 1).unwrap().and_hms_opt(0, 0, 0).unwrap(), + NaiveDate::from_ymd_opt(2022, 1, 5).unwrap().and_hms_opt(0, 0, 0).unwrap(), + Duration::parse("1d"), + ClosedWindow::Both, + TimeUnit::Milliseconds, + None + )?, + )?; + + let out = df + .clone() + .lazy() + .select([ + col("date").cast(DataType::Int64), + col("datetime").cast(DataType::Int64), + ]) + .collect()?; + println!("{}", &out); + // --8<-- [end:dates] + + // --8<-- [start:dates2] + + let df = df! ( + "date" => date_range("date", + NaiveDate::from_ymd_opt(2022, 1, 1).unwrap().and_hms_opt(0, 0, 0).unwrap(), NaiveDate::from_ymd_opt(2022, 1, 5).unwrap().and_hms_opt(0, 0, 0).unwrap(), Duration::parse("1d"),ClosedWindow::Both, TimeUnit::Milliseconds, None)?, + "string" => &[ + "2022-01-01", + "2022-01-02", + "2022-01-03", + "2022-01-04", + "2022-01-05", + ], + )?; + + let out = df + .clone() + .lazy() + .select([ + col("date").dt().strftime("%Y-%m-%d"), + col("string").str().strptime( + DataType::Datetime(TimeUnit::Microseconds, None), + StrptimeOptions::default(), + ), + ]) + .collect()?; + println!("{}", &out); + // --8<-- [end:dates2] + + Ok(()) +} diff --git a/docs/src/rust/user-guide/expressions/column-selections.rs b/docs/src/rust/user-guide/expressions/column-selections.rs new file mode 100644 index 000000000000..105cc6f102df --- /dev/null +++ b/docs/src/rust/user-guide/expressions/column-selections.rs @@ -0,0 +1,99 @@ +use polars::prelude::*; + +fn main() -> Result<(), Box> { + // --8<-- [start:selectors_df] + + use chrono::prelude::*; + use polars::time::*; + + let df = df!( + "id" => &[9, 4, 2], + "place" => &["Mars", "Earth", "Saturn"], + "date" => date_range("date", + NaiveDate::from_ymd_opt(2022, 1, 1).unwrap().and_hms_opt(0, 0, 0).unwrap(), NaiveDate::from_ymd_opt(2022, 1, 3).unwrap().and_hms_opt(0, 0, 0).unwrap(), Duration::parse("1d"),ClosedWindow::Both, TimeUnit::Milliseconds, None)?, + "sales" => &[33.4, 2142134.1, 44.7], + "has_people" => &[false, true, false], + "logged_at" => date_range("logged_at", + NaiveDate::from_ymd_opt(2022, 1, 1).unwrap().and_hms_opt(0, 0, 0).unwrap(), NaiveDate::from_ymd_opt(2022, 1, 1).unwrap().and_hms_opt(0, 0, 2).unwrap(), Duration::parse("1s"),ClosedWindow::Both, TimeUnit::Milliseconds, None)?, + )? + .with_row_count("rn", None)?; + println!("{}", &df); + // --8<-- [end:selectors_df] + + // --8<-- [start:all] + let out = df.clone().lazy().select([col("*")]).collect()?; + + // Is equivalent to + let out = df.clone().lazy().select([all()]).collect()?; + println!("{}", &out); + // --8<-- [end:all] + + // --8<-- [start:exclude] + let out = df + .clone() + .lazy() + .select([col("*").exclude(["logged_at", "rn"])]) + .collect()?; + println!("{}", &out); + // --8<-- [end:exclude] + + // --8<-- [start:expansion_by_names] + let out = df + .clone() + .lazy() + .select([cols(["date", "logged_at"]).dt().to_string("%Y-%h-%d")]) + .collect()?; + println!("{}", &out); + // --8<-- [end:expansion_by_names] + + // --8<-- [start:expansion_by_regex] + let out = df.clone().lazy().select([col("^.*(as|sa).*$")]).collect()?; + println!("{}", &out); + // --8<-- [end:expansion_by_regex] + + // --8<-- [start:expansion_by_dtype] + let out = df + .clone() + .lazy() + .select([dtype_cols([DataType::Int64, DataType::UInt32, DataType::Boolean]).n_unique()]) + .collect()?; + // gives different result than python as the id col is i32 in rust + println!("{}", &out); + // --8<-- [end:expansion_by_dtype] + + // --8<-- [start:selectors_intro] + // Not available in Rust, refer the following link + // https://github.com/pola-rs/polars/issues/10594 + // --8<-- [end:selectors_intro] + + // --8<-- [start:selectors_diff] + // Not available in Rust, refer the following link + // https://github.com/pola-rs/polars/issues/10594 + // --8<-- [end:selectors_diff] + + // --8<-- [start:selectors_union] + // Not available in Rust, refer the following link + // https://github.com/pola-rs/polars/issues/10594 + // --8<-- [end:selectors_union] + + // --8<-- [start:selectors_by_name] + // Not available in Rust, refer the following link + // https://github.com/pola-rs/polars/issues/1059 + // --8<-- [end:selectors_by_name] + + // --8<-- [start:selectors_to_expr] + // Not available in Rust, refer the following link + // https://github.com/pola-rs/polars/issues/10594 + // --8<-- [end:selectors_to_expr] + + // --8<-- [start:selectors_is_selector_utility] + // Not available in Rust, refer the following link + // https://github.com/pola-rs/polars/issues/10594 + // --8<-- [end:selectors_is_selector_utility] + + // --8<-- [start:selectors_colnames_utility] + // Not available in Rust, refer the following link + // https://github.com/pola-rs/polars/issues/10594 + // --8<-- [end:selectors_colnames_utility] + Ok(()) +} diff --git a/docs/src/rust/user-guide/expressions/folds.rs b/docs/src/rust/user-guide/expressions/folds.rs new file mode 100644 index 000000000000..b851557f8e37 --- /dev/null +++ b/docs/src/rust/user-guide/expressions/folds.rs @@ -0,0 +1,49 @@ +use polars::prelude::*; + +fn main() -> Result<(), Box> { + + // --8<-- [start:mansum] + let df = df!( + "a" => &[1, 2, 3], + "b" => &[10, 20, 30], + )?; + + let out = df + .lazy() + .select([fold_exprs(lit(0), |acc, x| Ok(Some(acc + x)), [col("*")]).alias("sum")]) + .collect()?; + println!("{}", out); + // --8<-- [end:mansum] + + // --8<-- [start:conditional] + let df = df!( + "a" => &[1, 2, 3], + "b" => &[0, 1, 2], + )?; + + let out = df + .lazy() + .filter(fold_exprs( + lit(true), + |acc, x| Some(acc.bitand(&x)), + [col("*").gt(1)], + )) + .collect()?; + println!("{}", out); + // --8<-- [end:conditional] + + // --8<-- [start:string] + let df = df!( + "a" => &["a", "b", "c"], + "b" => &[1, 2, 3], + )?; + + let out = df + .lazy() + .select([concat_str([col("a"), col("b")], "")]) + .collect()?; + println!("{:?}", out); + // --8<-- [end:string] + + Ok(()) +} \ No newline at end of file diff --git a/docs/src/rust/user-guide/expressions/functions.rs b/docs/src/rust/user-guide/expressions/functions.rs new file mode 100644 index 000000000000..490809b75557 --- /dev/null +++ b/docs/src/rust/user-guide/expressions/functions.rs @@ -0,0 +1,79 @@ +use polars::prelude::*; + +fn main() -> Result<(), Box> { + // --8<-- [start:dataframe] + use rand::{thread_rng, Rng}; + + let mut arr = [0f64; 5]; + thread_rng().fill(&mut arr); + + let df = df! ( + "nrs" => &[Some(1), Some(2), Some(3), None, Some(5)], + "names" => &["foo", "ham", "spam", "egg", "spam"], + "random" => &arr, + "groups" => &["A", "A", "B", "C", "B"], + )?; + + println!("{}", &df); + // --8<-- [end:dataframe] + + // --8<-- [start:samename] + let df_samename = df.clone().lazy().select([col("nrs") + lit(5)]).collect()?; + println!("{}", &df_samename); + // --8<-- [end:samename] + + // --8<-- [start:samenametwice] + let df_samename2 = df + .clone() + .lazy() + .select([col("nrs") + lit(5), col("nrs") - lit(5)]) + .collect(); + match df_samename2 { + Ok(df) => println!("{}", &df), + Err(e) => println!("{:?}", &e), + }; + // --8<-- [end:samenametwice] + + // --8<-- [start:samenamealias] + let df_alias = df + .clone() + .lazy() + .select([ + (col("nrs") + lit(5)).alias("nrs + 5"), + (col("nrs") - lit(5)).alias("nrs - 5"), + ]) + .collect()?; + println!("{}", &df_alias); + // --8<-- [end:samenamealias] + + // --8<-- [start:countunique] + let df_alias = df + .clone() + .lazy() + .select([ + col("names").n_unique().alias("unique"), + // Following query shows there isn't anything in Rust API + // https://docs.rs/polars/latest/polars/?search=approx_n_unique + // col("names").approx_n_unique().alias("unique_approx"), + ]) + .collect()?; + println!("{}", &df_alias); + // --8<-- [end:countunique] + + // --8<-- [start:conditional] + let df_conditional = df + .clone() + .lazy() + .select([ + col("nrs"), + when(col("nrs").gt(2)) + .then(lit(true)) + .otherwise(lit(false)) + .alias("conditional"), + ]) + .collect()?; + println!("{}", &df_conditional); + // --8<-- [end:conditional] + + Ok(()) +} diff --git a/docs/src/rust/user-guide/expressions/lists.rs b/docs/src/rust/user-guide/expressions/lists.rs new file mode 100644 index 000000000000..05b1b4f6f1f9 --- /dev/null +++ b/docs/src/rust/user-guide/expressions/lists.rs @@ -0,0 +1,162 @@ +// --8<-- [start:setup] +use polars::prelude::*; +// --8<-- [end:setup] +fn main() -> Result<(), Box> { + // --8<-- [start:weather_df] + let stns: Vec = (1..6).map(|i| format!("Station {i}")).collect(); + let weather = df!( + "station"=> &stns, + "temperatures"=> &[ + "20 5 5 E1 7 13 19 9 6 20", + "18 8 16 11 23 E2 8 E2 E2 E2 90 70 40", + "19 24 E9 16 6 12 10 22", + "E2 E0 15 7 8 10 E1 24 17 13 6", + "14 8 E0 16 22 24 E1", + ], + )?; + println!("{}", &weather); + // --8<-- [end:weather_df] + + // --8<-- [start:string_to_list] + let out = weather + .clone() + .lazy() + .with_columns([col("temperatures").str().split(" ")]) + .collect()?; + println!("{}", &out); + // --8<-- [end:string_to_list] + + // --8<-- [start:explode_to_atomic] + let out = weather + .clone() + .lazy() + .with_columns([col("temperatures").str().split(" ")]) + .explode(["temperatures"]) + .collect()?; + println!("{}", &out); + // --8<-- [end:explode_to_atomic] + + // --8<-- [start:list_ops] + let out = weather + .clone() + .lazy() + .with_columns([col("temperatures").str().split(" ")]) + .with_columns([ + col("temperatures").list().head(lit(3)).alias("top3"), + col("temperatures") + .list() + .slice(lit(-3), lit(3)) + .alias("bottom_3"), + col("temperatures").list().lengths().alias("obs"), + ]) + .collect()?; + println!("{}", &out); + // --8<-- [end:list_ops] + + // --8<-- [start:count_errors] + let out = weather + .clone() + .lazy() + .with_columns([col("temperatures") + .str() + .split(" ") + .list() + .eval(col("").cast(DataType::Int64).is_null(), false) + .list() + .sum() + .alias("errors")]) + .collect()?; + println!("{}", &out); + // --8<-- [end:count_errors] + + // --8<-- [start:count_errors_regex] + let out = weather + .clone() + .lazy() + .with_columns([col("temperatures") + .str() + .split(" ") + .list() + .eval(col("").str().contains(lit("(?i)[a-z]"), false), false) + .list() + .sum() + .alias("errors")]) + .collect()?; + println!("{}", &out); + // --8<-- [end:count_errors_regex] + + // --8<-- [start:weather_by_day] + let stns: Vec = (1..11).map(|i| format!("Station {i}")).collect(); + let weather_by_day = df!( + "station" => &stns, + "day_1" => &[17, 11, 8, 22, 9, 21, 20, 8, 8, 17], + "day_2" => &[15, 11, 10, 8, 7, 14, 18, 21, 15, 13], + "day_3" => &[16, 15, 24, 24, 8, 23, 19, 23, 16, 10], + )?; + println!("{}", &weather_by_day); + // --8<-- [end:weather_by_day] + + // --8<-- [start:weather_by_day_rank] + let rank_pct = (col("") + .rank( + RankOptions { + method: RankMethod::Average, + descending: true, + }, + None, + ) + .cast(DataType::Float32) + / col("*").count().cast(DataType::Float32)) + .round(2); + + let out = weather_by_day + .clone() + .lazy() + .with_columns( + // create the list of homogeneous data + [concat_list([all().exclude(["station"])])?.alias("all_temps")], + ) + .select( + // select all columns except the intermediate list + [ + all().exclude(["all_temps"]), + // compute the rank by calling `list.eval` + col("all_temps") + .list() + .eval(rank_pct, true) + .alias("temps_rank"), + ], + ) + .collect()?; + + println!("{}", &out); + // --8<-- [end:weather_by_day_rank] + + // --8<-- [start:array_df] + let mut col1: ListPrimitiveChunkedBuilder = + ListPrimitiveChunkedBuilder::new("Array_1", 8, 8, DataType::Int32); + col1.append_slice(&[1, 3]); + col1.append_slice(&[2, 5]); + let mut col2: ListPrimitiveChunkedBuilder = + ListPrimitiveChunkedBuilder::new("Array_2", 8, 8, DataType::Int32); + col2.append_slice(&[1, 7, 3]); + col2.append_slice(&[8, 1, 0]); + let array_df = DataFrame::new([col1.finish(), col2.finish()].into())?; + + println!("{}", &array_df); + // --8<-- [end:array_df] + + // --8<-- [start:array_ops] + let out = array_df + .clone() + .lazy() + .select([ + col("Array_1").list().min().suffix("_min"), + col("Array_2").list().sum().suffix("_sum"), + ]) + .collect()?; + println!("{}", &out); + // --8<-- [end:array_ops] + + Ok(()) +} diff --git a/docs/src/rust/user-guide/expressions/null.rs b/docs/src/rust/user-guide/expressions/null.rs new file mode 100644 index 000000000000..8d78310cb0a9 --- /dev/null +++ b/docs/src/rust/user-guide/expressions/null.rs @@ -0,0 +1,89 @@ +use polars::prelude::*; + +fn main() -> Result<(), Box> { + // --8<-- [start:dataframe] + + let df = df! ( + "value" => &[Some(1), None], + )?; + + println!("{}", &df); + // --8<-- [end:dataframe] + + // --8<-- [start:count] + let null_count_df = df.null_count(); + println!("{}", &null_count_df); + // --8<-- [end:count] + + // --8<-- [start:isnull] + let is_null_series = df + .clone() + .lazy() + .select([col("value").is_null()]) + .collect()?; + println!("{}", &is_null_series); + // --8<-- [end:isnull] + + // --8<-- [start:dataframe2] + let df = df!( + "col1" => &[Some(1), Some(2), Some(3)], + "col2" => &[Some(1), None, Some(3)], + + )?; + println!("{}", &df); + // --8<-- [end:dataframe2] + + // --8<-- [start:fill] + let fill_literal_df = df + .clone() + .lazy() + .with_columns([col("col2").fill_null(lit(2))]) + .collect()?; + println!("{}", &fill_literal_df); + // --8<-- [end:fill] + + // --8<-- [start:fillstrategy] + let fill_forward_df = df + .clone() + .lazy() + .with_columns([col("col2").forward_fill(None)]) + .collect()?; + println!("{}", &fill_forward_df); + // --8<-- [end:fillstrategy] + + // --8<-- [start:fillexpr] + let fill_median_df = df + .clone() + .lazy() + .with_columns([col("col2").fill_null(median("col2"))]) + .collect()?; + println!("{}", &fill_median_df); + // --8<-- [end:fillexpr] + + // --8<-- [start:fillinterpolate] + let fill_interpolation_df = df + .clone() + .lazy() + .with_columns([col("col2").interpolate(InterpolationMethod::Linear)]) + .collect()?; + println!("{}", &fill_interpolation_df); + // --8<-- [end:fillinterpolate] + + // --8<-- [start:nan] + let nan_df = df!( + "value" => [1.0, f64::NAN, f64::NAN, 3.0], + )?; + println!("{}", &nan_df); + // --8<-- [end:nan] + + // --8<-- [start:nanfill] + let mean_nan_df = nan_df + .clone() + .lazy() + .with_columns([col("value").fill_nan(lit(NULL)).alias("value")]) + .mean() + .collect()?; + println!("{}", &mean_nan_df); + // --8<-- [end:nanfill] + Ok(()) +} diff --git a/docs/src/rust/user-guide/expressions/operators.rs b/docs/src/rust/user-guide/expressions/operators.rs new file mode 100644 index 000000000000..868d301c2182 --- /dev/null +++ b/docs/src/rust/user-guide/expressions/operators.rs @@ -0,0 +1,54 @@ +use polars::prelude::*; + +fn main() -> Result<(), Box> { + // --8<-- [start:dataframe] + use rand::{thread_rng, Rng}; + + let mut arr = [0f64; 5]; + thread_rng().fill(&mut arr); + + let df = df! ( + "nrs" => &[Some(1), Some(2), Some(3), None, Some(5)], + "names" => &[Some("foo"), Some("ham"), Some("spam"), Some("eggs"), None], + "random" => &arr, + "groups" => &["A", "A", "B", "C", "B"], + )?; + + println!("{}", &df); + // --8<-- [end:dataframe] + + // --8<-- [start:numerical] + let df_numerical = df + .clone() + .lazy() + .select([ + (col("nrs") + lit(5)).alias("nrs + 5"), + (col("nrs") - lit(5)).alias("nrs - 5"), + (col("nrs") * col("random")).alias("nrs * random"), + (col("nrs") / col("random")).alias("nrs / random"), + ]) + .collect()?; + println!("{}", &df_numerical); + // --8<-- [end:numerical] + + // --8<-- [start:logical] + let df_logical = df + .clone() + .lazy() + .select([ + col("nrs").gt(1).alias("nrs > 1"), + col("random").lt_eq(0.5).alias("random < .5"), + col("nrs").neq(1).alias("nrs != 1"), + col("nrs").eq(1).alias("nrs == 1"), + (col("random").lt_eq(0.5)) + .and(col("nrs").gt(1)) + .alias("and_expr"), // and + (col("random").lt_eq(0.5)) + .or(col("nrs").gt(1)) + .alias("or_expr"), // or + ]) + .collect()?; + println!("{}", &df_logical); + // --8<-- [end:logical] + Ok(()) +} diff --git a/docs/src/rust/user-guide/expressions/strings.rs b/docs/src/rust/user-guide/expressions/strings.rs new file mode 100644 index 000000000000..f3020e4fa2ce --- /dev/null +++ b/docs/src/rust/user-guide/expressions/strings.rs @@ -0,0 +1,93 @@ +// --8<-- [start:setup] +use polars::prelude::*; +// --8<-- [end:setup] + +fn main() -> Result<(), Box> { + // --8<-- [start:df] + let df = df! ( + "animal" => &[Some("Crab"), Some("cat and dog"), Some("rab$bit"), None], + )?; + + let out = df + .clone() + .lazy() + .select([ + col("animal").str().lengths().alias("byte_count"), + col("animal").str().n_chars().alias("letter_count"), + ]) + .collect()?; + + println!("{}", &out); + // --8<-- [end:df] + + // --8<-- [start:existence] + let out = df + .clone() + .lazy() + .select([ + col("animal"), + col("animal") + .str() + .contains(lit("cat|bit"), false) + .alias("regex"), + col("animal") + .str() + .contains_literal(lit("rab$")) + .alias("literal"), + col("animal") + .str() + .starts_with(lit("rab")) + .alias("starts_with"), + col("animal").str().ends_with(lit("dog")).alias("ends_with"), + ]) + .collect()?; + println!("{}", &out); + // --8<-- [end:existence] + + // --8<-- [start:extract] + let df = df!( + "a" => &[ + "http://vote.com/ballon_dor?candidate=messi&ref=polars", + "http://vote.com/ballon_dor?candidat=jorginho&ref=polars", + "http://vote.com/ballon_dor?candidate=ronaldo&ref=polars", + ] + )?; + let out = df + .clone() + .lazy() + .select([col("a").str().extract(r"candidate=(\w+)", 1)]) + .collect()?; + println!("{}", &out); + // --8<-- [end:extract] + + // --8<-- [start:extract_all] + let df = df!("foo"=> &["123 bla 45 asd", "xyz 678 910t"])?; + let out = df + .clone() + .lazy() + .select([col("foo") + .str() + .extract_all(lit(r"(\d+)")) + .alias("extracted_nrs")]) + .collect()?; + println!("{}", &out); + // --8<-- [end:extract_all] + + // --8<-- [start:replace] + let df = df!("id"=> &[1, 2], "text"=> &["123abc", "abc456"])?; + let out = df + .clone() + .lazy() + .with_columns([ + col("text").str().replace(lit(r"abc\b"), lit("ABC"), false), + col("text") + .str() + .replace_all(lit("a"), lit("-"), false) + .alias("text_replace_all"), + ]) + .collect()?; + println!("{}", &out); + // --8<-- [end:replace] + + Ok(()) +} diff --git a/docs/src/rust/user-guide/expressions/structs.rs b/docs/src/rust/user-guide/expressions/structs.rs new file mode 100644 index 000000000000..662e264222a6 --- /dev/null +++ b/docs/src/rust/user-guide/expressions/structs.rs @@ -0,0 +1,99 @@ +// --8<-- [start:setup] +use polars::{lazy::dsl::count, prelude::*}; +// --8<-- [end:setup] +fn main() -> Result<(), Box> { + // --8<-- [start:ratings_df] + let ratings = df!( + "Movie"=> &["Cars", "IT", "ET", "Cars", "Up", "IT", "Cars", "ET", "Up", "ET"], + "Theatre"=> &["NE", "ME", "IL", "ND", "NE", "SD", "NE", "IL", "IL", "SD"], + "Avg_Rating"=> &[4.5, 4.4, 4.6, 4.3, 4.8, 4.7, 4.7, 4.9, 4.7, 4.6], + "Count"=> &[30, 27, 26, 29, 31, 28, 28, 26, 33, 26], + + )?; + println!("{}", &ratings); + // --8<-- [end:ratings_df] + + // --8<-- [start:state_value_counts] + let out = ratings + .clone() + .lazy() + .select([col("Theatre").value_counts(true, true)]) + .collect()?; + println!("{}", &out); + // --8<-- [end:state_value_counts] + + // --8<-- [start:struct_unnest] + let out = ratings + .clone() + .lazy() + .select([col("Theatre").value_counts(true, true)]) + .unnest(["Theatre"]) + .collect()?; + println!("{}", &out); + // --8<-- [end:struct_unnest] + + // --8<-- [start:series_struct] + // Don't think we can make it the same way in rust, but this works + let rating_series = df!( + "Movie" => &["Cars", "Toy Story"], + "Theatre" => &["NE", "ME"], + "Avg_Rating" => &[4.5, 4.9], + )? + .into_struct("ratings") + .into_series(); + println!("{}", &rating_series); + // // --8<-- [end:series_struct] + + // --8<-- [start:series_struct_extract] + let out = rating_series.struct_()?.field_by_name("Movie")?; + println!("{}", &out); + // --8<-- [end:series_struct_extract] + + // --8<-- [start:series_struct_rename] + let out = DataFrame::new([rating_series].into())? + .lazy() + .select([col("ratings") + .struct_() + .rename_fields(["Film".into(), "State".into(), "Value".into()].to_vec())]) + .unnest(["ratings"]) + .collect()?; + + println!("{}", &out); + // --8<-- [end:series_struct_rename] + + // --8<-- [start:struct_duplicates] + let out = ratings + .clone() + .lazy() + // .filter(as_struct(&[col("Movie"), col("Theatre")]).is_duplicated()) + // Error: .is_duplicated() not available if you try that + // https://github.com/pola-rs/polars/issues/3803 + .filter(count().over([col("Movie"), col("Theatre")]).gt(lit(1))) + .collect()?; + println!("{}", &out); + // --8<-- [end:struct_duplicates] + + // --8<-- [start:struct_ranking] + let out = ratings + .clone() + .lazy() + .with_columns([as_struct(&[col("Count"), col("Avg_Rating")]) + .rank( + RankOptions { + method: RankMethod::Dense, + descending: false, + }, + None, + ) + .over([col("Movie"), col("Theatre")]) + .alias("Rank")]) + // .filter(as_struct(&[col("Movie"), col("Theatre")]).is_duplicated()) + // Error: .is_duplicated() not available if you try that + // https://github.com/pola-rs/polars/issues/3803 + .filter(count().over([col("Movie"), col("Theatre")]).gt(lit(1))) + .collect()?; + println!("{}", &out); + // --8<-- [end:struct_ranking] + + Ok(()) +} diff --git a/docs/src/rust/user-guide/expressions/user-defined-functions.rs b/docs/src/rust/user-guide/expressions/user-defined-functions.rs new file mode 100644 index 000000000000..7cbe1605f3e3 --- /dev/null +++ b/docs/src/rust/user-guide/expressions/user-defined-functions.rs @@ -0,0 +1,84 @@ +use polars::prelude::*; + +fn main() -> Result<(), Box> { + // --8<-- [start:dataframe] + let df = df!( + "keys" => &["a", "a", "b"], + "values" => &[10, 7, 1], + )?; + + let out = df + .lazy() + .group_by(["keys"]) + .agg([ + col("values") + .map(|s| Ok(s.shift(1)), GetOutput::default()) + .alias("shift_map"), + col("values").shift(1).alias("shift_expression"), + ]) + .collect()?; + + println!("{}", out); + // --8<-- [end:dataframe] + + // --8<-- [start:apply] + let out = df + .clone() + .lazy() + .group_by([col("keys")]) + .agg([ + col("values") + .apply(|s| Ok(s.shift(1)), GetOutput::default()) + .alias("shift_map"), + col("values").shift(1).alias("shift_expression"), + ]) + .collect()?; + println!("{}", out); + // --8<-- [end:apply] + + // --8<-- [start:counter] + + // --8<-- [end:counter] + + // --8<-- [start:combine] + let out = df + .lazy() + .select([ + // pack to struct to get access to multiple fields in a custom `apply/map` + as_struct(&[col("keys"), col("values")]) + // we will compute the len(a) + b + .apply( + |s| { + // downcast to struct + let ca = s.struct_()?; + + // get the fields as Series + let s_a = &ca.fields()[0]; + let s_b = &ca.fields()[1]; + + // downcast the `Series` to their known type + let ca_a = s_a.utf8()?; + let ca_b = s_b.i32()?; + + // iterate both `ChunkedArrays` + let out: Int32Chunked = ca_a + .into_iter() + .zip(ca_b) + .map(|(opt_a, opt_b)| match (opt_a, opt_b) { + (Some(a), Some(b)) => Some(a.len() as i32 + b), + _ => None, + }) + .collect(); + + Ok(out.into_series()) + }, + GetOutput::from_type(DataType::Int32), + ) + .alias("solution_apply"), + (col("keys").str().count_match(".") + col("values")).alias("solution_expr"), + ]) + .collect()?; + println!("{}", out); + // --8<-- [end:combine] + Ok(()) +} diff --git a/docs/src/rust/user-guide/expressions/window.rs b/docs/src/rust/user-guide/expressions/window.rs new file mode 100644 index 000000000000..2fcc32cdc309 --- /dev/null +++ b/docs/src/rust/user-guide/expressions/window.rs @@ -0,0 +1,131 @@ +use polars::prelude::*; + +fn main() -> Result<(), Box> { + // --8<-- [start:pokemon] + use polars::prelude::*; + use reqwest::blocking::Client; + + let data: Vec = Client::new() + .get("https://gist.githubusercontent.com/ritchie46/cac6b337ea52281aa23c049250a4ff03/raw/89a957ff3919d90e6ef2d34235e6bf22304f3366/pokemon.csv") + .send()? + .text()? + .bytes() + .collect(); + + let df = CsvReader::new(std::io::Cursor::new(data)) + .has_header(true) + .finish()?; + + println!("{}", df); + // --8<-- [end:pokemon] + + // --8<-- [start:group_by] + let out = df + .clone() + .lazy() + .select([ + col("Type 1"), + col("Type 2"), + col("Attack") + .mean() + .over(["Type 1"]) + .alias("avg_attack_by_type"), + col("Defense") + .mean() + .over(["Type 1", "Type 2"]) + .alias("avg_defense_by_type_combination"), + col("Attack").mean().alias("avg_attack"), + ]) + .collect()?; + + println!("{}", out); + // --8<-- [end:group_by] + + // --8<-- [start:operations] + let filtered = df + .clone() + .lazy() + .filter(col("Type 2").eq(lit("Psychic"))) + .select([col("Name"), col("Type 1"), col("Speed")]) + .collect()?; + + println!("{}", filtered); + // --8<-- [end:operations] + + // --8<-- [start:sort] + let out = filtered + .lazy() + .with_columns([cols(["Name", "Speed"]) + .sort_by(["Speed"], [true]) + .over(["Type 1"])]) + .collect()?; + println!("{}", out); + // --8<-- [end:sort] + + // --8<-- [start:rules] + // aggregate and broadcast within a group + // output type: -> i32 + sum("foo").over([col("groups")]) + // sum within a group and multiply with group elements + // output type: -> i32 + (col("x").sum() * col("y")) + .over([col("groups")]) + .alias("x1") + // sum within a group and multiply with group elements + // and aggregate the group to a list + // output type: -> ChunkedArray + (col("x").sum() * col("y")) + .list() + .over([col("groups")]) + .alias("x2") + // note that it will require an explicit `list()` call + // sum within a group and multiply with group elements + // and aggregate the group to a list + // the flatten call explodes that list + + // This is the fastest method to do things over groups when the groups are sorted + (col("x").sum() * col("y")) + .list() + .over([col("groups")]) + .flatten() + .alias("x3"); + // --8<-- [end:rules] + + // --8<-- [start:examples] + let out = df + .clone() + .lazy() + .select([ + col("Type 1") + .head(Some(3)) + .list() + .over(["Type 1"]) + .flatten(), + col("Name") + .sort_by(["Speed"], [true]) + .head(Some(3)) + .list() + .over(["Type 1"]) + .flatten() + .alias("fastest/group"), + col("Name") + .sort_by(["Attack"], [true]) + .head(Some(3)) + .list() + .over(["Type 1"]) + .flatten() + .alias("strongest/group"), + col("Name") + .sort(false) + .head(Some(3)) + .list() + .over(["Type 1"]) + .flatten() + .alias("sorted_by_alphabet"), + ]) + .collect()?; + println!("{:?}", out); + // --8<-- [end:examples] + + Ok(()) +} diff --git a/docs/src/rust/user-guide/io/aws.rs b/docs/src/rust/user-guide/io/aws.rs new file mode 100644 index 000000000000..0a1924d9d294 --- /dev/null +++ b/docs/src/rust/user-guide/io/aws.rs @@ -0,0 +1,32 @@ +""" +# --8<-- [start:bucket] +use aws_sdk_s3::Region; + +use aws_config::meta::region::RegionProviderChain; +use aws_sdk_s3::Client; +use std::borrow::Cow; + +use polars::prelude::*; + +#[tokio::main] +async fn main() { + let bucket = ""; + let path = ""; + + let config = aws_config::from_env().load().await; + let client = Client::new(&config); + + let req = client.get_object().bucket(bucket).key(path); + + let res = req.clone().send().await.unwrap(); + let bytes = res.body.collect().await.unwrap(); + let bytes = bytes.into_bytes(); + + let cursor = std::io::Cursor::new(bytes); + + let df = CsvReader::new(cursor).finish().unwrap(); + + println!("{:?}", df); +} +# --8<-- [end:bucket] +""" diff --git a/docs/src/rust/user-guide/io/csv.rs b/docs/src/rust/user-guide/io/csv.rs new file mode 100644 index 000000000000..7c56d813e626 --- /dev/null +++ b/docs/src/rust/user-guide/io/csv.rs @@ -0,0 +1,29 @@ +use polars::prelude::*; + +fn main() -> Result<(), Box>{ + + """ + // --8<-- [start:read] + use polars::prelude::*; + + let df = CsvReader::from_path("docs/data/path.csv").unwrap().finish().unwrap(); + // --8<-- [end:read] + """ + + // --8<-- [start:write] + let mut df = df!( + "foo" => &[1, 2, 3], + "bar" => &[None, Some("bak"), Some("baz")], + ) + .unwrap(); + + let mut file = std::fs::File::create("docs/data/path.csv").unwrap(); + CsvWriter::new(&mut file).finish(&mut df).unwrap(); + // --8<-- [end:write] + + // --8<-- [start:scan] + let df = LazyCsvReader::new("./test.csv").finish().unwrap(); + // --8<-- [end:scan] + + Ok(()) +} diff --git a/docs/src/rust/user-guide/io/json-file.rs b/docs/src/rust/user-guide/io/json-file.rs new file mode 100644 index 000000000000..ab4df729c955 --- /dev/null +++ b/docs/src/rust/user-guide/io/json-file.rs @@ -0,0 +1,47 @@ +use polars::prelude::*; + +fn main() -> Result<(), Box>{ + + """ + // --8<-- [start:read] + use polars::prelude::*; + + let mut file = std::fs::File::open("docs/data/path.json").unwrap(); + let df = JsonReader::new(&mut file).finish().unwrap(); + // --8<-- [end:read] + + + // --8<-- [start:readnd] + let mut file = std::fs::File::open("docs/data/path.json").unwrap(); + let df = JsonLineReader::new(&mut file).finish().unwrap(); + // --8<-- [end:readnd] + """ + + // --8<-- [start:write] + let mut df = df!( + "foo" => &[1, 2, 3], + "bar" => &[None, Some("bak"), Some("baz")], + ) + .unwrap(); + + let mut file = std::fs::File::create("docs/data/path.json").unwrap(); + + // json + JsonWriter::new(&mut file) + .with_json_format(JsonFormat::Json) + .finish(&mut df) + .unwrap(); + + // ndjson + JsonWriter::new(&mut file) + .with_json_format(JsonFormat::JsonLines) + .finish(&mut df) + .unwrap(); + // --8<-- [end:write] + + // --8<-- [start:scan] + let df = LazyJsonLineReader::new("docs/data/path.json".to_string()).finish().unwrap(); + // --8<-- [end:scan] + + Ok(()) +} diff --git a/docs/src/rust/user-guide/io/parquet.rs b/docs/src/rust/user-guide/io/parquet.rs new file mode 100644 index 000000000000..f3469ffd4e2c --- /dev/null +++ b/docs/src/rust/user-guide/io/parquet.rs @@ -0,0 +1,30 @@ +use polars::prelude::*; + +fn main() -> Result<(), Box>{ + + """ + // --8<-- [start:read] + let mut file = std::fs::File::open("docs/data/path.parquet").unwrap(); + + let df = ParquetReader::new(&mut file).finish().unwrap(); + // --8<-- [end:read] + """ + + // --8<-- [start:write] + let mut df = df!( + "foo" => &[1, 2, 3], + "bar" => &[None, Some("bak"), Some("baz")], + ) + .unwrap(); + + let mut file = std::fs::File::create("docs/data/path.parquet").unwrap(); + ParquetWriter::new(&mut file).finish(&mut df).unwrap(); + // --8<-- [end:write] + + // --8<-- [start:scan] + let args = ScanArgsParquet::default(); + let df = LazyFrame::scan_parquet("./file.parquet",args).unwrap(); + // --8<-- [end:scan] + + Ok(()) +} diff --git a/docs/src/rust/user-guide/transformations/concatenation.rs b/docs/src/rust/user-guide/transformations/concatenation.rs new file mode 100644 index 000000000000..ecb9dba877a6 --- /dev/null +++ b/docs/src/rust/user-guide/transformations/concatenation.rs @@ -0,0 +1,49 @@ +// --8<-- [start:setup] +use polars::prelude::*; +// --8<-- [end:setup] + +fn main() -> Result<(), Box> { + // --8<-- [start:vertical] + let df_v1 = df!( + "a"=> &[1], + "b"=> &[3], + )?; + let df_v2 = df!( + "a"=> &[2], + "b"=> &[4], + )?; + let df_vertical_concat = concat( + [df_v1.clone().lazy(), df_v2.clone().lazy()], + UnionArgs::default(), + )? + .collect()?; + println!("{}", &df_vertical_concat); + // --8<-- [end:vertical] + + // --8<-- [start:horizontal] + let df_h1 = df!( + "l1"=> &[1, 2], + "l2"=> &[3, 4], + )?; + let df_h2 = df!( + "r1"=> &[5, 6], + "r2"=> &[7, 8], + "r3"=> &[9, 10], + )?; + let df_horizontal_concat = polars::functions::hor_concat_df(&[df_h1, df_h2])?; + println!("{}", &df_horizontal_concat); + // --8<-- [end:horizontal] + + // --8<-- [start:cross] + let df_d1 = df!( + "a"=> &[1], + "b"=> &[3], + )?; + let df_d2 = df!( + "a"=> &[2], + "d"=> &[4],)?; + let df_diagonal_concat = polars::functions::diag_concat_df(&[df_d1, df_d2])?; + println!("{}", &df_diagonal_concat); + // --8<-- [end:cross] + Ok(()) +} diff --git a/docs/src/rust/user-guide/transformations/joins.rs b/docs/src/rust/user-guide/transformations/joins.rs new file mode 100644 index 000000000000..aa444c5d9a1a --- /dev/null +++ b/docs/src/rust/user-guide/transformations/joins.rs @@ -0,0 +1,205 @@ +// --8<-- [start:setup] +use polars::prelude::*; +// --8<-- [end:setup] + +fn main() -> Result<(), Box> { + // --8<-- [start:innerdf] + let df_customers = df! ( + + "customer_id" => &[1, 2, 3], + "name" => &["Alice", "Bob", "Charlie"], + )?; + + println!("{}", &df_customers); + // --8<-- [end:innerdf] + + // --8<-- [start:innerdf2] + let df_orders = df!( + "order_id"=> &["a", "b", "c"], + "customer_id"=> &[1, 2, 2], + "amount"=> &[100, 200, 300], + )?; + println!("{}", &df_orders); + // --8<-- [end:innerdf2] + + // --8<-- [start:inner] + let df_inner_customer_join = df_customers + .clone() + .lazy() + .join( + df_orders.clone().lazy(), + [col("customer_id")], + [col("customer_id")], + JoinArgs::new(JoinType::Inner), + ) + .collect()?; + println!("{}", &df_inner_customer_join); + // --8<-- [end:inner] + + // --8<-- [start:left] + let df_left_join = df_customers + .clone() + .lazy() + .join( + df_orders.clone().lazy(), + [col("customer_id")], + [col("customer_id")], + JoinArgs::new(JoinType::Left), + ) + .collect()?; + println!("{}", &df_left_join); + // --8<-- [end:left] + + // --8<-- [start:outer] + let df_outer_join = df_customers + .clone() + .lazy() + .join( + df_orders.clone().lazy(), + [col("customer_id")], + [col("customer_id")], + JoinArgs::new(JoinType::Outer), + ) + .collect()?; + println!("{}", &df_outer_join); + // --8<-- [end:outer] + + // --8<-- [start:df3] + let df_colors = df!( + "color"=> &["red", "blue", "green"], + )?; + println!("{}", &df_colors); + // --8<-- [end:df3] + + // --8<-- [start:df4] + let df_sizes = df!( + "size"=> &["S", "M", "L"], + )?; + println!("{}", &df_sizes); + // --8<-- [end:df4] + + // --8<-- [start:cross] + let df_cross_join = df_colors + .clone() + .lazy() + .cross_join(df_sizes.clone().lazy()) + .collect()?; + println!("{}", &df_cross_join); + // --8<-- [end:cross] + + // --8<-- [start:df5] + let df_cars = df!( + "id"=> &["a", "b", "c"], + "make"=> &["ford", "toyota", "bmw"], + )?; + println!("{}", &df_cars); + // --8<-- [end:df5] + + // --8<-- [start:df6] + let df_repairs = df!( + "id"=> &["c", "c"], + "cost"=> &[100, 200], + )?; + println!("{}", &df_repairs); + // --8<-- [end:df6] + + // --8<-- [start:inner2] + let df_inner_join = df_cars + .clone() + .lazy() + .inner_join(df_repairs.clone().lazy(), col("id"), col("id")) + .collect()?; + println!("{}", &df_inner_join); + // --8<-- [end:inner2] + + // --8<-- [start:semi] + let df_semi_join = df_cars + .clone() + .lazy() + .join( + df_repairs.clone().lazy(), + [col("id")], + [col("id")], + JoinArgs::new(JoinType::Semi), + ) + .collect()?; + println!("{}", &df_semi_join); + // --8<-- [end:semi] + + // --8<-- [start:anti] + let df_anti_join = df_cars + .clone() + .lazy() + .join( + df_repairs.clone().lazy(), + [col("id")], + [col("id")], + JoinArgs::new(JoinType::Anti), + ) + .collect()?; + println!("{}", &df_anti_join); + // --8<-- [end:anti] + + // --8<-- [start:df7] + use chrono::prelude::*; + let df_trades = df!( + "time"=> &[ + NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 1, 0).unwrap(), + NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 1, 0).unwrap(), + NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 3, 0).unwrap(), + NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 6, 0).unwrap(), + ], + "stock"=> &["A", "B", "B", "C"], + "trade"=> &[101, 299, 301, 500], + )?; + println!("{}", &df_trades); + // --8<-- [end:df7] + + // --8<-- [start:df8] + let df_quotes = df!( + "time"=> &[ + NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 0, 0).unwrap(), + NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 2, 0).unwrap(), + NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 4, 0).unwrap(), + NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_opt(9, 6, 0).unwrap(), + ], + "stock"=> &["A", "B", "C", "A"], + "quote"=> &[100, 300, 501, 102], + )?; + + println!("{}", &df_quotes); + // --8<-- [end:df8] + + // --8<-- [start:asofpre] + let df_trades = df_trades.sort(["time"], false, true).unwrap(); + let df_quotes = df_quotes.sort(["time"], false, true).unwrap(); + // --8<-- [end:asofpre] + + // --8<-- [start:asof] + let df_asof_join = df_trades.join_asof_by( + &df_quotes, + "time", + "time", + ["stock"], + ["stock"], + AsofStrategy::Backward, + None, + )?; + println!("{}", &df_asof_join); + // --8<-- [end:asof] + + // --8<-- [start:asof2] + let df_asof_tolerance_join = df_trades.join_asof_by( + &df_quotes, + "time", + "time", + ["stock"], + ["stock"], + AsofStrategy::Backward, + Some(AnyValue::Duration(60000, TimeUnit::Milliseconds)), + )?; + println!("{}", &df_asof_tolerance_join); + // --8<-- [end:asof2] + + Ok(()) +} diff --git a/docs/src/rust/user-guide/transformations/melt.rs b/docs/src/rust/user-guide/transformations/melt.rs new file mode 100644 index 000000000000..ff797423d293 --- /dev/null +++ b/docs/src/rust/user-guide/transformations/melt.rs @@ -0,0 +1,21 @@ +// --8<-- [start:setup] +use polars::prelude::*; +// --8<-- [end:setup] + +fn main() -> Result<(), Box> { + // --8<-- [start:df] + let df = df!( + "A"=> &["a", "b", "a"], + "B"=> &[1, 3, 5], + "C"=> &[10, 11, 12], + "D"=> &[2, 4, 6], + )?; + println!("{}", &df); + // --8<-- [end:df] + + // --8<-- [start:melt] + let out = df.melt(["A", "B"], ["C", "D"])?; + println!("{}", &out); + // --8<-- [end:melt] + Ok(()) +} diff --git a/docs/src/rust/user-guide/transformations/pivot.rs b/docs/src/rust/user-guide/transformations/pivot.rs new file mode 100644 index 000000000000..e632f095f31b --- /dev/null +++ b/docs/src/rust/user-guide/transformations/pivot.rs @@ -0,0 +1,28 @@ +// --8<-- [start:setup] +use polars::prelude::{pivot::pivot, *}; +// --8<-- [end:setup] + +fn main() -> Result<(), Box> { + // --8<-- [start:df] + let df = df!( + "foo"=> ["A", "A", "B", "B", "C"], + "N"=> [1, 2, 2, 4, 2], + "bar"=> ["k", "l", "m", "n", "o"], + )?; + println!("{}", &df); + // --8<-- [end:df] + + // --8<-- [start:eager] + let out = pivot(&df, ["N"], ["foo"], ["bar"], false, None, None)?; + println!("{}", &out); + // --8<-- [end:eager] + + // --8<-- [start:lazy] + let q = df.lazy(); + let q2 = pivot(&q.collect()?, ["N"], ["foo"], ["bar"], false, None, None)?.lazy(); + let out = q2.collect()?; + println!("{}", &out); + // --8<-- [end:lazy] + + Ok(()) +} diff --git a/docs/src/rust/user-guide/transformations/time-series/filter.rs b/docs/src/rust/user-guide/transformations/time-series/filter.rs new file mode 100644 index 000000000000..6e5b2175b81c --- /dev/null +++ b/docs/src/rust/user-guide/transformations/time-series/filter.rs @@ -0,0 +1,61 @@ +// --8<-- [start:setup] +use chrono::prelude::*; +use polars::io::prelude::*; +use polars::lazy::dsl::StrptimeOptions; +use polars::prelude::*; +// --8<-- [end:setup] + +fn main() -> Result<(), Box> { + // --8<-- [start:df] + let df = CsvReader::from_path("docs/data/apple_stock.csv") + .unwrap() + .with_try_parse_dates(true) + .finish() + .unwrap(); + println!("{}", &df); + // --8<-- [end:df] + + // --8<-- [start:filter] + let filtered_df = df + .clone() + .lazy() + .filter(col("Date").eq(lit(NaiveDate::from_ymd_opt(1995, 10, 16).unwrap()))) + .collect()?; + println!("{}", &filtered_df); + // --8<-- [end:filter] + + // --8<-- [start:range] + let filtered_range_df = df + .clone() + .lazy() + .filter( + col("Date") + .gt(lit(NaiveDate::from_ymd_opt(1995, 7, 1).unwrap())) + .and(col("Date").lt(lit(NaiveDate::from_ymd_opt(1995, 11, 1).unwrap()))), + ) + .collect()?; + println!("{}", &filtered_range_df); + // --8<-- [end:range] + + // --8<-- [start:negative] + let negative_dates_df = df!( + "ts"=> &["-1300-05-23", "-1400-03-02"], + "values"=> &[3, 4])? + .lazy() + .with_column( + col("ts") + .str() + .strptime(DataType::Date, StrptimeOptions::default()), + ) + .collect()?; + + let negative_dates_filtered_df = negative_dates_df + .clone() + .lazy() + .filter(col("ts").dt().year().lt(-1300)) + .collect()?; + println!("{}", &negative_dates_filtered_df); + // --8<-- [end:negative] + + Ok(()) +} diff --git a/docs/src/rust/user-guide/transformations/time-series/parsing.rs b/docs/src/rust/user-guide/transformations/time-series/parsing.rs new file mode 100644 index 000000000000..275ed0bf0e6a --- /dev/null +++ b/docs/src/rust/user-guide/transformations/time-series/parsing.rs @@ -0,0 +1,75 @@ +// --8<-- [start:setup] +use polars::io::prelude::*; +use polars::lazy::dsl::StrptimeOptions; +use polars::prelude::*; +// --8<-- [end:setup] + +fn main() -> Result<(), Box> { + // --8<-- [start:df] + let df = CsvReader::from_path("docs/data/apple_stock.csv") + .unwrap() + .with_try_parse_dates(true) + .finish() + .unwrap(); + println!("{}", &df); + // --8<-- [end:df] + + // --8<-- [start:cast] + let df = CsvReader::from_path("docs/data/apple_stock.csv") + .unwrap() + .with_try_parse_dates(false) + .finish() + .unwrap(); + let df = df + .clone() + .lazy() + .with_columns([col("Date") + .str() + .strptime(DataType::Date, StrptimeOptions::default())]) + .collect()?; + println!("{}", &df); + // --8<-- [end:cast] + + // --8<-- [start:df3] + let df_with_year = df + .clone() + .lazy() + .with_columns([col("Date").dt().year().alias("year")]) + .collect()?; + println!("{}", &df_with_year); + // --8<-- [end:df3] + + // --8<-- [start:extract] + let df_with_year = df + .clone() + .lazy() + .with_columns([col("Date").dt().year().alias("year")]) + .collect()?; + println!("{}", &df_with_year); + // --8<-- [end:extract] + + // --8<-- [start:mixed] + let data = [ + "2021-03-27T00:00:00+0100", + "2021-03-28T00:00:00+0100", + "2021-03-29T00:00:00+0200", + "2021-03-30T00:00:00+0200", + ]; + let q = col("date") + .str() + .strptime( + DataType::Datetime(TimeUnit::Microseconds, None), + StrptimeOptions { + format: Some("%Y-%m-%dT%H:%M:%S%z".to_string()), + ..Default::default() + }, + ) + .dt() + .convert_time_zone("Europe/Brussels".to_string()); + let mixed_parsed = df!("date" => &data)?.lazy().select([q]).collect()?; + + println!("{}", &mixed_parsed); + // --8<-- [end:mixed] + + Ok(()) +} diff --git a/docs/src/rust/user-guide/transformations/time-series/resampling.rs b/docs/src/rust/user-guide/transformations/time-series/resampling.rs new file mode 100644 index 000000000000..60888c264e12 --- /dev/null +++ b/docs/src/rust/user-guide/transformations/time-series/resampling.rs @@ -0,0 +1,43 @@ +// --8<-- [start:setup] +use chrono::prelude::*; +use polars::io::prelude::*; +use polars::prelude::*; +use polars::time::prelude::*; +// --8<-- [end:setup] + +fn main() -> Result<(), Box> { + // --8<-- [start:df] + let df = df!( + "time" => date_range( + "time", + NaiveDate::from_ymd_opt(2021, 12, 16).unwrap().and_hms_opt(0, 0, 0).unwrap(), + NaiveDate::from_ymd_opt(2021, 12, 16).unwrap().and_hms_opt(3, 0, 0).unwrap(), + Duration::parse("30m"), + ClosedWindow::Both, + TimeUnit::Milliseconds, None)?, + "groups" => &["a", "a", "a", "b", "b", "a", "a"], + "values" => &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0], + )?; + println!("{}", &df); + // --8<-- [end:df] + + // --8<-- [start:upsample] + let out1 = df + .clone() + .upsample::<[String; 0]>([], "time", Duration::parse("15m"), Duration::parse("0"))? + .fill_null(FillNullStrategy::Forward(None))?; + println!("{}", &out1); + // --8<-- [end:upsample] + + // --8<-- [start:upsample2] + let out2 = df + .clone() + .upsample::<[String; 0]>([], "time", Duration::parse("15m"), Duration::parse("0"))? + .lazy() + .with_columns([col("values").interpolate(InterpolationMethod::Linear)]) + .collect()? + .fill_null(FillNullStrategy::Forward(None))?; + println!("{}", &out2); + // --8<-- [end:upsample2] + Ok(()) +} diff --git a/docs/src/rust/user-guide/transformations/time-series/rolling.rs b/docs/src/rust/user-guide/transformations/time-series/rolling.rs new file mode 100644 index 000000000000..6458eb69bdfc --- /dev/null +++ b/docs/src/rust/user-guide/transformations/time-series/rolling.rs @@ -0,0 +1,130 @@ +// --8<-- [start:setup] +use chrono::prelude::*; +use polars::io::prelude::*; +use polars::lazy::dsl::GetOutput; +use polars::prelude::*; +use polars::time::prelude::*; +// --8<-- [end:setup] + +fn main() -> Result<(), Box> { + // --8<-- [start:df] + let df = CsvReader::from_path("docs/data/apple_stock.csv") + .unwrap() + .with_try_parse_dates(true) + .finish() + .unwrap() + .sort(["Date"], false, true)?; + println!("{}", &df); + // --8<-- [end:df] + + // --8<-- [start:group_by] + let annual_average_df = df + .clone() + .lazy() + .groupby_dynamic( + col("Date"), + [], + DynamicGroupOptions { + every: Duration::parse("1y"), + period: Duration::parse("1y"), + offset: Duration::parse("0"), + ..Default::default() + }, + ) + .agg([col("Close").mean()]) + .collect()?; + + let df_with_year = annual_average_df + .lazy() + .with_columns([col("Date").dt().year().alias("year")]) + .collect()?; + println!("{}", &df_with_year); + // --8<-- [end:group_by] + + // --8<-- [start:group_by_dyn] + let df = df!( + "time" => date_range( + "time", + NaiveDate::from_ymd_opt(2021, 1, 1).unwrap().and_hms_opt(0, 0, 0).unwrap(), + NaiveDate::from_ymd_opt(2021, 12, 31).unwrap().and_hms_opt(0, 0, 0).unwrap(), + Duration::parse("1d"), + ClosedWindow::Both, + TimeUnit::Milliseconds, None)?.cast(&DataType::Date)?)?; + + let out = df + .clone() + .lazy() + .groupby_dynamic( + col("time"), + [], + DynamicGroupOptions { + every: Duration::parse("1mo"), + period: Duration::parse("1mo"), + offset: Duration::parse("0"), + closed_window: ClosedWindow::Left, + ..Default::default() + }, + ) + .agg([ + col("time") + .cumcount(true) // python example has false + .reverse() + .head(Some(3)) + .alias("day/eom"), + ((col("time").last() - col("time").first()).map( + // had to use map as .duration().days() is not available + |s| { + Ok(Some( + s.duration()? + .into_iter() + .map(|d| d.map(|v| v / 1000 / 24 / 60 / 60)) + .collect::() + .into_series(), + )) + }, + GetOutput::from_type(DataType::Int64), + ) + lit(1)) + .alias("days_in_month"), + ]) + .explode([col("day/eom")]) + .collect()?; + println!("{}", &out); + // --8<-- [end:group_by_dyn] + + // --8<-- [start:group_by_roll] + let df = df!( + "time" => date_range( + "time", + NaiveDate::from_ymd_opt(2021, 12, 16).unwrap().and_hms_opt(0, 0, 0).unwrap(), + NaiveDate::from_ymd_opt(2021, 12, 16).unwrap().and_hms_opt(3, 0, 0).unwrap(), + Duration::parse("30m"), + ClosedWindow::Both, + TimeUnit::Milliseconds, None)?, + "groups"=> ["a", "a", "a", "b", "b", "a", "a"], + )?; + println!("{}", &df); + // --8<-- [end:group_by_roll] + + // --8<-- [start:group_by_dyn2] + let out = df + .clone() + .lazy() + .groupby_dynamic( + col("time"), + [col("groups")], + DynamicGroupOptions { + every: Duration::parse("1h"), + period: Duration::parse("1h"), + offset: Duration::parse("0"), + include_boundaries: true, + closed_window: ClosedWindow::Both, + ..Default::default() + }, + ) + .agg([count()]) + .collect()?; + println!("{}", &out); + // --8<-- [end:group_by_dyn2] + + Ok(()) +} diff --git a/docs/src/rust/user-guide/transformations/time-series/timezones.rs b/docs/src/rust/user-guide/transformations/time-series/timezones.rs new file mode 100644 index 000000000000..20f818954667 --- /dev/null +++ b/docs/src/rust/user-guide/transformations/time-series/timezones.rs @@ -0,0 +1,46 @@ +// --8<-- [start:setup] +use polars::prelude::*; +// --8<-- [end:setup] + +fn main() -> Result<(), Box> { + // --8<-- [start:example] + let ts = ["2021-03-27 03:00", "2021-03-28 03:00"]; + let tz_naive = Series::new("tz_naive", &ts); + let time_zones_df = DataFrame::new(vec![tz_naive])? + .lazy() + .select([col("tz_naive").str().strptime( + DataType::Datetime(TimeUnit::Milliseconds, None), + StrptimeOptions::default(), + )]) + .with_columns([col("tz_naive") + .dt() + .replace_time_zone(Some("UTC".to_string()), None) + .alias("tz_aware")]) + .collect()?; + + println!("{}", &time_zones_df); + // --8<-- [end:example] + + // --8<-- [start:example2] + let time_zones_operations = time_zones_df + .lazy() + .select([ + col("tz_aware") + .dt() + .replace_time_zone(Some("Europe/Brussels".to_string()), None) + .alias("replace time zone"), + col("tz_aware") + .dt() + .convert_time_zone("Asia/Kathmandu".to_string()) + .alias("convert time zone"), + col("tz_aware") + .dt() + .replace_time_zone(None, None) + .alias("unset time zone"), + ]) + .collect()?; + println!("{}", &time_zones_operations); + // --8<-- [end:example2] + + Ok(()) +} diff --git a/docs/user-guide/concepts/contexts.md b/docs/user-guide/concepts/contexts.md new file mode 100644 index 000000000000..604ff311ca63 --- /dev/null +++ b/docs/user-guide/concepts/contexts.md @@ -0,0 +1,64 @@ +# Contexts + +Polars has developed its own Domain Specific Language (DSL) for transforming data. The language is very easy to use and allows for complex queries that remain human readable. The two core components of the language are Contexts and Expressions, the latter we will cover in the next section. + +A context, as implied by the name, refers to the context in which an expression needs to be evaluated. There are three main contexts [^1]: + +1. Selection: `df.select([..])`, `df.with_columns([..])` +1. Filtering: `df.filter()` +1. Group by / Aggregation: `df.group_by(..).agg([..])` + +The examples below are performed on the following `DataFrame`: + +{{code_block('user-guide/concepts/contexts','dataframe',['DataFrame'])}} + +```python exec="on" result="text" session="user-guide/contexts" +--8<-- "python/user-guide/concepts/contexts.py:setup" +--8<-- "python/user-guide/concepts/contexts.py:dataframe" +``` + +## Select + +In the `select` context the selection applies expressions over columns. The expressions in this context must produce `Series` that are all the same length or have a length of 1. + +A `Series` of a length of 1 will be broadcasted to match the height of the `DataFrame`. Note that a select may produce new columns that are aggregations, combinations of expressions, or literals. + +{{code_block('user-guide/concepts/contexts','select',['select'])}} + +```python exec="on" result="text" session="user-guide/contexts" +--8<-- "python/user-guide/concepts/contexts.py:select" +``` + +As you can see from the query the `select` context is very powerful and allows you to perform arbitrary expressions independent (and in parallel) of each other. + +Similarly to the `select` statement there is the `with_columns` statement which also is an entrance to the selection context. The main difference is that `with_columns` retains the original columns and adds new ones while `select` drops the original columns. + +{{code_block('user-guide/concepts/contexts','with_columns',['with_columns'])}} + +```python exec="on" result="text" session="user-guide/contexts" +--8<-- "python/user-guide/concepts/contexts.py:with_columns" +``` + +## Filter + +In the `filter` context you filter the existing dataframe based on arbitrary expression which evaluates to the `Boolean` data type. + +{{code_block('user-guide/concepts/contexts','filter',['filter'])}} + +```python exec="on" result="text" session="user-guide/contexts" +--8<-- "python/user-guide/concepts/contexts.py:filter" +``` + +## Group by / aggregation + +In the `group_by` context, expressions work on groups and thus may yield results of any length (a group may have many members). + +{{code_block('user-guide/concepts/contexts','group_by',['group_by'])}} + +```python exec="on" result="text" session="user-guide/contexts" +--8<-- "python/user-guide/concepts/contexts.py:group_by" +``` + +As you can see from the result all expressions are applied to the group defined by the `group_by` context. Besides the standard `group_by`, `group_by_dynamic`, and `group_by_rolling` are also entrances to the group by context. + +[^1]: There are additional List and SQL contexts which are covered later in this guide. But for simplicity, we leave them out of scope for now. diff --git a/docs/user-guide/concepts/data-structures.md b/docs/user-guide/concepts/data-structures.md new file mode 100644 index 000000000000..1825f8bbc892 --- /dev/null +++ b/docs/user-guide/concepts/data-structures.md @@ -0,0 +1,68 @@ +# Data structures + +The core base data structures provided by Polars are `Series` and `DataFrames`. + +## Series + +Series are a 1-dimensional data structure. Within a series all elements have the same [Data Type](data-types.md) . +The snippet below shows how to create a simple named `Series` object. + +{{code_block('getting-started/series-dataframes','series',['Series'])}} + +```python exec="on" result="text" session="getting-started/series" +--8<-- "python/getting-started/series-dataframes.py:series" +``` + +## DataFrame + +A `DataFrame` is a 2-dimensional data structure that is backed by a `Series`, and it can be seen as an abstraction of a collection (e.g. list) of `Series`. Operations that can be executed on a `DataFrame` are very similar to what is done in a `SQL` like query. You can `GROUP BY`, `JOIN`, `PIVOT`, but also define custom functions. + +{{code_block('getting-started/series-dataframes','dataframe',['DataFrame'])}} + +```python exec="on" result="text" session="getting-started/series" +--8<-- "python/getting-started/series-dataframes.py:dataframe" +``` + +### Viewing data + +This part focuses on viewing data in a `DataFrame`. We will use the `DataFrame` from the previous example as a starting point. + +#### Head + +The `head` function shows by default the first 5 rows of a `DataFrame`. You can specify the number of rows you want to see (e.g. `df.head(10)`). + +{{code_block('getting-started/series-dataframes','head',['head'])}} + +```python exec="on" result="text" session="getting-started/series" +--8<-- "python/getting-started/series-dataframes.py:head" +``` + +#### Tail + +The `tail` function shows the last 5 rows of a `DataFrame`. You can also specify the number of rows you want to see, similar to `head`. + +{{code_block('getting-started/series-dataframes','tail',['tail'])}} + +```python exec="on" result="text" session="getting-started/series" +--8<-- "python/getting-started/series-dataframes.py:tail" +``` + +#### Sample + +If you want to get an impression of the data of your `DataFrame`, you can also use `sample`. With `sample` you get an _n_ number of random rows from the `DataFrame`. + +{{code_block('getting-started/series-dataframes','sample',['sample'])}} + +```python exec="on" result="text" session="getting-started/series" +--8<-- "python/getting-started/series-dataframes.py:sample" +``` + +#### Describe + +`Describe` returns summary statistics of your `DataFrame`. It will provide several quick statistics if possible. + +{{code_block('getting-started/series-dataframes','describe',['describe'])}} + +```python exec="on" result="text" session="getting-started/series" +--8<-- "python/getting-started/series-dataframes.py:describe" +``` diff --git a/docs/user-guide/concepts/data-types.md b/docs/user-guide/concepts/data-types.md new file mode 100644 index 000000000000..c63c9b4a37f7 --- /dev/null +++ b/docs/user-guide/concepts/data-types.md @@ -0,0 +1,31 @@ +# Data types + +`Polars` is entirely based on `Arrow` data types and backed by `Arrow` memory arrays. This makes data processing +cache-efficient and well-supported for Inter Process Communication. Most data types follow the exact implementation +from `Arrow`, with the exception of `Utf8` (this is actually `LargeUtf8`), `Categorical`, and `Object` (support is limited). The data types are: + +| Group | Type | Details | +| -------- | ------------- | -------------------------------------------------------------------------------------------------------------------------------------- | +| Numeric | `Int8` | 8-bit signed integer. | +| | `Int16` | 16-bit signed integer. | +| | `Int32` | 32-bit signed integer. | +| | `Int64` | 64-bit signed integer. | +| | `UInt8` | 8-bit unsigned integer. | +| | `UInt16` | 16-bit unsigned integer. | +| | `UInt32` | 32-bit unsigned integer. | +| | `UInt64` | 64-bit unsigned integer. | +| | `Float32` | 32-bit floating point. | +| | `Float64` | 64-bit floating point. | +| Nested | `Struct` | A struct array is represented as a `Vec` and is useful to pack multiple/heterogenous values in a single column. | +| | `List` | A list array contains a child array containing the list values and an offset array. (this is actually `Arrow` `LargeList` internally). | +| Temporal | `Date` | Date representation, internally represented as days since UNIX epoch encoded by a 32-bit signed integer. | +| | `Datetime` | Datetime representation, internally represented as microseconds since UNIX epoch encoded by a 64-bit signed integer. | +| | `Duration` | A timedelta type, internally represented as microseconds. Created when subtracting `Date/Datetime`. | +| | `Time` | Time representation, internally represented as nanoseconds since midnight. | +| Other | `Boolean` | Boolean type effectively bit packed. | +| | `Utf8` | String data (this is actually `Arrow` `LargeUtf8` internally). | +| | `Binary` | Store data as bytes. | +| | `Object` | A limited supported data type that can be any value. | +| | `Categorical` | A categorical encoding of a set of strings. | + +To learn more about the internal representation of these data types, check the [`Arrow` columnar format](https://arrow.apache.org/docs/format/Columnar.html). diff --git a/docs/user-guide/concepts/expressions.md b/docs/user-guide/concepts/expressions.md new file mode 100644 index 000000000000..b276c494a4a3 --- /dev/null +++ b/docs/user-guide/concepts/expressions.md @@ -0,0 +1,49 @@ +# Expressions + +`Polars` has a powerful concept called expressions that is central to its very fast performance. + +Expressions are at the core of many data science operations: + +- taking a sample of rows from a column +- multiplying values in a column +- extracting a column of years from dates +- convert a column of strings to lowercase +- and so on! + +However, expressions are also used within other operations: + +- taking the mean of a group in a `group_by` operation +- calculating the size of groups in a `group_by` operation +- taking the sum horizontally across columns + +`Polars` performs these core data transformations very quickly by: + +- automatic query optimization on each expression +- automatic parallelization of expressions on many columns + +Polars expressions are a mapping from a series to a series (or mathematically `Fn(Series) -> Series`). As expressions have a `Series` as an input and a `Series` as an output then it is straightforward to do a sequence of expressions (similar to method chaining in `Pandas`). + +## Examples + +The following is an expression: + +{{code_block('user-guide/concepts/expressions','example1',['col','sort','head'])}} + +The snippet above says: + +1. Select column "foo" +1. Then sort the column (not in reversed order) +1. Then take the first two values of the sorted output + +The power of expressions is that every expression produces a new expression, and that they +can be _piped_ together. You can run an expression by passing them to one of `Polars` execution contexts. + +Here we run two expressions by running `df.select`: + +{{code_block('user-guide/concepts/expressions','example2',['select'])}} + +All expressions are run in parallel, meaning that separate `Polars` expressions are **embarrassingly parallel**. Note that within an expression there may be more parallelization going on. + +## Conclusion + +This is the tip of the iceberg in terms of possible expressions. There are a ton more, and they can be combined in a variety of ways. This page is intended to get you familiar with the concept of expressions, in the section on [expressions](../expressions/operators.md) we will dive deeper. diff --git a/docs/user-guide/concepts/lazy-vs-eager.md b/docs/user-guide/concepts/lazy-vs-eager.md new file mode 100644 index 000000000000..1b84a0272aa5 --- /dev/null +++ b/docs/user-guide/concepts/lazy-vs-eager.md @@ -0,0 +1,28 @@ +# Lazy / eager API + +`Polars` supports two modes of operation: lazy and eager. In the eager API the query is executed immediately while in the lazy API the query is only evaluated once it is 'needed'. Deferring the execution to the last minute can have significant performance advantages that is why the Lazy API is preferred in most cases. Let us demonstrate this with an example: + +{{code_block('user-guide/concepts/lazy-vs-eager','eager',['read_csv'])}} + +In this example we use the eager API to: + +1. Read the iris [dataset](https://archive.ics.uci.edu/ml/datasets/iris). +1. Filter the dataset based on sepal length +1. Calculate the mean of the sepal width per species + +Every step is executed immediately returning the intermediate results. This can be very wasteful as we might do work or load extra data that is not being used. If we instead used the lazy API and waited on execution until all the steps are defined then the query planner could perform various optimizations. In this case: + +- Predicate pushdown: Apply filters as early as possible while reading the dataset, thus only reading rows with sepal length greater than 5. +- Projection pushdown: Select only the columns that are needed while reading the dataset, thus removing the need to load additional columns (e.g. petal length & petal width) + +{{code_block('user-guide/concepts/lazy-vs-eager','lazy',['scan_csv'])}} + +These will significantly lower the load on memory & CPU thus allowing you to fit bigger datasets in memory and process faster. Once the query is defined you call `collect` to inform `Polars` that you want to execute it. In the section on Lazy API we will go into more details on its implementation. + +!!! info "Eager API" + + In many cases the eager API is actually calling the lazy API under the hood and immediately collecting the result. This has the benefit that within the query itself optimization(s) made by the query planner can still take place. + +### When to use which + +In general the lazy API should be preferred unless you are either interested in the intermediate results or are doing exploratory work and don't know yet what your query is going to look like. diff --git a/docs/user-guide/concepts/streaming.md b/docs/user-guide/concepts/streaming.md new file mode 100644 index 000000000000..e52e28bf2cfe --- /dev/null +++ b/docs/user-guide/concepts/streaming.md @@ -0,0 +1,21 @@ +# Streaming API + +One additional benefit of the lazy API is that it allows queries to be executed in a streaming manner. Instead of processing the data all-at-once `Polars` can execute the query in batches allowing you to process datasets that are larger-than-memory. + +To tell Polars we want to execute a query in streaming mode we pass the `streaming=True` argument to `collect` + +{{code_block('user-guide/concepts/streaming','streaming',['collect'])}} + +## When is streaming available? + +Streaming is still in development. We can ask Polars to execute any lazy query in streaming mode. However, not all lazy operations support streaming. If there is an operation for which streaming is not supported Polars will run the query in non-streaming mode. + +Streaming is supported for many operations including: + +- `filter`,`slice`,`head`,`tail` +- `with_columns`,`select` +- `group_by` +- `join` +- `sort` +- `explode`,`melt` +- `scan_csv`,`scan_parquet`,`scan_ipc` diff --git a/docs/user-guide/expressions/aggregation.md b/docs/user-guide/expressions/aggregation.md new file mode 100644 index 000000000000..6b5fb8bcaf48 --- /dev/null +++ b/docs/user-guide/expressions/aggregation.md @@ -0,0 +1,122 @@ +# Aggregation + +`Polars` implements a powerful syntax defined not only in its lazy API, but also in its eager API. Let's take a look at what that means. + +We can start with the simple [US congress `dataset`](https://github.com/unitedstates/congress-legislators). + +{{code_block('user-guide/expressions/aggregation','dataframe',['DataFrame','Categorical'])}} + +#### Basic aggregations + +You can easily combine different aggregations by adding multiple expressions in a +`list`. There is no upper bound on the number of aggregations you can do, and you can +make any combination you want. In the snippet below we do the following aggregations: + +Per GROUP `"first_name"` we + +- count the number of rows in the group: + - short form: `pl.count("party")` + - full form: `pl.col("party").count()` +- aggregate the gender values groups: + - full form: `pl.col("gender")` +- get the first value of column `"last_name"` in the group: + - short form: `pl.first("last_name")` (not available in Rust) + - full form: `pl.col("last_name").first()` + +Besides the aggregation, we immediately sort the result and limit to the top `5` so that +we have a nice summary overview. + +{{code_block('user-guide/expressions/aggregation','basic',['group_by'])}} + +```python exec="on" result="text" session="user-guide/expressions" +--8<-- "python/user-guide/expressions/aggregation.py:setup" +--8<-- "python/user-guide/expressions/aggregation.py:dataframe" +--8<-- "python/user-guide/expressions/aggregation.py:basic" +``` + +#### Conditionals + +It's that easy! Let's turn it up a notch. Let's say we want to know how +many delegates of a "state" are "Pro" or "Anti" administration. We could directly query +that in the aggregation without the need of a `lambda` or grooming the `DataFrame`. + +{{code_block('user-guide/expressions/aggregation','conditional',['group_by'])}} + +```python exec="on" result="text" session="user-guide/expressions" +--8<-- "python/user-guide/expressions/aggregation.py:conditional" +``` + +Similarly, this could also be done with a nested GROUP BY, but that doesn't help show off some of these nice features. 😉 + +{{code_block('user-guide/expressions/aggregation','nested',['group_by'])}} + +```python exec="on" result="text" session="user-guide/expressions" +--8<-- "python/user-guide/expressions/aggregation.py:nested" +``` + +#### Filtering + +We can also filter the groups. Let's say we want to compute a mean per group, but we +don't want to include all values from that group, and we also don't want to filter the +rows from the `DataFrame` (because we need those rows for another aggregation). + +In the example below we show how this can be done. + +!!! note + + Note that we can make `Python` functions for clarity. These functions don't cost us anything. That is because we only create `Polars` expressions, we don't apply a custom function over a `Series` during runtime of the query. Of course, you can make functions that return expressions in Rust, too. + +{{code_block('user-guide/expressions/aggregation','filter',['group_by'])}} + +```python exec="on" result="text" session="user-guide/expressions" +--8<-- "python/user-guide/expressions/aggregation.py:filter" +``` + +#### Sorting + +It's common to see a `DataFrame` being sorted for the sole purpose of managing the ordering during a GROUP BY operation. Let's say that we want to get the names of the oldest and youngest politicians per state. We could SORT and GROUP BY. + +{{code_block('user-guide/expressions/aggregation','sort',['group_by'])}} + +```python exec="on" result="text" session="user-guide/expressions" +--8<-- "python/user-guide/expressions/aggregation.py:sort" +``` + +However, **if** we also want to sort the names alphabetically, this breaks. Luckily we can sort in a `group_by` context separate from the `DataFrame`. + +{{code_block('user-guide/expressions/aggregation','sort2',['group_by'])}} + +```python exec="on" result="text" session="user-guide/expressions" +--8<-- "python/user-guide/expressions/aggregation.py:sort2" +``` + +We can even sort by another column in the `group_by` context. If we want to know if the alphabetically sorted name is male or female we could add: `pl.col("gender").sort_by("first_name").first().alias("gender")` + +{{code_block('user-guide/expressions/aggregation','sort3',['group_by'])}} + +```python exec="on" result="text" session="user-guide/expressions" +--8<-- "python/user-guide/expressions/aggregation.py:sort3" +``` + +### Do not kill parallelization + +!!! warning "Python Users Only" + + The following section is specific to `Python`, and doesn't apply to `Rust`. Within `Rust`, blocks and closures (lambdas) can, and will, be executed concurrently. + +We have all heard that `Python` is slow, and does "not scale." Besides the overhead of +running "slow" bytecode, `Python` has to remain within the constraints of the Global +Interpreter Lock (GIL). This means that if you were to use a `lambda` or a custom `Python` +function to apply during a parallelized phase, `Polars` speed is capped running `Python` +code preventing any multiple threads from executing the function. + +This all feels terribly limiting, especially because we often need those `lambda` functions in a +`.group_by()` step, for example. This approach is still supported by `Polars`, but +keeping in mind bytecode **and** the GIL costs have to be paid. It is recommended to try to solve your queries using the expression syntax before moving to `lambdas`. If you want to learn more about using `lambdas`, go to the [user defined functions section](./user-defined-functions.md). + +### Conclusion + +In the examples above we've seen that we can do a lot by combining expressions. By doing so we delay the use of custom `Python` functions that slow down the queries (by the slow nature of Python AND the GIL). + +If we are missing a type expression let us know by opening a +[feature request](https://github.com/pola-rs/polars/issues/new/choose)! diff --git a/docs/user-guide/expressions/casting.md b/docs/user-guide/expressions/casting.md new file mode 100644 index 000000000000..cb06699fa2ed --- /dev/null +++ b/docs/user-guide/expressions/casting.md @@ -0,0 +1,100 @@ +# Casting + +Casting converts the underlying [`DataType`](../concepts/data-types.md) of a column to a new one. Polars uses Arrow to manage the data in memory and relies on the compute kernels in the [rust implementation](https://github.com/jorgecarleitao/arrow2) to do the conversion. Casting is available with the `cast()` method. + +The `cast` method includes a `strict` parameter that determines how Polars behaves when it encounters a value that can't be converted from the source `DataType` to the target `DataType`. By default, `strict=True`, which means that Polars will throw an error to notify the user of the failed conversion and provide details on the values that couldn't be cast. On the other hand, if `strict=False`, any values that can't be converted to the target `DataType` will be quietly converted to `null`. + +## Numerics + +Let's take a look at the following `DataFrame` which contains both integers and floating point numbers. + +{{code_block('user-guide/expressions/casting','dfnum',['DataFrame'])}} + +```python exec="on" result="text" session="user-guide/cast" +--8<-- "python/user-guide/expressions/casting.py:setup" +--8<-- "python/user-guide/expressions/casting.py:dfnum" +``` + +To perform casting operations between floats and integers, or vice versa, we can invoke the `cast()` function. + +{{code_block('user-guide/expressions/casting','castnum',['cast'])}} + +```python exec="on" result="text" session="user-guide/cast" +--8<-- "python/user-guide/expressions/casting.py:castnum" +``` + +Note that in the case of decimal values these are rounded downwards when casting to an integer. + +##### Downcast + +Reducing the memory footprint is also achievable by modifying the number of bits allocated to an element. As an illustration, the code below demonstrates how casting from `Int64` to `Int16` and from `Float64` to `Float32` can be used to lower memory usage. + +{{code_block('user-guide/expressions/casting','downcast',['cast'])}} + +```python exec="on" result="text" session="user-guide/cast" +--8<-- "python/user-guide/expressions/casting.py:downcast" +``` + +#### Overflow + +When performing downcasting, it is crucial to ensure that the chosen number of bits (such as 64, 32, or 16) is sufficient to accommodate the largest and smallest numbers in the column. For example, using a 32-bit signed integer (`Int32`) allows handling integers within the range of -2147483648 to +2147483647, while using `Int8` covers integers between -128 to 127. Attempting to cast to a `DataType` that is too small will result in a `ComputeError` thrown by Polars, as the operation is not supported. + +{{code_block('user-guide/expressions/casting','overflow',['cast'])}} + +```python exec="on" result="text" session="user-guide/cast" +--8<-- "python/user-guide/expressions/casting.py:overflow" +``` + +You can set the `strict` parameter to `False`, this converts values that are overflowing to null values. + +{{code_block('user-guide/expressions/casting','overflow2',['cast'])}} + +```python exec="on" result="text" session="user-guide/cast" +--8<-- "python/user-guide/expressions/casting.py:overflow2" +``` + +## Strings + +Strings can be casted to numerical data types and vice versa: + +{{code_block('user-guide/expressions/casting','strings',['cast'])}} + +```python exec="on" result="text" session="user-guide/cast" +--8<-- "python/user-guide/expressions/casting.py:strings" +``` + +In case the column contains a non-numerical value, Polars will throw a `ComputeError` detailing the conversion error. Setting `strict=False` will convert the non float value to `null`. + +{{code_block('user-guide/expressions/casting','strings2',['cast'])}} + +```python exec="on" result="text" session="user-guide/cast" +--8<-- "python/user-guide/expressions/casting.py:strings2" +``` + +## Booleans + +Booleans can be expressed as either 1 (`True`) or 0 (`False`). It's possible to perform casting operations between a numerical `DataType` and a boolean, and vice versa. However, keep in mind that casting from a string (`Utf8`) to a boolean is not permitted. + +{{code_block('user-guide/expressions/casting','bool',['cast'])}} + +```python exec="on" result="text" session="user-guide/cast" +--8<-- "python/user-guide/expressions/casting.py:bool" +``` + +## Dates + +Temporal data types such as `Date` or `Datetime` are represented as the number of days (`Date`) and microseconds (`Datetime`) since epoch. Therefore, casting between the numerical types and the temporal data types is allowed. + +{{code_block('user-guide/expressions/casting','dates',['cast'])}} + +```python exec="on" result="text" session="user-guide/cast" +--8<-- "python/user-guide/expressions/casting.py:dates" +``` + +To perform casting operations between strings and `Dates`/`Datetimes`, `strftime` and `strptime` are utilized. Polars adopts the [chrono format syntax](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) for when formatting. It's worth noting that `strptime` features additional options that support timezone functionality. Refer to the API documentation for further information. + +{{code_block('user-guide/expressions/casting','dates2',['strftime','strptime'])}} + +```python exec="on" result="text" session="user-guide/cast" +--8<-- "python/user-guide/expressions/casting.py:dates2" +``` diff --git a/docs/user-guide/expressions/column-selections.md b/docs/user-guide/expressions/column-selections.md new file mode 100644 index 000000000000..0f6b1a82f018 --- /dev/null +++ b/docs/user-guide/expressions/column-selections.md @@ -0,0 +1,134 @@ +# Column selections + +Let's create a dataset to use in this section: + +{{code_block('user-guide/expressions/column-selections','selectors_df',['DataFrame'])}} + +```python exec="on" result="text" session="user-guide/column-selections" +--8<-- "python/user-guide/expressions/column-selections.py:setup" +--8<-- "python/user-guide/expressions/column-selections.py:selectors_df" +``` + +## Expression expansion + +As we've seen in the previous section, we can select specific columns using the `pl.col` method. It can also select multiple columns - both as a means of convenience, and to _expand_ the expression. + +This kind of convenience feature isn't just decorative or syntactic sugar. It allows for a very powerful application of [DRY](https://en.wikipedia.org/wiki/Don%27t_repeat_yourself) principles in your code: a single expression that specifies multiple columns expands into a list of expressions (depending on the DataFrame schema), resulting in being able to select multiple columns + run computation on them! + +### Select all, or all but some + +We can select all columns in the `DataFrame` object by providing the argument `*`: + +{{code_block('user-guide/expressions/column-selections', 'all',['all'])}} + +```python exec="on" result="text" session="user-guide/column-selections" +--8<-- "python/user-guide/expressions/column-selections.py:all" +``` + +Often, we don't just want to include all columns, but include all _while_ excluding a few. This can be done easily as well: + +{{code_block('user-guide/expressions/column-selections','exclude',['exclude'])}} + +```python exec="on" result="text" session="user-guide/column-selections" +--8<-- "python/user-guide/expressions/column-selections.py:exclude" +``` + +### By multiple strings + +Specifying multiple strings allows expressions to _expand_ to all matching columns: + +{{code_block('user-guide/expressions/column-selections','expansion_by_names',['dt_to_string'])}} + +```python exec="on" result="text" session="user-guide/column-selections" +--8<-- "python/user-guide/expressions/column-selections.py:expansion_by_names" +``` + +### By regular expressions + +Multiple column selection is possible by regular expressions also, by making sure to wrap the regex by `^` and `$` to let `pl.col` know that a regex selection is expected: + +{{code_block('user-guide/expressions/column-selections','expansion_by_regex',[''])}} + +```python exec="on" result="text" session="user-guide/column-selections" +--8<-- "python/user-guide/expressions/column-selections.py:expansion_by_regex" +``` + +### By data type + +`pl.col` can select multiple columns using Polars data types: + +{{code_block('user-guide/expressions/column-selections','expansion_by_dtype',['n_unique'])}} + +```python exec="on" result="text" session="user-guide/column-selections" +--8<-- "python/user-guide/expressions/column-selections.py:expansion_by_dtype" +``` + +## Using `selectors` + +Polars also allows for the use of intuitive selections for columns based on their name, `dtype` or other properties; and this is built on top of existing functionality outlined in `col` used above. It is recommended to use them by importing and aliasing `polars.selectors` as `cs`. + +### By `dtype` + +To select just the integer and string columns, we can do: + +{{code_block('user-guide/expressions/column-selections','selectors_intro',['selectors'])}} + +```python exec="on" result="text" session="user-guide/column-selections" +--8<-- "python/user-guide/expressions/column-selections.py:selectors_intro" +``` + +### Applying set operations + +These _selectors_ also allow for set based selection operations. For instance, to select the **numeric** columns **except** the **first** column that indicates row numbers: + +{{code_block('user-guide/expressions/column-selections','selectors_diff',['cs_first', 'cs_numeric'])}} + +```python exec="on" result="text" session="user-guide/column-selections" +--8<-- "python/user-guide/expressions/column-selections.py:selectors_diff" +``` + +We can also select the row number by name **and** any **non**-numeric columns: + +{{code_block('user-guide/expressions/column-selections','selectors_union',['cs_by_name', 'cs_numeric'])}} + +```python exec="on" result="text" session="user-guide/column-selections" +--8<-- "python/user-guide/expressions/column-selections.py:selectors_union" +``` + +### By patterns and substrings + +_Selectors_ can also be matched by substring and regex patterns: + +{{code_block('user-guide/expressions/column-selections','selectors_by_name',['cs_contains', 'cs_matches'])}} + +```python exec="on" result="text" session="user-guide/column-selections" +--8<-- "python/user-guide/expressions/column-selections.py:selectors_by_name" +``` + +### Converting to expressions + +What if we want to apply a specific operation on the selected columns (i.e. get back to representing them as **expressions** to operate upon)? We can simply convert them using `as_expr` and then proceed as normal: + +{{code_block('user-guide/expressions/column-selections','selectors_to_expr',['cs_temporal'])}} + +```python exec="on" result="text" session="user-guide/column-selections" +--8<-- "python/user-guide/expressions/column-selections.py:selectors_to_expr" +``` + +### Debugging `selectors` + +Polars also provides two helpful utility functions to aid with using selectors: `is_selector` and `selector_column_names`: + +{{code_block('user-guide/expressions/column-selections','selectors_is_selector_utility',['is_selector'])}} + +```python exec="on" result="text" session="user-guide/column-selections" +--8<-- "python/user-guide/expressions/column-selections.py:selectors_is_selector_utility" +``` + +To predetermine the column names that are selected, which is especially useful for a LazyFrame object: + +{{code_block('user-guide/expressions/column-selections','selectors_colnames_utility',['selector_column_names'])}} + +```python exec="on" result="text" session="user-guide/column-selections" +--8<-- "python/user-guide/expressions/column-selections.py:selectors_colnames_utility" +``` diff --git a/docs/user-guide/expressions/folds.md b/docs/user-guide/expressions/folds.md new file mode 100644 index 000000000000..2339f8f114e5 --- /dev/null +++ b/docs/user-guide/expressions/folds.md @@ -0,0 +1,43 @@ +# Folds + +`Polars` provides expressions/methods for horizontal aggregations like `sum`,`min`, `mean`, +etc. However, when you need a more complex aggregation the default methods `Polars` supplies may not be sufficient. That's when `folds` come in handy. + +The `fold` expression operates on columns for maximum speed. It utilizes the data layout very efficiently and often has vectorized execution. + +### Manual sum + +Let's start with an example by implementing the `sum` operation ourselves, with a `fold`. + +{{code_block('user-guide/expressions/folds','mansum',['fold'])}} + +```python exec="on" result="text" session="user-guide/folds" +--8<-- "python/user-guide/expressions/folds.py:setup" +--8<-- "python/user-guide/expressions/folds.py:mansum" +``` + +The snippet above recursively applies the function `f(acc, x) -> acc` to an accumulator `acc` and a new column `x`. The function operates on columns individually and can take advantage of cache efficiency and vectorization. + +### Conditional + +In the case where you'd want to apply a condition/predicate on all columns in a `DataFrame` a `fold` operation can be a very concise way to express this. + +{{code_block('user-guide/expressions/folds','conditional',['fold'])}} + +```python exec="on" result="text" session="user-guide/folds" +--8<-- "python/user-guide/expressions/folds.py:conditional" +``` + +In the snippet we filter all rows where **each** column value is `> 1`. + +### Folds and string data + +Folds could be used to concatenate string data. However, due to the materialization of intermediate columns, this operation will have squared complexity. + +Therefore, we recommend using the `concat_str` expression for this. + +{{code_block('user-guide/expressions/folds','string',['concat_str'])}} + +```python exec="on" result="text" session="user-guide/folds" +--8<-- "python/user-guide/expressions/folds.py:string" +``` diff --git a/docs/user-guide/expressions/functions.md b/docs/user-guide/expressions/functions.md new file mode 100644 index 000000000000..1dfe08e771d2 --- /dev/null +++ b/docs/user-guide/expressions/functions.md @@ -0,0 +1,71 @@ +# Functions + +`Polars` expressions have a large number of built in functions. These allow you to create complex queries without the need for [user defined functions](user-defined-functions.md). There are too many to go through here, but we will cover some of the more popular use cases. If you want to view all the functions go to the API Reference for your programming language. + +In the examples below we will use the following `DataFrame`: + +{{code_block('user-guide/expressions/functions','dataframe',['DataFrame'])}} + +```python exec="on" result="text" session="user-guide/functions" +--8<-- "python/user-guide/expressions/functions.py:setup" +--8<-- "python/user-guide/expressions/functions.py:dataframe" +``` + +## Column naming + +By default if you perform an expression it will keep the same name as the original column. In the example below we perform an expression on the `nrs` column. Note that the output `DataFrame` still has the same name. + +{{code_block('user-guide/expressions/functions','samename',[])}} + +=== ":fontawesome-brands-python: Python" + +```python +--8<-- "python/user-guide/expressions/functions.py:samename" +``` + +```python exec="on" result="text" session="user-guide/functions" +--8<-- "python/user-guide/expressions/functions.py:samename" +``` + +This might get problematic in the case you use the same column multiple times in your expression as the output columns will get duplicated. For example, the following query will fail. + +{{code_block('user-guide/expressions/functions','samenametwice',[])}} + +```python exec="on" result="text" session="user-guide/functions" +--8<-- "python/user-guide/expressions/functions.py:samenametwice" +``` + +You can change the output name of an expression by using the `alias` function + +{{code_block('user-guide/expressions/functions','samenamealias',['alias'])}} + +```python exec="on" result="text" session="user-guide/functions" +--8<-- "python/user-guide/expressions/functions.py:samenamealias" +``` + +In case of multiple columns for example when using `all()` or `col(*)` you can apply a mapping function `map_alias` to change the original column name into something else. In case you want to add a suffix (`suffix()`) or prefix (`prefix()`) these are also built in. + +=== ":fontawesome-brands-python: Python" +[:material-api: `prefix`](https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.prefix.html) +[:material-api: `suffix`](https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.suffix.html) +[:material-api: `map_alias`](https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.map_alias.html) + +## Count unique values + +There are two ways to count unique values in `Polars`: an exact methodology and an approximation. The approximation uses the [HyperLogLog++](https://en.wikipedia.org/wiki/HyperLogLog) algorithm to approximate the cardinality and is especially useful for very large datasets where an approximation is good enough. + +{{code_block('user-guide/expressions/functions','countunique',['n_unique','approx_n_unique'])}} + +```python exec="on" result="text" session="user-guide/functions" +--8<-- "python/user-guide/expressions/functions.py:countunique" +``` + +## Conditionals + +`Polars` supports if-else like conditions in expressions with the `when`, `then`, `otherwise` syntax. The predicate is placed in the `when` clause and when this evaluates to `true` the `then` expression is applied otherwise the `otherwise` expression is applied (row-wise). + +{{code_block('user-guide/expressions/functions','conditional',['when'])}} + +```python exec="on" result="text" session="user-guide/functions" +--8<-- "python/user-guide/expressions/functions.py:conditional" +``` diff --git a/docs/user-guide/expressions/lists.md b/docs/user-guide/expressions/lists.md new file mode 100644 index 000000000000..b7c508f11b90 --- /dev/null +++ b/docs/user-guide/expressions/lists.md @@ -0,0 +1,119 @@ +# Lists and Arrays + +`Polars` has first-class support for `List` columns: that is, columns where each row is a list of homogeneous elements, of varying lengths. `Polars` also has an `Array` datatype, which is analogous to `numpy`'s `ndarray` objects, where the length is identical across rows. + +Note: this is different from Python's `list` object, where the elements can be of any type. Polars can store these within columns, but as a generic `Object` datatype that doesn't have the special list manipulation features that we're about to discuss. + +## Powerful `List` manipulation + +Let's say we had the following data from different weather stations across a state. When the weather station is unable to get a result, an error code is recorded instead of the actual temperature at that time. + +{{code_block('user-guide/expressions/lists','weather_df',['DataFrame'])}} + +```python exec="on" result="text" session="user-guide/lists" +--8<-- "python/user-guide/expressions/lists.py:setup" +--8<-- "python/user-guide/expressions/lists.py:weather_df" +``` + +### Creating a `List` column + +For the `weather` `DataFrame` created above, it's very likely we need to run some analysis on the temperatures that are captured by each station. To make this happen, we need to first be able to get individual temperature measurements. This is done by: + +{{code_block('user-guide/expressions/lists','string_to_list',['str.split'])}} + +```python exec="on" result="text" session="user-guide/lists" +--8<-- "python/user-guide/expressions/lists.py:string_to_list" +``` + +One way we could go post this would be to convert each temperature measurement into its own row: + +{{code_block('user-guide/expressions/lists','explode_to_atomic',['DataFrame.explode'])}} + +```python exec="on" result="text" session="user-guide/lists" +--8<-- "python/user-guide/expressions/lists.py:explode_to_atomic" +``` + +However, in Polars, we often do not need to do this to operate on the `List` elements. + +### Operating on `List` columns + +Polars provides several standard operations on `List` columns. If we want the first three measurements, we can do a `head(3)`. The last three can be obtained via a `tail(3)`, or alternately, via `slice` (negative indexing is supported). We can also identify the number of observations via `lengths`. Let's see them in action: + +{{code_block('user-guide/expressions/lists','list_ops',['Expr.List'])}} + +```python exec="on" result="text" session="user-guide/lists" +--8<-- "python/user-guide/expressions/lists.py:list_ops" +``` + +!!! warning "`arr` then, `list` now" + + If you find references to the `arr` API on Stackoverflow or other sources, just replace `arr` with `list`, this was the old accessor for the `List` datatype. `arr` now refers to the newly introduced `Array` datatype (see below). + +### Element-wise computation within `List`s + +If we need to identify the stations that are giving the most number of errors from the starting `DataFrame`, we need to: + +1. Parse the string input as a `List` of string values (already done). +2. Identify those strings that can be converted to numbers. +3. Identify the number of non-numeric values (i.e. `null` values) in the list, by row. +4. Rename this output as `errors` so that we can easily identify the stations. + +The third step requires a casting (or alternately, a regex pattern search) operation to be perform on each element of the list. We can do this using by applying the operation on each element by first referencing them in the `pl.element()` context, and then calling a suitable Polars expression on them. Let's see how: + +{{code_block('user-guide/expressions/lists','count_errors',['Expr.List', 'element'])}} + +```python exec="on" result="text" session="user-guide/lists" +--8<-- "python/user-guide/expressions/lists.py:count_errors" +``` + +What if we chose the regex route (i.e. recognizing the presence of _any_ alphabetical character?) + +{{code_block('user-guide/expressions/lists','count_errors_regex',['str.contains'])}} + +```python exec="on" result="text" session="user-guide/lists" +--8<-- "python/user-guide/expressions/lists.py:count_errors_regex" +``` + +If you're unfamiliar with the `(?i)`, it's a good time to look at the documentation for the `str.contains` function in Polars! The rust regex crate provides a lot of additional regex flags that might come in handy. + +## Row-wise computations + +This context is ideal for computing in row orientation. + +We can apply **any** Polars operations on the elements of the list with the `list.eval` (`list().eval` in Rust) expression! These expressions run entirely on Polars' query engine and can run in parallel, so will be well optimized. Let's say we have another set of weather data across three days, for different stations: + +{{code_block('user-guide/expressions/lists','weather_by_day',['DataFrame'])}} + +```python exec="on" result="text" session="user-guide/lists" +--8<-- "python/user-guide/expressions/lists.py:weather_by_day" +``` + +Let's do something interesting, where we calculate the percentage rank of the temperatures by day, measured across stations. Pandas allows you to compute the percentages of the `rank` values. `Polars` doesn't provide a special function to do this directly, but because expressions are so versatile we can create our own percentage rank expression for highest temperature. Let's try that! + +{{code_block('user-guide/expressions/lists','weather_by_day_rank',['list.eval'])}} + +```python exec="on" result="text" session="user-guide/lists" +--8<-- "python/user-guide/expressions/lists.py:weather_by_day_rank" +``` + +## Polars `Array`s + +`Array`s are a new data type that was recently introduced, and are still pretty nascent in features that it offers. The major difference between a `List` and an `Array` is that the latter is limited to having the same number of elements per row, while a `List` can have a variable number of elements. Both still require that each element's data type is the same. + +We can define `Array` columns in this manner: + +{{code_block('user-guide/expressions/lists','array_df',['Array'])}} + +```python exec="on" result="text" session="user-guide/lists" +--8<-- "python/user-guide/expressions/lists.py:array_df" +``` + +Basic operations are available on it: + +{{code_block('user-guide/expressions/lists','array_ops',['arr'])}} + +```python exec="on" result="text" session="user-guide/lists" +--8<-- "python/user-guide/expressions/lists.py:array_ops" +``` + +Polars `Array`s are still being actively developed, so this section will likely change in the future. diff --git a/docs/user-guide/expressions/null.md b/docs/user-guide/expressions/null.md new file mode 100644 index 000000000000..5ded317ac2b5 --- /dev/null +++ b/docs/user-guide/expressions/null.md @@ -0,0 +1,140 @@ +# Missing data + +This page sets out how missing data is represented in `Polars` and how missing data can be filled. + +## `null` and `NaN` values + +Each column in a `DataFrame` (or equivalently a `Series`) is an Arrow array or a collection of Arrow arrays [based on the Apache Arrow format](https://arrow.apache.org/docs/format/Columnar.html#null-count). Missing data is represented in Arrow and `Polars` with a `null` value. This `null` missing value applies for all data types including numerical values. + +`Polars` also allows `NotaNumber` or `NaN` values for float columns. These `NaN` values are considered to be a type of floating point data rather than missing data. We discuss `NaN` values separately below. + +You can manually define a missing value with the python `None` value: + +{{code_block('user-guide/expressions/null','dataframe',['DataFrame'])}} + +```python exec="on" result="text" session="user-guide/null" +--8<-- "python/user-guide/expressions/null.py:setup" +--8<-- "python/user-guide/expressions/null.py:dataframe" +``` + +!!! info + + In `Pandas` the value for missing data depends on the dtype of the column. In `Polars` missing data is always represented as a `null` value. + +## Missing data metadata + +Each Arrow array used by `Polars` stores two kinds of metadata related to missing data. This metadata allows `Polars` to quickly show how many missing values there are and which values are missing. + +The first piece of metadata is the `null_count` - this is the number of rows with `null` values in the column: + +{{code_block('user-guide/expressions/null','count',['null_count'])}} + +```python exec="on" result="text" session="user-guide/null" +--8<-- "python/user-guide/expressions/null.py:count" +``` + +The `null_count` method can be called on a `DataFrame`, a column from a `DataFrame` or a `Series`. The `null_count` method is a cheap operation as `null_count` is already calculated for the underlying Arrow array. + +The second piece of metadata is an array called a _validity bitmap_ that indicates whether each data value is valid or missing. +The validity bitmap is memory efficient as it is bit encoded - each value is either a 0 or a 1. This bit encoding means the memory overhead per array is only (array length / 8) bytes. The validity bitmap is used by the `is_null` method in `Polars`. + +You can return a `Series` based on the validity bitmap for a column in a `DataFrame` or a `Series` with the `is_null` method: + +{{code_block('user-guide/expressions/null','isnull',['is_null'])}} + +```python exec="on" result="text" session="user-guide/null" +--8<-- "python/user-guide/expressions/null.py:isnull" +``` + +The `is_null` method is a cheap operation that does not require scanning the full column for `null` values. This is because the validity bitmap already exists and can be returned as a Boolean array. + +## Filling missing data + +Missing data in a `Series` can be filled with the `fill_null` method. You have to specify how you want the `fill_null` method to fill the missing data. The main ways to do this are filling with: + +- a literal such as 0 or "0" +- a strategy such as filling forwards +- an expression such as replacing with values from another column +- interpolation + +We illustrate each way to fill nulls by defining a simple `DataFrame` with a missing value in `col2`: + +{{code_block('user-guide/expressions/null','dataframe2',['DataFrame'])}} + +```python exec="on" result="text" session="user-guide/null" +--8<-- "python/user-guide/expressions/null.py:dataframe2" +``` + +### Fill with specified literal value + +We can fill the missing data with a specified literal value with `pl.lit`: + +{{code_block('user-guide/expressions/null','fill',['fill_null'])}} + +```python exec="on" result="text" session="user-guide/null" +--8<-- "python/user-guide/expressions/null.py:fill" +``` + +### Fill with a strategy + +We can fill the missing data with a strategy such as filling forward: + +{{code_block('user-guide/expressions/null','fillstrategy',['fill_null'])}} + +```python exec="on" result="text" session="user-guide/null" +--8<-- "python/user-guide/expressions/null.py:fillstrategy" +``` + +You can find other fill strategies in the API docs. + +### Fill with an expression + +For more flexibility we can fill the missing data with an expression. For example, +to fill nulls with the median value from that column: + +{{code_block('user-guide/expressions/null','fillexpr',['fill_null'])}} + +```python exec="on" result="text" session="user-guide/null" +--8<-- "python/user-guide/expressions/null.py:fillexpr" +``` + +In this case the column is cast from integer to float because the median is a float statistic. + +### Fill with interpolation + +In addition, we can fill nulls with interpolation (without using the `fill_null` function): + +{{code_block('user-guide/expressions/null','fillinterpolate',['interpolate'])}} + +```python exec="on" result="text" session="user-guide/null" +--8<-- "python/user-guide/expressions/null.py:fillinterpolate" +``` + +## `NotaNumber` or `NaN` values + +Missing data in a `Series` has a `null` value. However, you can use `NotaNumber` or `NaN` values in columns with float datatypes. These `NaN` values can be created from Numpy's `np.nan` or the native python `float('nan')`: + +{{code_block('user-guide/expressions/null','nan',['DataFrame'])}} + +```python exec="on" result="text" session="user-guide/null" +--8<-- "python/user-guide/expressions/null.py:nan" +``` + +!!! info + + In `Pandas` by default a `NaN` value in an integer column causes the column to be cast to float. This does not happen in `Polars` - instead an exception is raised. + +`NaN` values are considered to be a type of floating point data and are **not considered to be missing data** in `Polars`. This means: + +- `NaN` values are **not** counted with the `null_count` method +- `NaN` values are filled when you use `fill_nan` method but are **not** filled with the `fill_null` method + +`Polars` has `is_nan` and `fill_nan` methods which work in a similar way to the `is_null` and `fill_null` methods. The underlying Arrow arrays do not have a pre-computed validity bitmask for `NaN` values so this has to be computed for the `is_nan` method. + +One further difference between `null` and `NaN` values is that taking the `mean` of a column with `null` values excludes the `null` values from the calculation but with `NaN` values taking the mean results in a `NaN`. This behaviour can be avoided by replacing the `NaN` values with `null` values; + +{{code_block('user-guide/expressions/null','nanfill',['fill_nan'])}} + +```python exec="on" result="text" session="user-guide/null" +--8<-- "python/user-guide/expressions/null.py:nanfill" +``` diff --git a/docs/user-guide/expressions/numpy.md b/docs/user-guide/expressions/numpy.md new file mode 100644 index 000000000000..6449ffd634bf --- /dev/null +++ b/docs/user-guide/expressions/numpy.md @@ -0,0 +1,22 @@ +# Numpy + +`Polars` expressions support `NumPy` [ufuncs](https://numpy.org/doc/stable/reference/ufuncs.html). See [here](https://numpy.org/doc/stable/reference/ufuncs.html#available-ufuncs) +for a list on all supported numpy functions. + +This means that if a function is not provided by `Polars`, we can use `NumPy` and we still have fast columnar operation through the `NumPy` API. + +### Example + +{{code_block('user-guide/expressions/numpy-example',api_functions=['DataFrame','np.log'])}} + +```python exec="on" result="text" session="user-guide/numpy" +--8<-- "python/user-guide/expressions/numpy-example.py" +``` + +### Interoperability + +Polars `Series` have support for NumPy universal functions (ufuncs). Element-wise functions such as `np.exp()`, `np.cos()`, `np.div()`, etc. all work with almost zero overhead. + +However, as a Polars-specific remark: missing values are a separate bitmask and are not visible by NumPy. This can lead to a window function or a `np.convolve()` giving flawed or incomplete results. + +Convert a Polars `Series` to a NumPy array with the `.to_numpy()` method. Missing values will be replaced by `np.nan` during the conversion. If the `Series` does not include missing values, or those values are not desired anymore, the `.view()` method can be used instead, providing a zero-copy NumPy array of the data. diff --git a/docs/user-guide/expressions/operators.md b/docs/user-guide/expressions/operators.md new file mode 100644 index 000000000000..24cb4e6834b8 --- /dev/null +++ b/docs/user-guide/expressions/operators.md @@ -0,0 +1,30 @@ +# Basic operators + +This section describes how to use basic operators (e.g. addition, subtraction) in conjunction with Expressions. We will provide various examples using different themes in the context of the following dataframe. + +!!! note Operator Overloading + + In Rust and Python it is possible to use the operators directly (as in `+ - * / < > `) as the language allows operator overloading. For instance, the operator `+` translates to the `.add()` method. You can choose the one you prefer. + +{{code_block('user-guide/expressions/operators','dataframe',['DataFrame'])}} + +```python exec="on" result="text" session="user-guide/operators" +--8<-- "python/user-guide/expressions/operators.py:setup" +--8<-- "python/user-guide/expressions/operators.py:dataframe" +``` + +### Numerical + +{{code_block('user-guide/expressions/operators','numerical',['operators'])}} + +```python exec="on" result="text" session="user-guide/operators" +--8<-- "python/user-guide/expressions/operators.py:numerical" +``` + +### Logical + +{{code_block('user-guide/expressions/operators','logical',['operators'])}} + +```python exec="on" result="text" session="user-guide/operators" +--8<-- "python/user-guide/expressions/operators.py:logical" +``` diff --git a/docs/user-guide/expressions/strings.md b/docs/user-guide/expressions/strings.md new file mode 100644 index 000000000000..ccb06de30f20 --- /dev/null +++ b/docs/user-guide/expressions/strings.md @@ -0,0 +1,62 @@ +# Strings + +The following section discusses operations performed on `Utf8` strings, which are a frequently used `DataType` when working with `DataFrames`. However, processing strings can often be inefficient due to their unpredictable memory size, causing the CPU to access many random memory locations. To address this issue, Polars utilizes `Arrow` as its backend, which stores all strings in a contiguous block of memory. As a result, string traversal is cache-optimal and predictable for the CPU. + +String processing functions are available in the `str` namespace. + +##### Accessing the string namespace + +The `str` namespace can be accessed through the `.str` attribute of a column with `Utf8` data type. In the following example, we create a column named `animal` and compute the length of each element in the column in terms of the number of bytes and the number of characters. If you are working with ASCII text, then the results of these two computations will be the same, and using `lengths` is recommended since it is faster. + +{{code_block('user-guide/expressions/strings','df',['lengths','n_chars'])}} + +```python exec="on" result="text" session="user-guide/strings" +--8<-- "python/user-guide/expressions/strings.py:setup" +--8<-- "python/user-guide/expressions/strings.py:df" +``` + +#### String parsing + +`Polars` offers multiple methods for checking and parsing elements of a string. Firstly, we can use the `contains` method to check whether a given pattern exists within a substring. Subsequently, we can extract these patterns and replace them using other methods, which will be demonstrated in upcoming examples. + +##### Check for existence of a pattern + +To check for the presence of a pattern within a string, we can use the contains method. The `contains` method accepts either a regular substring or a regex pattern, depending on the value of the `literal` parameter. If the pattern we're searching for is a simple substring located either at the beginning or end of the string, we can alternatively use the `starts_with` and `ends_with` functions. + +{{code_block('user-guide/expressions/strings','existence',['str.contains', 'starts_with','ends_with'])}} + +```python exec="on" result="text" session="user-guide/strings" +--8<-- "python/user-guide/expressions/strings.py:existence" +``` + +##### Extract a pattern + +The `extract` method allows us to extract a pattern from a specified string. This method takes a regex pattern containing one or more capture groups, which are defined by parentheses `()` in the pattern. The group index indicates which capture group to output. + +{{code_block('user-guide/expressions/strings','extract',['extract'])}} + +```python exec="on" result="text" session="user-guide/strings" +--8<-- "python/user-guide/expressions/strings.py:extract" +``` + +To extract all occurrences of a pattern within a string, we can use the `extract_all` method. In the example below, we extract all numbers from a string using the regex pattern `(\d+)`, which matches one or more digits. The resulting output of the `extract_all` method is a list containing all instances of the matched pattern within the string. + +{{code_block('user-guide/expressions/strings','extract_all',['extract_all'])}} + +```python exec="on" result="text" session="user-guide/strings" +--8<-- "python/user-guide/expressions/strings.py:extract_all" +``` + +##### Replace a pattern + +We have discussed two methods for pattern matching and extraction thus far, and now we will explore how to replace a pattern within a string. Similar to `extract` and `extract_all`, Polars provides the `replace` and `replace_all` methods for this purpose. In the example below we replace one match of `abc` at the end of a word (`\b`) by `ABC` and we replace all occurrence of `a` with `-`. + +{{code_block('user-guide/expressions/strings','replace',['replace','replace_all'])}} + +```python exec="on" result="text" session="user-guide/strings" +--8<-- "python/user-guide/expressions/strings.py:replace" +``` + +#### API documentation + +In addition to the examples covered above, Polars offers various other string manipulation methods for tasks such as formatting, stripping, splitting, and more. To explore these additional methods, you can go to the API documentation of your chosen programming language for Polars. diff --git a/docs/user-guide/expressions/structs.md b/docs/user-guide/expressions/structs.md new file mode 100644 index 000000000000..9973e61d4c68 --- /dev/null +++ b/docs/user-guide/expressions/structs.md @@ -0,0 +1,99 @@ +# The Struct datatype + +Polars `Struct`s are the idiomatic way of working with multiple columns. It is also a free operation i.e. moving columns into `Struct`s does not copy any data! + +For this section, let's start with a `DataFrame` that captures the average rating of a few movies across some states in the U.S.: + +{{code_block('user-guide/expressions/structs','ratings_df',['DataFrame'])}} + +```python exec="on" result="text" session="user-guide/structs" +--8<-- "python/user-guide/expressions/structs.py:setup" +--8<-- "python/user-guide/expressions/structs.py:ratings_df" +``` + +## Encountering the `Struct` type + +A common operation that will lead to a `Struct` column is the ever so popular `value_counts` function that is commonly used in exploratory data analysis. Checking the number of times a state appears the data will be done as so: + +{{code_block('user-guide/expressions/structs','state_value_counts',['value_counts'])}} + +```python exec="on" result="text" session="user-guide/structs" +--8<-- "python/user-guide/expressions/structs.py:state_value_counts" +``` + +Quite unexpected an output, especially if coming from tools that do not have such a data type. We're not in peril though, to get back to a more familiar output, all we need to do is `unnest` the `Struct` column into its constituent columns: + +{{code_block('user-guide/expressions/structs','struct_unnest',['unnest'])}} + +```python exec="on" result="text" session="user-guide/structs" +--8<-- "python/user-guide/expressions/structs.py:struct_unnest" +``` + +!!! note "Why `value_counts` returns a `Struct`" + + Polars expressions always have a `Fn(Series) -> Series` signature and `Struct` is thus the data type that allows us to provide multiple columns as input/ouput of an expression. In other words, all expressions have to return a `Series` object, and `Struct` allows us to stay consistent with that requirement. + +## Structs as `dict`s + +Polars will interpret a `dict` sent to the `Series` constructor as a `Struct`: + +{{code_block('user-guide/expressions/structs','series_struct',['Series'])}} + +```python exec="on" result="text" session="user-guide/structs" +--8<-- "python/user-guide/expressions/structs.py:series_struct" +``` + +!!! note "Constructing `Series` objects" + + Note that `Series` here was constructed with the `name` of the series in the begninng, followed by the `values`. Providing the latter first + is considered an anti-pattern in Polars, and must be avoided. + +### Extracting individual values of a `Struct` + +Let's say that we needed to obtain just the `movie` value in the `Series` that we created above. We can use the `field` method to do so: + +{{code_block('user-guide/expressions/structs','series_struct_extract',['field'])}} + +```python exec="on" result="text" session="user-guide/structs" +--8<-- "python/user-guide/expressions/structs.py:series_struct_extract" +``` + +### Renaming individual keys of a `Struct` + +What if we need to rename individual `field`s of a `Struct` column? We first convert the `rating_Series` object to a `DataFrame` so that we can view the changes easily, and then use the `rename_fields` method: + +{{code_block('user-guide/expressions/structs','series_struct_rename',['rename_fields'])}} + +```python exec="on" result="text" session="user-guide/structs" +--8<-- "python/user-guide/expressions/structs.py:series_struct_rename" +``` + +## Practical use-cases of `Struct` columns + +### Identifying duplicate rows + +Let's get back to the `ratings` data. We want to identify cases where there are duplicates at a `Movie` and `Theatre` level. This is where the `Struct` datatype shines: + +{{code_block('user-guide/expressions/structs','struct_duplicates',['is_duplicated', 'struct'])}} + +```python exec="on" result="text" session="user-guide/structs" +--8<-- "python/user-guide/expressions/structs.py:struct_duplicates" +``` + +We can identify the unique cases at this level also with `is_unique`! + +### Multi-column ranking + +Suppose, given that we know there are duplicates, we want to choose which rank gets a higher priority. We define _Count_ of ratings to be more important than the actual `Avg_Rating` themselves, and only use it to break a tie. We can then do: + +{{code_block('user-guide/expressions/structs','struct_ranking',['is_duplicated', 'struct'])}} + +```python exec="on" result="text" session="user-guide/structs" +--8<-- "python/user-guide/expressions/structs.py:struct_ranking" +``` + +That's a pretty complex set of requirements done very elegantly in Polars! + +### Using multi-column apply + +This was discussed in the previous section on _User Defined Functions_. diff --git a/docs/user-guide/expressions/user-defined-functions.md b/docs/user-guide/expressions/user-defined-functions.md new file mode 100644 index 000000000000..dd83cb13c382 --- /dev/null +++ b/docs/user-guide/expressions/user-defined-functions.md @@ -0,0 +1,187 @@ +# User-defined functions + +!!! warning "Not updated for Python Polars `0.19.0`" + + This section of the user guide still needs to be updated for the latest Polars release. + +You should be convinced by now that Polars expressions are so powerful and flexible that there is much less need for custom Python functions +than in other libraries. + +Still, you need to have the power to be able to pass an expression's state to a third party library or apply your black box function +over data in Polars. + +For this we provide the following expressions: + +- `map` +- `apply` + +## To `map` or to `apply`. + +These functions have an important distinction in how they operate and consequently what data they will pass to the user. + +A `map` passes the `Series` backed by the `expression` as is. + +`map` follows the same rules in both the `select` and the `group_by` context, this will +mean that the `Series` represents a column in a `DataFrame`. Note that in the `group_by` context, that column is not yet +aggregated! + +Use cases for `map` are for instance passing the `Series` in an expression to a third party library. Below we show how +we could use `map` to pass an expression column to a neural network model. + +=== ":fontawesome-brands-python: Python" +[:material-api: `map`](https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.map.html) + +```python +df.with_columns([ + pl.col("features").map(lambda s: MyNeuralNetwork.forward(s.to_numpy())).alias("activations") +]) +``` + +=== ":fontawesome-brands-rust: Rust" + +```rust +df.with_columns([ + col("features").map(|s| Ok(my_nn.forward(s))).alias("activations") +]) +``` + +Use cases for `map` in the `group_by` context are slim. They are only used for performance reasons, but can quite easily lead to incorrect results. Let me explain why. + +{{code_block('user-guide/expressions/user-defined-functions','dataframe',['map'])}} + +```python exec="on" result="text" session="user-guide/udf" +--8<-- "python/user-guide/expressions/user-defined-functions.py:setup" +--8<-- "python/user-guide/expressions/user-defined-functions.py:dataframe" +``` + +In the snippet above we group by the `"keys"` column. That means we have the following groups: + +```c +"a" -> [10, 7] +"b" -> [1] +``` + +If we would then apply a `shift` operation to the right, we'd expect: + +```c +"a" -> [null, 10] +"b" -> [null] +``` + +Now, let's print and see what we've got. + +```python +print(out) +``` + +``` +shape: (2, 3) +┌──────┬────────────┬──────────────────┐ +│ keys ┆ shift_map ┆ shift_expression │ +│ --- ┆ --- ┆ --- │ +│ str ┆ list[i64] ┆ list[i64] │ +╞══════╪════════════╪══════════════════╡ +│ a ┆ [null, 10] ┆ [null, 10] │ +│ b ┆ [7] ┆ [null] │ +└──────┴────────────┴──────────────────┘ +``` + +Ouch.. we clearly get the wrong results here. Group `"b"` even got a value from group `"a"` 😵. + +This went horribly wrong, because the `map` applies the function before we aggregate! So that means the whole column `[10, 7, 1`\] got shifted to `[null, 10, 7]` and was then aggregated. + +So my advice is to never use `map` in the `group_by` context unless you know you need it and know what you are doing. + +## To `apply` + +Luckily we can fix previous example with `apply`. `apply` works on the smallest logical elements for that operation. + +That is: + +- `select context` -> single elements +- `group by context` -> single groups + +So with `apply` we should be able to fix our example: + +{{code_block('user-guide/expressions/user-defined-functions','apply',['apply'])}} + +```python exec="on" result="text" session="user-guide/udf" +--8<-- "python/user-guide/expressions/user-defined-functions.py:apply" +``` + +And observe, a valid result! 🎉 + +## `apply` in the `select` context + +In the `select` context, the `apply` expression passes elements of the column to the python function. + +_Note that you are now running Python, this will be slow._ + +Let's go through some examples to see what to expect. We will continue with the `DataFrame` we defined at the start of +this section and show an example with the `apply` function and a counter example where we use the expression API to +achieve the same goals. + +### Adding a counter + +In this example we create a global `counter` and then add the integer `1` to the global state at every element processed. +Every iteration the result of the increment will be added to the element value. + +> Note, this example isn't provided in Rust. The reason is that the global `counter` value would lead to data races when this apply is evaluated in parallel. It would be possible to wrap it in a `Mutex` to protect the variable, but that would be obscuring the point of the example. This is a case where the Python Global Interpreter Lock's performance tradeoff provides some safety guarantees. + +{{code_block('user-guide/expressions/user-defined-functions','counter',['apply'])}} + +```python exec="on" result="text" session="user-guide/udf" +--8<-- "python/user-guide/expressions/user-defined-functions.py:counter" +``` + +### Combining multiple column values + +If we want to have access to values of different columns in a single `apply` function call, we can create `struct` data +type. This data type collects those columns as fields in the `struct`. So if we'd create a struct from the columns +`"keys"` and `"values"`, we would get the following struct elements: + +```python +[ + {"keys": "a", "values": 10}, + {"keys": "a", "values": 7}, + {"keys": "b", "values": 1}, +] +``` + +In Python, those would be passed as `dict` to the calling python function and can thus be indexed by `field: str`. In rust, you'll get a `Series` with the `Struct` type. The fields of the struct can then be indexed and downcast. + +{{code_block('user-guide/expressions/user-defined-functions','combine',['apply','struct'])}} + +```python exec="on" result="text" session="user-guide/udf" +--8<-- "python/user-guide/expressions/user-defined-functions.py:combine" +``` + +`Structs` are covered in detail in the next section. + +### Return types? + +Custom python functions are black boxes for polars. We really don't know what kind of black arts you are doing, so we have +to infer and try our best to understand what you meant. + +As a user it helps to understand what we do to better utilize custom functions. + +The data type is automatically inferred. We do that by waiting for the first non-null value. That value will then be used +to determine the type of the `Series`. + +The mapping of python types to polars data types is as follows: + +- `int` -> `Int64` +- `float` -> `Float64` +- `bool` -> `Boolean` +- `str` -> `Utf8` +- `list[tp]` -> `List[tp]` (where the inner type is inferred with the same rules) +- `dict[str, [tp]]` -> `struct` +- `Any` -> `object` (Prevent this at all times) + +Rust types map as follows: + +- `i32` or `i64` -> `Int64` +- `f32` or `f64` -> `Float64` +- `bool` -> `Boolean` +- `String` or `str` -> `Utf8` +- `Vec` -> `List[tp]` (where the inner type is inferred with the same rules) diff --git a/docs/user-guide/expressions/window.md b/docs/user-guide/expressions/window.md new file mode 100644 index 000000000000..7ea426ccb1b9 --- /dev/null +++ b/docs/user-guide/expressions/window.md @@ -0,0 +1,91 @@ +# Window functions + +Window functions are expressions with superpowers. They allow you to perform aggregations on groups in the +`select` context. Let's get a feel for what that means. First we create a dataset. The dataset loaded in the +snippet below contains information about pokemon: + +{{code_block('user-guide/expressions/window','pokemon',['read_csv'])}} + +```python exec="on" result="text" session="user-guide/window" +--8<-- "python/user-guide/expressions/window.py:pokemon" +``` + +## Group by aggregations in selection + +Below we show how to use window functions to group over different columns and perform an aggregation on them. +Doing so allows us to use multiple group by operations in parallel, using a single query. The results of the aggregation +are projected back to the original rows. Therefore, a window function will almost always lead to a `DataFrame` with the same size as the original. + +We will discuss later the cases where a window function can change the numbers of rows in a `DataFrame`. + +Note how we call `.over("Type 1")` and `.over(["Type 1", "Type 2"])`. Using window functions we can aggregate over different groups in a single `select` call! Note that, in Rust, the type of the argument to `over()` must be a collection, so even when you're only using one column, you must provided it in an array. + +The best part is, this won't cost you anything. The computed groups are cached and shared between different `window` expressions. + +{{code_block('user-guide/expressions/window','group_by',['over'])}} + +```python exec="on" result="text" session="user-guide/window" +--8<-- "python/user-guide/expressions/window.py:group_by" +``` + +## Operations per group + +Window functions can do more than aggregation. They can also be viewed as an operation within a group. If, for instance, you +want to `sort` the values within a `group`, you can write `col("value").sort().over("group")` and voilà! We sorted by group! + +Let's filter out some rows to make this more clear. + +{{code_block('user-guide/expressions/window','operations',['filter'])}} + +```python exec="on" result="text" session="user-guide/window" +--8<-- "python/user-guide/expressions/window.py:operations" +``` + +Observe that the group `Water` of column `Type 1` is not contiguous. There are two rows of `Grass` in between. Also note +that each pokemon within a group are sorted by `Speed` in `ascending` order. Unfortunately, for this example we want them sorted in +`descending` speed order. Luckily with window functions this is easy to accomplish. + +{{code_block('user-guide/expressions/window','sort',['over'])}} + +```python exec="on" result="text" session="user-guide/window" +--8<-- "python/user-guide/expressions/window.py:sort" +``` + +`Polars` keeps track of each group's location and maps the expressions to the proper row locations. This will also work over different groups in a single `select`. + +The power of window expressions is that you often don't need a `group_by -> explode` combination, but you can put the logic in a single expression. It also makes the API cleaner. If properly used a: + +- `group_by` -> marks that groups are aggregated and we expect a `DataFrame` of size `n_groups` +- `over` -> marks that we want to compute something within a group, and doesn't modify the original size of the `DataFrame` except in specific cases + +## Map the expression result to the DataFrame rows + +In cases where the expression results in multiple values per group, the Window function has 3 strategies for linking the values back to the `DataFrame` rows: + +- `mapping_strategy = 'group_to_rows'` -> each value is assigned back to one row. The number of values returned should match the number of rows. + +- `mapping_strategy = 'join'` -> the values are imploded in a list, and the list is repeated on all rows. This can be memory intensive. + +- `mapping_strategy = 'explode'` -> the values are exploded to new rows. This operation changes the number of rows. + +## Window expression rules + +The evaluations of window expressions are as follows (assuming we apply it to a `pl.Int32` column): + +{{code_block('user-guide/expressions/window','rules',['over'])}} + +## More examples + +For more exercise, below are some window functions for us to compute: + +- sort all pokemon by type +- select the first `3` pokemon per type as `"Type 1"` +- sort the pokemon within a type by speed in descending order and select the first `3` as `"fastest/group"` +- sort the pokemon within a type by attack in descending order and select the first `3` as `"strongest/group"` +- sort the pokemon within a type by name and select the first `3` as `"sorted_by_alphabet"` + +{{code_block('user-guide/expressions/window','examples',['over','implode'])}} + +```python exec="on" result="text" session="user-guide/window" +--8<-- "python/user-guide/expressions/window.py:examples" +``` diff --git a/docs/user-guide/index.md b/docs/user-guide/index.md new file mode 100644 index 000000000000..8fb27a98c743 --- /dev/null +++ b/docs/user-guide/index.md @@ -0,0 +1,31 @@ +# Introduction + +This User Guide is an introduction to the [`Polars` DataFrame library](https://github.com/pola-rs/polars). Its goal is to introduce you to `Polars` by going through examples and comparing it to other +solutions. Some design choices are introduced here. The guide will also introduce you to optimal usage of `Polars`. + +Even though `Polars` is completely written in [`Rust`](https://www.rust-lang.org/) (no runtime overhead!) and uses [`Arrow`](https://arrow.apache.org/) -- the +[native arrow2 `Rust` implementation](https://github.com/jorgecarleitao/arrow2) -- as its foundation, the examples presented in this guide will be mostly using its higher-level language +bindings. Higher-level bindings only serve as a thin wrapper for functionality implemented in the core library. + +For [`Pandas`](https://pandas.pydata.org/) users, our [Python package](https://pypi.org/project/polars/) will offer the easiest way to get started with `Polars`. + +### Philosophy + +The goal of `Polars` is to provide a lightning fast `DataFrame` library that: + +- Utilizes all available cores on your machine. +- Optimizes queries to reduce unneeded work/memory allocations. +- Handles datasets much larger than your available RAM. +- Has an API that is consistent and predictable. +- Has a strict schema (data-types should be known before running the query). + +Polars is written in Rust which gives it C/C++ performance and allows it to fully control performance critical parts +in a query engine. + +As such `Polars` goes to great lengths to: + +- Reduce redundant copies. +- Traverse memory cache efficiently. +- Minimize contention in parallelism. +- Process data in chunks. +- Reuse memory allocations. diff --git a/docs/user-guide/installation.md b/docs/user-guide/installation.md new file mode 100644 index 000000000000..a732d9b9be04 --- /dev/null +++ b/docs/user-guide/installation.md @@ -0,0 +1,174 @@ +# Installation + +Polars is a library and installation is as simple as invoking the package manager of the corresponding programming language. + +=== ":fontawesome-brands-python: Python" + + ``` bash + pip install polars + ``` + +=== ":fontawesome-brands-rust: Rust" + + ``` shell + cargo add polars -F lazy + + # Or Cargo.toml + [dependencies] + polars = { version = "x", features = ["lazy", ...]} + ``` + +## Importing + +To use the library import it into your project + +=== ":fontawesome-brands-python: Python" + + ``` python + import polars as pl + ``` + +=== ":fontawesome-brands-rust: Rust" + + ``` rust + use polars::prelude::*; + ``` + +## Feature Flags + +By using the above command you install the core of `Polars` onto your system. However depending on your use case you might want to install the optional dependencies as well. These are made optional to minimize the footprint. The flags are different depending on the programming language. Throughout the user guide we will mention when a functionality is used that requires an additional dependency. + +### Python + +```text +# For example +pip install polars[numpy, fsspec] +``` + +| Tag | Description | +| ---------- | ------------------------------------------------------------------------------------------------------------------------------------- | +| all | Install all optional dependencies (all of the following) | +| pandas | Install with Pandas for converting data to and from Pandas Dataframes/Series | +| numpy | Install with numpy for converting data to and from numpy arrays | +| pyarrow | Reading data formats using PyArrow | +| fsspec | Support for reading from remote file systems | +| connectorx | Support for reading from SQL databases | +| xlsx2csv | Support for reading from Excel files | +| deltalake | Support for reading from Delta Lake Tables | +| timezone | Timezone support, only needed if 1. you are on Python < 3.9 and/or 2. you are on Windows, otherwise no dependencies will be installed | + +### Rust + +```toml +# Cargo.toml +[dependencies] +polars = { version = "0.26.1", features = ["lazy", "temporal", "describe", "json", "parquet", "dtype-datetime"] } +``` + +The opt-in features are: + +- Additional data types: + - `dtype-date` + - `dtype-datetime` + - `dtype-time` + - `dtype-duration` + - `dtype-i8` + - `dtype-i16` + - `dtype-u8` + - `dtype-u16` + - `dtype-categorical` + - `dtype-struct` +- `performant` - Longer compile times more fast paths. +- `lazy` - Lazy API + - `lazy_regex` - Use regexes in [column selection](crate::lazy::dsl::col) + - `dot_diagram` - Create dot diagrams from lazy logical plans. +- `sql` - Pass SQL queries to polars. +- `streaming` - Be able to process datasets that are larger than RAM. +- `random` - Generate arrays with randomly sampled values +- `ndarray`- Convert from `DataFrame` to `ndarray` +- `temporal` - Conversions between [Chrono](https://docs.rs/chrono/) and Polars for temporal data types +- `timezones` - Activate timezone support. +- `strings` - Extra string utilities for `Utf8Chunked` + - `string_justify` - `zfill`, `ljust`, `rjust` + - `string_from_radix` - `parse_int` +- `object` - Support for generic ChunkedArrays called `ObjectChunked` (generic over `T`). + These are downcastable from Series through the [Any](https://doc.rust-lang.org/std/any/index.html) trait. +- Performance related: + - `nightly` - Several nightly only features such as SIMD and specialization. + - `performant` - more fast paths, slower compile times. + - `bigidx` - Activate this feature if you expect >> 2^32 rows. This has not been needed by anyone. + This allows polars to scale up way beyond that by using `u64` as an index. + Polars will be a bit slower with this feature activated as many data structures + are less cache efficient. + - `cse` - Activate common subplan elimination optimization +- IO related: + + - `serde` - Support for [serde](https://crates.io/crates/serde) serialization and deserialization. + Can be used for JSON and more serde supported serialization formats. + - `serde-lazy` - Support for [serde](https://crates.io/crates/serde) serialization and deserialization. + Can be used for JSON and more serde supported serialization formats. + + - `parquet` - Read Apache Parquet format + - `json` - JSON serialization + - `ipc` - Arrow's IPC format serialization + - `decompress` - Automatically infer compression of csvs and decompress them. + Supported compressions: + - zip + - gzip + +- `DataFrame` operations: + - `dynamic_group_by` - Group by based on a time window instead of predefined keys. + Also activates rolling window group by operations. + - `sort_multiple` - Allow sorting a `DataFrame` on multiple columns + - `rows` - Create `DataFrame` from rows and extract rows from `DataFrames`. + And activates `pivot` and `transpose` operations + - `join_asof` - Join ASOF, to join on nearest keys instead of exact equality match. + - `cross_join` - Create the cartesian product of two DataFrames. + - `semi_anti_join` - SEMI and ANTI joins. + - `group_by_list` - Allow group by operation on keys of type List. + - `row_hash` - Utility to hash DataFrame rows to UInt64Chunked + - `diagonal_concat` - Concat diagonally thereby combining different schemas. + - `horizontal_concat` - Concat horizontally and extend with null values if lengths don't match + - `dataframe_arithmetic` - Arithmetic on (Dataframe and DataFrames) and (DataFrame on Series) + - `partition_by` - Split into multiple DataFrames partitioned by groups. +- `Series`/`Expression` operations: + - `is_in` - [Check for membership in `Series`](crate::chunked_array::ops::IsIn) + - `zip_with` - [Zip two Series/ ChunkedArrays](crate::chunked_array::ops::ChunkZip) + - `round_series` - round underlying float types of `Series`. + - `repeat_by` - [Repeat element in an Array N times, where N is given by another array. + - `is_first` - Check if element is first unique value. + - `is_last` - Check if element is last unique value. + - `checked_arithmetic` - checked arithmetic/ returning `None` on invalid operations. + - `dot_product` - Dot/inner product on Series and Expressions. + - `concat_str` - Concat string data in linear time. + - `reinterpret` - Utility to reinterpret bits to signed/unsigned + - `take_opt_iter` - Take from a Series with `Iterator>` + - `mode` - [Return the most occurring value(s)](crate::chunked_array::ops::ChunkUnique::mode) + - `cum_agg` - cumsum, cummin, cummax aggregation. + - `rolling_window` - rolling window functions, like rolling_mean + - `interpolate` [interpolate None values](crate::chunked_array::ops::Interpolate) + - `extract_jsonpath` - [Run jsonpath queries on Utf8Chunked](https://goessner.net/articles/JsonPath/) + - `list` - List utils. + - `list_take` take sublist by multiple indices + - `rank` - Ranking algorithms. + - `moment` - kurtosis and skew statistics + - `ewma` - Exponential moving average windows + - `abs` - Get absolute values of Series + - `arange` - Range operation on Series + - `product` - Compute the product of a Series. + - `diff` - `diff` operation. + - `pct_change` - Compute change percentages. + - `unique_counts` - Count unique values in expressions. + - `log` - Logarithms for `Series`. + - `list_to_struct` - Convert `List` to `Struct` dtypes. + - `list_count` - Count elements in lists. + - `list_eval` - Apply expressions over list elements. + - `cumulative_eval` - Apply expressions over cumulatively increasing windows. + - `arg_where` - Get indices where condition holds. + - `search_sorted` - Find indices where elements should be inserted to maintain order. + - `date_offset` Add an offset to dates that take months and leap years into account. + - `trigonometry` Trigonometric functions. + - `sign` Compute the element-wise sign of a Series. + - `propagate_nans` NaN propagating min/max aggregations. +- `DataFrame` pretty printing + - `fmt` - Activate DataFrame formatting diff --git a/docs/user-guide/io/aws.md b/docs/user-guide/io/aws.md new file mode 100644 index 000000000000..e19efc74b580 --- /dev/null +++ b/docs/user-guide/io/aws.md @@ -0,0 +1,20 @@ +# AWS + +--8<-- "docs/_build/snippets/under_construction.md" + +To read from or write to an AWS bucket, additional dependencies are needed in Rust: + +=== ":fontawesome-brands-rust: Rust" + +```shell +$ cargo add aws_sdk_s3 aws_config tokio --features tokio/full +``` + +In the next few snippets we'll demonstrate interacting with a `Parquet` file +located on an AWS bucket. + +## Read + +Load a `.parquet` file using: + +{{code_block('user-guide/io/aws','bucket',['from_arrow'])}} diff --git a/docs/user-guide/io/bigquery.md b/docs/user-guide/io/bigquery.md new file mode 100644 index 000000000000..21287cd448d2 --- /dev/null +++ b/docs/user-guide/io/bigquery.md @@ -0,0 +1,19 @@ +# Google BigQuery + +To read or write from GBQ, additional dependencies are needed: + +=== ":fontawesome-brands-python: Python" + +```shell +$ pip install google-cloud-bigquery +``` + +## Read + +We can load a query into a `DataFrame` like this: + +{{code_block('user-guide/io/bigquery','read',['from_arrow'])}} + +## Write + +{{code_block('user-guide/io/bigquery','write',[])}} diff --git a/docs/user-guide/io/csv.md b/docs/user-guide/io/csv.md new file mode 100644 index 000000000000..eeb209dfb34e --- /dev/null +++ b/docs/user-guide/io/csv.md @@ -0,0 +1,21 @@ +# CSV + +## Read & write + +Reading a CSV file should look familiar: + +{{code_block('user-guide/io/csv','read',['read_csv'])}} + +Writing a CSV file is similar with the `write_csv` function: + +{{code_block('user-guide/io/csv','write',['write_csv'])}} + +## Scan + +`Polars` allows you to _scan_ a CSV input. Scanning delays the actual parsing of the +file and instead returns a lazy computation holder called a `LazyFrame`. + +{{code_block('user-guide/io/csv','scan',['scan_csv'])}} + +If you want to know why this is desirable, you can read more about these `Polars` +optimizations [here](../concepts/lazy-vs-eager.md). diff --git a/docs/user-guide/io/database.md b/docs/user-guide/io/database.md new file mode 100644 index 000000000000..4444e7be799e --- /dev/null +++ b/docs/user-guide/io/database.md @@ -0,0 +1,70 @@ +# Databases + +## Read from a database + +We can read from a database with Polars using the `pl.read_database` function. To use this function you need an SQL query string and a connection string called a `connection_uri`. + +For example, the following snippet shows the general patterns for reading all columns from the `foo` table in a Postgres database: + +{{code_block('user-guide/io/database','read',['read_database_connectorx'])}} + +### Engines + +Polars doesn't manage connections and data transfer from databases by itself. Instead external libraries (known as _engines_) handle this. At present Polars can use two engines to read from databases: + +- [ConnectorX](https://github.com/sfu-db/connector-x) and +- [ADBC](https://arrow.apache.org/docs/format/ADBC.html) + +#### ConnectorX + +ConnectorX is the default engine and [supports numerous databases](https://github.com/sfu-db/connector-x#sources) including Postgres, Mysql, SQL Server and Redshift. ConnectorX is written in Rust and stores data in Arrow format to allow for zero-copy to Polars. + +To read from one of the supported databases with `ConnectorX` you need to activate the additional dependency `ConnectorX` when installing Polars or install it manually with + +```shell +$ pip install connectorx +``` + +#### ADBC + +ADBC (Arrow Database Connectivity) is an engine supported by the Apache Arrow project. ADBC aims to be both an API standard for connecting to databases and libraries implementing this standard in a range of languages. + +It is still early days for ADBC so support for different databases is still limited. At present drivers for ADBC are only available for [Postgres and SQLite](https://arrow.apache.org/adbc/0.1.0/driver/cpp/index.html). To install ADBC you need to install the driver for your database. For example to install the driver for SQLite you run + +```shell +$ pip install adbc-driver-sqlite +``` + +As ADBC is not the default engine you must specify the engine as an argument to `pl.read_database` + +{{code_block('user-guide/io/database','adbc',['read_database'])}} + +## Write to a database + +We can write to a database with Polars using the `pl.write_database` function. + +### Engines + +As with reading from a database above Polars uses an _engine_ to write to a database. The currently supported engines are: + +- [SQLAlchemy](https://www.sqlalchemy.org/) and +- Arrow Database Connectivity (ADBC) + +#### SQLAlchemy + +With the default engine SQLAlchemy you can write to any database supported by SQLAlchemy. To use this engine you need to install SQLAlchemy and Pandas + +```shell +$ pip install SQLAlchemy pandas +``` + +In this example, we write the `DataFrame` to a table called `records` in the database + +{{code_block('user-guide/io/database','write',['write_database'])}} + +In the SQLAlchemy approach Polars converts the `DataFrame` to a Pandas `DataFrame` backed by PyArrow and then uses SQLAlchemy methods on a Pandas `DataFrame` to write to the database. + +#### ADBC + +As with reading from a database you can also use ADBC to write to a SQLite or Posgres database. As shown above you need to install the appropriate ADBC driver for your database. +{{code_block('user-guide/io/database','write_adbc',['write_database'])}} diff --git a/docs/user-guide/io/json_file.md b/docs/user-guide/io/json_file.md new file mode 100644 index 000000000000..352904829c7b --- /dev/null +++ b/docs/user-guide/io/json_file.md @@ -0,0 +1,26 @@ +# JSON files + +## Read & write + +### JSON + +Reading a JSON file should look familiar: + +{{code_block('user-guide/io/json-file','read',['read_json'])}} + +### Newline Delimited JSON + +JSON objects that are delimited by newlines can be read into polars in a much more performant way than standard json. + +{{code_block('user-guide/io/json-file','readnd',['read_ndjson'])}} + +## Write + +{{code_block('user-guide/io/json-file','write',['write_json','write_ndjson'])}} + +## Scan + +`Polars` allows you to _scan_ a JSON input **only for newline delimited json**. Scanning delays the actual parsing of the +file and instead returns a lazy computation holder called a `LazyFrame`. + +{{code_block('user-guide/io/json-file','scan',['scan_ndjson'])}} diff --git a/docs/user-guide/io/multiple.md b/docs/user-guide/io/multiple.md new file mode 100644 index 000000000000..c5a66b03940f --- /dev/null +++ b/docs/user-guide/io/multiple.md @@ -0,0 +1,40 @@ +## Dealing with multiple files. + +Polars can deal with multiple files differently depending on your needs and memory strain. + +Let's create some files to give us some context: + +{{code_block('user-guide/io/multiple','create',['write_csv'])}} + +## Reading into a single `DataFrame` + +To read multiple files into a single `DataFrame`, we can use globbing patterns: + +{{code_block('user-guide/io/multiple','read',['read_csv'])}} + +```python exec="on" result="text" session="user-guide/io/multiple" +--8<-- "python/user-guide/io/multiple.py:create" +--8<-- "python/user-guide/io/multiple.py:read" +``` + +To see how this works we can take a look at the query plan. Below we see that all files are read separately and +concatenated into a single `DataFrame`. `Polars` will try to parallelize the reading. + +{{code_block('user-guide/io/multiple','graph',['show_graph'])}} + +```python exec="on" session="user-guide/io/multiple" +--8<-- "python/user-guide/io/multiple.py:creategraph" +``` + +## Reading and processing in parallel + +If your files don't have to be in a single table you can also build a query plan for each file and execute them in parallel +on the `Polars` thread pool. + +All query plan execution is embarrassingly parallel and doesn't require any communication. + +{{code_block('user-guide/io/multiple','glob',['scan_csv'])}} + +```python exec="on" result="text" session="user-guide/io/multiple" +--8<-- "python/user-guide/io/multiple.py:glob" +``` diff --git a/docs/user-guide/io/parquet.md b/docs/user-guide/io/parquet.md new file mode 100644 index 000000000000..71a5399bb393 --- /dev/null +++ b/docs/user-guide/io/parquet.md @@ -0,0 +1,24 @@ +# Parquet + +Loading or writing [`Parquet` files](https://parquet.apache.org/) is lightning fast. +`Pandas` uses [`PyArrow`](https://arrow.apache.org/docs/python/) -`Python` bindings +exposed by `Arrow`- to load `Parquet` files into memory, but it has to copy that data into +`Pandas` memory. With `Polars` there is no extra cost due to +copying as we read `Parquet` directly into `Arrow` memory and _keep it there_. + +## Read + +{{code_block('user-guide/io/parquet','read',['read_parquet'])}} + +## Write + +{{code_block('user-guide/io/parquet','write',['write_parquet'])}} + +## Scan + +`Polars` allows you to _scan_ a `Parquet` input. Scanning delays the actual parsing of the +file and instead returns a lazy computation holder called a `LazyFrame`. + +{{code_block('user-guide/io/parquet','scan',['scan_parquet'])}} + +If you want to know why this is desirable, you can read more about those `Polars` optimizations [here](../concepts/lazy-vs-eager.md). diff --git a/docs/user-guide/lazy/execution.md b/docs/user-guide/lazy/execution.md new file mode 100644 index 000000000000..975f52a0ac4a --- /dev/null +++ b/docs/user-guide/lazy/execution.md @@ -0,0 +1,79 @@ +# Query execution + +Our example query on the Reddit dataset is: + +{{code_block('user-guide/lazy/execution','df',['scan_csv'])}} + +If we were to run the code above on the Reddit CSV the query would not be evaluated. Instead Polars takes each line of code, adds it to the internal query graph and optimizes the query graph. + +When we execute the code Polars executes the optimized query graph by default. + +### Execution on the full dataset + +We can execute our query on the full dataset by calling the `.collect` method on the query. + +{{code_block('user-guide/lazy/execution','collect',['scan_csv','collect'])}} + +```text +shape: (14_029, 6) +┌─────────┬───────────────────────────┬─────────────┬────────────┬───────────────┬────────────┐ +│ id ┆ name ┆ created_utc ┆ updated_on ┆ comment_karma ┆ link_karma │ +│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ +│ i64 ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ +╞═════════╪═══════════════════════════╪═════════════╪════════════╪═══════════════╪════════════╡ +│ 6 ┆ TAOJIANLONG_JASONBROKEN ┆ 1397113510 ┆ 1536527864 ┆ 4 ┆ 0 │ +│ 17 ┆ SSAIG_JASONBROKEN ┆ 1397113544 ┆ 1536527864 ┆ 1 ┆ 0 │ +│ 19 ┆ FDBVFDSSDGFDS_JASONBROKEN ┆ 1397113552 ┆ 1536527864 ┆ 3 ┆ 0 │ +│ 37 ┆ IHATEWHOWEARE_JASONBROKEN ┆ 1397113636 ┆ 1536527864 ┆ 61 ┆ 0 │ +│ … ┆ … ┆ … ┆ … ┆ … ┆ … │ +│ 1229384 ┆ DSFOX ┆ 1163177415 ┆ 1536497412 ┆ 44411 ┆ 7917 │ +│ 1229459 ┆ NEOCARTY ┆ 1163177859 ┆ 1536533090 ┆ 40 ┆ 0 │ +│ 1229587 ┆ TEHSMA ┆ 1163178847 ┆ 1536497412 ┆ 14794 ┆ 5707 │ +│ 1229621 ┆ JEREMYLOW ┆ 1163179075 ┆ 1536497412 ┆ 411 ┆ 1063 │ +└─────────┴───────────────────────────┴─────────────┴────────────┴───────────────┴────────────┘ +``` + +Above we see that from the 10 million rows there are 14,029 rows that match our predicate. + +With the default `collect` method Polars processes all of your data as one batch. This means that all the data has to fit into your available memory at the point of peak memory usage in your query. + +!!! warning "Reusing `LazyFrame` objects" + + Remember that `LazyFrame`s are query plans i.e. a promise on computation and is not guaranteed to cache common subplans. This means that every time you reuse it in separate downstream queries after it is defined, it is computed all over again. If you define an operation on a `LazyFrame` that doesn't maintain row order (such as a `group_by`), then the order will also change every time it is run. To avoid this, use `maintain_order=True` arguments for such operations. + +### Execution on larger-than-memory data + +If your data requires more memory than you have available Polars may be able to process the data in batches using _streaming_ mode. To use streaming mode you simply pass the `streaming=True` argument to `collect` + +{{code_block('user-guide/lazy/execution','stream',['scan_csv','collect'])}} + +We look at [streaming in more detail here](streaming.md). + +### Execution on a partial dataset + +While you're writing, optimizing or checking your query on a large dataset, querying all available data may lead to a slow development process. + +You can instead execute the query with the `.fetch` method. The `.fetch` method takes a parameter `n_rows` and tries to 'fetch' that number of rows at the data source. The number of rows cannot be guaranteed, however, as the lazy API does not count how many rows there are at each stage of the query. + +Here we "fetch" 100 rows from the source file and apply the predicates. + +{{code_block('user-guide/lazy/execution','partial',['scan_csv','collect','fetch'])}} + +```text +shape: (27, 6) +┌───────┬───────────────────────────┬─────────────┬────────────┬───────────────┬────────────┐ +│ id ┆ name ┆ created_utc ┆ updated_on ┆ comment_karma ┆ link_karma │ +│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ +│ i64 ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ +╞═══════╪═══════════════════════════╪═════════════╪════════════╪═══════════════╪════════════╡ +│ 6 ┆ TAOJIANLONG_JASONBROKEN ┆ 1397113510 ┆ 1536527864 ┆ 4 ┆ 0 │ +│ 17 ┆ SSAIG_JASONBROKEN ┆ 1397113544 ┆ 1536527864 ┆ 1 ┆ 0 │ +│ 19 ┆ FDBVFDSSDGFDS_JASONBROKEN ┆ 1397113552 ┆ 1536527864 ┆ 3 ┆ 0 │ +│ 37 ┆ IHATEWHOWEARE_JASONBROKEN ┆ 1397113636 ┆ 1536527864 ┆ 61 ┆ 0 │ +│ … ┆ … ┆ … ┆ … ┆ … ┆ … │ +│ 77763 ┆ LUNCHY ┆ 1137599510 ┆ 1536528275 ┆ 65 ┆ 0 │ +│ 77765 ┆ COMPOSTELLAS ┆ 1137474000 ┆ 1536528276 ┆ 6 ┆ 0 │ +│ 77766 ┆ GENERICBOB ┆ 1137474000 ┆ 1536528276 ┆ 291 ┆ 14 │ +│ 77768 ┆ TINHEADNED ┆ 1139665457 ┆ 1536497404 ┆ 4434 ┆ 103 │ +└───────┴───────────────────────────┴─────────────┴────────────┴───────────────┴────────────┘ +``` diff --git a/docs/user-guide/lazy/optimizations.md b/docs/user-guide/lazy/optimizations.md new file mode 100644 index 000000000000..576413833a3a --- /dev/null +++ b/docs/user-guide/lazy/optimizations.md @@ -0,0 +1,17 @@ +# Optimizations + +If you use `Polars`' lazy API, `Polars` will run several optimizations on your query. Some of them are executed up front, +others are determined just in time as the materialized data comes in. + +Here is a non-complete overview of optimizations done by polars, what they do and how often they run. + +| Optimization | Explanation | runs | +| -------------------------- | ------------------------------------------------------------------------------------------------------------ | ----------------------------- | +| Predicate pushdown | Applies filters as early as possible/ at scan level. | 1 time | +| Projection pushdown | Select only the columns that are needed at the scan level. | 1 time | +| Slice pushdown | Only load the required slice from the scan level. Don't materialize sliced outputs (e.g. join.head(10)). | 1 time | +| Common subplan elimination | Cache subtrees/file scans that are used by multiple subtrees in the query plan. | 1 time | +| Simplify expressions | Various optimizations, such as constant folding and replacing expensive operations with faster alternatives. | until fixed point | +| Join ordering | Estimates the branches of joins that should be executed first in order to reduce memory pressure. | 1 time | +| Type coercion | Coerce types such that operations succeed and run on minimal required memory. | until fixed point | +| Cardinality estimation | Estimates cardinality in order to determine optimal group by strategy. | 0/n times; dependent on query | diff --git a/docs/user-guide/lazy/query_plan.md b/docs/user-guide/lazy/query_plan.md new file mode 100644 index 000000000000..bb57a74168de --- /dev/null +++ b/docs/user-guide/lazy/query_plan.md @@ -0,0 +1,96 @@ +# Query plan + +For any lazy query `Polars` has both: + +- a non-optimized plan with the set of steps code as we provided it and +- an optimized plan with changes made by the query optimizer + +We can understand both the non-optimized and optimized query plans with visualization and by printing them as text. + +
+```python exec="on" result="text" session="user-guide/lazy/query_plan" +--8<-- "python/user-guide/lazy/query_plan.py:setup" +``` +
+ +Below we consider the following query: + +{{code_block('user-guide/lazy/query_plan','plan',[])}} + +```python exec="on" session="user-guide/lazy/query_plan" +--8<-- "python/user-guide/lazy/query_plan.py:plan" +``` + +## Non-optimized query plan + +### Graphviz visualization + +First we visualise the non-optimized plan by setting `optimized=False`. + +{{code_block('user-guide/lazy/query_plan','showplan',['show_graph'])}} + +```python exec="on" session="user-guide/lazy/query_plan" +--8<-- "python/user-guide/lazy/query_plan.py:createplan" +``` + +The query plan visualization should be read from bottom to top. In the visualization: + +- each box corresponds to a stage in the query plan +- the `sigma` stands for `SELECTION` and indicates any filter conditions +- the `pi` stands for `PROJECTION` and indicates choosing a subset of columns + +### Printed query plan + +We can also print the non-optimized plan with `explain(optimized=False)` + +{{code_block('user-guide/lazy/query_plan','describe',['explain'])}} + +```python exec="on" session="user-guide/lazy/query_plan" +--8<-- "python/user-guide/lazy/query_plan.py:describe" +``` + +```text +FILTER [(col("comment_karma")) > (0)] FROM WITH_COLUMNS: + [col("name").str.uppercase()] + + CSV SCAN data/reddit.csv + PROJECT */6 COLUMNS +``` + +The printed plan should also be read from bottom to top. This non-optimized plan is roughly equal to: + +- read from the `data/reddit.csv` file +- read all 6 columns (where the * wildcard in PROJECT \*/6 COLUMNS means take all columns) +- transform the `name` column to uppercase +- apply a filter on the `comment_karma` column + +## Optimized query plan + +Now we visualize the optimized plan with `show_graph`. + +{{code_block('user-guide/lazy/query_plan','show',['show_graph'])}} + +```python exec="on" session="user-guide/lazy/query_plan" +--8<-- "python/user-guide/lazy/query_plan.py:createplan2" +``` + +We can also print the optimized plan with `explain` + +{{code_block('user-guide/lazy/query_plan','optimized',['explain'])}} + +```text + WITH_COLUMNS: + [col("name").str.uppercase()] + + CSV SCAN data/reddit.csv + PROJECT */6 COLUMNS + SELECTION: [(col("comment_karma")) > (0)] +``` + +The optimized plan is to: + +- read the data from the Reddit CSV +- apply the filter on the `comment_karma` column while the CSV is being read line-by-line +- transform the `name` column to uppercase + +In this case the query optimizer has identified that the `filter` can be applied while the CSV is read from disk rather than reading the whole file into memory and then applying the filter. This optimization is called _Predicate Pushdown_. diff --git a/docs/user-guide/lazy/schemas.md b/docs/user-guide/lazy/schemas.md new file mode 100644 index 000000000000..77d2be54b722 --- /dev/null +++ b/docs/user-guide/lazy/schemas.md @@ -0,0 +1,60 @@ +# Schema + +The schema of a Polars `DataFrame` or `LazyFrame` sets out the names of the columns and their datatypes. You can see the schema with the `.schema` method on a `DataFrame` or `LazyFrame` + +{{code_block('user-guide/lazy/schema','schema',['DataFrame','lazy'])}} + +```python exec="on" result="text" session="user-guide/lazy/schemas" +--8<-- "python/user-guide/lazy/schema.py:setup" +--8<-- "python/user-guide/lazy/schema.py:schema" +``` + +The schema plays an important role in the lazy API. + +## Type checking in the lazy API + +One advantage of the lazy API is that Polars will check the schema before any data is processed. This check happens when you execute your lazy query. + +We see how this works in the following simple example where we call the `.round` expression on the integer `bar` column. + +{{code_block('user-guide/lazy/schema','typecheck',['lazy','with_columns'])}} + +The `.round` expression is only valid for columns with a floating point dtype. Calling `.round` on an integer column means the operation will raise an `InvalidOperationError` when we evaluate the query with `collect`. This schema check happens before the data is processed when we call `collect`. + +`python exec="on" result="text" session="user-guide/lazy/schemas"` + +If we executed this query in eager mode the error would only be found once the data had been processed in all earlier steps. + +When we execute a lazy query Polars checks for any potential `InvalidOperationError` before the time-consuming step of actually processing the data in the pipeline. + +## The lazy API must know the schema + +In the lazy API the Polars query optimizer must be able to infer the schema at every step of a query plan. This means that operations where the schema is not knowable in advance cannot be used with the lazy API. + +The classic example of an operation where the schema is not knowable in advance is a `.pivot` operation. In a `.pivot` the new column names come from data in one of the columns. As these column names cannot be known in advance a `.pivot` is not available in the lazy API. + +## Dealing with operations not available in the lazy API + +If your pipeline includes an operation that is not available in the lazy API it is normally best to: + +- run the pipeline in lazy mode up until that point +- execute the pipeline with `.collect` to materialize a `DataFrame` +- do the non-lazy operation on the `DataFrame` +- convert the output back to a `LazyFrame` with `.lazy` and continue in lazy mode + +We show how to deal with a non-lazy operation in this example where we: + +- create a simple `DataFrame` +- convert it to a `LazyFrame` with `.lazy` +- do a transformation using `.with_columns` +- execute the query before the pivot with `.collect` to get a `DataFrame` +- do the `.pivot` on the `DataFrame` +- convert back in lazy mode +- do a `.filter` +- finish by executing the query with `.collect` to get a `DataFrame` + +{{code_block('user-guide/lazy/schema','lazyeager',['collect','pivot','filter'])}} + +```python exec="on" result="text" session="user-guide/lazy/schemas" +--8<-- "python/user-guide/lazy/schema.py:lazyeager" +``` diff --git a/docs/user-guide/lazy/streaming.md b/docs/user-guide/lazy/streaming.md new file mode 100644 index 000000000000..3f9d268443ca --- /dev/null +++ b/docs/user-guide/lazy/streaming.md @@ -0,0 +1,3 @@ +# Streaming + +--8<-- "docs/_build/snippets/under_construction.md" diff --git a/docs/user-guide/lazy/using.md b/docs/user-guide/lazy/using.md new file mode 100644 index 000000000000..d777557da550 --- /dev/null +++ b/docs/user-guide/lazy/using.md @@ -0,0 +1,37 @@ +# Usage + +With the lazy API, Polars doesn't run each query line-by-line but instead processes the full query end-to-end. To get the most out of Polars it is important that you use the lazy API because: + +- the lazy API allows Polars to apply automatic query optimization with the query optimizer +- the lazy API allows you to work with larger than memory datasets using streaming +- the lazy API can catch schema errors before processing the data + +Here we see how to use the lazy API starting from either a file or an existing `DataFrame`. + +## Using the lazy API from a file + +In the ideal case we would use the lazy API right from a file as the query optimizer may help us to reduce the amount of data we read from the file. + +We create a lazy query from the Reddit CSV data and apply some transformations. + +By starting the query with `pl.scan_csv` we are using the lazy API. + +{{code_block('user-guide/lazy/using','dataframe',['scan_csv','with_columns','filter','col'])}} + +A `pl.scan_` function is available for a number of file types including CSV, IPC, Parquet and JSON. + +In this query we tell Polars that we want to: + +- load data from the Reddit CSV file +- convert the `name` column to uppercase +- apply a filter to the `comment_karma` column + +The lazy query will not be executed at this point. See this page on [executing lazy queries](execution.md) for more on running lazy queries. + +## Using the lazy API from a `DataFrame` + +An alternative way to access the lazy API is to call `.lazy` on a `DataFrame` that has already been created in memory. + +{{code_block('user-guide/lazy/using','fromdf',['lazy'])}} + +By calling `.lazy` we convert the `DataFrame` to a `LazyFrame`. diff --git a/docs/user-guide/migration/pandas.md b/docs/user-guide/migration/pandas.md new file mode 100644 index 000000000000..d781ae290f96 --- /dev/null +++ b/docs/user-guide/migration/pandas.md @@ -0,0 +1,328 @@ +# Coming from Pandas + +Here we set out the key points that anyone who has experience with `Pandas` and wants to +try `Polars` should know. We include both differences in the concepts the libraries are +built on and differences in how you should write `Polars` code compared to `Pandas` +code. + +## Differences in concepts between `Polars` and `Pandas` + +### `Polars` does not have a multi-index/index + +`Pandas` gives a label to each row with an index. `Polars` does not use an index and +each row is indexed by its integer position in the table. + +Polars aims to have predictable results and readable queries, as such we think an index does not help us reach that +objective. We believe the semantics of a query should not change by the state of an index or a `reset_index` call. + +In Polars a DataFrame will always be a 2D table with heterogeneous data-types. The data-types may have nesting, but the +table itself will not. +Operations like resampling will be done by specialized functions or methods that act like 'verbs' on a table explicitly +stating the columns that that 'verb' operates on. As such, it is our conviction that not having indices make things simpler, +more explicit, more readable and less error-prone. + +Note that an 'index' data structure as known in databases will be used by polars as an optimization technique. + +### `Polars` uses Apache Arrow arrays to represent data in memory while `Pandas` uses `Numpy` arrays + +`Polars` represents data in memory with Arrow arrays while `Pandas` represents data in +memory with `Numpy` arrays. Apache Arrow is an emerging standard for in-memory columnar +analytics that can accelerate data load times, reduce memory usage and accelerate +calculations. + +`Polars` can convert data to `Numpy` format with the `to_numpy` method. + +### `Polars` has more support for parallel operations than `Pandas` + +`Polars` exploits the strong support for concurrency in Rust to run many operations in +parallel. While some operations in `Pandas` are multi-threaded the core of the library +is single-threaded and an additional library such as `Dask` must be used to parallelize +operations. + +### `Polars` can lazily evaluate queries and apply query optimization + +Eager evaluation is when code is evaluated as soon as you run the code. Lazy evaluation +is when running a line of code means that the underlying logic is added to a query plan +rather than being evaluated. + +`Polars` supports eager evaluation and lazy evaluation whereas `Pandas` only supports +eager evaluation. The lazy evaluation mode is powerful because `Polars` carries out +automatic query optimization when it examines the query plan and looks for ways to +accelerate the query or reduce memory usage. + +`Dask` also supports lazy evaluation when it generates a query plan. However, `Dask` +does not carry out query optimization on the query plan. + +## Key syntax differences + +Users coming from `Pandas` generally need to know one thing... + +``` +polars != pandas +``` + +If your `Polars` code looks like it could be `Pandas` code, it might run, but it likely +runs slower than it should. + +Let's go through some typical `Pandas` code and see how we might rewrite it in `Polars`. + +### Selecting data + +As there is no index in `Polars` there is no `.loc` or `iloc` method in `Polars` - and +there is also no `SettingWithCopyWarning` in `Polars`. + +However, the best way to select data in `Polars` is to use the expression API. For +example, if you want to select a column in `Pandas` you can do one of the following: + +```python +df['a'] +df.loc[:,'a'] +``` + +but in `Polars` you would use the `.select` method: + +```python +df.select('a') +``` + +If you want to select rows based on the values then in `Polars` you use the `.filter` +method: + +```python +df.filter(pl.col('a') < 10) +``` + +As noted in the section on expressions below, `Polars` can run operations in `.select` +and `filter` in parallel and `Polars` can carry out query optimization on the full set +of data selection criteria. + +### Be lazy + +Working in lazy evaluation mode is straightforward and should be your default in +`Polars` as the lazy mode allows `Polars` to do query optimization. + +We can run in lazy mode by either using an implicitly lazy function (such as `scan_csv`) +or explicitly using the `lazy` method. + +Take the following simple example where we read a CSV file from disk and do a group by. +The CSV file has numerous columns but we just want to do a group by on one of the id +columns (`id1`) and then sum by a value column (`v1`). In `Pandas` this would be: + +```python +df = pd.read_csv(csv_file, usecols=['id1','v1']) +grouped_df = df.loc[:,['id1','v1']].groupby('id1').sum('v1') +``` + +In `Polars` you can build this query in lazy mode with query optimization and evaluate +it by replacing the eager `Pandas` function `read_csv` with the implicitly lazy `Polars` +function `scan_csv`: + +```python +df = pl.scan_csv(csv_file) +grouped_df = df.group_by('id1').agg(pl.col('v1').sum()).collect() +``` + +`Polars` optimizes this query by identifying that only the `id1` and `v1` columns are +relevant and so will only read these columns from the CSV. By calling the `.collect` +method at the end of the second line we instruct `Polars` to eagerly evaluate the query. + +If you do want to run this query in eager mode you can just replace `scan_csv` with +`read_csv` in the `Polars` code. + +Read more about working with lazy evaluation in the +[lazy API](../lazy/using.md) section. + +### Express yourself + +A typical `Pandas` script consists of multiple data transformations that are executed +sequentially. However, in `Polars` these transformations can be executed in parallel +using expressions. + +#### Column assignment + +We have a dataframe `df` with a column called `value`. We want to add two new columns, a +column called `tenXValue` where the `value` column is multiplied by 10 and a column +called `hundredXValue` where the `value` column is multiplied by 100. + +In `Pandas` this would be: + +```python +df["tenXValue"] = df["value"] * 10 +df["hundredXValue"] = df["value"] * 100 +``` + +These column assignments are executed sequentially. + +In `Polars` we add columns to `df` using the `.with_columns` method and name them with +the `.alias` method: + +```python +df.with_columns( + (pl.col("value") * 10).alias("tenXValue"), + (pl.col("value") * 100).alias("hundredXValue"), +) +``` + +These column assignments are executed in parallel. + +#### Column assignment based on predicate + +In this case we have a dataframe `df` with columns `a`,`b` and `c`. We want to re-assign +the values in column `a` based on a condition. When the value in column `c` is equal to +2 then we replace the value in `a` with the value in `b`. + +In `Pandas` this would be: + +```python +df.loc[df["c"] == 2, "a"] = df.loc[df["c"] == 2, "b"] +``` + +while in `Polars` this would be: + +```python +df.with_columns( + pl.when(pl.col("c") == 2) + .then(pl.col("b")) + .otherwise(pl.col("a")).alias("a") +) +``` + +The `Polars` way is pure in that the original `DataFrame` is not modified. The `mask` is +also not computed twice as in `Pandas` (you could prevent this in `Pandas`, but that +would require setting a temporary variable). + +Additionally `Polars` can compute every branch of an `if -> then -> otherwise` in +parallel. This is valuable, when the branches get more expensive to compute. + +#### Filtering + +We want to filter the dataframe `df` with housing data based on some criteria. + +In `Pandas` you filter the dataframe by passing Boolean expressions to the `loc` method: + +```python +df.loc[(df['sqft_living'] > 2500) & (df['price'] < 300000)] +``` + +while in `Polars` you call the `filter` method: + +```python +df.filter( + (pl.col("m2_living") > 2500) & (pl.col("price") < 300000) +) +``` + +The query optimizer in `Polars` can also detect if you write multiple filters separately +and combine them into a single filter in the optimized plan. + +## `Pandas` transform + +The `Pandas` documentation demonstrates an operation on a group by called `transform`. In +this case we have a dataframe `df` and we want a new column showing the number of rows +in each group. + +In `Pandas` we have: + +```python +df = pd.DataFrame({ + "type": ["m", "n", "o", "m", "m", "n", "n"], + "c": [1, 1, 1, 2, 2, 2, 2], +}) + +df["size"] = df.groupby("c")["type"].transform(len) +``` + +Here `Pandas` does a group by on `"c"`, takes column `"type"`, computes the group length +and then joins the result back to the original `DataFrame` producing: + +``` + c type size +0 1 m 3 +1 1 n 3 +2 1 o 3 +3 2 m 4 +4 2 m 4 +5 2 n 4 +6 2 n 4 +``` + +In `Polars` the same can be achieved with `window` functions: + +```python +df.select( + pl.all(), + pl.col("type").count().over("c").alias("size") +) +``` + +``` +shape: (7, 3) +┌─────┬──────┬──────┐ +│ c ┆ type ┆ size │ +│ --- ┆ --- ┆ --- │ +│ i64 ┆ str ┆ u32 │ +╞═════╪══════╪══════╡ +│ 1 ┆ m ┆ 3 │ +├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤ +│ 1 ┆ n ┆ 3 │ +├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤ +│ 1 ┆ o ┆ 3 │ +├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤ +│ 2 ┆ m ┆ 4 │ +├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤ +│ 2 ┆ m ┆ 4 │ +├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤ +│ 2 ┆ n ┆ 4 │ +├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤ +│ 2 ┆ n ┆ 4 │ +└─────┴──────┴──────┘ +``` + +Because we can store the whole operation in a single expression, we can combine several +`window` functions and even combine different groups! + +`Polars` will cache window expressions that are applied over the same group, so storing +them in a single `select` is both convenient **and** optimal. In the following example +we look at a case where we are calculating group statistics over `"c"` twice: + +```python +df.select( + pl.all(), + pl.col("c").count().over("c").alias("size"), + pl.col("c").sum().over("type").alias("sum"), + pl.col("c").reverse().over("c").flatten().alias("reverse_type") +) +``` + +``` +shape: (7, 5) +┌─────┬──────┬──────┬─────┬──────────────┐ +│ c ┆ type ┆ size ┆ sum ┆ reverse_type │ +│ --- ┆ --- ┆ --- ┆ --- ┆ --- │ +│ i64 ┆ str ┆ u32 ┆ i64 ┆ i64 │ +╞═════╪══════╪══════╪═════╪══════════════╡ +│ 1 ┆ m ┆ 3 ┆ 5 ┆ 2 │ +├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ +│ 1 ┆ n ┆ 3 ┆ 5 ┆ 2 │ +├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ +│ 1 ┆ o ┆ 3 ┆ 1 ┆ 2 │ +├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ +│ 2 ┆ m ┆ 4 ┆ 5 ┆ 2 │ +├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ +│ 2 ┆ m ┆ 4 ┆ 5 ┆ 1 │ +├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ +│ 2 ┆ n ┆ 4 ┆ 5 ┆ 1 │ +├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ +│ 2 ┆ n ┆ 4 ┆ 5 ┆ 1 │ +└─────┴──────┴──────┴─────┴──────────────┘ +``` + +## Missing data + +`Pandas` uses `NaN` and/or `None` values to indicate missing values depending on the dtype of the column. In addition the behaviour in `Pandas` varies depending on whether the default dtypes or optional nullable arrays are used. In `Polars` missing data corresponds to a `null` value for all data types. + +For float columns `Polars` permits the use of `NaN` values. These `NaN` values are not considered to be missing data but instead a special floating point value. + +In `Pandas` an integer column with missing values is cast to be a float column with `NaN` values for the missing values (unless using optional nullable integer dtypes). In `Polars` any missing values in an integer column are simply `null` values and the column remains an integer column. + +See the [missing data](../expressions/null.md) section for more details. diff --git a/docs/user-guide/migration/spark.md b/docs/user-guide/migration/spark.md new file mode 100644 index 000000000000..ea1a41abbd71 --- /dev/null +++ b/docs/user-guide/migration/spark.md @@ -0,0 +1,158 @@ +# Coming from Apache Spark + +## Column-based API vs. Row-based API + +Whereas the `Spark` `DataFrame` is analogous to a collection of rows, a `Polars` `DataFrame` is closer to a collection of columns. This means that you can combine columns in `Polars` in ways that are not possible in `Spark`, because `Spark` preserves the relationship of the data in each row. + +Consider this sample dataset: + +```python +import polars as pl + +df = pl.DataFrame({ + "foo": ["a", "b", "c", "d", "d"], + "bar": [1, 2, 3, 4, 5], +}) + +dfs = spark.createDataFrame( + [ + ("a", 1), + ("b", 2), + ("c", 3), + ("d", 4), + ("d", 5), + ], + schema=["foo", "bar"], +) +``` + +### Example 1: Combining `head` and `sum` + +In `Polars` you can write something like this: + +```python +df.select( + pl.col("foo").sort().head(2), + pl.col("bar").filter(pl.col("foo") == "d").sum() +) +``` + +Output: + +``` +shape: (2, 2) +┌─────┬─────┐ +│ foo ┆ bar │ +│ --- ┆ --- │ +│ str ┆ i64 │ +╞═════╪═════╡ +│ a ┆ 9 │ +├╌╌╌╌╌┼╌╌╌╌╌┤ +│ b ┆ 9 │ +└─────┴─────┘ +``` + +The expressions on columns `foo` and `bar` are completely independent. Since the expression on `bar` returns a single value, that value is repeated for each value output by the expression on `foo`. But `a` and `b` have no relation to the data that produced the sum of `9`. + +To do something similar in `Spark`, you'd need to compute the sum separately and provide it as a literal: + +```python +from pyspark.sql.functions import col, sum, lit + +bar_sum = ( + dfs + .where(col("foo") == "d") + .groupBy() + .agg(sum(col("bar"))) + .take(1)[0][0] +) + +( + dfs + .orderBy("foo") + .limit(2) + .withColumn("bar", lit(bar_sum)) + .show() +) +``` + +Output: + +``` ++---+---+ +|foo|bar| ++---+---+ +| a| 9| +| b| 9| ++---+---+ +``` + +### Example 2: Combining Two `head`s + +In `Polars` you can combine two different `head` expressions on the same DataFrame, provided that they return the same number of values. + +```python +df.select( + pl.col("foo").sort().head(2), + pl.col("bar").sort(descending=True).head(2), +) +``` + +Output: + +``` +shape: (3, 2) +┌─────┬─────┐ +│ foo ┆ bar │ +│ --- ┆ --- │ +│ str ┆ i64 │ +╞═════╪═════╡ +│ a ┆ 5 │ +├╌╌╌╌╌┼╌╌╌╌╌┤ +│ b ┆ 4 │ +└─────┴─────┘ +``` + +Again, the two `head` expressions here are completely independent, and the pairing of `a` to `5` and `b` to `4` results purely from the juxtaposition of the two columns output by the expressions. + +To accomplish something similar in `Spark`, you would need to generate an artificial key that enables you to join the values in this way. + +```python +from pyspark.sql import Window +from pyspark.sql.functions import row_number + +foo_dfs = ( + dfs + .withColumn( + "rownum", + row_number().over(Window.orderBy("foo")) + ) +) + +bar_dfs = ( + dfs + .withColumn( + "rownum", + row_number().over(Window.orderBy(col("bar").desc())) + ) +) + +( + foo_dfs.alias("foo") + .join(bar_dfs.alias("bar"), on="rownum") + .select("foo.foo", "bar.bar") + .limit(2) + .show() +) +``` + +Output: + +``` ++---+---+ +|foo|bar| ++---+---+ +| a| 5| +| b| 4| ++---+---+ +``` diff --git a/docs/user-guide/misc/alternatives.md b/docs/user-guide/misc/alternatives.md new file mode 100644 index 000000000000..a5544e7db354 --- /dev/null +++ b/docs/user-guide/misc/alternatives.md @@ -0,0 +1,66 @@ +# Alternatives + +These are some tools that share similar functionality to what polars does. + +- Pandas + + A very versatile tool for small data. Read [10 things I hate about pandas](https://wesmckinney.com/blog/apache-arrow-pandas-internals/) + written by the author himself. Polars has solved all those 10 things. + Polars is a versatile tool for small and large data with a more predictable, less ambiguous, and stricter API. + +- Pandas the API + + The API of pandas was designed for in memory data. This makes it a poor fit for performant analysis on large data + (read anything that does not fit into RAM). Any tool that tries to distribute that API will likely have a + suboptimal query plan compared to plans that follow from a declarative API like SQL or Polars' API. + +- Dask + + Parallelizes existing single-threaded libraries like `NumPy` and `Pandas`. As a consumer of those libraries Dask + therefore has less control over low level performance and semantics. + Those libraries are treated like a black box. + On a single machine the parallelization effort can also be seriously stalled by pandas strings. + Pandas strings, by default, are stored as python objects in + numpy arrays meaning that any operation on them is GIL bound and therefore single threaded. This can be circumvented + by multi-processing but has a non-trivial cost. + +- Modin + + Similar to Dask + +- Vaex + + Vaexs method of out-of-core analysis is memory mapping files. This works until it doesn't. For instance parquet + or csv files first need to be read and converted to a file format that can be memory mapped. Another downside is + that the OS determines when pages will be swapped. Operations that need a full data shuffle, such as + sorts, have terrible performance on memory mapped data. + Polars' out of core processing is not based on memory mapping, but on streaming data in batches (and spilling to disk + if needed), we control which data must be hold in memory, not the OS, meaning that we don't have unexpected IO stalls. + +- DuckDB + + Polars and DuckDB have many similarities. DuckDB is focused on providing an in-process OLAP Sqlite alternative, + Polars is focused on providing a scalable `DataFrame` interface to many languages. Those different front-ends lead to + different optimization strategies and different algorithm prioritization. The interoperability between both is zero-copy. + See more: https://duckdb.org/docs/guides/python/polars + +- Spark + + Spark is designed for distributed workloads and uses the JVM. The setup for spark is complicated and the startup-time + is slow. On a single machine Polars has much better performance characteristics. If you need to process TB's of data + Spark is a better choice. + +- CuDF + + GPU's and CuDF are fast! + However, GPU's are not readily available and expensive in production. The amount of memory available on a GPU + is often a fraction of the available RAM. + This (and out-of-core) processing means that Polars can handle much larger data-sets. + Next to that Polars can be close in [performance to CuDF](https://zakopilo.hatenablog.jp/entry/2023/02/04/220552). + CuDF doesn't optimize your query, so is not uncommon that on ETL jobs Polars will be faster because it can elide + unneeded work and materializations. + +- Any + + Polars is written in Rust. This gives it strong safety, performance and concurrency guarantees. + Polars is written in a modular manner. Parts of Polars can be used in other query programs and can be added as a library. diff --git a/docs/user-guide/misc/contributing.md b/docs/user-guide/misc/contributing.md new file mode 100644 index 000000000000..abd4d4d229be --- /dev/null +++ b/docs/user-guide/misc/contributing.md @@ -0,0 +1,11 @@ +# Contributing + +See the [`CONTRIBUTING.md`](https://github.com/pola-rs/polars/blob/master/CONTRIBUTING.md) if you would like to contribute to the `Polars` project. + +If you're new to this we recommend starting out with contributing examples to the Python API documentation. The Python API docs are generated from the docstrings of the Python wrapper located in `polars/py-polars`. + +Here is an example [commit](https://github.com/pola-rs/polars/pull/3567/commits/5db9e335f3f2777dd1d6f80df765c6bca8f307b0) that adds a docstring. + +If you spot any gaps in this User Guide you can submit fixes to the [`pola-rs/polars`](https://github.com/pola-rs/polars) repo. + +Happy hunting! diff --git a/docs/user-guide/misc/multiprocessing.md b/docs/user-guide/misc/multiprocessing.md new file mode 100644 index 000000000000..4973da8c0155 --- /dev/null +++ b/docs/user-guide/misc/multiprocessing.md @@ -0,0 +1,104 @@ +# Multiprocessing + +TLDR: if you find that using Python's built-in `multiprocessing` module together with Polars results in a Polars error about multiprocessing methods, you should make sure you are using `spawn`, not `fork`, as the starting method: + +{{code_block('user-guide/misc/multiprocess','recommendation',[])}} + +## When not to use multiprocessing + +Before we dive into the details, it is important to emphasize that Polars has been built from the start to use all your CPU cores. +It does this by executing computations which can be done in parallel in separate threads. +For example, requesting two expressions in a `select` statement can be done in parallel, with the results only being combined at the end. +Another example is aggregating a value within groups using `group_by().agg()`, each group can be evaluated separately. +It is very unlikely that the `multiprocessing` module can improve your code performance in these cases. + +See [the optimizations section](../lazy/optimizations.md) for more optimizations. + +## When to use multiprocessing + +Although Polars is multithreaded, other libraries may be single-threaded. +When the other library is the bottleneck, and the problem at hand is parallelizable, it makes sense to use multiprocessing to gain a speed up. + +## The problem with the default multiprocessing config + +### Summary + +The [Python multiprocessing documentation](https://docs.python.org/3/library/multiprocessing.html) lists the three methods to create a process pool: + +1. spawn +1. fork +1. forkserver + +The description of fork is (as of 2022-10-15): + +> The parent process uses os.fork() to fork the Python interpreter. The child process, when it begins, is effectively identical to the parent process. All resources of the parent are inherited by the child process. Note that safely forking a multithreaded process is problematic. + +> Available on Unix only. The default on Unix. + +The short summary is: Polars is multithreaded as to provide strong performance out-of-the-box. +Thus, it cannot be combined with `fork`. +If you are on Unix (Linux, BSD, etc), you are using `fork`, unless you explicitly override it. + +The reason you may not have encountered this before is that pure Python code, and most Python libraries, are (mostly) single threaded. +Alternatively, you are on Windows or MacOS, on which `fork` is not even available as a method (for MacOS it was up to Python 3.7). + +Thus one should use `spawn`, or `forkserver`, instead. `spawn` is available on all platforms and the safest choice, and hence the recommended method. + +### Example + +The problem with `fork` is in the copying of the parent's process. +Consider the example below, which is a slightly modified example posted on the [Polars issue tracker](https://github.com/pola-rs/polars/issues/3144): + +{{code_block('user-guide/misc/multiprocess','example1',[])}} + +Using `fork` as the method, instead of `spawn`, will cause a dead lock. +Please note: Polars will not even start and raise the error on multiprocessing method being set wrong, but if the check had not been there, the deadlock would exist. + +The fork method is equivalent to calling `os.fork()`, which is a system call as defined in [the POSIX standard](https://pubs.opengroup.org/onlinepubs/9699919799/functions/fork.html): + +> A process shall be created with a single thread. If a multi-threaded process calls fork(), the new process shall contain a replica of the calling thread and its entire address space, possibly including the states of mutexes and other resources. Consequently, to avoid errors, the child process may only execute async-signal-safe operations until such time as one of the exec functions is called. + +In contrast, `spawn` will create a completely new fresh Python interpreter, and not inherit the state of mutexes. + +So what happens in the code example? +For reading the file with `pl.read_parquet` the file has to be locked. +Then `os.fork()` is called, copying the state of the parent process, including mutexes. +Thus all child processes will copy the file lock in an acquired state, leaving them hanging indefinitely waiting for the file lock to be released, which never happens. + +What makes debugging these issues tricky is that `fork` can work. +Change the example to not having the call to `pl.read_parquet`: + +{{code_block('user-guide/misc/multiprocess','example2',[])}} + +This works fine. +Therefore debugging these issues in larger code bases, i.e. not the small toy examples here, can be a real pain, as a seemingly unrelated change can break your multiprocessing code. +In general, one should therefore never use the `fork` start method with multithreaded libraries unless there are very specific requirements that cannot be met otherwise. + +### Pro's and cons of fork + +Based on the example, you may think, why is `fork` available in Python to start with? + +First, probably because of historical reasons: `spawn` was added to Python in version 3.4, whilst `fork` has been part of Python from the 2.x series. + +Second, there are several limitations for `spawn` and `forkserver` that do not apply to `fork`, in particular all arguments should be pickable. +See the [Python multiprocessing docs](https://docs.python.org/3/library/multiprocessing.html#the-spawn-and-forkserver-start-methods) for more information. + +Third, because it is faster to create new processes compared to `spawn`, as `spawn` is effectively `fork` + creating a brand new Python process without the locks by calling [execv](https://pubs.opengroup.org/onlinepubs/9699919799/functions/exec.html). +Hence the warning in the Python docs that it is slower: there is more overhead to `spawn`. +However, in almost all cases, one would like to use multiple processes to speed up computations that take multiple minutes or even hours, meaning the overhead is negligible in the grand scheme of things. +And more importantly, it actually works in combination with multithreaded libraries. + +Fourth, `spawn` starts a new process, and therefore it requires code to be importable, in contrast to `fork`. +In particular, this means that when using `spawn` the relevant code should not be in the global scope, such as in Jupyter notebooks or in plain scripts. +Hence in the examples above, we define functions where we spawn within, and run those functions from a `__main__` clause. +This is not an issue for typical projects, but during quick experimentation in notebooks it could fail. + +## References + +1. https://docs.python.org/3/library/multiprocessing.html + +1. https://pythonspeed.com/articles/python-multiprocessing/ + +1. https://pubs.opengroup.org/onlinepubs/9699919799/functions/fork.html + +1. https://bnikolic.co.uk/blog/python/parallelism/2019/11/13/python-forkserver-preload.html diff --git a/docs/user-guide/misc/reference-guides.md b/docs/user-guide/misc/reference-guides.md new file mode 100644 index 000000000000..c0e082d08447 --- /dev/null +++ b/docs/user-guide/misc/reference-guides.md @@ -0,0 +1,6 @@ +# Reference guides + +The api documentations with details on function / object signatures can be found here: + +- [Python](https://pola-rs.github.io/polars/py-polars/html/reference/index.html) +- [Rust](https://docs.rs/polars/latest/polars/) diff --git a/docs/user-guide/sql/create.md b/docs/user-guide/sql/create.md new file mode 100644 index 000000000000..a5a1922b7f23 --- /dev/null +++ b/docs/user-guide/sql/create.md @@ -0,0 +1,28 @@ +# CREATE + +In Polars, the `SQLContext` provides a way to execute SQL statements against `LazyFrames` and `DataFrames` using SQL syntax. One of the SQL statements that can be executed using `SQLContext` is the `CREATE TABLE` statement, which is used to create a new table. + +The syntax for the `CREATE TABLE` statement in Polars is as follows: + +``` +CREATE TABLE table_name +AS +SELECT ... +``` + +In this syntax, `table_name` is the name of the new table that will be created, and `SELECT ...` is a SELECT statement that defines the data that will be inserted into the table. + +Here's an example of how to use the `CREATE TABLE` statement in Polars: + +{{code_block('user-guide/sql/create','create',['SQLregister','SQLexecute'])}} + +```python exec="on" result="text" session="user-guide/sql" +--8<-- "python/user-guide/sql/create.py:setup" +--8<-- "python/user-guide/sql/create.py:create" +``` + +In this example, we use the `execute()` method of the `SQLContext` to execute a `CREATE TABLE` statement that creates a new table called `older_people` based on a SELECT statement that selects all rows from the `my_table` DataFrame where the `age` column is greater than 30. + +!!! note Result + + Note that the result of a `CREATE TABLE` statement is not the table itself. The table is registered in the `SQLContext`. In case you want to turn the table back to a `DataFrame` you can use a `SELECT * FROM ...` statement diff --git a/docs/user-guide/sql/cte.md b/docs/user-guide/sql/cte.md new file mode 100644 index 000000000000..1129f6d19230 --- /dev/null +++ b/docs/user-guide/sql/cte.md @@ -0,0 +1,27 @@ +# Common Table Expressions + +Common Table Expressions (CTEs) are a feature of SQL that allow you to define a temporary named result set that can be referenced within a SQL statement. CTEs provide a way to break down complex SQL queries into smaller, more manageable pieces, making them easier to read, write, and maintain. + +A CTE is defined using the `WITH` keyword followed by a comma-separated list of subqueries, each of which defines a named result set that can be used in subsequent queries. The syntax for a CTE is as follows: + +``` +WITH cte_name AS ( + subquery +) +SELECT ... +``` + +In this syntax, `cte_name` is the name of the CTE, and `subquery` is the subquery that defines the result set. The CTE can then be referenced in subsequent queries as if it were a table or view. + +CTEs are particularly useful when working with complex queries that involve multiple levels of subqueries, as they allow you to break down the query into smaller, more manageable pieces that are easier to understand and debug. Additionally, CTEs can help improve query performance by allowing the database to optimize and cache the results of subqueries, reducing the number of times they need to be executed. + +Polars supports Common Table Expressions (CTEs) using the WITH clause in SQL syntax. Below is an example + +{{code_block('user-guide/sql/cte','cte',['SQLregister','SQLexecute'])}} + +```python exec="on" result="text" session="user-guide/sql/cte" +--8<-- "python/user-guide/sql/cte.py:setup" +--8<-- "python/user-guide/sql/cte.py:cte" +``` + +In this example, we use the `execute()` method of the `SQLContext` to execute a SQL query that includes a CTE. The CTE selects all rows from the `my_table` LazyFrame where the `age` column is greater than 30 and gives it the alias `older_people`. We then execute a second SQL query that selects all rows from the `older_people` CTE where the `name` column starts with the letter 'C'. diff --git a/docs/user-guide/sql/intro.md b/docs/user-guide/sql/intro.md new file mode 100644 index 000000000000..815231e3d59c --- /dev/null +++ b/docs/user-guide/sql/intro.md @@ -0,0 +1,106 @@ +# Introduction + +While Polars does support writing queries in SQL, it's recommended that users familiarize themselves with the [expression syntax](../concepts/expressions.md) for more readable and expressive code. As a primarily DataFrame library, new features will typically be added to the expression API first. However, if you already have an existing SQL codebase or prefer to use SQL, Polars also offers support for SQL queries. + +!!! note Execution + + In Polars, there is no separate SQL engine because Polars translates SQL queries into [expressions](../concepts/expressions.md), which are then executed using its built-in execution engine. This approach ensures that Polars maintains its performance and scalability advantages as a native DataFrame library while still providing users with the ability to work with SQL queries. + +## Context + +Polars uses the `SQLContext` to manage SQL queries . The context contains a dictionary mapping `DataFrames` and `LazyFrames` names to their corresponding datasets[^1]. The example below starts a `SQLContext`: + +{{code_block('user-guide/sql/intro','context',['SQLContext'])}} + +```python exec="on" session="user-guide/sql" +--8<-- "python/user-guide/sql/intro.py:setup" +--8<-- "python/user-guide/sql/intro.py:context" +``` + +## Register Dataframes + +There are 2 ways to register DataFrames in the `SQLContext`: + +- register all `LazyFrames` and `DataFrames` in the global namespace +- register them one by one + +{{code_block('user-guide/sql/intro','register_context',['SQLContext'])}} + +```python exec="on" session="user-guide/sql" +--8<-- "python/user-guide/sql/intro.py:register_context" +``` + +We can also register Pandas DataFrames by converting them to Polars first. + +{{code_block('user-guide/sql/intro','register_pandas',['SQLContext'])}} + +```python exec="on" session="user-guide/sql" +--8<-- "python/user-guide/sql/intro.py:register_pandas" +``` + +!!! note Pandas + + Converting a Pandas DataFrame backed by Numpy to Polars triggers a conversion to the Arrow format. This conversion has a computation cost. Converting a Pandas DataFrame backed by Arrow on the other hand will be free or almost free. + +Once the `SQLContext` is initialized, we can register additional Dataframes or unregister existing Dataframes with: + +- `register` +- `register_globals` +- `register_many` +- `unregister` + +## Execute queries and collect results + +SQL queries are always executed in lazy mode to benefit from lazy optimizations, so we have 2 options to collect the result: + +- Set the parameter `eager_execution` to True in `SQLContext`. With this parameter, Polars will automatically collect SQL results +- Set the parameter `eager` to True when executing a query with `execute`, or collect the result with `collect`. + +We execute SQL queries by calling `execute` on a `SQLContext`. + +{{code_block('user-guide/sql/intro','execute',['SQLregister','SQLexecute'])}} + +```python exec="on" result="text" session="user-guide/sql" +--8<-- "python/user-guide/sql/intro.py:execute" +``` + +## Execute queries from multiple sources + +SQL queries can be executed just as easily from multiple sources. +In the example below, we register : + +- a CSV file loaded lazily +- a NDJSON file loaded lazily +- a Pandas DataFrame + +And we join them together with SQL. +Lazy reading allows to only load the necessary rows and columns from the files. + +In the same way, it's possible to register cloud datalakes (S3, Azure Data Lake). A PyArrow dataset can point to the datalake, then Polars can read it with `scan_pyarrow_dataset`. + +{{code_block('user-guide/sql/intro','execute_multiple_sources',['SQLregister','SQLexecute'])}} + +```python exec="on" result="text" session="user-guide/sql" +--8<-- "python/user-guide/sql/intro.py:prepare_multiple_sources" +--8<-- "python/user-guide/sql/intro.py:execute_multiple_sources" +--8<-- "python/user-guide/sql/intro.py:clean_multiple_sources" +``` + +[^1]: Additionally it also tracks the [common table expressions](./cte.md) as well. + +## Compatibility + +Polars does not support the full SQL language, in Polars you are allowed to: + +- Write a `CREATE` statements `CREATE TABLE xxx AS ...` +- Write a `SELECT` statements with all generic elements (`GROUP BY`, `WHERE`,`ORDER`,`LIMIT`,`JOIN`, ...) +- Write Common Table Expressions (CTE's) (`WITH tablename AS`) +- Show an overview of all tables `SHOW TABLES` + +The following is not yet supported: + +- `INSERT`, `UPDATE` or `DELETE` statements +- Table aliasing (e.g. `SELECT p.Name from pokemon AS p`) +- Meta queries such as `ANALYZE`, `EXPLAIN` + +In the upcoming sections we will cover each of the statements in more details. diff --git a/docs/user-guide/sql/select.md b/docs/user-guide/sql/select.md new file mode 100644 index 000000000000..1c643895dec7 --- /dev/null +++ b/docs/user-guide/sql/select.md @@ -0,0 +1,72 @@ +# SELECT + +In Polars SQL, the `SELECT` statement is used to retrieve data from a table into a `DataFrame`. The basic syntax of a `SELECT` statement in Polars SQL is as follows: + +```sql +SELECT column1, column2, ... +FROM table_name; +``` + +Here, `column1`, `column2`, etc. are the columns that you want to select from the table. You can also use the wildcard `*` to select all columns. `table_name` is the name of the table or that you want to retrieve data from. In the sections below we will cover some of the more common SELECT variants + +{{code_block('user-guide/sql/sql_select','df',['SQLregister','SQLexecute'])}} + +```python exec="on" result="text" session="user-guide/sql/select" +--8<-- "python/user-guide/sql/sql_select.py:setup" +--8<-- "python/user-guide/sql/sql_select.py:df" +``` + +### GROUP BY + +The `GROUP BY` statement is used to group rows in a table by one or more columns and compute aggregate functions on each group. + +{{code_block('user-guide/sql/sql_select','group_by',['SQLexecute'])}} + +```python exec="on" result="text" session="user-guide/sql/select" +--8<-- "python/user-guide/sql/sql_select.py:group_by" +``` + +### ORDER BY + +The `ORDER BY` statement is used to sort the result set of a query by one or more columns in ascending or descending order. + +{{code_block('user-guide/sql/sql_select','orderby',['SQLexecute'])}} + +```python exec="on" result="text" session="user-guide/sql/select" +--8<-- "python/user-guide/sql/sql_select.py:orderby" +``` + +### JOIN + +{{code_block('user-guide/sql/sql_select','join',['SQLregister_many','SQLexecute'])}} + +```python exec="on" result="text" session="user-guide/sql/select" +--8<-- "python/user-guide/sql/sql_select.py:join" +``` + +### Functions + +Polars provides a wide range of SQL functions, including: + +- Mathematical functions: `ABS`, `EXP`, `LOG`, `ASIN`, `ACOS`, `ATAN`, etc. +- String functions: `LOWER`, `UPPER`, `LTRIM`, `RTRIM`, `STARTS_WITH`,`ENDS_WITH`. +- Aggregation functions: `SUM`, `AVG`, `MIN`, `MAX`, `COUNT`, `STDDEV`, `FIRST` etc. +- Array functions: `EXPLODE`, `UNNEST`,`ARRAY_SUM`,`ARRAY_REVERSE`, etc. + +For a full list of supported functions go the [API documentation](https://docs.rs/polars-sql/latest/src/polars_sql/keywords.rs.html). The example below demonstrates how to use a function in a query + +{{code_block('user-guide/sql/sql_select','functions',['SQLquery'])}} + +```python exec="on" result="text" session="user-guide/sql/select" +--8<-- "python/user-guide/sql/sql_select.py:functions" +``` + +### Table Functions + +In the examples earlier we first generated a DataFrame which we registered in the `SQLContext`. Polars also support directly reading from CSV, Parquet, JSON and IPC in your SQL query using table functions `read_xxx`. + +{{code_block('user-guide/sql/sql_select','tablefunctions',['SQLexecute'])}} + +```python exec="on" result="text" session="user-guide/sql/select" +--8<-- "python/user-guide/sql/sql_select.py:tablefunctions" +``` diff --git a/docs/user-guide/sql/show.md b/docs/user-guide/sql/show.md new file mode 100644 index 000000000000..70453ebcb6dd --- /dev/null +++ b/docs/user-guide/sql/show.md @@ -0,0 +1,22 @@ +# SHOW TABLES + +In Polars, the `SHOW TABLES` statement is used to list all the tables that have been registered in the current `SQLContext`. When you register a DataFrame with the `SQLContext`, you give it a name that can be used to refer to the DataFrame in subsequent SQL statements. The `SHOW TABLES` statement allows you to see a list of all the registered tables, along with their names. + +The syntax for the `SHOW TABLES` statement in Polars is as follows: + +``` +SHOW TABLES +``` + +Here's an example of how to use the `SHOW TABLES` statement in Polars: + +{{code_block('user-guide/sql/show','show',['SQLregister','SQLexecute'])}} + +```python exec="on" result="text" session="user-guide/sql/show" +--8<-- "python/user-guide/sql/show.py:setup" +--8<-- "python/user-guide/sql/show.py:show" +``` + +In this example, we create two DataFrames and register them with the `SQLContext` using different names. We then execute a `SHOW TABLES` statement using the `execute()` method of the `SQLContext` object, which returns a DataFrame containing a list of all the registered tables and their names. The resulting DataFrame is then printed using the `print()` function. + +Note that the `SHOW TABLES` statement only lists tables that have been registered with the current `SQLContext`. If you register a DataFrame with a different `SQLContext` or in a different Python session, it will not appear in the list of tables returned by `SHOW TABLES`. diff --git a/docs/user-guide/transformations/concatenation.md b/docs/user-guide/transformations/concatenation.md new file mode 100644 index 000000000000..8deff923acee --- /dev/null +++ b/docs/user-guide/transformations/concatenation.md @@ -0,0 +1,51 @@ +# Concatenation + +There are a number of ways to concatenate data from separate DataFrames: + +- two dataframes with **the same columns** can be **vertically** concatenated to make a **longer** dataframe +- two dataframes with the **same number of rows** and **non-overlapping columns** can be **horizontally** concatenated to make a **wider** dataframe +- two dataframes with **different numbers of rows and columns** can be **diagonally** concatenated to make a dataframe which might be longer and/ or wider. Where column names overlap values will be vertically concatenated. Where column names do not overlap new rows and columns will be added. Missing values will be set as `null` + +## Vertical concatenation - getting longer + +In a vertical concatenation you combine all of the rows from a list of `DataFrames` into a single longer `DataFrame`. + +{{code_block('user-guide/transformations/concatenation','vertical',['concat'])}} + +```python exec="on" result="text" session="user-guide/transformations/concatenation" +--8<-- "python/user-guide/transformations/concatenation.py:setup" +--8<-- "python/user-guide/transformations/concatenation.py:vertical" +``` + +Vertical concatenation fails when the dataframes do not have the same column names. + +## Horizontal concatenation - getting wider + +In a horizontal concatenation you combine all of the columns from a list of `DataFrames` into a single wider `DataFrame`. + +{{code_block('user-guide/transformations/concatenation','horizontal',['concat'])}} + +```python exec="on" result="text" session="user-guide/transformations/concatenation" +--8<-- "python/user-guide/transformations/concatenation.py:horizontal" +``` + +Horizontal concatenation fails when dataframes have overlapping columns or a different number of rows. + +## Diagonal concatenation - getting longer, wider and `null`ier + +In a diagonal concatenation you combine all of the row and columns from a list of `DataFrames` into a single longer and/or wider `DataFrame`. + +{{code_block('user-guide/transformations/concatenation','cross',['concat'])}} + +```python exec="on" result="text" session="user-guide/transformations/concatenation" +--8<-- "python/user-guide/transformations/concatenation.py:cross" +``` + +Diagonal concatenation generates nulls when the column names do not overlap. + +When the dataframe shapes do not match and we have an overlapping semantic key then [we can join the dataframes](joins.md) instead of concatenating them. + +## Rechunking + +Before a concatenation we have two dataframes `df1` and `df2`. Each column in `df1` and `df2` is in one or more chunks in memory. By default, during concatenation the chunks in each column are copied to a single new chunk - this is known as **rechunking**. Rechunking is an expensive operation, but is often worth it because future operations will be faster. +If you do not want Polars to rechunk the concatenated `DataFrame` you specify `rechunk = False` when doing the concatenation. diff --git a/docs/user-guide/transformations/joins.md b/docs/user-guide/transformations/joins.md new file mode 100644 index 000000000000..ad233cf060fb --- /dev/null +++ b/docs/user-guide/transformations/joins.md @@ -0,0 +1,183 @@ +# Joins + +## Join strategies + +`Polars` supports the following join strategies by specifying the `strategy` argument: + +| Strategy | Description | +| -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `inner` | Returns row with matching keys in _both_ frames. Non-matching rows in either the left or right frame are discarded. | +| `left` | Returns all rows in the left dataframe, whether or not a match in the right-frame is found. Non-matching rows have their right columns null-filled. | +| `outer` | Returns all rows from both the left and right dataframe. If no match is found in one frame, columns from the other frame are null-filled. | +| `cross` | Returns the Cartesian product of all rows from the left frame with all rows from the right frame. Duplicates rows are retained; the table length of `A` cross-joined with `B` is always `len(A) × len(B)`. | +| `asof` | A left-join in which the match is performed on the _nearest_ key rather than on equal keys. | +| `semi` | Returns all rows from the left frame in which the join key is also present in the right frame. | +| `anti` | Returns all rows from the left frame in which the join key is _not_ present in the right frame. | + +### Inner join + +An `inner` join produces a `DataFrame` that contains only the rows where the join key exists in both `DataFrames`. Let's take for example the following two `DataFrames`: + +{{code_block('user-guide/transformations/joins','innerdf',['DataFrame'])}} + +```python exec="on" result="text" session="user-guide/transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:setup" +--8<-- "python/user-guide/transformations/joins.py:innerdf" +``` + +

+ +{{code_block('user-guide/transformations/joins','innerdf2',['DataFrame'])}} + +```python exec="on" result="text" session="user-guide/transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:innerdf2" +``` + +To get a `DataFrame` with the orders and their associated customer we can do an `inner` join on the `customer_id` column: + +{{code_block('user-guide/transformations/joins','inner',['join'])}} + +```python exec="on" result="text" session="user-guide/transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:inner" +``` + +### Left join + +The `left` join produces a `DataFrame` that contains all the rows from the left `DataFrame` and only the rows from the right `DataFrame` where the join key exists in the left `DataFrame`. If we now take the example from above and want to have a `DataFrame` with all the customers and their associated orders (regardless of whether they have placed an order or not) we can do a `left` join: + +{{code_block('user-guide/transformations/joins','left',['join'])}} + +```python exec="on" result="text" session="user-guide/transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:left" +``` + +Notice, that the fields for the customer with the `customer_id` of `3` are null, as there are no orders for this customer. + +### Outer join + +The `outer` join produces a `DataFrame` that contains all the rows from both `DataFrames`. Columns are null, if the join key does not exist in the source `DataFrame`. Doing an `outer` join on the two `DataFrames` from above produces a similar `DataFrame` to the `left` join: + +{{code_block('user-guide/transformations/joins','outer',['join'])}} + +```python exec="on" result="text" session="user-guide/transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:outer" +``` + +### Cross join + +A `cross` join is a cartesian product of the two `DataFrames`. This means that every row in the left `DataFrame` is joined with every row in the right `DataFrame`. The `cross` join is useful for creating a `DataFrame` with all possible combinations of the columns in two `DataFrames`. Let's take for example the following two `DataFrames`. + +{{code_block('user-guide/transformations/joins','df3',['DataFrame'])}} + +```python exec="on" result="text" session="user-guide/transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:df3" +``` + +

+ +{{code_block('user-guide/transformations/joins','df4',['DataFrame'])}} + +```python exec="on" result="text" session="user-guide/transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:df4" +``` + +We can now create a `DataFrame` containing all possible combinations of the colors and sizes with a `cross` join: + +{{code_block('user-guide/transformations/joins','cross',['join'])}} + +```python exec="on" result="text" session="user-guide/transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:cross" +``` + +
+ +The `inner`, `left`, `outer` and `cross` join strategies are standard amongst dataframe libraries. We provide more details on the less familiar `semi`, `anti` and `asof` join strategies below. + +### Semi join + +The `semi` join returns all rows from the left frame in which the join key is also present in the right frame. Consider the following scenario: a car rental company has a `DataFrame` showing the cars that it owns with each car having a unique `id`. + +{{code_block('user-guide/transformations/joins','df5',['DataFrame'])}} + +```python exec="on" result="text" session="user-guide/transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:df5" +``` + +The company has another `DataFrame` showing each repair job carried out on a vehicle. + +{{code_block('user-guide/transformations/joins','df6',['DataFrame'])}} + +```python exec="on" result="text" session="user-guide/transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:df6" +``` + +You want to answer this question: which of the cars have had repairs carried out? + +An inner join does not answer this question directly as it produces a `DataFrame` with multiple rows for each car that has had multiple repair jobs: + +{{code_block('user-guide/transformations/joins','inner2',['join'])}} + +```python exec="on" result="text" session="user-guide/transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:inner2" +``` + +However, a semi join produces a single row for each car that has had a repair job carried out. + +{{code_block('user-guide/transformations/joins','semi',['join'])}} + +```python exec="on" result="text" session="user-guide/transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:semi" +``` + +### Anti join + +Continuing this example, an alternative question might be: which of the cars have **not** had a repair job carried out? An anti join produces a `DataFrame` showing all the cars from `df_cars` where the `id` is not present in the `df_repairs` `DataFrame`. + +{{code_block('user-guide/transformations/joins','anti',['join'])}} + +```python exec="on" result="text" session="user-guide/transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:anti" +``` + +### Asof join + +An `asof` join is like a left join except that we match on nearest key rather than equal keys. +In `Polars` we can do an asof join with the `join` method and specifying `strategy="asof"`. However, for more flexibility we can use the `join_asof` method. + +Consider the following scenario: a stock market broker has a `DataFrame` called `df_trades` showing transactions it has made for different stocks. + +{{code_block('user-guide/transformations/joins','df7',['DataFrame'])}} + +```python exec="on" result="text" session="user-guide/transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:df7" +``` + +The broker has another `DataFrame` called `df_quotes` showing prices it has quoted for these stocks. + +{{code_block('user-guide/transformations/joins','df8',['DataFrame'])}} + +```python exec="on" result="text" session="user-guide/transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:df8" +``` + +You want to produce a `DataFrame` showing for each trade the most recent quote provided _before_ the trade. You do this with `join_asof` (using the default `strategy = "backward"`). +To avoid joining between trades on one stock with a quote on another you must specify an exact preliminary join on the stock column with `by="stock"`. + +{{code_block('user-guide/transformations/joins','asof',['join_asof'])}} + +```python exec="on" result="text" session="user-guide/transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:asofpre" +--8<-- "python/user-guide/transformations/joins.py:asof" +``` + +If you want to make sure that only quotes within a certain time range are joined to the trades you can specify the `tolerance` argument. In this case we want to make sure that the last preceding quote is within 1 minute of the trade so we set `tolerance = "1m"`. + +=== ":fontawesome-brands-python: Python" + +```python +--8<-- "python/user-guide/transformations/joins.py:asof2" +``` + +```python exec="on" result="text" session="user-guide/transformations/joins" +--8<-- "python/user-guide/transformations/joins.py:asof2" +``` diff --git a/docs/user-guide/transformations/melt.md b/docs/user-guide/transformations/melt.md new file mode 100644 index 000000000000..3e6efe35723e --- /dev/null +++ b/docs/user-guide/transformations/melt.md @@ -0,0 +1,21 @@ +# Melts + +Melt operations unpivot a DataFrame from wide format to long format + +## Dataset + +{{code_block('user-guide/transformations/melt','df',['DataFrame'])}} + +```python exec="on" result="text" session="user-guide/transformations/melt" +--8<-- "python/user-guide/transformations/melt.py:df" +``` + +## Eager + lazy + +`Eager` and `lazy` have the same API. + +{{code_block('user-guide/transformations/melt','melt',['melt'])}} + +```python exec="on" result="text" session="user-guide/transformations/melt" +--8<-- "python/user-guide/transformations/melt.py:melt" +``` diff --git a/docs/user-guide/transformations/pivot.md b/docs/user-guide/transformations/pivot.md new file mode 100644 index 000000000000..9850dbed0330 --- /dev/null +++ b/docs/user-guide/transformations/pivot.md @@ -0,0 +1,46 @@ +# Pivots + +Pivot a column in a `DataFrame` and perform one of the following aggregations: + +- first +- sum +- min +- max +- mean +- median + +The pivot operation consists of a group by one, or multiple columns (these will be the +new y-axis), the column that will be pivoted (this will be the new x-axis) and an +aggregation. + +## Dataset + +{{code_block('user-guide/transformations/pivot','df',['DataFrame'])}} + +```python exec="on" result="text" session="user-guide/transformations/pivot" +--8<-- "python/user-guide/transformations/pivot.py:setup" +--8<-- "python/user-guide/transformations/pivot.py:df" +``` + +## Eager + +{{code_block('user-guide/transformations/pivot','eager',['pivot'])}} + +```python exec="on" result="text" session="user-guide/transformations/pivot" +--8<-- "python/user-guide/transformations/pivot.py:eager" +``` + +## Lazy + +A polars `LazyFrame` always need to know the schema of a computation statically (before collecting the query). +As a pivot's output schema depends on the data, and it is therefore impossible to determine the schema without +running the query. + +Polars could have abstracted this fact for you just like Spark does, but we don't want you to shoot yourself in the foot +with a shotgun. The cost should be clear upfront. + +{{code_block('user-guide/transformations/pivot','lazy',['pivot'])}} + +```python exec="on" result="text" session="user-guide/transformations/pivot" +--8<-- "python/user-guide/transformations/pivot.py:lazy" +``` diff --git a/docs/user-guide/transformations/time-series/filter.md b/docs/user-guide/transformations/time-series/filter.md new file mode 100644 index 000000000000..326969c34e11 --- /dev/null +++ b/docs/user-guide/transformations/time-series/filter.md @@ -0,0 +1,48 @@ +# Filtering + +Filtering date columns works in the same way as with other types of columns using the `.filter` method. + +Polars uses Python's native `datetime`, `date` and `timedelta` for equality comparisons between the datatypes `pl.Datetime`, `pl.Date` and `pl.Duration`. + +In the following example we use a time series of Apple stock prices. + +{{code_block('user-guide/transformations/time-series/filter','df',['read_csv'])}} + +```python exec="on" result="text" session="user-guide/transformations/ts/filter" +--8<-- "python/user-guide/transformations/time-series/filter.py:df" +``` + +## Filtering by single dates + +We can filter by a single date by casting the desired date string to a `Date` object +in a filter expression: + +{{code_block('user-guide/transformations/time-series/filter','filter',['filter'])}} + +```python exec="on" result="text" session="user-guide/transformations/ts/filter" +--8<-- "python/user-guide/transformations/time-series/filter.py:filter" +``` + +Note we are using the lowercase `datetime` method rather than the uppercase `Datetime` data type. + +## Filtering by a date range + +We can filter by a range of dates using the `is_between` method in a filter expression with the start and end dates: + +{{code_block('user-guide/transformations/time-series/filter','range',['filter','is_between'])}} + +```python exec="on" result="text" session="user-guide/transformations/ts/filter" +--8<-- "python/user-guide/transformations/time-series/filter.py:range" +``` + +## Filtering with negative dates + +Say you are working with an archeologist and are dealing in negative dates. +Polars can parse and store them just fine, but the Python `datetime` library +does not. So for filtering, you should use attributes in the `.dt` namespace: + +{{code_block('user-guide/transformations/time-series/filter','negative',['strptime'])}} + +```python exec="on" result="text" session="user-guide/transformations/ts/filter" +--8<-- "python/user-guide/transformations/time-series/filter.py:negative" +``` diff --git a/docs/user-guide/transformations/time-series/parsing.md b/docs/user-guide/transformations/time-series/parsing.md new file mode 100644 index 000000000000..a31095d07434 --- /dev/null +++ b/docs/user-guide/transformations/time-series/parsing.md @@ -0,0 +1,58 @@ +# Parsing + +Polars has native support for parsing time series data and doing more sophisticated operations such as temporal grouping and resampling. + +## Datatypes + +`Polars` has the following datetime datatypes: + +- `Date`: Date representation e.g. 2014-07-08. It is internally represented as days since UNIX epoch encoded by a 32-bit signed integer. +- `Datetime`: Datetime representation e.g. 2014-07-08 07:00:00. It is internally represented as a 64 bit integer since the Unix epoch and can have different units such as ns, us, ms. +- `Duration`: A time delta type that is created when subtracting `Date/Datetime`. Similar to `timedelta` in python. +- `Time`: Time representation, internally represented as nanoseconds since midnight. + +## Parsing dates from a file + +When loading from a CSV file `Polars` attempts to parse dates and times if the `try_parse_dates` flag is set to `True`: + +{{code_block('user-guide/transformations/time-series/parsing','df',['read_csv'])}} + +```python exec="on" result="text" session="user-guide/transformations/ts/parsing" +--8<-- "python/user-guide/transformations/time-series/parsing.py:setup" +--8<-- "python/user-guide/transformations/time-series/parsing.py:df" +``` + +On the other hand binary formats such as parquet have a schema that is respected by `Polars`. + +## Casting strings to dates + +You can also cast a column of datetimes encoded as strings to a datetime type. You do this by calling the string `str.strptime` method and passing the format of the date string: + +{{code_block('user-guide/transformations/time-series/parsing','cast',['read_csv','strptime'])}} + +```python exec="on" result="text" session="user-guide/transformations/ts/parsing" +--8<-- "python/user-guide/transformations/time-series/parsing.py:cast" +``` + +[The strptime date formats can be found here.](https://docs.rs/chrono/latest/chrono/format/strftime/index.html). + +## Extracting date features from a date column + +You can extract data features such as the year or day from a date column using the `.dt` namespace on a date column: + +{{code_block('user-guide/transformations/time-series/parsing','extract',['year'])}} + +```python exec="on" result="text" session="user-guide/transformations/ts/parsing" +--8<-- "python/user-guide/transformations/time-series/parsing.py:extract" +``` + +## Mixed offsets + +If you have mixed offsets (say, due to crossing daylight saving time), +then you can use `utc=True` and then convert to your time zone: + +{{code_block('user-guide/transformations/time-series/parsing','mixed',['strptime','convert_time_zone'])}} + +```python exec="on" result="text" session="user-guide/transformations/ts/parsing" +--8<-- "python/user-guide/transformations/time-series/parsing.py:mixed" +``` diff --git a/docs/user-guide/transformations/time-series/resampling.md b/docs/user-guide/transformations/time-series/resampling.md new file mode 100644 index 000000000000..63ad583a9bec --- /dev/null +++ b/docs/user-guide/transformations/time-series/resampling.md @@ -0,0 +1,42 @@ +# Resampling + +We can resample by either: + +- upsampling (moving data to a higher frequency) +- downsampling (moving data to a lower frequency) +- combinations of these e.g. first upsample and then downsample + +## Downsampling to a lower frequency + +`Polars` views downsampling as a special case of the **group_by** operation and you can do this with `group_by_dynamic` and `group_by_rolling` - [see the temporal group by page for examples](rolling.md). + +## Upsampling to a higher frequency + +Let's go through an example where we generate data at 30 minute intervals: + +{{code_block('user-guide/transformations/time-series/resampling','df',['DataFrame','date_range'])}} + +```python exec="on" result="text" session="user-guide/transformations/ts/resampling" +--8<-- "python/user-guide/transformations/time-series/resampling.py:setup" +--8<-- "python/user-guide/transformations/time-series/resampling.py:df" +``` + +Upsampling can be done by defining the new sampling interval. By upsampling we are adding in extra rows where we do not have data. As such upsampling by itself gives a DataFrame with nulls. These nulls can then be filled with a fill strategy or interpolation. + +### Upsampling strategies + +In this example we upsample from the original 30 minutes to 15 minutes and then use a `forward` strategy to replace the nulls with the previous non-null value: + +{{code_block('user-guide/transformations/time-series/resampling','upsample',['upsample'])}} + +```python exec="on" result="text" session="user-guide/transformations/ts/resampling" +--8<-- "python/user-guide/transformations/time-series/resampling.py:upsample" +``` + +In this example we instead fill the nulls by linear interpolation: + +{{code_block('user-guide/transformations/time-series/resampling','upsample2',['upsample','interpolate','fill_null'])}} + +```python exec="on" result="text" session="user-guide/transformations/ts/resampling" +--8<-- "python/user-guide/transformations/time-series/resampling.py:upsample2" +``` diff --git a/docs/user-guide/transformations/time-series/rolling.md b/docs/user-guide/transformations/time-series/rolling.md new file mode 100644 index 000000000000..a88373caada2 --- /dev/null +++ b/docs/user-guide/transformations/time-series/rolling.md @@ -0,0 +1,148 @@ +# Grouping + +## Grouping by fixed windows + +We can calculate temporal statistics using `group_by_dynamic` to group rows into days/months/years etc. + +### Annual average example + +In following simple example we calculate the annual average closing price of Apple stock prices. We first load the data from CSV: + +{{code_block('user-guide/transformations/time-series/rolling','df',['upsample'])}} + +```python exec="on" result="text" session="user-guide/transformations/ts/rolling" +--8<-- "python/user-guide/transformations/time-series/rolling.py:setup" +--8<-- "python/user-guide/transformations/time-series/rolling.py:df" +``` + +!!! info + + The dates are sorted in ascending order - if they are not sorted in this way the `group_by_dynamic` output will not be correct! + +To get the annual average closing price we tell `group_by_dynamic` that we want to: + +- group by the `Date` column on an annual (`1y`) basis +- take the mean values of the `Close` column for each year: + +{{code_block('user-guide/transformations/time-series/rolling','group_by',['group_by_dynamic'])}} + +The annual average closing price is then: + +```python exec="on" result="text" session="user-guide/transformations/ts/rolling" +--8<-- "python/user-guide/transformations/time-series/rolling.py:group_by" +``` + +### Parameters for `group_by_dynamic` + +A dynamic window is defined by a: + +- **every**: indicates the interval of the window +- **period**: indicates the duration of the window +- **offset**: can be used to offset the start of the windows + +The value for `every` sets how often the groups start. The time period values are flexible - for example we could take: + +- the average over 2 year intervals by replacing `1y` with `2y` +- the average over 18 month periods by replacing `1y` with `1y6mo` + +We can also use the `period` parameter to set how long the time period for each group is. For example, if we set the `every` parameter to be `1y` and the `period` parameter to be `2y` then we would get groups at one year intervals where each groups spanned two years. + +If the `period` parameter is not specified then it is set equal to the `every` parameter so that if the `every` parameter is set to be `1y` then each group spans `1y` as well. + +Because _**every**_ does not have to be equal to _**period**_, we can create many groups in a very flexible way. They may overlap +or leave boundaries between them. + +Let's see how the windows for some parameter combinations would look. Let's start out boring. 🥱 + +- every: 1 day -> `"1d"` +- period: 1 day -> `"1d"` + +```text +this creates adjacent windows of the same size +|--| + |--| + |--| +``` + +- every: 1 day -> `"1d"` +- period: 2 days -> `"2d"` + +```text +these windows have an overlap of 1 day +|----| + |----| + |----| +``` + +- every: 2 days -> `"2d"` +- period: 1 day -> `"1d"` + +```text +this would leave gaps between the windows +data points that in these gaps will not be a member of any group +|--| + |--| + |--| +``` + +#### `truncate` + +The `truncate` parameter is a Boolean variable that determines what datetime value is associated with each group in the output. In the example above the first data point is on 23rd February 1981. If `truncate = True` (the default) then the date for the first year in the annual average is 1st January 1981. However, if `truncate = False` then the date for the first year in the annual average is the date of the first data point on 23rd February 1981. Note that `truncate` only affects what's shown in the +`Date` column and does not affect the window boundaries. + +### Using expressions in `group_by_dynamic` + +We aren't restricted to using simple aggregations like `mean` in a group by operation - we can use the full range of expressions available in Polars. + +In the snippet below we create a `date range` with every **day** (`"1d"`) in 2021 and turn this into a `DataFrame`. + +Then in the `group_by_dynamic` we create dynamic windows that start every **month** (`"1mo"`) and have a window length of `1` month. The values that match these dynamic windows are then assigned to that group and can be aggregated with the powerful expression API. + +Below we show an example where we use **group_by_dynamic** to compute: + +- the number of days until the end of the month +- the number of days in a month + +{{code_block('user-guide/transformations/time-series/rolling','group_by_dyn',['group_by_dynamic','explode','date_range'])}} + +```python exec="on" result="text" session="user-guide/transformations/ts/rolling" +--8<-- "python/user-guide/transformations/time-series/rolling.py:group_by_dyn" +``` + +## Grouping by rolling windows + +The rolling group by, `group_by_rolling`, is another entrance to the `group_by` context. But different from the `group_by_dynamic` the windows are +not fixed by a parameter `every` and `period`. In a rolling group by, the windows are not fixed at all! They are determined +by the values in the `index_column`. + +So imagine having a time column with the values `{2021-01-06, 2021-01-10}` and a `period="5d"` this would create the following +windows: + +```text +2021-01-01 2021-01-06 + |----------| + + 2021-01-05 2021-01-10 + |----------| +``` + +Because the windows of a rolling group by are always determined by the values in the `DataFrame` column, the number of +groups is always equal to the original `DataFrame`. + +## Combining group by operations + +Rolling and dynamic group by operations can be combined with normal group by operations. + +Below is an example with a dynamic group by. + +{{code_block('user-guide/transformations/time-series/rolling','group_by_roll',['DataFrame'])}} + +```python exec="on" result="text" session="user-guide/transformations/ts/rolling" +--8<-- "python/user-guide/transformations/time-series/rolling.py:group_by_roll" +``` + +{{code_block('user-guide/transformations/time-series/rolling','group_by_dyn2',['group_by_dynamic'])}} + +```python exec="on" result="text" session="user-guide/transformations/ts/rolling" +--8<-- "python/user-guide/transformations/time-series/rolling.py:group_by_dyn2" +``` diff --git a/docs/user-guide/transformations/time-series/timezones.md b/docs/user-guide/transformations/time-series/timezones.md new file mode 100644 index 000000000000..48f6870e8b20 --- /dev/null +++ b/docs/user-guide/transformations/time-series/timezones.md @@ -0,0 +1,46 @@ +--- +hide: + - toc +--- + +# Time zones + +!!! quote "Tom Scott" + + You really should never, ever deal with time zones if you can help it. + +The `Datetime` datatype can have a time zone associated with it. +Examples of valid time zones are: + +- `None`: no time zone, also known as "time zone naive"; +- `UTC`: Coordinated Universal Time; +- `Asia/Kathmandu`: time zone in "area/location" format. + See the [list of tz database time zones](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones) + to see what's available; +- `+01:00`: fixed offsets. May be useful when parsing, but you almost certainly want the "Area/Location" + format above instead as it will deal with irregularities such as DST (Daylight Saving Time) for you. + +Note that, because a `Datetime` can only have a single time zone, it is +impossible to have a column with multiple time zones. If you are parsing data +with multiple offsets, you may want to pass `utc=True` to convert +them all to a common time zone (`UTC`), see [parsing dates and times](parsing.md). + +The main methods for setting and converting between time zones are: + +- `dt.convert_time_zone`: convert from one time zone to another; +- `dt.replace_time_zone`: set/unset/change time zone; + +Let's look at some examples of common operations: + +{{code_block('user-guide/transformations/time-series/timezones','example',['strptime','replace_time_zone'])}} + +```python exec="on" result="text" session="user-guide/transformations/ts/timezones" +--8<-- "python/user-guide/transformations/time-series/timezones.py:setup" +--8<-- "python/user-guide/transformations/time-series/timezones.py:example" +``` + +{{code_block('user-guide/transformations/time-series/timezones','example2',['convert_time_zone','replace_time_zone'])}} + +```python exec="on" result="text" session="user-guide/transformations/ts/timezones" +--8<-- "python/user-guide/transformations/time-series/timezones.py:example2" +``` diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 000000000000..65e961b13225 --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,163 @@ +# https://www.mkdocs.org/user-guide/configuration/ + +# Project information +site_name: Polars documentation +site_url: https://pola-rs.github.io/polars +repo_url: https://github.com/pola-rs/polars +repo_name: pola-rs/polars + +# Documentation layout +nav: + - Home: index.md + - Getting started: + - getting-started/intro.md + - getting-started/installation.md + - getting-started/series-dataframes.md + - getting-started/reading-writing.md + - getting-started/expressions.md + - getting-started/joins.md + - User guide: + - user-guide/index.md + - user-guide/installation.md + - Concepts: + - user-guide/concepts/data-types.md + - user-guide/concepts/data-structures.md + - user-guide/concepts/contexts.md + - user-guide/concepts/expressions.md + - user-guide/concepts/lazy-vs-eager.md + - user-guide/concepts/streaming.md + - Expressions: + - user-guide/expressions/operators.md + - user-guide/expressions/column-selections.md + - user-guide/expressions/functions.md + - user-guide/expressions/casting.md + - user-guide/expressions/strings.md + - user-guide/expressions/aggregation.md + - user-guide/expressions/null.md + - user-guide/expressions/window.md + - user-guide/expressions/folds.md + - user-guide/expressions/lists.md + - user-guide/expressions/user-defined-functions.md + - user-guide/expressions/structs.md + - user-guide/expressions/numpy.md + - Transformations: + - user-guide/transformations/joins.md + - user-guide/transformations/concatenation.md + - user-guide/transformations/pivot.md + - user-guide/transformations/melt.md + - Time series: + - user-guide/transformations/time-series/parsing.md + - user-guide/transformations/time-series/filter.md + - user-guide/transformations/time-series/rolling.md + - user-guide/transformations/time-series/resampling.md + - user-guide/transformations/time-series/timezones.md + - Lazy API: + - user-guide/lazy/using.md + - user-guide/lazy/optimizations.md + - user-guide/lazy/schemas.md + - user-guide/lazy/query_plan.md + - user-guide/lazy/execution.md + - user-guide/lazy/streaming.md + - IO: + - user-guide/io/csv.md + - user-guide/io/parquet.md + - user-guide/io/json_file.md + - user-guide/io/multiple.md + - user-guide/io/database.md + - user-guide/io/aws.md + - user-guide/io/bigquery.md + - SQL: + - user-guide/sql/intro.md + - user-guide/sql/show.md + - user-guide/sql/select.md + - user-guide/sql/create.md + - user-guide/sql/cte.md + - Migrating: + - user-guide/migration/pandas.md + - user-guide/migration/spark.md + - Misc: + - user-guide/misc/multiprocessing.md + - user-guide/misc/alternatives.md + - user-guide/misc/reference-guides.md + - user-guide/misc/contributing.md +not_in_nav: | + /_build/ + people.md +validation: + links: + # Allow an absolute link to the features page for our code snippets + absolute_links: ignore + +# Build directories +theme: + name: material + locale: en + custom_dir: docs/_build/overrides + palette: + # Palette toggle for light mode + - media: "(prefers-color-scheme: light)" + scheme: default + toggle: + icon: material/brightness-7 + name: Switch to dark mode + # Palette toggle for dark mode + - media: "(prefers-color-scheme: dark)" + scheme: slate + toggle: + icon: material/brightness-4 + name: Switch to light mode + logo: _build/assets/logo.png + features: + - navigation.tracking + - navigation.instant + - navigation.tabs + - navigation.tabs.sticky + - navigation.footer + - content.tabs.link + icon: + repo: fontawesome/brands/github + +extra_css: + - _build/css/extra.css +extra: + consent: + title: Cookie consent + description: >- + We use cookies to recognize your repeated visits and preferences, as well + as to measure the effectiveness of our documentation and whether users + find what they're searching for. With your consent, you're helping us to + make our documentation better. + analytics: + provider: google + property: G-LKNVFWD3T5 + +# Preview controls +# TODO: Fix warnings and turn on strict mode +strict: false + +# Formatting options +markdown_extensions: + - admonition + - pymdownx.details + - attr_list + - pymdownx.emoji: + emoji_index: !!python/name:materialx.emoji.twemoji + emoji_generator: !!python/name:materialx.emoji.to_svg + - pymdownx.superfences + - pymdownx.tabbed: + alternate_style: true + - pymdownx.snippets: + base_path: ['.','docs/src/'] + check_paths: true + dedent_subsections: true + - footnotes + +hooks: + - docs/_build/scripts/people.py + +plugins: + - search: + lang: en + - markdown-exec + - macros: + module_name: docs/_build/scripts/macro diff --git a/py-polars/Makefile b/py-polars/Makefile index f5fd7c4404cd..74336116b0d0 100644 --- a/py-polars/Makefile +++ b/py-polars/Makefile @@ -76,6 +76,7 @@ test: .venv build ## Run fast unittests .PHONY: doctest doctest: .venv build ## Run doctests $(VENV_BIN)/python tests/docs/run_doctest.py + $(VENV_BIN)/pytest tests/docs/test_user_guide.py -m docs .PHONY: test-all test-all: .venv build ## Run all tests diff --git a/py-polars/docs/source/conf.py b/py-polars/docs/source/conf.py index 2e3f1b34290a..e4f3a71c516d 100644 --- a/py-polars/docs/source/conf.py +++ b/py-polars/docs/source/conf.py @@ -106,7 +106,7 @@ "external_links": [ { "name": "User Guide", - "url": f"{web_root}/polars-book/user-guide/index.html", + "url": f"{web_root}/polars/user-guide/index.html", }, { "name": "Powered by Xomnia", diff --git a/py-polars/polars/expr/expr.py b/py-polars/polars/expr/expr.py index 916b963497ce..5524132bc6d2 100644 --- a/py-polars/polars/expr/expr.py +++ b/py-polars/polars/expr/expr.py @@ -3694,7 +3694,7 @@ def map_batches( represented by an expression using a third-party library. Read more in `the book - `_. + `_. Parameters ---------- diff --git a/py-polars/pyproject.toml b/py-polars/pyproject.toml index f0048981a912..9ee24aeebd23 100644 --- a/py-polars/pyproject.toml +++ b/py-polars/pyproject.toml @@ -191,6 +191,7 @@ markers = [ "write_disk: Tests that write to disk", "slow: Tests with a longer than average runtime.", "benchmark: Tests that should be run on a Polars release build.", + "docs: Documentation code snippets", ] filterwarnings = [ # Fail on warnings... @@ -198,6 +199,7 @@ filterwarnings = [ # ...except where it prevents test debugging in an IPython console "ignore:.*unrecognized arguments.*PyDevIPCompleter:DeprecationWarning", "ignore:.*is_sparse is deprecated.*:FutureWarning", + "ignore:Matplotlib is currently using agg:UserWarning", ] xfail_strict = true diff --git a/py-polars/requirements-dev.txt b/py-polars/requirements-dev.txt index 692095f66151..1b675ab8ceb6 100644 --- a/py-polars/requirements-dev.txt +++ b/py-polars/requirements-dev.txt @@ -36,6 +36,8 @@ XlsxWriter deltalake == 0.10.1 # Dataframe interchange protocol dataframe-api-compat >= 0.1.6 +# Other +matplotlib # ------- # TOOLING diff --git a/py-polars/tests/docs/test_user_guide.py b/py-polars/tests/docs/test_user_guide.py new file mode 100644 index 000000000000..032961dd936a --- /dev/null +++ b/py-polars/tests/docs/test_user_guide.py @@ -0,0 +1,32 @@ +"""Run all Python code snippets.""" +import os +import runpy +from pathlib import Path +from typing import Iterator + +import matplotlib +import pytest + +# Do not show plots +matplotlib.use("Agg") + +# Get paths to Python code snippets +repo_root = Path(__file__).parent.parent.parent.parent +python_snippets_dir = repo_root / "docs" / "src" / "python" +snippet_paths = list(python_snippets_dir.rglob("*.py")) + + +@pytest.fixture(scope="module") +def _change_test_dir() -> Iterator[None]: + """Change path to repo root to accommodate data paths in code snippets.""" + current_path = Path() + os.chdir(repo_root) + yield + os.chdir(current_path) + + +@pytest.mark.docs() +@pytest.mark.parametrize("path", snippet_paths) +@pytest.mark.usefixtures("_change_test_dir") +def test_run_python_snippets(path: Path) -> None: + runpy.run_path(str(path))