From 18dd7d2aca489f1ebef2ce128067beba9daf8dec Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 3 Mar 2024 11:47:38 -0700 Subject: [PATCH] ruff format --- .github/workflows/test.yaml | 7 ------ .pre-commit-config.yaml | 21 ++++++---------- README.md | 21 +++++++++++++--- benchmarks/db-benchmark/groupby-datafusion.py | 16 ++++--------- benchmarks/db-benchmark/join-datafusion.py | 24 ++++--------------- benchmarks/tpch/tpch.py | 4 +--- ci/scripts/python_lint.sh | 22 +++++++++++++++++ datafusion/tests/test_dataframe.py | 4 ++-- dev/release/check-rat-report.py | 4 +--- dev/release/generate-changelog.py | 8 ++----- examples/sql-on-polars.py | 4 +--- examples/sql-using-python-udaf.py | 8 ++----- examples/substrait.py | 8 ++----- 13 files changed, 66 insertions(+), 85 deletions(-) create mode 100755 ci/scripts/python_lint.sh diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 371f0797..291120bf 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -103,13 +103,6 @@ jobs: source venv/bin/activate pip install -r requirements-311.txt - - name: Run Python Linters - if: ${{ matrix.python-version == '3.10' && matrix.toolchain == 'stable' }} - run: | - source venv/bin/activate - flake8 --exclude venv,benchmarks/db-benchmark --ignore=E501,W503 - black --line-length 79 --diff --check . - - name: Run tests env: RUST_BACKTRACE: 1 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 39049bf4..8509fae2 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -20,21 +20,14 @@ repos: rev: v1.6.23 hooks: - id: actionlint-docker - - repo: https://github.com/psf/black - rev: 22.3.0 + - repo: https://github.com/astral-sh/ruff-pre-commit + # Ruff version. + rev: v0.3.0 hooks: - - id: black - files: datafusion/.* - # Explicitly specify the pyproject.toml at the repo root, not per-project. - args: ["--config", "pyproject.toml", "--line-length", "79", "--diff", "--check", "."] - - repo: https://github.com/PyCQA/flake8 - rev: 5.0.4 - hooks: - - id: flake8 - files: datafusion/.*$ - types: [file] - types_or: [python] - additional_dependencies: ["flake8-force"] + # Run the linter. + - id: ruff + # Run the formatter. + - id: ruff-format - repo: local hooks: - id: rust-fmt diff --git a/README.md b/README.md index a682f73d..a2e3efd4 100644 --- a/README.md +++ b/README.md @@ -202,7 +202,7 @@ source venv/bin/activate # update pip itself if necessary python -m pip install -U pip # install dependencies (for Python 3.8+) -python -m pip install -r requirements-310.txt +python -m pip install -r requirements.in ``` The tests rely on test data in git submodules. @@ -222,12 +222,27 @@ python -m pytest ### Running & Installing pre-commit hooks -arrow-datafusion-python takes advantage of [pre-commit](https://pre-commit.com/) to assist developers with code linting to help reduce the number of commits that ultimately fail in CI due to linter errors. Using the pre-commit hooks is optional for the developer but certainly helpful for keeping PRs clean and concise. +arrow-datafusion-python takes advantage of [pre-commit](https://pre-commit.com/) to assist developers with code linting to help reduce +the number of commits that ultimately fail in CI due to linter errors. Using the pre-commit hooks is optional for the +developer but certainly helpful for keeping PRs clean and concise. -Our pre-commit hooks can be installed by running `pre-commit install`, which will install the configurations in your ARROW_DATAFUSION_PYTHON_ROOT/.github directory and run each time you perform a commit, failing to complete the commit if an offending lint is found allowing you to make changes locally before pushing. +Our pre-commit hooks can be installed by running `pre-commit install`, which will install the configurations in +your ARROW_DATAFUSION_PYTHON_ROOT/.github directory and run each time you perform a commit, failing to complete +the commit if an offending lint is found allowing you to make changes locally before pushing. The pre-commit hooks can also be run adhoc without installing them by simply running `pre-commit run --all-files` +## Running linters without using pre-commit + +There are scripts in `ci/scripts` for running Rust and Python linters. + +```shell +./ci/scripts/python_lint.sh +./ci/scripts/rust_clippy.sh +./ci/scripts/rust_fmt.sh +./ci/scripts/rust_toml_fmt.sh +``` + ## How to update dependencies To change test dependencies, change the `requirements.in` and run diff --git a/benchmarks/db-benchmark/groupby-datafusion.py b/benchmarks/db-benchmark/groupby-datafusion.py index 2c35259e..3a4399f7 100644 --- a/benchmarks/db-benchmark/groupby-datafusion.py +++ b/benchmarks/db-benchmark/groupby-datafusion.py @@ -79,17 +79,13 @@ def execute(df): data = pacsv.read_csv( src_grp, - convert_options=pacsv.ConvertOptions( - auto_dict_encode=True, column_types=schema - ), + convert_options=pacsv.ConvertOptions(auto_dict_encode=True, column_types=schema), ) print("dataset loaded") # create a session context with explicit runtime and config settings runtime = ( - RuntimeConfig() - .with_disk_manager_os() - .with_fair_spill_pool(64 * 1024 * 1024 * 1024) + RuntimeConfig().with_disk_manager_os().with_fair_spill_pool(64 * 1024 * 1024 * 1024) ) config = ( SessionConfig() @@ -116,9 +112,7 @@ def execute(df): if sql: df = ctx.sql("SELECT id1, SUM(v1) AS v1 FROM x GROUP BY id1") else: - df = ctx.table("x").aggregate( - [f.col("id1")], [f.sum(f.col("v1")).alias("v1")] - ) + df = ctx.table("x").aggregate([f.col("id1")], [f.sum(f.col("v1")).alias("v1")]) ans = execute(df) shape = ans_shape(ans) @@ -197,9 +191,7 @@ def execute(df): gc.collect() t_start = timeit.default_timer() if sql: - df = ctx.sql( - "SELECT id3, SUM(v1) AS v1, AVG(v3) AS v3 FROM x GROUP BY id3" - ) + df = ctx.sql("SELECT id3, SUM(v1) AS v1, AVG(v3) AS v3 FROM x GROUP BY id3") else: df = ctx.table("x").aggregate( [f.col("id3")], diff --git a/benchmarks/db-benchmark/join-datafusion.py b/benchmarks/db-benchmark/join-datafusion.py index 602cee69..4d59c7dc 100755 --- a/benchmarks/db-benchmark/join-datafusion.py +++ b/benchmarks/db-benchmark/join-datafusion.py @@ -152,11 +152,7 @@ def ans_shape(batches): print(f"q2: {t}") t_start = timeit.default_timer() df = ctx.create_dataframe([ans]) -chk = ( - df.aggregate([], [f.sum(col("v1")), f.sum(col("v2"))]) - .collect()[0] - .column(0)[0] -) +chk = df.aggregate([], [f.sum(col("v1")), f.sum(col("v2"))]).collect()[0].column(0)[0] chkt = timeit.default_timer() - t_start m = memory_usage() write_log( @@ -193,11 +189,7 @@ def ans_shape(batches): print(f"q3: {t}") t_start = timeit.default_timer() df = ctx.create_dataframe([ans]) -chk = ( - df.aggregate([], [f.sum(col("v1")), f.sum(col("v2"))]) - .collect()[0] - .column(0)[0] -) +chk = df.aggregate([], [f.sum(col("v1")), f.sum(col("v2"))]).collect()[0].column(0)[0] chkt = timeit.default_timer() - t_start m = memory_usage() write_log( @@ -234,11 +226,7 @@ def ans_shape(batches): print(f"q4: {t}") t_start = timeit.default_timer() df = ctx.create_dataframe([ans]) -chk = ( - df.aggregate([], [f.sum(col("v1")), f.sum(col("v2"))]) - .collect()[0] - .column(0)[0] -) +chk = df.aggregate([], [f.sum(col("v1")), f.sum(col("v2"))]).collect()[0].column(0)[0] chkt = timeit.default_timer() - t_start m = memory_usage() write_log( @@ -275,11 +263,7 @@ def ans_shape(batches): print(f"q5: {t}") t_start = timeit.default_timer() df = ctx.create_dataframe([ans]) -chk = ( - df.aggregate([], [f.sum(col("v1")), f.sum(col("v2"))]) - .collect()[0] - .column(0)[0] -) +chk = df.aggregate([], [f.sum(col("v1")), f.sum(col("v2"))]).collect()[0].column(0)[0] chkt = timeit.default_timer() - t_start m = memory_usage() write_log( diff --git a/benchmarks/tpch/tpch.py b/benchmarks/tpch/tpch.py index ea830a1f..7f104a4c 100644 --- a/benchmarks/tpch/tpch.py +++ b/benchmarks/tpch/tpch.py @@ -83,9 +83,7 @@ def bench(data_path, query_path): time_millis = (end - start) * 1000 total_time_millis += time_millis print("q{},{}".format(query, round(time_millis, 1))) - results.write( - "q{},{}\n".format(query, round(time_millis, 1)) - ) + results.write("q{},{}\n".format(query, round(time_millis, 1))) results.flush() except Exception as e: print("query", query, "failed", e) diff --git a/ci/scripts/python_lint.sh b/ci/scripts/python_lint.sh new file mode 100755 index 00000000..3f7310ba --- /dev/null +++ b/ci/scripts/python_lint.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -ex +ruff format datafusion +ruff check datafusion \ No newline at end of file diff --git a/datafusion/tests/test_dataframe.py b/datafusion/tests/test_dataframe.py index a39cb39d..c80653cc 100644 --- a/datafusion/tests/test_dataframe.py +++ b/datafusion/tests/test_dataframe.py @@ -409,8 +409,8 @@ def test_execution_plan(aggregate_df): plan = aggregate_df.execution_plan() expected = ( - "AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1], aggr=[SUM(test.c2)]\n" - ) # noqa: E501 + "AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1], aggr=[SUM(test.c2)]\n" # noqa: E501 + ) assert expected == plan.display() diff --git a/dev/release/check-rat-report.py b/dev/release/check-rat-report.py index 30a01116..d3dd7c5d 100644 --- a/dev/release/check-rat-report.py +++ b/dev/release/check-rat-report.py @@ -23,9 +23,7 @@ import xml.etree.ElementTree as ET if len(sys.argv) != 3: - sys.stderr.write( - "Usage: %s exclude_globs.lst rat_report.xml\n" % sys.argv[0] - ) + sys.stderr.write("Usage: %s exclude_globs.lst rat_report.xml\n" % sys.argv[0]) sys.exit(1) exclude_globs_filename = sys.argv[1] diff --git a/dev/release/generate-changelog.py b/dev/release/generate-changelog.py index e97f0030..01d64066 100755 --- a/dev/release/generate-changelog.py +++ b/dev/release/generate-changelog.py @@ -27,9 +27,7 @@ def print_pulls(repo_name, title, pulls): print("**{}:**".format(title)) print() for pull, commit in pulls: - url = "https://github.com/{}/pull/{}".format( - repo_name, pull.number - ) + url = "https://github.com/{}/pull/{}".format(repo_name, pull.number) print( "- {} [#{}]({}) ({})".format( pull.title, pull.number, url, commit.author.login @@ -40,9 +38,7 @@ def print_pulls(repo_name, title, pulls): def generate_changelog(repo, repo_name, tag1, tag2): # get a list of commits between two tags - print( - f"Fetching list of commits between {tag1} and {tag2}", file=sys.stderr - ) + print(f"Fetching list of commits between {tag1} and {tag2}", file=sys.stderr) comparison = repo.compare(tag1, tag2) # get the pull requests for these commits diff --git a/examples/sql-on-polars.py b/examples/sql-on-polars.py index dd7a9e02..ffcb12b7 100644 --- a/examples/sql-on-polars.py +++ b/examples/sql-on-polars.py @@ -20,7 +20,5 @@ ctx = SessionContext() ctx.register_table("taxi", "yellow_tripdata_2021-01.parquet") -df = ctx.sql( - "select passenger_count, count(*) from taxi group by passenger_count" -) +df = ctx.sql("select passenger_count, count(*) from taxi group by passenger_count") print(df) diff --git a/examples/sql-using-python-udaf.py b/examples/sql-using-python-udaf.py index 3326c4a1..7ccf5d3c 100644 --- a/examples/sql-using-python-udaf.py +++ b/examples/sql-using-python-udaf.py @@ -30,15 +30,11 @@ def __init__(self): def update(self, values: pa.Array) -> None: # not nice since pyarrow scalars can't be summed yet. This breaks on `None` - self._sum = pa.scalar( - self._sum.as_py() + pa.compute.sum(values).as_py() - ) + self._sum = pa.scalar(self._sum.as_py() + pa.compute.sum(values).as_py()) def merge(self, states: pa.Array) -> None: # not nice since pyarrow scalars can't be summed yet. This breaks on `None` - self._sum = pa.scalar( - self._sum.as_py() + pa.compute.sum(states).as_py() - ) + self._sum = pa.scalar(self._sum.as_py() + pa.compute.sum(states).as_py()) def state(self) -> pa.Array: return pa.array([self._sum.as_py()]) diff --git a/examples/substrait.py b/examples/substrait.py index c579751d..23cd7464 100644 --- a/examples/substrait.py +++ b/examples/substrait.py @@ -23,9 +23,7 @@ ctx = SessionContext() # Register table with context -ctx.register_csv( - "aggregate_test_data", "./testing/data/csv/aggregate_test_100.csv" -) +ctx.register_csv("aggregate_test_data", "./testing/data/csv/aggregate_test_100.csv") substrait_plan = ss.substrait.serde.serialize_to_plan( "SELECT * FROM aggregate_test_data", ctx @@ -49,9 +47,7 @@ substrait_plan = ss.substrait.serde.deserialize_bytes(substrait_bytes) # type(df_logical_plan) -> -df_logical_plan = ss.substrait.consumer.from_substrait_plan( - ctx, substrait_plan -) +df_logical_plan = ss.substrait.consumer.from_substrait_plan(ctx, substrait_plan) # Back to Substrait Plan just for demonstration purposes # type(substrait_plan) ->