ruff format

andygrove · andygrove · commit 18dd7d2aca48 · 2024-03-03T11:47:38.000-07:00
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -103,13 +103,6 @@ jobs:
           source venv/bin/activate
           pip install -r requirements-311.txt
 
-      - name: Run Python Linters
-        if: ${{ matrix.python-version == '3.10' && matrix.toolchain == 'stable' }}
-        run: |
-          source venv/bin/activate
-          flake8 --exclude venv,benchmarks/db-benchmark --ignore=E501,W503
-          black --line-length 79 --diff --check .
-
       - name: Run tests
         env:
           RUST_BACKTRACE: 1
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -20,21 +20,14 @@ repos:
         rev: v1.6.23
         hooks:
               - id: actionlint-docker
-      - repo: https://github.com/psf/black
-        rev: 22.3.0
+      - repo: https://github.com/astral-sh/ruff-pre-commit
+        # Ruff version.
+        rev: v0.3.0
         hooks:
-              - id: black
-                files: datafusion/.*
-                # Explicitly specify the pyproject.toml at the repo root, not per-project.
-                args: ["--config", "pyproject.toml", "--line-length", "79", "--diff", "--check", "."]
-      - repo: https://github.com/PyCQA/flake8
-        rev: 5.0.4
-        hooks:
-              - id: flake8
-                files: datafusion/.*$
-                types: [file]
-                types_or: [python]
-                additional_dependencies: ["flake8-force"]
+          # Run the linter.
+          - id: ruff
+          # Run the formatter.
+          - id: ruff-format
       - repo: local
         hooks:
           - id: rust-fmt
diff --git a/README.md b/README.md
@@ -202,7 +202,7 @@ source venv/bin/activate
 # update pip itself if necessary
 python -m pip install -U pip
 # install dependencies (for Python 3.8+)
-python -m pip install -r requirements-310.txt
+python -m pip install -r requirements.in
 ```
 
 The tests rely on test data in git submodules.
@@ -222,12 +222,27 @@ python -m pytest
 
 ### Running & Installing pre-commit hooks
 
-arrow-datafusion-python takes advantage of [pre-commit](https://pre-commit.com/) to assist developers with code linting to help reduce the number of commits that ultimately fail in CI due to linter errors. Using the pre-commit hooks is optional for the developer but certainly helpful for keeping PRs clean and concise.
+arrow-datafusion-python takes advantage of [pre-commit](https://pre-commit.com/) to assist developers with code linting to help reduce
+the number of commits that ultimately fail in CI due to linter errors. Using the pre-commit hooks is optional for the
+developer but certainly helpful for keeping PRs clean and concise.
 
-Our pre-commit hooks can be installed by running `pre-commit install`, which will install the configurations in your ARROW_DATAFUSION_PYTHON_ROOT/.github directory and run each time you perform a commit, failing to complete the commit if an offending lint is found allowing you to make changes locally before pushing.
+Our pre-commit hooks can be installed by running `pre-commit install`, which will install the configurations in
+your ARROW_DATAFUSION_PYTHON_ROOT/.github directory and run each time you perform a commit, failing to complete
+the commit if an offending lint is found allowing you to make changes locally before pushing.
 
 The pre-commit hooks can also be run adhoc without installing them by simply running `pre-commit run --all-files`
 
+## Running linters without using pre-commit
+
+There are scripts in `ci/scripts` for running Rust and Python linters.
+
+```shell
+./ci/scripts/python_lint.sh
+./ci/scripts/rust_clippy.sh
+./ci/scripts/rust_fmt.sh
+./ci/scripts/rust_toml_fmt.sh
+```
+
 ## How to update dependencies
 
 To change test dependencies, change the `requirements.in` and run
diff --git a/benchmarks/db-benchmark/groupby-datafusion.py b/benchmarks/db-benchmark/groupby-datafusion.py
@@ -79,17 +79,13 @@ def execute(df):
 
 data = pacsv.read_csv(
     src_grp,
-    convert_options=pacsv.ConvertOptions(
-        auto_dict_encode=True, column_types=schema
-    ),
+    convert_options=pacsv.ConvertOptions(auto_dict_encode=True, column_types=schema),
 )
 print("dataset loaded")
 
 # create a session context with explicit runtime and config settings
 runtime = (
-    RuntimeConfig()
-    .with_disk_manager_os()
-    .with_fair_spill_pool(64 * 1024 * 1024 * 1024)
+    RuntimeConfig().with_disk_manager_os().with_fair_spill_pool(64 * 1024 * 1024 * 1024)
 )
 config = (
     SessionConfig()
@@ -116,9 +112,7 @@ def execute(df):
 if sql:
     df = ctx.sql("SELECT id1, SUM(v1) AS v1 FROM x GROUP BY id1")
 else:
-    df = ctx.table("x").aggregate(
-        [f.col("id1")], [f.sum(f.col("v1")).alias("v1")]
-    )
+    df = ctx.table("x").aggregate([f.col("id1")], [f.sum(f.col("v1")).alias("v1")])
 ans = execute(df)
 
 shape = ans_shape(ans)
@@ -197,9 +191,7 @@ def execute(df):
 gc.collect()
 t_start = timeit.default_timer()
 if sql:
-    df = ctx.sql(
-        "SELECT id3, SUM(v1) AS v1, AVG(v3) AS v3 FROM x GROUP BY id3"
-    )
+    df = ctx.sql("SELECT id3, SUM(v1) AS v1, AVG(v3) AS v3 FROM x GROUP BY id3")
 else:
     df = ctx.table("x").aggregate(
         [f.col("id3")],
diff --git a/benchmarks/db-benchmark/join-datafusion.py b/benchmarks/db-benchmark/join-datafusion.py
@@ -152,11 +152,7 @@ def ans_shape(batches):
 print(f"q2: {t}")
 t_start = timeit.default_timer()
 df = ctx.create_dataframe([ans])
-chk = (
-    df.aggregate([], [f.sum(col("v1")), f.sum(col("v2"))])
-    .collect()[0]
-    .column(0)[0]
-)
+chk = df.aggregate([], [f.sum(col("v1")), f.sum(col("v2"))]).collect()[0].column(0)[0]
 chkt = timeit.default_timer() - t_start
 m = memory_usage()
 write_log(
@@ -193,11 +189,7 @@ def ans_shape(batches):
 print(f"q3: {t}")
 t_start = timeit.default_timer()
 df = ctx.create_dataframe([ans])
-chk = (
-    df.aggregate([], [f.sum(col("v1")), f.sum(col("v2"))])
-    .collect()[0]
-    .column(0)[0]
-)
+chk = df.aggregate([], [f.sum(col("v1")), f.sum(col("v2"))]).collect()[0].column(0)[0]
 chkt = timeit.default_timer() - t_start
 m = memory_usage()
 write_log(
@@ -234,11 +226,7 @@ def ans_shape(batches):
 print(f"q4: {t}")
 t_start = timeit.default_timer()
 df = ctx.create_dataframe([ans])
-chk = (
-    df.aggregate([], [f.sum(col("v1")), f.sum(col("v2"))])
-    .collect()[0]
-    .column(0)[0]
-)
+chk = df.aggregate([], [f.sum(col("v1")), f.sum(col("v2"))]).collect()[0].column(0)[0]
 chkt = timeit.default_timer() - t_start
 m = memory_usage()
 write_log(
@@ -275,11 +263,7 @@ def ans_shape(batches):
 print(f"q5: {t}")
 t_start = timeit.default_timer()
 df = ctx.create_dataframe([ans])
-chk = (
-    df.aggregate([], [f.sum(col("v1")), f.sum(col("v2"))])
-    .collect()[0]
-    .column(0)[0]
-)
+chk = df.aggregate([], [f.sum(col("v1")), f.sum(col("v2"))]).collect()[0].column(0)[0]
 chkt = timeit.default_timer() - t_start
 m = memory_usage()
 write_log(
diff --git a/benchmarks/tpch/tpch.py b/benchmarks/tpch/tpch.py
@@ -83,9 +83,7 @@ def bench(data_path, query_path):
                     time_millis = (end - start) * 1000
                     total_time_millis += time_millis
                     print("q{},{}".format(query, round(time_millis, 1)))
-                    results.write(
-                        "q{},{}\n".format(query, round(time_millis, 1))
-                    )
+                    results.write("q{},{}\n".format(query, round(time_millis, 1)))
                     results.flush()
                 except Exception as e:
                     print("query", query, "failed", e)
diff --git a/ci/scripts/python_lint.sh b/ci/scripts/python_lint.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -ex
+ruff format datafusion
+ruff check datafusion
diff --git a/datafusion/tests/test_dataframe.py b/datafusion/tests/test_dataframe.py
@@ -409,8 +409,8 @@ def test_execution_plan(aggregate_df):
     plan = aggregate_df.execution_plan()
 
     expected = (
-        "AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1], aggr=[SUM(test.c2)]\n"
-    )  # noqa: E501
+        "AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1], aggr=[SUM(test.c2)]\n"  # noqa: E501
+    )
 
     assert expected == plan.display()
 
diff --git a/dev/release/check-rat-report.py b/dev/release/check-rat-report.py
@@ -23,9 +23,7 @@
 import xml.etree.ElementTree as ET
 
 if len(sys.argv) != 3:
-    sys.stderr.write(
-        "Usage: %s exclude_globs.lst rat_report.xml\n" % sys.argv[0]
-    )
+    sys.stderr.write("Usage: %s exclude_globs.lst rat_report.xml\n" % sys.argv[0])
     sys.exit(1)
 
 exclude_globs_filename = sys.argv[1]
diff --git a/dev/release/generate-changelog.py b/dev/release/generate-changelog.py
@@ -27,9 +27,7 @@ def print_pulls(repo_name, title, pulls):
         print("**{}:**".format(title))
         print()
         for pull, commit in pulls:
-            url = "https://github.com/{}/pull/{}".format(
-                repo_name, pull.number
-            )
+            url = "https://github.com/{}/pull/{}".format(repo_name, pull.number)
             print(
                 "- {} [#{}]({}) ({})".format(
                     pull.title, pull.number, url, commit.author.login
@@ -40,9 +38,7 @@ def print_pulls(repo_name, title, pulls):
 
 def generate_changelog(repo, repo_name, tag1, tag2):
     # get a list of commits between two tags
-    print(
-        f"Fetching list of commits between {tag1} and {tag2}", file=sys.stderr
-    )
+    print(f"Fetching list of commits between {tag1} and {tag2}", file=sys.stderr)
     comparison = repo.compare(tag1, tag2)
 
     # get the pull requests for these commits
diff --git a/examples/sql-on-polars.py b/examples/sql-on-polars.py
@@ -20,7 +20,5 @@
 
 ctx = SessionContext()
 ctx.register_table("taxi", "yellow_tripdata_2021-01.parquet")
-df = ctx.sql(
-    "select passenger_count, count(*) from taxi group by passenger_count"
-)
+df = ctx.sql("select passenger_count, count(*) from taxi group by passenger_count")
 print(df)
diff --git a/examples/sql-using-python-udaf.py b/examples/sql-using-python-udaf.py
@@ -30,15 +30,11 @@ def __init__(self):
 
     def update(self, values: pa.Array) -> None:
         # not nice since pyarrow scalars can't be summed yet. This breaks on `None`
-        self._sum = pa.scalar(
-            self._sum.as_py() + pa.compute.sum(values).as_py()
-        )
+        self._sum = pa.scalar(self._sum.as_py() + pa.compute.sum(values).as_py())
 
     def merge(self, states: pa.Array) -> None:
         # not nice since pyarrow scalars can't be summed yet. This breaks on `None`
-        self._sum = pa.scalar(
-            self._sum.as_py() + pa.compute.sum(states).as_py()
-        )
+        self._sum = pa.scalar(self._sum.as_py() + pa.compute.sum(states).as_py())
 
     def state(self) -> pa.Array:
         return pa.array([self._sum.as_py()])
diff --git a/examples/substrait.py b/examples/substrait.py
@@ -23,9 +23,7 @@
 ctx = SessionContext()
 
 # Register table with context
-ctx.register_csv(
-    "aggregate_test_data", "./testing/data/csv/aggregate_test_100.csv"
-)
+ctx.register_csv("aggregate_test_data", "./testing/data/csv/aggregate_test_100.csv")
 
 substrait_plan = ss.substrait.serde.serialize_to_plan(
     "SELECT * FROM aggregate_test_data", ctx
@@ -49,9 +47,7 @@
 substrait_plan = ss.substrait.serde.deserialize_bytes(substrait_bytes)
 
 # type(df_logical_plan) -> <class 'substrait.LogicalPlan'>
-df_logical_plan = ss.substrait.consumer.from_substrait_plan(
-    ctx, substrait_plan
-)
+df_logical_plan = ss.substrait.consumer.from_substrait_plan(ctx, substrait_plan)
 
 # Back to Substrait Plan just for demonstration purposes
 # type(substrait_plan) -> <class 'datafusion.substrait.plan'>