Skip to content

Commit

Permalink
ruff format
Browse files Browse the repository at this point in the history
  • Loading branch information
andygrove committed Mar 3, 2024
1 parent dac3148 commit 18dd7d2
Show file tree
Hide file tree
Showing 13 changed files with 66 additions and 85 deletions.
7 changes: 0 additions & 7 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -103,13 +103,6 @@ jobs:
source venv/bin/activate
pip install -r requirements-311.txt
- name: Run Python Linters
if: ${{ matrix.python-version == '3.10' && matrix.toolchain == 'stable' }}
run: |
source venv/bin/activate
flake8 --exclude venv,benchmarks/db-benchmark --ignore=E501,W503
black --line-length 79 --diff --check .
- name: Run tests
env:
RUST_BACKTRACE: 1
Expand Down
21 changes: 7 additions & 14 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,21 +20,14 @@ repos:
rev: v1.6.23
hooks:
- id: actionlint-docker
- repo: https://github.com/psf/black
rev: 22.3.0
- repo: https://github.com/astral-sh/ruff-pre-commit
# Ruff version.
rev: v0.3.0
hooks:
- id: black
files: datafusion/.*
# Explicitly specify the pyproject.toml at the repo root, not per-project.
args: ["--config", "pyproject.toml", "--line-length", "79", "--diff", "--check", "."]
- repo: https://github.com/PyCQA/flake8
rev: 5.0.4
hooks:
- id: flake8
files: datafusion/.*$
types: [file]
types_or: [python]
additional_dependencies: ["flake8-force"]
# Run the linter.
- id: ruff
# Run the formatter.
- id: ruff-format
- repo: local
hooks:
- id: rust-fmt
Expand Down
21 changes: 18 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ source venv/bin/activate
# update pip itself if necessary
python -m pip install -U pip
# install dependencies (for Python 3.8+)
python -m pip install -r requirements-310.txt
python -m pip install -r requirements.in
```

The tests rely on test data in git submodules.
Expand All @@ -222,12 +222,27 @@ python -m pytest

### Running & Installing pre-commit hooks

arrow-datafusion-python takes advantage of [pre-commit](https://pre-commit.com/) to assist developers with code linting to help reduce the number of commits that ultimately fail in CI due to linter errors. Using the pre-commit hooks is optional for the developer but certainly helpful for keeping PRs clean and concise.
arrow-datafusion-python takes advantage of [pre-commit](https://pre-commit.com/) to assist developers with code linting to help reduce
the number of commits that ultimately fail in CI due to linter errors. Using the pre-commit hooks is optional for the
developer but certainly helpful for keeping PRs clean and concise.

Our pre-commit hooks can be installed by running `pre-commit install`, which will install the configurations in your ARROW_DATAFUSION_PYTHON_ROOT/.github directory and run each time you perform a commit, failing to complete the commit if an offending lint is found allowing you to make changes locally before pushing.
Our pre-commit hooks can be installed by running `pre-commit install`, which will install the configurations in
your ARROW_DATAFUSION_PYTHON_ROOT/.github directory and run each time you perform a commit, failing to complete
the commit if an offending lint is found allowing you to make changes locally before pushing.

The pre-commit hooks can also be run adhoc without installing them by simply running `pre-commit run --all-files`

## Running linters without using pre-commit

There are scripts in `ci/scripts` for running Rust and Python linters.

```shell
./ci/scripts/python_lint.sh
./ci/scripts/rust_clippy.sh
./ci/scripts/rust_fmt.sh
./ci/scripts/rust_toml_fmt.sh
```

## How to update dependencies

To change test dependencies, change the `requirements.in` and run
Expand Down
16 changes: 4 additions & 12 deletions benchmarks/db-benchmark/groupby-datafusion.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,17 +79,13 @@ def execute(df):

data = pacsv.read_csv(
src_grp,
convert_options=pacsv.ConvertOptions(
auto_dict_encode=True, column_types=schema
),
convert_options=pacsv.ConvertOptions(auto_dict_encode=True, column_types=schema),
)
print("dataset loaded")

# create a session context with explicit runtime and config settings
runtime = (
RuntimeConfig()
.with_disk_manager_os()
.with_fair_spill_pool(64 * 1024 * 1024 * 1024)
RuntimeConfig().with_disk_manager_os().with_fair_spill_pool(64 * 1024 * 1024 * 1024)
)
config = (
SessionConfig()
Expand All @@ -116,9 +112,7 @@ def execute(df):
if sql:
df = ctx.sql("SELECT id1, SUM(v1) AS v1 FROM x GROUP BY id1")
else:
df = ctx.table("x").aggregate(
[f.col("id1")], [f.sum(f.col("v1")).alias("v1")]
)
df = ctx.table("x").aggregate([f.col("id1")], [f.sum(f.col("v1")).alias("v1")])
ans = execute(df)

shape = ans_shape(ans)
Expand Down Expand Up @@ -197,9 +191,7 @@ def execute(df):
gc.collect()
t_start = timeit.default_timer()
if sql:
df = ctx.sql(
"SELECT id3, SUM(v1) AS v1, AVG(v3) AS v3 FROM x GROUP BY id3"
)
df = ctx.sql("SELECT id3, SUM(v1) AS v1, AVG(v3) AS v3 FROM x GROUP BY id3")
else:
df = ctx.table("x").aggregate(
[f.col("id3")],
Expand Down
24 changes: 4 additions & 20 deletions benchmarks/db-benchmark/join-datafusion.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,11 +152,7 @@ def ans_shape(batches):
print(f"q2: {t}")
t_start = timeit.default_timer()
df = ctx.create_dataframe([ans])
chk = (
df.aggregate([], [f.sum(col("v1")), f.sum(col("v2"))])
.collect()[0]
.column(0)[0]
)
chk = df.aggregate([], [f.sum(col("v1")), f.sum(col("v2"))]).collect()[0].column(0)[0]
chkt = timeit.default_timer() - t_start
m = memory_usage()
write_log(
Expand Down Expand Up @@ -193,11 +189,7 @@ def ans_shape(batches):
print(f"q3: {t}")
t_start = timeit.default_timer()
df = ctx.create_dataframe([ans])
chk = (
df.aggregate([], [f.sum(col("v1")), f.sum(col("v2"))])
.collect()[0]
.column(0)[0]
)
chk = df.aggregate([], [f.sum(col("v1")), f.sum(col("v2"))]).collect()[0].column(0)[0]
chkt = timeit.default_timer() - t_start
m = memory_usage()
write_log(
Expand Down Expand Up @@ -234,11 +226,7 @@ def ans_shape(batches):
print(f"q4: {t}")
t_start = timeit.default_timer()
df = ctx.create_dataframe([ans])
chk = (
df.aggregate([], [f.sum(col("v1")), f.sum(col("v2"))])
.collect()[0]
.column(0)[0]
)
chk = df.aggregate([], [f.sum(col("v1")), f.sum(col("v2"))]).collect()[0].column(0)[0]
chkt = timeit.default_timer() - t_start
m = memory_usage()
write_log(
Expand Down Expand Up @@ -275,11 +263,7 @@ def ans_shape(batches):
print(f"q5: {t}")
t_start = timeit.default_timer()
df = ctx.create_dataframe([ans])
chk = (
df.aggregate([], [f.sum(col("v1")), f.sum(col("v2"))])
.collect()[0]
.column(0)[0]
)
chk = df.aggregate([], [f.sum(col("v1")), f.sum(col("v2"))]).collect()[0].column(0)[0]
chkt = timeit.default_timer() - t_start
m = memory_usage()
write_log(
Expand Down
4 changes: 1 addition & 3 deletions benchmarks/tpch/tpch.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,9 +83,7 @@ def bench(data_path, query_path):
time_millis = (end - start) * 1000
total_time_millis += time_millis
print("q{},{}".format(query, round(time_millis, 1)))
results.write(
"q{},{}\n".format(query, round(time_millis, 1))
)
results.write("q{},{}\n".format(query, round(time_millis, 1)))
results.flush()
except Exception as e:
print("query", query, "failed", e)
Expand Down
22 changes: 22 additions & 0 deletions ci/scripts/python_lint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/usr/bin/env bash
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

set -ex
ruff format datafusion
ruff check datafusion
4 changes: 2 additions & 2 deletions datafusion/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -409,8 +409,8 @@ def test_execution_plan(aggregate_df):
plan = aggregate_df.execution_plan()

expected = (
"AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1], aggr=[SUM(test.c2)]\n"
) # noqa: E501
"AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1], aggr=[SUM(test.c2)]\n" # noqa: E501
)

assert expected == plan.display()

Expand Down
4 changes: 1 addition & 3 deletions dev/release/check-rat-report.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,7 @@
import xml.etree.ElementTree as ET

if len(sys.argv) != 3:
sys.stderr.write(
"Usage: %s exclude_globs.lst rat_report.xml\n" % sys.argv[0]
)
sys.stderr.write("Usage: %s exclude_globs.lst rat_report.xml\n" % sys.argv[0])
sys.exit(1)

exclude_globs_filename = sys.argv[1]
Expand Down
8 changes: 2 additions & 6 deletions dev/release/generate-changelog.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,7 @@ def print_pulls(repo_name, title, pulls):
print("**{}:**".format(title))
print()
for pull, commit in pulls:
url = "https://github.com/{}/pull/{}".format(
repo_name, pull.number
)
url = "https://github.com/{}/pull/{}".format(repo_name, pull.number)
print(
"- {} [#{}]({}) ({})".format(
pull.title, pull.number, url, commit.author.login
Expand All @@ -40,9 +38,7 @@ def print_pulls(repo_name, title, pulls):

def generate_changelog(repo, repo_name, tag1, tag2):
# get a list of commits between two tags
print(
f"Fetching list of commits between {tag1} and {tag2}", file=sys.stderr
)
print(f"Fetching list of commits between {tag1} and {tag2}", file=sys.stderr)
comparison = repo.compare(tag1, tag2)

# get the pull requests for these commits
Expand Down
4 changes: 1 addition & 3 deletions examples/sql-on-polars.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,5 @@

ctx = SessionContext()
ctx.register_table("taxi", "yellow_tripdata_2021-01.parquet")
df = ctx.sql(
"select passenger_count, count(*) from taxi group by passenger_count"
)
df = ctx.sql("select passenger_count, count(*) from taxi group by passenger_count")
print(df)
8 changes: 2 additions & 6 deletions examples/sql-using-python-udaf.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,11 @@ def __init__(self):

def update(self, values: pa.Array) -> None:
# not nice since pyarrow scalars can't be summed yet. This breaks on `None`
self._sum = pa.scalar(
self._sum.as_py() + pa.compute.sum(values).as_py()
)
self._sum = pa.scalar(self._sum.as_py() + pa.compute.sum(values).as_py())

def merge(self, states: pa.Array) -> None:
# not nice since pyarrow scalars can't be summed yet. This breaks on `None`
self._sum = pa.scalar(
self._sum.as_py() + pa.compute.sum(states).as_py()
)
self._sum = pa.scalar(self._sum.as_py() + pa.compute.sum(states).as_py())

def state(self) -> pa.Array:
return pa.array([self._sum.as_py()])
Expand Down
8 changes: 2 additions & 6 deletions examples/substrait.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,7 @@
ctx = SessionContext()

# Register table with context
ctx.register_csv(
"aggregate_test_data", "./testing/data/csv/aggregate_test_100.csv"
)
ctx.register_csv("aggregate_test_data", "./testing/data/csv/aggregate_test_100.csv")

substrait_plan = ss.substrait.serde.serialize_to_plan(
"SELECT * FROM aggregate_test_data", ctx
Expand All @@ -49,9 +47,7 @@
substrait_plan = ss.substrait.serde.deserialize_bytes(substrait_bytes)

# type(df_logical_plan) -> <class 'substrait.LogicalPlan'>
df_logical_plan = ss.substrait.consumer.from_substrait_plan(
ctx, substrait_plan
)
df_logical_plan = ss.substrait.consumer.from_substrait_plan(ctx, substrait_plan)

# Back to Substrait Plan just for demonstration purposes
# type(substrait_plan) -> <class 'datafusion.substrait.plan'>
Expand Down

0 comments on commit 18dd7d2

Please sign in to comment.