diff --git a/README.md b/README.md index 5d7e3c3bf203..a106db5cc3ca 100644 --- a/README.md +++ b/README.md @@ -157,7 +157,7 @@ Refer to the [Polars CLI repository](https://github.com/pola-rs/polars-cli) for ### Blazingly fast -Polars is very fast. In fact, it is one of the best performing solutions available. See the [TPC-H benchmarks](https://www.pola.rs/benchmarks.html) results. +Polars is very fast. In fact, it is one of the best performing solutions available. See the [PDS-H benchmarks](https://www.pola.rs/benchmarks.html) results. ### Lightweight diff --git a/crates/polars-lazy/src/tests/mod.rs b/crates/polars-lazy/src/tests/mod.rs index 8b1a51212d18..f4ba3e876a65 100644 --- a/crates/polars-lazy/src/tests/mod.rs +++ b/crates/polars-lazy/src/tests/mod.rs @@ -6,14 +6,14 @@ mod cse; mod io; mod logical; mod optimization_checks; +#[cfg(all(feature = "strings", feature = "cse"))] +mod pdsh; mod predicate_queries; mod projection_queries; mod queries; mod schema; #[cfg(feature = "streaming")] mod streaming; -#[cfg(all(feature = "strings", feature = "cse"))] -mod tpch; fn get_arenas() -> (Arena, Arena) { let expr_arena = Arena::with_capacity(16); diff --git a/crates/polars-lazy/src/tests/tpch.rs b/crates/polars-lazy/src/tests/pdsh.rs similarity index 96% rename from crates/polars-lazy/src/tests/tpch.rs rename to crates/polars-lazy/src/tests/pdsh.rs index a525e52091aa..426b19506684 100644 --- a/crates/polars-lazy/src/tests/tpch.rs +++ b/crates/polars-lazy/src/tests/pdsh.rs @@ -1,10 +1,10 @@ -//! The tpch files only got ten rows, so after all the joins filters there is not data +//! The PDSH files only got ten rows, so after all the joins filters there is not data //! Still we can use this to test the schema, operation correctness on empty data, and optimizations //! taken. use super::*; const fn base_path() -> &'static str { - "../../examples/datasets/tpc_heads" + "../../examples/datasets/pds_heads" } fn region() -> LazyFrame { diff --git a/examples/datasets/tpc_heads/customer.feather b/examples/datasets/pds_heads/customer.feather similarity index 100% rename from examples/datasets/tpc_heads/customer.feather rename to examples/datasets/pds_heads/customer.feather diff --git a/examples/datasets/tpc_heads/lineitem.feather b/examples/datasets/pds_heads/lineitem.feather similarity index 100% rename from examples/datasets/tpc_heads/lineitem.feather rename to examples/datasets/pds_heads/lineitem.feather diff --git a/examples/datasets/tpc_heads/nation.feather b/examples/datasets/pds_heads/nation.feather similarity index 100% rename from examples/datasets/tpc_heads/nation.feather rename to examples/datasets/pds_heads/nation.feather diff --git a/examples/datasets/tpc_heads/orders.feather b/examples/datasets/pds_heads/orders.feather similarity index 100% rename from examples/datasets/tpc_heads/orders.feather rename to examples/datasets/pds_heads/orders.feather diff --git a/examples/datasets/tpc_heads/part.feather b/examples/datasets/pds_heads/part.feather similarity index 100% rename from examples/datasets/tpc_heads/part.feather rename to examples/datasets/pds_heads/part.feather diff --git a/examples/datasets/tpc_heads/partsupp.feather b/examples/datasets/pds_heads/partsupp.feather similarity index 100% rename from examples/datasets/tpc_heads/partsupp.feather rename to examples/datasets/pds_heads/partsupp.feather diff --git a/examples/datasets/tpc_heads/region.feather b/examples/datasets/pds_heads/region.feather similarity index 100% rename from examples/datasets/tpc_heads/region.feather rename to examples/datasets/pds_heads/region.feather diff --git a/examples/datasets/tpc_heads/supplier.feather b/examples/datasets/pds_heads/supplier.feather similarity index 100% rename from examples/datasets/tpc_heads/supplier.feather rename to examples/datasets/pds_heads/supplier.feather diff --git a/examples/read_csv/src/main.rs b/examples/read_csv/src/main.rs index aa9188f19409..877fc6483635 100644 --- a/examples/read_csv/src/main.rs +++ b/examples/read_csv/src/main.rs @@ -2,7 +2,7 @@ use polars::io::mmap::MmapBytesReader; use polars::prelude::*; fn main() -> PolarsResult<()> { - let file = std::fs::File::open("/home/ritchie46/Downloads/tpch/tables_scale_100/lineitem.tbl") + let file = std::fs::File::open("/home/ritchie46/Downloads/pdsh/tables_scale_100/lineitem.tbl") .unwrap(); let file = Box::new(file) as Box; let _df = CsvReader::new(file) diff --git a/py-polars/Makefile b/py-polars/Makefile index 7e273b14914c..3c98adab08cb 100644 --- a/py-polars/Makefile +++ b/py-polars/Makefile @@ -113,7 +113,7 @@ clean: ## Clean up caches and build artifacts @rm -rf .mypy_cache/ @rm -rf .pytest_cache/ @$(VENV_BIN)/ruff clean - @rm -rf tests/data/tpch/sf* + @rm -rf tests/data/pdsh/sf* @rm -f .coverage @rm -f coverage.xml @rm -f polars/polars.abi3.so diff --git a/py-polars/polars/io/csv/batched_reader.py b/py-polars/polars/io/csv/batched_reader.py index 57cca5d366d3..f13efb7aa3b3 100644 --- a/py-polars/polars/io/csv/batched_reader.py +++ b/py-polars/polars/io/csv/batched_reader.py @@ -121,7 +121,7 @@ def next_batches(self, n: int) -> list[DataFrame] | None: Examples -------- >>> reader = pl.read_csv_batched( - ... "./tpch/tables_scale_100/lineitem.tbl", + ... "./pdsh/tables_scale_100/lineitem.tbl", ... separator="|", ... try_parse_dates=True, ... ) # doctest: +SKIP diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py index 7b3d3a91dbf3..3a27911d716e 100644 --- a/py-polars/polars/io/csv/functions.py +++ b/py-polars/polars/io/csv/functions.py @@ -827,7 +827,7 @@ def read_csv_batched( Examples -------- >>> reader = pl.read_csv_batched( - ... "./tpch/tables_scale_100/lineitem.tbl", + ... "./pdsh/tables_scale_100/lineitem.tbl", ... separator="|", ... try_parse_dates=True, ... ) # doctest: +SKIP diff --git a/py-polars/tests/benchmark/data/__init__.py b/py-polars/tests/benchmark/data/__init__.py index b7f246f37abc..255752458b72 100644 --- a/py-polars/tests/benchmark/data/__init__.py +++ b/py-polars/tests/benchmark/data/__init__.py @@ -1,6 +1,6 @@ """Data generation functionality for use in the benchmarking suite.""" from tests.benchmark.data.h2oai import generate_group_by_data -from tests.benchmark.data.tpch import load_tpch_table +from tests.benchmark.data.pdsh import load_pdsh_table -__all__ = ["load_tpch_table", "generate_group_by_data"] +__all__ = ["load_pdsh_table", "generate_group_by_data"] diff --git a/py-polars/tests/benchmark/data/pdsh/__init__.py b/py-polars/tests/benchmark/data/pdsh/__init__.py new file mode 100644 index 000000000000..ef007f5ed8d9 --- /dev/null +++ b/py-polars/tests/benchmark/data/pdsh/__init__.py @@ -0,0 +1,5 @@ +"""Generate data for the PDS-H benchmark tests.""" + +from tests.benchmark.data.pdsh.generate_data import load_pdsh_table + +__all__ = ["load_pdsh_table"] diff --git a/py-polars/tests/benchmark/data/tpch/dbgen/dbgen b/py-polars/tests/benchmark/data/pdsh/dbgen/dbgen similarity index 100% rename from py-polars/tests/benchmark/data/tpch/dbgen/dbgen rename to py-polars/tests/benchmark/data/pdsh/dbgen/dbgen diff --git a/py-polars/tests/benchmark/data/tpch/dbgen/dists.dss b/py-polars/tests/benchmark/data/pdsh/dbgen/dists.dss similarity index 100% rename from py-polars/tests/benchmark/data/tpch/dbgen/dists.dss rename to py-polars/tests/benchmark/data/pdsh/dbgen/dists.dss diff --git a/py-polars/tests/benchmark/data/tpch/generate_data.py b/py-polars/tests/benchmark/data/pdsh/generate_data.py similarity index 73% rename from py-polars/tests/benchmark/data/tpch/generate_data.py rename to py-polars/tests/benchmark/data/pdsh/generate_data.py index 3b4d81be51ff..0f8866311ada 100644 --- a/py-polars/tests/benchmark/data/tpch/generate_data.py +++ b/py-polars/tests/benchmark/data/pdsh/generate_data.py @@ -1,8 +1,19 @@ """ -Script to generate data for running the TPC-H benchmark. - -Data generation logic was adapted from the TPC-H benchmark tools: -https://www.tpc.org/tpch/ +Disclaimer. + +Certain portions of the contents of this file are derived from TPC-H version 3.0.1 +(retrieved from +http://www.tpc.org/tpc_documents_current_versions/current_specifications5.asp). +Such portions are subject to copyrights held by Transaction Processing +Performance Council (“TPC”) and licensed under the TPC EULA is available at +http://www.tpc.org/tpc_documents_current_versions/current_specifications5.asp) +(the “TPC EULA”). + +You may not use this file except in compliance with the TPC EULA. +DISCLAIMER: Portions of this file is derived from the TPC-H benchmark and as +such any result obtained using this file are not comparable to published TPC-H +Benchmark results, as the results obtained from using this file do not comply with +the TPC-H Benchmark. """ from __future__ import annotations @@ -19,12 +30,12 @@ CURRENT_DIR = Path(__file__).parent DBGEN_DIR = CURRENT_DIR / "dbgen" -__all__ = ["load_tpch_table"] +__all__ = ["load_pdsh_table"] -def load_tpch_table(table_name: str, scale_factor: float = 0.01) -> pl.DataFrame: +def load_pdsh_table(table_name: str, scale_factor: float = 0.01) -> pl.DataFrame: """ - Load a TPC-H table from disk. + Load a PDS-H table from disk. If the file does not exist, it is generated along with all other tables. """ @@ -32,16 +43,16 @@ def load_tpch_table(table_name: str, scale_factor: float = 0.01) -> pl.DataFrame file_path = folder / f"{table_name}.parquet" if not file_path.exists(): - _generate_tpch_data(scale_factor) + _generate_pdsh_data(scale_factor) return pl.read_parquet(file_path) -def _generate_tpch_data(scale_factor: float = 0.01) -> None: - """Generate all TPC-H datasets with the given scale factor.""" +def _generate_pdsh_data(scale_factor: float = 0.01) -> None: + """Generate all PDS-H datasets with the given scale factor.""" # TODO: Can we make this work under Windows? if sys.platform == "win32": - msg = "cannot generate TPC-H data under Windows" + msg = "cannot generate PDS-H data under Windows" raise RuntimeError(msg) subprocess.run(["./dbgen", "-f", "-v", "-s", str(scale_factor)], cwd=DBGEN_DIR) diff --git a/py-polars/tests/benchmark/data/tpch/__init__.py b/py-polars/tests/benchmark/data/tpch/__init__.py deleted file mode 100644 index 2973049f3fbd..000000000000 --- a/py-polars/tests/benchmark/data/tpch/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -"""Generate data for the TPC-H benchmark tests.""" - -from tests.benchmark.data.tpch.generate_data import load_tpch_table - -__all__ = ["load_tpch_table"] diff --git a/py-polars/tests/benchmark/test_tpch.py b/py-polars/tests/benchmark/test_pdsh.py similarity index 91% rename from py-polars/tests/benchmark/test_tpch.py rename to py-polars/tests/benchmark/test_pdsh.py index bd09d92161c9..2ee601b5b895 100644 --- a/py-polars/tests/benchmark/test_tpch.py +++ b/py-polars/tests/benchmark/test_pdsh.py @@ -1,58 +1,76 @@ +""" +Disclaimer. + +Certain portions of the contents of this file are derived from TPC-H version 3.0.1 +(retrieved from +http://www.tpc.org/tpc_documents_current_versions/current_specifications5.asp). +Such portions are subject to copyrights held by Transaction Processing +Performance Council (“TPC”) and licensed under the TPC EULA is available at +http://www.tpc.org/tpc_documents_current_versions/current_specifications5.asp) +(the “TPC EULA”). + +You may not use this file except in compliance with the TPC EULA. +DISCLAIMER: Portions of this file is derived from the TPC-H benchmark and as +such any result obtained using this file are not comparable to published TPC-H +Benchmark results, as the results obtained from using this file do not comply with +the TPC-H Benchmark. +""" + import sys from datetime import date import pytest import polars as pl -from tests.benchmark.data import load_tpch_table +from tests.benchmark.data import load_pdsh_table if sys.platform == "win32": - pytest.skip("TPC-H data cannot be generated under Windows", allow_module_level=True) + pytest.skip("PDS-H data cannot be generated under Windows", allow_module_level=True) pytestmark = pytest.mark.benchmark() @pytest.fixture(scope="module") def customer() -> pl.LazyFrame: - return load_tpch_table("customer").lazy() + return load_pdsh_table("customer").lazy() @pytest.fixture(scope="module") def lineitem() -> pl.LazyFrame: - return load_tpch_table("lineitem").lazy() + return load_pdsh_table("lineitem").lazy() @pytest.fixture(scope="module") def nation() -> pl.LazyFrame: - return load_tpch_table("nation").lazy() + return load_pdsh_table("nation").lazy() @pytest.fixture(scope="module") def orders() -> pl.LazyFrame: - return load_tpch_table("orders").lazy() + return load_pdsh_table("orders").lazy() @pytest.fixture(scope="module") def part() -> pl.LazyFrame: - return load_tpch_table("part").lazy() + return load_pdsh_table("part").lazy() @pytest.fixture(scope="module") def partsupp() -> pl.LazyFrame: - return load_tpch_table("partsupp").lazy() + return load_pdsh_table("partsupp").lazy() @pytest.fixture(scope="module") def region() -> pl.LazyFrame: - return load_tpch_table("region").lazy() + return load_pdsh_table("region").lazy() @pytest.fixture(scope="module") def supplier() -> pl.LazyFrame: - return load_tpch_table("supplier").lazy() + return load_pdsh_table("supplier").lazy() -def test_tpch_q1(lineitem: pl.LazyFrame) -> None: +def test_pdsh_q1(lineitem: pl.LazyFrame) -> None: var1 = date(1998, 9, 2) q_final = ( @@ -81,7 +99,7 @@ def test_tpch_q1(lineitem: pl.LazyFrame) -> None: q_final.collect() -def test_tpch_q2( +def test_pdsh_q2( nation: pl.LazyFrame, part: pl.LazyFrame, partsupp: pl.LazyFrame, @@ -125,7 +143,7 @@ def test_tpch_q2( q_final.collect() -def test_tpch_q3( +def test_pdsh_q3( customer: pl.LazyFrame, lineitem: pl.LazyFrame, orders: pl.LazyFrame ) -> None: var1 = "BUILDING" @@ -154,7 +172,7 @@ def test_tpch_q3( q_final.collect() -def test_tpch_q4(lineitem: pl.LazyFrame, orders: pl.LazyFrame) -> None: +def test_pdsh_q4(lineitem: pl.LazyFrame, orders: pl.LazyFrame) -> None: var1 = date(1993, 7, 1) var2 = date(1993, 10, 1) @@ -170,7 +188,7 @@ def test_tpch_q4(lineitem: pl.LazyFrame, orders: pl.LazyFrame) -> None: q_final.collect() -def test_tpch_q5( +def test_pdsh_q5( customer: pl.LazyFrame, lineitem: pl.LazyFrame, nation: pl.LazyFrame, @@ -205,7 +223,7 @@ def test_tpch_q5( q_final.collect() -def test_tpch_q6(lineitem: pl.LazyFrame) -> None: +def test_pdsh_q6(lineitem: pl.LazyFrame) -> None: var1 = date(1994, 1, 1) var2 = date(1995, 1, 1) var3 = 0.05 @@ -225,7 +243,7 @@ def test_tpch_q6(lineitem: pl.LazyFrame) -> None: q_final.collect() -def test_tpch_q7( +def test_pdsh_q7( customer: pl.LazyFrame, lineitem: pl.LazyFrame, nation: pl.LazyFrame, @@ -274,7 +292,7 @@ def test_tpch_q7( q_final.collect() -def test_tpch_q8( +def test_pdsh_q8( customer: pl.LazyFrame, lineitem: pl.LazyFrame, nation: pl.LazyFrame, @@ -322,7 +340,7 @@ def test_tpch_q8( q_final.collect() -def test_tpch_q9( +def test_pdsh_q9( lineitem: pl.LazyFrame, nation: pl.LazyFrame, orders: pl.LazyFrame, @@ -357,7 +375,7 @@ def test_tpch_q9( q_final.collect() -def test_tpch_q10( +def test_pdsh_q10( customer: pl.LazyFrame, lineitem: pl.LazyFrame, nation: pl.LazyFrame, @@ -404,7 +422,7 @@ def test_tpch_q10( q_final.collect() -def test_tpch_q11( +def test_pdsh_q11( nation: pl.LazyFrame, partsupp: pl.LazyFrame, supplier: pl.LazyFrame ) -> None: var1 = "GERMANY" @@ -438,7 +456,7 @@ def test_tpch_q11( q_final.collect() -def test_tpch_q12(lineitem: pl.LazyFrame, orders: pl.LazyFrame) -> None: +def test_pdsh_q12(lineitem: pl.LazyFrame, orders: pl.LazyFrame) -> None: var1 = "MAIL" var2 = "SHIP" var3 = date(1994, 1, 1) @@ -467,7 +485,7 @@ def test_tpch_q12(lineitem: pl.LazyFrame, orders: pl.LazyFrame) -> None: q_final.collect() -def test_tpch_q13(customer: pl.LazyFrame, orders: pl.LazyFrame) -> None: +def test_pdsh_q13(customer: pl.LazyFrame, orders: pl.LazyFrame) -> None: var1 = "special" var2 = "requests" @@ -484,7 +502,7 @@ def test_tpch_q13(customer: pl.LazyFrame, orders: pl.LazyFrame) -> None: q_final.collect() -def test_tpch_q14(lineitem: pl.LazyFrame, part: pl.LazyFrame) -> None: +def test_pdsh_q14(lineitem: pl.LazyFrame, part: pl.LazyFrame) -> None: var1 = date(1995, 9, 1) var2 = date(1995, 10, 1) @@ -507,7 +525,7 @@ def test_tpch_q14(lineitem: pl.LazyFrame, part: pl.LazyFrame) -> None: q_final.collect() -def test_tpch_q15(lineitem: pl.LazyFrame, supplier: pl.LazyFrame) -> None: +def test_pdsh_q15(lineitem: pl.LazyFrame, supplier: pl.LazyFrame) -> None: var1 = date(1996, 1, 1) var2 = date(1996, 4, 1) @@ -532,7 +550,7 @@ def test_tpch_q15(lineitem: pl.LazyFrame, supplier: pl.LazyFrame) -> None: q_final.collect() -def test_tpch_q16( +def test_pdsh_q16( part: pl.LazyFrame, partsupp: pl.LazyFrame, supplier: pl.LazyFrame ) -> None: var1 = "Brand#45" @@ -558,7 +576,7 @@ def test_tpch_q16( q_final.collect() -def test_tpch_q17(lineitem: pl.LazyFrame, part: pl.LazyFrame) -> None: +def test_pdsh_q17(lineitem: pl.LazyFrame, part: pl.LazyFrame) -> None: var1 = "Brand#23" var2 = "MED BOX" @@ -579,7 +597,7 @@ def test_tpch_q17(lineitem: pl.LazyFrame, part: pl.LazyFrame) -> None: q_final.collect() -def test_tpch_q18( +def test_pdsh_q18( customer: pl.LazyFrame, lineitem: pl.LazyFrame, orders: pl.LazyFrame ) -> None: var1 = 300 @@ -608,7 +626,7 @@ def test_tpch_q18( q_final.collect() -def test_tpch_q19(lineitem: pl.LazyFrame, part: pl.LazyFrame) -> None: +def test_pdsh_q19(lineitem: pl.LazyFrame, part: pl.LazyFrame) -> None: q_final = ( part.join(lineitem, left_on="p_partkey", right_on="l_partkey") .filter(pl.col("l_shipmode").is_in(["AIR", "AIR REG"])) @@ -649,7 +667,7 @@ def test_tpch_q19(lineitem: pl.LazyFrame, part: pl.LazyFrame) -> None: q_final.collect() -def test_tpch_q20( +def test_pdsh_q20( lineitem: pl.LazyFrame, nation: pl.LazyFrame, part: pl.LazyFrame, @@ -687,7 +705,7 @@ def test_tpch_q20( q_final.collect() -def test_tpch_q21( +def test_pdsh_q21( lineitem: pl.LazyFrame, nation: pl.LazyFrame, orders: pl.LazyFrame, @@ -723,7 +741,7 @@ def test_tpch_q21( q_final.collect() -def test_tpch_q22(customer: pl.LazyFrame, orders: pl.LazyFrame) -> None: +def test_pdsh_q22(customer: pl.LazyFrame, orders: pl.LazyFrame) -> None: q1 = ( customer.with_columns(pl.col("c_phone").str.slice(0, 2).alias("cntrycode")) .filter(pl.col("cntrycode").str.contains("13|31|23|29|30|18|17"))