pola-rs · ritchie46 · Aug 30, 2024 · Aug 30, 2024 · Aug 30, 2024
@@ -157,7 +157,7 @@ Refer to the [Polars CLI repository](https://github.com/pola-rs/polars-cli) for
 
 ### Blazingly fast
 
-Polars is very fast. In fact, it is one of the best performing solutions available. See the [TPC-H benchmarks](https://www.pola.rs/benchmarks.html) results.
+Polars is very fast. In fact, it is one of the best performing solutions available. See the [PDS-H benchmarks](https://www.pola.rs/benchmarks.html) results.
 
 ### Lightweight
 

@@ -6,14 +6,14 @@ mod cse;
 mod io;
 mod logical;
 mod optimization_checks;
+#[cfg(all(feature = "strings", feature = "cse"))]
+mod pdsh;
 mod predicate_queries;
 mod projection_queries;
 mod queries;
 mod schema;
 #[cfg(feature = "streaming")]
 mod streaming;
-#[cfg(all(feature = "strings", feature = "cse"))]
-mod tpch;
 
 fn get_arenas() -> (Arena<AExpr>, Arena<IR>) {
     let expr_arena = Arena::with_capacity(16);

@@ -1,10 +1,10 @@
-//! The tpch files only got ten rows, so after all the joins filters there is not data
+//! The PDSH files only got ten rows, so after all the joins filters there is not data
 //! Still we can use this to test the schema, operation correctness on empty data, and optimizations
 //! taken.
 use super::*;
 
 const fn base_path() -> &'static str {
-    "../../examples/datasets/tpc_heads"
+    "../../examples/datasets/pds_heads"
 }
 
 fn region() -> LazyFrame {

@@ -2,7 +2,7 @@ use polars::io::mmap::MmapBytesReader;
 use polars::prelude::*;
 
 fn main() -> PolarsResult<()> {
-    let file = std::fs::File::open("/home/ritchie46/Downloads/tpch/tables_scale_100/lineitem.tbl")
+    let file = std::fs::File::open("/home/ritchie46/Downloads/pdsh/tables_scale_100/lineitem.tbl")
         .unwrap();
     let file = Box::new(file) as Box<dyn MmapBytesReader>;
     let _df = CsvReader::new(file)

@@ -113,7 +113,7 @@ clean:  ## Clean up caches and build artifacts
 	@rm -rf .mypy_cache/
 	@rm -rf .pytest_cache/
 	@$(VENV_BIN)/ruff clean
-	@rm -rf tests/data/tpch/sf*
+	@rm -rf tests/data/pdsh/sf*
 	@rm -f .coverage
 	@rm -f coverage.xml
 	@rm -f polars/polars.abi3.so

@@ -121,7 +121,7 @@ def next_batches(self, n: int) -> list[DataFrame] | None:
         Examples
         --------
         >>> reader = pl.read_csv_batched(
-        ...     "./tpch/tables_scale_100/lineitem.tbl",
+        ...     "./pdsh/tables_scale_100/lineitem.tbl",
         ...     separator="|",
         ...     try_parse_dates=True,
         ... )  # doctest: +SKIP

@@ -827,7 +827,7 @@ def read_csv_batched(
     Examples
     --------
     >>> reader = pl.read_csv_batched(
-    ...     "./tpch/tables_scale_100/lineitem.tbl",
+    ...     "./pdsh/tables_scale_100/lineitem.tbl",
     ...     separator="|",
     ...     try_parse_dates=True,
     ... )  # doctest: +SKIP

@@ -1,6 +1,6 @@
 """Data generation functionality for use in the benchmarking suite."""
 
 from tests.benchmark.data.h2oai import generate_group_by_data
-from tests.benchmark.data.tpch import load_tpch_table
+from tests.benchmark.data.pdsh import load_pdsh_table
 
-__all__ = ["load_tpch_table", "generate_group_by_data"]
+__all__ = ["load_pdsh_table", "generate_group_by_data"]
@@ -0,0 +1,5 @@
+"""Generate data for the PDS-H benchmark tests."""
+
+from tests.benchmark.data.pdsh.generate_data import load_pdsh_table
+
+__all__ = ["load_pdsh_table"]
@@ -1,8 +1,19 @@
 """
-Script to generate data for running the TPC-H benchmark.
-
-Data generation logic was adapted from the TPC-H benchmark tools:
-https://www.tpc.org/tpch/
+Disclaimer.
+
+Certain portions of the contents of this file are derived from TPC-H version 3.0.1
+(retrieved from
+http://www.tpc.org/tpc_documents_current_versions/current_specifications5.asp).
+Such portions are subject to copyrights held by Transaction Processing
+Performance Council (“TPC”) and licensed under the TPC EULA is available at
+http://www.tpc.org/tpc_documents_current_versions/current_specifications5.asp)
+(the “TPC EULA”).
+
+You may not use this file except in compliance with the TPC EULA.
+DISCLAIMER: Portions of this file is derived from the TPC-H benchmark and as
+such any result obtained using this file are not comparable to published TPC-H
+Benchmark results, as the results obtained from using this file do not comply with
+the TPC-H Benchmark.
 """
 
 from __future__ import annotations
@@ -19,29 +30,29 @@
 CURRENT_DIR = Path(__file__).parent
 DBGEN_DIR = CURRENT_DIR / "dbgen"
 
-__all__ = ["load_tpch_table"]
+__all__ = ["load_pdsh_table"]
 
 
-def load_tpch_table(table_name: str, scale_factor: float = 0.01) -> pl.DataFrame:
+def load_pdsh_table(table_name: str, scale_factor: float = 0.01) -> pl.DataFrame:
     """
-    Load a TPC-H table from disk.
+    Load a PDS-H table from disk.
 
     If the file does not exist, it is generated along with all other tables.
     """
     folder = CURRENT_DIR / f"sf-{scale_factor:g}"
     file_path = folder / f"{table_name}.parquet"
 
     if not file_path.exists():
-        _generate_tpch_data(scale_factor)
+        _generate_pdsh_data(scale_factor)
 
     return pl.read_parquet(file_path)
 
 
-def _generate_tpch_data(scale_factor: float = 0.01) -> None:
-    """Generate all TPC-H datasets with the given scale factor."""
+def _generate_pdsh_data(scale_factor: float = 0.01) -> None:
+    """Generate all PDS-H datasets with the given scale factor."""
     # TODO: Can we make this work under Windows?
     if sys.platform == "win32":
-        msg = "cannot generate TPC-H data under Windows"
+        msg = "cannot generate PDS-H data under Windows"
         raise RuntimeError(msg)
 
     subprocess.run(["./dbgen", "-f", "-v", "-s", str(scale_factor)], cwd=DBGEN_DIR)