-
Notifications
You must be signed in to change notification settings - Fork 20
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Create arrow1/arrow2 read benchmarks (#82)
- Loading branch information
1 parent
834927c
commit 2e36920
Showing
6 changed files
with
535 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
import b from "benny"; | ||
import * as arrow1 from "../pkg/node/arrow1"; | ||
import * as arrow2 from "../pkg/node/arrow2"; | ||
import { readFileSync } from "fs"; | ||
|
||
const dataDir = `${__dirname}/data`; | ||
|
||
// https://stackoverflow.com/a/43053803 | ||
const cartesian = (...a) => | ||
a.reduce((a, b) => a.flatMap((d) => b.map((e) => [d, e].flat()))); | ||
|
||
const apis = ["arrow1", "arrow2"]; | ||
const partitions = [1, 5, 20]; | ||
const compressions = ["brotli", "gzip", "none", "snappy"]; | ||
|
||
const testCases: [number, string, "arrow1" | "arrow2"][] = cartesian( | ||
partitions, | ||
compressions, | ||
apis | ||
); | ||
|
||
const createReadTests = () => | ||
testCases.map(([partitions, compression, api]) => { | ||
const file = `${partitions}-partition-${compression}`; | ||
const testName = `${api} ${file}`; | ||
return b.add(testName, () => { | ||
const arr = loadFile(file); | ||
if (api === "arrow1") { | ||
return () => arrow1.readParquet(arr); | ||
} | ||
|
||
return () => arrow2.readParquet2(arr); | ||
}); | ||
}); | ||
|
||
function loadFile(name: string): Uint8Array { | ||
const dataPath = `${dataDir}/${name}.parquet`; | ||
return new Uint8Array(readFileSync(dataPath)); | ||
} | ||
|
||
b.suite( | ||
"Read Parquet", | ||
|
||
...createReadTests(), | ||
|
||
b.cycle(), | ||
b.configure({ minDisplayPrecision: 2 }), | ||
b.complete(), | ||
b.save({ | ||
file: "bench", | ||
folder: "bench/results/", | ||
version: "0.3.0", | ||
details: true, | ||
format: "chart.html", | ||
}) | ||
); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
from pathlib import Path | ||
|
||
import numpy as np | ||
import pandas as pd | ||
import pyarrow as pa | ||
import pyarrow.parquet as pq | ||
|
||
compressions = ["SNAPPY", "GZIP", "BROTLI", "ZSTD", "NONE"] | ||
|
||
|
||
def create_table(n_rows=1_000_000): | ||
data = {} | ||
|
||
for dtype in ["uint8", "uint16", "uint32"]: | ||
data[dtype] = pa.array(np.random.randint(0, np.iinfo(dtype).max, size=n_rows)) | ||
|
||
data["bool"] = pa.array(np.random.randint(0, 2, size=n_rows), type=pa.bool_()) | ||
|
||
# Todo column with string data? | ||
# https://stackoverflow.com/a/2257449 | ||
|
||
return pa.table(data) | ||
|
||
|
||
def write_table(table): | ||
# Create data directory | ||
Path("data").mkdir(exist_ok=True) | ||
|
||
data_len = len(table) | ||
for n_partitions in [1, 5, 20]: | ||
for compression in compressions: | ||
row_group_size = data_len / n_partitions | ||
compression_text = str(compression).lower() | ||
fname = f"data/{n_partitions}-partition-{compression_text}.parquet" | ||
pq.write_table( | ||
table, fname, row_group_size=row_group_size, compression=compression | ||
) | ||
|
||
|
||
def main(): | ||
table = create_table() | ||
write_table(table) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
[tool.poetry] | ||
name = "parquet-wasm-bench" | ||
version = "0.1.0" | ||
description = "Create data for parquet-wasm benchmarks" | ||
authors = ["Kyle Barron <[email protected]>"] | ||
license = "MIT" | ||
|
||
[tool.poetry.dependencies] | ||
python = "^3.8" | ||
numpy = "^1.22.3" | ||
pyarrow = "^7.0.0" | ||
pandas = "^1.4.2" | ||
|
||
[tool.poetry.dev-dependencies] | ||
|
||
[build-system] | ||
requires = ["poetry-core>=1.0.0"] | ||
build-backend = "poetry.core.masonry.api" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.