Skip to content

Commit

Permalink
Create arrow1/arrow2 read benchmarks (#82)
Browse files Browse the repository at this point in the history
  • Loading branch information
kylebarron authored Apr 26, 2022
1 parent 834927c commit 2e36920
Show file tree
Hide file tree
Showing 6 changed files with 535 additions and 1 deletion.
56 changes: 56 additions & 0 deletions bench/bench.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import b from "benny";
import * as arrow1 from "../pkg/node/arrow1";
import * as arrow2 from "../pkg/node/arrow2";
import { readFileSync } from "fs";

const dataDir = `${__dirname}/data`;

// https://stackoverflow.com/a/43053803
const cartesian = (...a) =>
a.reduce((a, b) => a.flatMap((d) => b.map((e) => [d, e].flat())));

const apis = ["arrow1", "arrow2"];
const partitions = [1, 5, 20];
const compressions = ["brotli", "gzip", "none", "snappy"];

const testCases: [number, string, "arrow1" | "arrow2"][] = cartesian(
partitions,
compressions,
apis
);

const createReadTests = () =>
testCases.map(([partitions, compression, api]) => {
const file = `${partitions}-partition-${compression}`;
const testName = `${api} ${file}`;
return b.add(testName, () => {
const arr = loadFile(file);
if (api === "arrow1") {
return () => arrow1.readParquet(arr);
}

return () => arrow2.readParquet2(arr);
});
});

function loadFile(name: string): Uint8Array {
const dataPath = `${dataDir}/${name}.parquet`;
return new Uint8Array(readFileSync(dataPath));
}

b.suite(
"Read Parquet",

...createReadTests(),

b.cycle(),
b.configure({ minDisplayPrecision: 2 }),
b.complete(),
b.save({
file: "bench",
folder: "bench/results/",
version: "0.3.0",
details: true,
format: "chart.html",
})
);
46 changes: 46 additions & 0 deletions bench/make_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from pathlib import Path

import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

compressions = ["SNAPPY", "GZIP", "BROTLI", "ZSTD", "NONE"]


def create_table(n_rows=1_000_000):
data = {}

for dtype in ["uint8", "uint16", "uint32"]:
data[dtype] = pa.array(np.random.randint(0, np.iinfo(dtype).max, size=n_rows))

data["bool"] = pa.array(np.random.randint(0, 2, size=n_rows), type=pa.bool_())

# Todo column with string data?
# https://stackoverflow.com/a/2257449

return pa.table(data)


def write_table(table):
# Create data directory
Path("data").mkdir(exist_ok=True)

data_len = len(table)
for n_partitions in [1, 5, 20]:
for compression in compressions:
row_group_size = data_len / n_partitions
compression_text = str(compression).lower()
fname = f"data/{n_partitions}-partition-{compression_text}.parquet"
pq.write_table(
table, fname, row_group_size=row_group_size, compression=compression
)


def main():
table = create_table()
write_table(table)


if __name__ == "__main__":
main()
162 changes: 162 additions & 0 deletions bench/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

18 changes: 18 additions & 0 deletions bench/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
[tool.poetry]
name = "parquet-wasm-bench"
version = "0.1.0"
description = "Create data for parquet-wasm benchmarks"
authors = ["Kyle Barron <[email protected]>"]
license = "MIT"

[tool.poetry.dependencies]
python = "^3.8"
numpy = "^1.22.3"
pyarrow = "^7.0.0"
pandas = "^1.4.2"

[tool.poetry.dev-dependencies]

[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"@types/node": "^17.0.21",
"@types/tape": "^4.13.2",
"apache-arrow": "^7.0.0",
"benny": "^3.7.1",
"gh-pages": "^3.2.3",
"tape": "^5.5.2",
"ts-node": "^10.7.0",
Expand Down
Loading

0 comments on commit 2e36920

Please sign in to comment.