Create arrow1/arrow2 read benchmarks (#82)

kylebarron · Apr 26, 2022 · 2e36920 · 2e36920
1 parent 834927c
commit 2e36920
Show file tree

Hide file tree

Showing 6 changed files with 535 additions and 1 deletion.
diff --git a/bench/bench.ts b/bench/bench.ts
@@ -0,0 +1,56 @@
+import b from "benny";
+import * as arrow1 from "../pkg/node/arrow1";
+import * as arrow2 from "../pkg/node/arrow2";
+import { readFileSync } from "fs";
+
+const dataDir = `${__dirname}/data`;
+
+// https://stackoverflow.com/a/43053803
+const cartesian = (...a) =>
+  a.reduce((a, b) => a.flatMap((d) => b.map((e) => [d, e].flat())));
+
+const apis = ["arrow1", "arrow2"];
+const partitions = [1, 5, 20];
+const compressions = ["brotli", "gzip", "none", "snappy"];
+
+const testCases: [number, string, "arrow1" | "arrow2"][] = cartesian(
+  partitions,
+  compressions,
+  apis
+);
+
+const createReadTests = () =>
+  testCases.map(([partitions, compression, api]) => {
+    const file = `${partitions}-partition-${compression}`;
+    const testName = `${api} ${file}`;
+    return b.add(testName, () => {
+      const arr = loadFile(file);
+      if (api === "arrow1") {
+        return () => arrow1.readParquet(arr);
+      }
+
+      return () => arrow2.readParquet2(arr);
+    });
+  });
+
+function loadFile(name: string): Uint8Array {
+  const dataPath = `${dataDir}/${name}.parquet`;
+  return new Uint8Array(readFileSync(dataPath));
+}
+
+b.suite(
+  "Read Parquet",
+
+  ...createReadTests(),
+
+  b.cycle(),
+  b.configure({ minDisplayPrecision: 2 }),
+  b.complete(),
+  b.save({
+    file: "bench",
+    folder: "bench/results/",
+    version: "0.3.0",
+    details: true,
+    format: "chart.html",
+  })
+);
diff --git a/bench/make_data.py b/bench/make_data.py
@@ -0,0 +1,46 @@
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import pyarrow as pa
+import pyarrow.parquet as pq
+
+compressions = ["SNAPPY", "GZIP", "BROTLI", "ZSTD", "NONE"]
+
+
+def create_table(n_rows=1_000_000):
+    data = {}
+
+    for dtype in ["uint8", "uint16", "uint32"]:
+        data[dtype] = pa.array(np.random.randint(0, np.iinfo(dtype).max, size=n_rows))
+
+    data["bool"] = pa.array(np.random.randint(0, 2, size=n_rows), type=pa.bool_())
+
+    # Todo column with string data?
+    # https://stackoverflow.com/a/2257449
+
+    return pa.table(data)
+
+
+def write_table(table):
+    # Create data directory
+    Path("data").mkdir(exist_ok=True)
+
+    data_len = len(table)
+    for n_partitions in [1, 5, 20]:
+        for compression in compressions:
+            row_group_size = data_len / n_partitions
+            compression_text = str(compression).lower()
+            fname = f"data/{n_partitions}-partition-{compression_text}.parquet"
+            pq.write_table(
+                table, fname, row_group_size=row_group_size, compression=compression
+            )
+
+
+def main():
+    table = create_table()
+    write_table(table)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bench/poetry.lock b/bench/poetry.lock
diff --git a/bench/pyproject.toml b/bench/pyproject.toml
@@ -0,0 +1,18 @@
+[tool.poetry]
+name = "parquet-wasm-bench"
+version = "0.1.0"
+description = "Create data for parquet-wasm benchmarks"
+authors = ["Kyle Barron <[email protected]>"]
+license = "MIT"
+
+[tool.poetry.dependencies]
+python = "^3.8"
+numpy = "^1.22.3"
+pyarrow = "^7.0.0"
+pandas = "^1.4.2"
+
+[tool.poetry.dev-dependencies]
+
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
diff --git a/package.json b/package.json
@@ -12,6 +12,7 @@
     "@types/node": "^17.0.21",
     "@types/tape": "^4.13.2",
     "apache-arrow": "^7.0.0",
+    "benny": "^3.7.1",
     "gh-pages": "^3.2.3",
     "tape": "^5.5.2",
     "ts-node": "^10.7.0",