forked from huggingface/datasets
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Fast table queries with interpolation search (huggingface#2122)
* add interpolation search * update dataset and formatting * update test_formatting * test interpolation search * docstrings * add benchmark * update benchmarks * add indexed table test
- Loading branch information
Showing
10 changed files
with
327 additions
and
106 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
import json | ||
import os | ||
from dataclasses import dataclass | ||
|
||
import numpy as np | ||
import pyarrow as pa | ||
from utils import get_duration | ||
|
||
import datasets | ||
|
||
|
||
SPEED_TEST_N_EXAMPLES = 100_000_000_000 | ||
SPEED_TEST_CHUNK_SIZE = 10_000 | ||
|
||
RESULTS_BASEPATH, RESULTS_FILENAME = os.path.split(__file__) | ||
RESULTS_FILE_PATH = os.path.join(RESULTS_BASEPATH, "results", RESULTS_FILENAME.replace(".py", ".json")) | ||
|
||
|
||
def generate_100B_dataset(num_examples: int, chunk_size: int) -> datasets.Dataset: | ||
table = pa.Table.from_pydict({"col": [0] * chunk_size}) | ||
table = pa.concat_tables([table] * (num_examples // chunk_size)) | ||
return datasets.Dataset(table, fingerprint="table_100B") | ||
|
||
|
||
@dataclass | ||
class RandIter: | ||
low: int | ||
high: int | ||
size: int | ||
seed: int | ||
|
||
def __post_init__(self): | ||
rng = np.random.default_rng(self.seed) | ||
self._sampled_values = rng.integers(low=self.low, high=self.high, size=self.size).tolist() | ||
|
||
def __iter__(self): | ||
return iter(self._sampled_values) | ||
|
||
def __len__(self): | ||
return self.size | ||
|
||
|
||
@get_duration | ||
def get_first_row(dataset: datasets.Dataset): | ||
_ = dataset[0] | ||
|
||
|
||
@get_duration | ||
def get_last_row(dataset: datasets.Dataset): | ||
_ = dataset[-1] | ||
|
||
|
||
@get_duration | ||
def get_batch_of_1024_rows(dataset: datasets.Dataset): | ||
_ = dataset[range(len(dataset) // 2, len(dataset) // 2 + 1024)] | ||
|
||
|
||
@get_duration | ||
def get_batch_of_1024_random_rows(dataset: datasets.Dataset): | ||
_ = dataset[RandIter(0, len(dataset), 1024, seed=42)] | ||
|
||
|
||
def benchmark_table_100B(): | ||
times = {"num examples": SPEED_TEST_N_EXAMPLES} | ||
functions = (get_first_row, get_last_row, get_batch_of_1024_rows, get_batch_of_1024_random_rows) | ||
print("generating dataset") | ||
dataset = generate_100B_dataset(num_examples=SPEED_TEST_N_EXAMPLES, chunk_size=SPEED_TEST_CHUNK_SIZE) | ||
print("Functions") | ||
for func in functions: | ||
print(func.__name__) | ||
times[func.__name__] = func(dataset) | ||
|
||
with open(RESULTS_FILE_PATH, "wb") as f: | ||
f.write(json.dumps(times).encode("utf-8")) | ||
|
||
|
||
if __name__ == "__main__": # useful to run the profiler | ||
benchmark_table_100B() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"num examples": 100000000000, "get_first_row": 0.00019991099999927542, "get_last_row": 5.4411000000698095e-05, "get_batch_of_1024_rows": 0.0004897069999998394, "get_batch_of_1024_random_rows": 0.01800621099999944} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.