Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Commit 36fd574

Browse files
committed
benchmark: add suite
1 parent d7ae027 commit 36fd574

File tree

5 files changed

+199
-46
lines changed

5 files changed

+199
-46
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,7 @@ ml-25m*
133133
ratings*.csv
134134
drive
135135
mysqltuner.pl
136+
benchmark_*.jsonl
136137

137138
# Mac
138139
.DS_Store

README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -457,6 +457,13 @@ $ poetry run preql -f dev/prepare_db.pql bigquery:///<project>
457457
poetry run python3 -m data_diff postgresql://postgres:Password1@localhost/postgres rating postgresql://postgres:Password1@localhost/postgres rating_del1 --verbose
458458
```
459459

460+
**6. Run benchmarks (optional)**
461+
462+
```shell-session
463+
$ dev/benchmark.sh
464+
```
465+
466+
460467
# License
461468

462469
[MIT License](https://github.com/datafold/data-diff/blob/master/LICENSE)

data_diff/diff_tables.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
from abc import ABC, abstractmethod
55
import time
6+
import os
67
from operator import attrgetter, methodcaller
78
from collections import defaultdict
89
from typing import List, Tuple, Iterator, Optional, Type
@@ -28,7 +29,7 @@
2829
logger = logging.getLogger("diff_tables")
2930

3031
RECOMMENDED_CHECKSUM_DURATION = 10
31-
32+
BENCHMARK = os.environ.get("BENCHMARK", False)
3233
DEFAULT_BISECTION_THRESHOLD = 1024 * 16
3334
DEFAULT_BISECTION_FACTOR = 32
3435

@@ -409,6 +410,16 @@ def _diff_tables(self, table1, table2, level=0, segment_index=None, segment_coun
409410
f"size: {table2.max_key-table1.min_key}"
410411
)
411412

413+
# When benchmarking, we want the ability to skip checksumming. This
414+
# allows us to download all rows for comparison in performance. By
415+
# default, data-diff will checksum the section first (when it's below
416+
# the threshold) and _then_ download it.
417+
if BENCHMARK:
418+
max_rows_from_keys = max(table1.max_key - table1.min_key, table2.max_key - table2.min_key)
419+
if max_rows_from_keys < self.bisection_threshold:
420+
yield from self._bisect_and_diff_tables(table1, table2, level=level, max_rows=max_rows_from_keys)
421+
return
422+
412423
(count1, checksum1), (count2, checksum2) = self._threaded_call("count_and_checksum", [table1, table2])
413424

414425
if count1 == 0 and count2 == 0:

tests/common.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
from data_diff import databases as db
55
import logging
6+
import subprocess
67

78
TEST_MYSQL_CONN_STRING: str = "mysql://mysql:Password1@localhost/mysql"
89
TEST_POSTGRESQL_CONN_STRING: str = None
@@ -14,6 +15,12 @@
1415

1516
DEFAULT_N_SAMPLES = 50
1617
N_SAMPLES = int(os.environ.get("N_SAMPLES", DEFAULT_N_SAMPLES))
18+
BENCHMARK = os.environ.get("BENCHMARK", False)
19+
20+
def get_git_revision_short_hash() -> str:
21+
return subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode('ascii').strip()
22+
23+
GIT_REVISION=get_git_revision_short_hash()
1724

1825
level = logging.ERROR
1926
if os.environ.get("LOG_LEVEL", False):

0 commit comments

Comments
 (0)