Skip to content
This repository has been archived by the owner on May 17, 2024. It is now read-only.

Commit

Permalink
benchmark: add suite
Browse files Browse the repository at this point in the history
  • Loading branch information
sirupsen committed Jun 29, 2022
1 parent d7ae027 commit 36fd574
Show file tree
Hide file tree
Showing 5 changed files with 199 additions and 46 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ ml-25m*
ratings*.csv
drive
mysqltuner.pl
benchmark_*.jsonl

# Mac
.DS_Store
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -457,6 +457,13 @@ $ poetry run preql -f dev/prepare_db.pql bigquery:///<project>
poetry run python3 -m data_diff postgresql://postgres:Password1@localhost/postgres rating postgresql://postgres:Password1@localhost/postgres rating_del1 --verbose
```

**6. Run benchmarks (optional)**

```shell-session
$ dev/benchmark.sh
```


# License

[MIT License](https://github.com/datafold/data-diff/blob/master/LICENSE)
Expand Down
13 changes: 12 additions & 1 deletion data_diff/diff_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from abc import ABC, abstractmethod
import time
import os
from operator import attrgetter, methodcaller
from collections import defaultdict
from typing import List, Tuple, Iterator, Optional, Type
Expand All @@ -28,7 +29,7 @@
logger = logging.getLogger("diff_tables")

RECOMMENDED_CHECKSUM_DURATION = 10

BENCHMARK = os.environ.get("BENCHMARK", False)
DEFAULT_BISECTION_THRESHOLD = 1024 * 16
DEFAULT_BISECTION_FACTOR = 32

Expand Down Expand Up @@ -409,6 +410,16 @@ def _diff_tables(self, table1, table2, level=0, segment_index=None, segment_coun
f"size: {table2.max_key-table1.min_key}"
)

# When benchmarking, we want the ability to skip checksumming. This
# allows us to download all rows for comparison in performance. By
# default, data-diff will checksum the section first (when it's below
# the threshold) and _then_ download it.
if BENCHMARK:
max_rows_from_keys = max(table1.max_key - table1.min_key, table2.max_key - table2.min_key)
if max_rows_from_keys < self.bisection_threshold:
yield from self._bisect_and_diff_tables(table1, table2, level=level, max_rows=max_rows_from_keys)
return

(count1, checksum1), (count2, checksum2) = self._threaded_call("count_and_checksum", [table1, table2])

if count1 == 0 and count2 == 0:
Expand Down
7 changes: 7 additions & 0 deletions tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from data_diff import databases as db
import logging
import subprocess

TEST_MYSQL_CONN_STRING: str = "mysql://mysql:Password1@localhost/mysql"
TEST_POSTGRESQL_CONN_STRING: str = None
Expand All @@ -14,6 +15,12 @@

DEFAULT_N_SAMPLES = 50
N_SAMPLES = int(os.environ.get("N_SAMPLES", DEFAULT_N_SAMPLES))
BENCHMARK = os.environ.get("BENCHMARK", False)

def get_git_revision_short_hash() -> str:
return subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode('ascii').strip()

GIT_REVISION=get_git_revision_short_hash()

level = logging.ERROR
if os.environ.get("LOG_LEVEL", False):
Expand Down
Loading

0 comments on commit 36fd574

Please sign in to comment.