Skip to content
This repository has been archived by the owner on May 17, 2024. It is now read-only.

benchmark: add suite #125

Merged
merged 1 commit into from
Jun 29, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ ml-25m*
ratings*.csv
drive
mysqltuner.pl
benchmark_*.jsonl

# Mac
.DS_Store
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -457,6 +457,13 @@ $ poetry run preql -f dev/prepare_db.pql bigquery:///<project>
poetry run python3 -m data_diff postgresql://postgres:Password1@localhost/postgres rating postgresql://postgres:Password1@localhost/postgres rating_del1 --verbose
```

**6. Run benchmarks (optional)**

```shell-session
$ dev/benchmark.sh
```


# License

[MIT License](https://github.com/datafold/data-diff/blob/master/LICENSE)
Expand Down
13 changes: 12 additions & 1 deletion data_diff/diff_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from abc import ABC, abstractmethod
import time
import os
from operator import attrgetter, methodcaller
from collections import defaultdict
from typing import List, Tuple, Iterator, Optional, Type
Expand All @@ -28,7 +29,7 @@
logger = logging.getLogger("diff_tables")

RECOMMENDED_CHECKSUM_DURATION = 10

BENCHMARK = os.environ.get("BENCHMARK", False)
DEFAULT_BISECTION_THRESHOLD = 1024 * 16
DEFAULT_BISECTION_FACTOR = 32

Expand Down Expand Up @@ -409,6 +410,16 @@ def _diff_tables(self, table1, table2, level=0, segment_index=None, segment_coun
f"size: {table2.max_key-table1.min_key}"
)

# When benchmarking, we want the ability to skip checksumming. This
# allows us to download all rows for comparison in performance. By
# default, data-diff will checksum the section first (when it's below
# the threshold) and _then_ download it.
if BENCHMARK:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW I don't think it would be crazy to make this the default...

I ran the benchmark with the default bisection threshold, and checksumming / downloading with the default threshold and ~10k rows is extremely competitive. And the current approach is twice as slow when there is a difference.

max_rows_from_keys = max(table1.max_key - table1.min_key, table2.max_key - table2.min_key)
if max_rows_from_keys < self.bisection_threshold:
yield from self._bisect_and_diff_tables(table1, table2, level=level, max_rows=max_rows_from_keys)
return

(count1, checksum1), (count2, checksum2) = self._threaded_call("count_and_checksum", [table1, table2])

if count1 == 0 and count2 == 0:
Expand Down
7 changes: 7 additions & 0 deletions tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from data_diff import databases as db
import logging
import subprocess

TEST_MYSQL_CONN_STRING: str = "mysql://mysql:Password1@localhost/mysql"
TEST_POSTGRESQL_CONN_STRING: str = None
Expand All @@ -14,6 +15,12 @@

DEFAULT_N_SAMPLES = 50
N_SAMPLES = int(os.environ.get("N_SAMPLES", DEFAULT_N_SAMPLES))
BENCHMARK = os.environ.get("BENCHMARK", False)

def get_git_revision_short_hash() -> str:
return subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode('ascii').strip()

GIT_REVISION=get_git_revision_short_hash()

level = logging.ERROR
if os.environ.get("LOG_LEVEL", False):
Expand Down
Loading