benchmark: add suite

sirupsen · sirupsen · commit 36fd574289f1 · 2022-06-29T13:07:55.000Z
diff --git a/.gitignore b/.gitignore
@@ -133,6 +133,7 @@ ml-25m*
 ratings*.csv
 drive
 mysqltuner.pl
+benchmark_*.jsonl
 
 # Mac
 .DS_Store
diff --git a/README.md b/README.md
@@ -457,6 +457,13 @@ $ poetry run preql -f dev/prepare_db.pql bigquery:///<project>
 poetry run python3 -m data_diff postgresql://postgres:Password1@localhost/postgres rating postgresql://postgres:Password1@localhost/postgres rating_del1 --verbose
 ```
 
+**6. Run benchmarks (optional)**
+
+```shell-session
+$ dev/benchmark.sh
+```
+
+
 # License
 
 [MIT License](https://github.com/datafold/data-diff/blob/master/LICENSE)
diff --git a/data_diff/diff_tables.py b/data_diff/diff_tables.py
@@ -3,6 +3,7 @@
 
 from abc import ABC, abstractmethod
 import time
+import os
 from operator import attrgetter, methodcaller
 from collections import defaultdict
 from typing import List, Tuple, Iterator, Optional, Type
@@ -28,7 +29,7 @@
 logger = logging.getLogger("diff_tables")
 
 RECOMMENDED_CHECKSUM_DURATION = 10
-
+BENCHMARK = os.environ.get("BENCHMARK", False)
 DEFAULT_BISECTION_THRESHOLD = 1024 * 16
 DEFAULT_BISECTION_FACTOR = 32
 
@@ -409,6 +410,16 @@ def _diff_tables(self, table1, table2, level=0, segment_index=None, segment_coun
             f"size: {table2.max_key-table1.min_key}"
         )
 
+        # When benchmarking, we want the ability to skip checksumming. This
+        # allows us to download all rows for comparison in performance. By
+        # default, data-diff will checksum the section first (when it's below
+        # the threshold) and _then_ download it.
+        if BENCHMARK:
+            max_rows_from_keys = max(table1.max_key - table1.min_key, table2.max_key - table2.min_key)
+            if max_rows_from_keys < self.bisection_threshold:
+                yield from self._bisect_and_diff_tables(table1, table2, level=level, max_rows=max_rows_from_keys)
+                return
+
         (count1, checksum1), (count2, checksum2) = self._threaded_call("count_and_checksum", [table1, table2])
 
         if count1 == 0 and count2 == 0:
diff --git a/tests/common.py b/tests/common.py
@@ -3,6 +3,7 @@
 
 from data_diff import databases as db
 import logging
+import subprocess
 
 TEST_MYSQL_CONN_STRING: str = "mysql://mysql:Password1@localhost/mysql"
 TEST_POSTGRESQL_CONN_STRING: str = None
@@ -14,6 +15,12 @@
 
 DEFAULT_N_SAMPLES = 50
 N_SAMPLES = int(os.environ.get("N_SAMPLES", DEFAULT_N_SAMPLES))
+BENCHMARK = os.environ.get("BENCHMARK", False)
+
+def get_git_revision_short_hash() -> str:
+    return subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode('ascii').strip()
+
+GIT_REVISION=get_git_revision_short_hash()
 
 level = logging.ERROR
 if os.environ.get("LOG_LEVEL", False):
diff --git a/tests/test_database_types.py b/tests/test_database_types.py