From d2e2822f0ab651449b864343a553f0937445da78 Mon Sep 17 00:00:00 2001 From: Nick Carchedi Date: Fri, 17 May 2024 05:46:31 -0600 Subject: [PATCH 1/2] Update README.md --- README.md | 229 +----------------------------------------------------- 1 file changed, 3 insertions(+), 226 deletions(-) diff --git a/README.md b/README.md index 5da51054..17b472ef 100644 --- a/README.md +++ b/README.md @@ -1,238 +1,15 @@ -

- Datafold -

+### ⚠️ As of May 17, 2024, Datafold is no longer actively supporting or developing open source data-diff. We’re grateful to everyone who made contributions along the way. Please see [our blog post](https://www.datafold.com/blog/sunsetting-open-source-data-diff) for additional context on this decision. -

-data-diff: Compare datasets fast, within or across SQL databases +--- -![data-diff-logo](docs/data-diff-logo.png) -

-
- -> [Join our live virtual lab series to learn how to set it up!](https://www.datafold.com/virtual-hands-on-lab) - -# What's a Data Diff? -A data diff is the value-level comparison between two tables—used to identify critical changes to your data and guarantee data quality. - -There is a lot you can do with data-diff: you can test SQL code by comparing development or staging environment data to production, or compare source and target data to identify discrepancies when moving data between databases. - -# data-diff OSS & Datafold Cloud -data-diff is an open source utility for running stateless diffs as a great single player experience. - - - -Scale up with [Datafold Cloud](https://www.datafold.com/) to make data diffing a company-wide experience to both supercharge your data diffing CLI experience (ex: data-diff --dbt --cloud) and run diffs manually in your CI process and within the Datafold UI. This includes [column-level lineage](https://www.datafold.com/column-level-lineage) with BI tool integrations, [CI testing](https://docs.datafold.com/deployment_testing/how_it_works/), faster cross-database diffing, and diff history. - -# Use Cases - -### Data Development Testing -When developing SQL code, data-diff helps you validate and preview changes by comparing data between development/staging environments and production. Here's how it works: -1. Make a change to your SQL code -2. Run the SQL code to create a new dataset -3. Compare this dataset with its production version or other iterations - -### Data Migration & Replication Testing -data-diff is a powerful tool for comparing data when you're moving it between systems. Use it to ensure data accuracy and identify discrepancies during tasks like: -- **Migrating** to a new data warehouse (e.g., Oracle -> Snowflake) -- **Validating SQL transformations** from legacy solutions (e.g., stored procedures) to new transformation frameworks (e.g., dbt) -- Continuously **replicating data** from an OLTP database to OLAP data warehouse (e.g., MySQL -> Redshift) - -# dbt Integration -

- dbt -

- -data-diff integrates with [dbt Core](https://github.com/dbt-labs/dbt-core) to seamlessly compare local development to production datasets. - -Learn more about how data-diff works with dbt: -* Read our docs to get started with [data-diff & dbt](https://docs.datafold.com/development_testing/cli) or :eyes: **watch the [4-min demo video](https://www.loom.com/share/ad3df969ba6b4298939efb2fbcc14cde)** -* dbt Cloud users should check out [Datafold's out-of-the-box deployment testing integration](https://www.datafold.com/data-deployment-testing) -* Get support from the dbt Community Slack in [#tools-datafold](https://getdbt.slack.com/archives/C03D25A92UU) - - -# Getting Started - -### ⚡ Validating dbt model changes between dev and prod -Looking to use data-diff in dbt development? - -Development testing with Datafold enables you to see the impact of dbt code changes on data as you write the code, whether in your IDE or CLI. - - Head over to [our `data-diff` + `dbt` documentation](https://docs.datafold.com/development_testing/cli) to get started with a development testing workflow! - -### 🔀 Compare data tables between databases -1. Install `data-diff` with adapters - -To compare data between databases, install `data-diff` with specific database adapters. For example, install it for PostgreSQL and Snowflake like this: - -``` -pip install data-diff 'data-diff[postgresql,snowflake]' -U -``` - -Additionally, you can install all open source supported database adapters as follows. -``` -pip install data-diff 'data-diff[all-dbs]' -U -``` - -2. Run `data-diff` with connection URIs - -Then, we compare tables between PostgreSQL and Snowflake using the hashdiff algorithm: - -```bash -data-diff \ - postgresql://:''@localhost:5432/ \ - \ - "snowflake://:@//?warehouse=&role=" \ -
\ - -k \ - -c \ - -w -``` -3. Set up your configuration - -You can use a `toml` configuration file to run your `data-diff` job. In this example, we compare tables between MotherDuck (hosted DuckDB) and Snowflake using the hashdiff algorithm: - -```toml -## DATABASE CONNECTION ## -[database.duckdb_connection] - driver = "duckdb" - # filepath = "datafold_demo.duckdb" # local duckdb file example - # filepath = "md:" # default motherduck connection example - filepath = "md:datafold_demo?motherduck_token=${motherduck_token}" # API token recommended for motherduck connection - -[database.snowflake_connection] - driver = "snowflake" - database = "DEV" - user = "sung" - password = "${SNOWFLAKE_PASSWORD}" # or "" - # the info below is only required for snowflake - account = "${ACCOUNT}" # by33919 - schema = "DEVELOPMENT" - warehouse = "DEMO" - role = "DEMO_ROLE" - -## RUN PARAMETERS ## -[run.default] - verbose = true - -## EXAMPLE DATA DIFF JOB ## -[run.demo_xdb_diff] - # Source 1 ("left") - 1.database = "duckdb_connection" - 1.table = "development.raw_orders" - - # Source 2 ("right") - 2.database = "snowflake_connection" - 2.table = "RAW_ORDERS" # note that snowflake table names are case-sensitive - - verbose = false -``` -4. Run your `data-diff` job - -Make sure to export relevant environment variables as needed. For example, we compare data based on the earlier configuration: - -```bash - -# export relevant environment variables, example below -export motherduck_token= - -# run the configured data-diff job -data-diff --conf datadiff.toml \ - --run demo_xdb_diff \ - -k "id" \ - -c status - -# output example -- 1, completed -+ 1, returned -``` - -5. Review the output - -After running your data-diff job, review the output to identify and analyze differences in your data. - -Check out [documentation](https://docs.datafold.com/reference/open_source/cli) for the full command reference. - -# Supported databases - -| Database | Status | Connection string | -|---------------|-------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------| -| PostgreSQL >=10 | 🟢 | `postgresql://:@:5432/` | -| MySQL | 🟢 | `mysql://:@:5432/` | -| Snowflake | 🟢 | `"snowflake://[:]@//?warehouse=&role=[&authenticator=externalbrowser]"` | -| BigQuery | 🟢 | `bigquery:///` | -| Redshift | 🟢 | `redshift://:@:5439/` | -| DuckDB | 🟢 | `duckdb://` | -| MotherDuck | 🟢 | `duckdb://` | -| Microsoft SQL Server* | 🟢 | `mssql://:@//` | -| Oracle | 🟡 | `oracle://:@/servive_or_sid` | -| Presto | 🟡 | `presto://:@:8080/` | -| Databricks | 🟡 | `databricks://:@//` | -| Trino | 🟡 | `trino://:@:8080/` | -| Clickhouse | 🟡 | `clickhouse://:@:9000/` | -| Vertica | 🟡 | `vertica://:@:5433/` | - -*MS SQL Server support is limited, with known performance issues that are addressed in Datafold Cloud. - -* 🟢: Implemented and thoroughly tested. -* 🟡: Implemented, but not thoroughly tested yet. - -Your database not listed here? - -- Contribute a [new database adapter](https://github.com/datafold/data-diff/blob/master/docs/new-database-driver-guide.rst) – we accept pull requests! -- [Get in touch](https://www.datafold.com/demo) about enterprise support and adding new adapters and features - - -
- -# How it works - -`data-diff` efficiently compares data using two modes: - -**joindiff**: Ideal for comparing data within the same database, utilizing outer joins for efficient row comparisons. It relies on the database engine for computation and has consistent performance. - -**hashdiff**: Recommended for comparing datasets across different databases or large tables with minimal differences. It uses hashing and binary search, capable of diffing data across distinct database engines. - -
-Click here to learn more about joindiff and hashdiff - -### `joindiff` -* Recommended for comparing data within the same database -* Uses the outer join operation to diff the rows as efficiently as possible within the same database -* Fully relies on the underlying database engine for computation -* Requires both datasets to be queryable with a single SQL query -* Time complexity approximates JOIN operation and is largely independent of the number of differences in the dataset - -### `hashdiff`: -* Recommended for comparing datasets across different databases -* Can also be helpful in diffing very large tables with few expected differences within the same database -* Employs a divide-and-conquer algorithm based on hashing and binary search -* Can diff data across distinct database engines, e.g., PostgreSQL <> Snowflake -* Time complexity approximates COUNT(*) operation when there are few differences -* Performance degrades when datasets have a large number of differences - -
-
- -For detailed algorithm and performance insights, explore [here](https://github.com/datafold/data-diff/blob/master/docs/technical-explanation.md), or head to our docs to [learn more about how Datafold diffs data](https://docs.datafold.com/data_diff/how-datafold-diffs-data). +# data-diff: Compare datasets fast, within or across SQL databases ## Contributors -We thank everyone who contributed so far! - -We'd love to see your face here: [Contributing Instructions](CONTRIBUTING.md) - -
- -## Analytics - -* [Usage Analytics & Data Privacy](https://github.com/datafold/data-diff/blob/master/docs/usage_analytics.md) - -
- ## License This project is licensed under the terms of the [MIT License](https://github.com/datafold/data-diff/blob/master/LICENSE). From 94f79321a9371ac96303893b80ee03b5aaae8518 Mon Sep 17 00:00:00 2001 From: Nick Carchedi Date: Fri, 17 May 2024 05:47:56 -0600 Subject: [PATCH 2/2] increment version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 83ff5898..d377e5c5 100755 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "data-diff" -version = "0.11.1" +version = "0.11.2" description = "Command-line tool and Python library to efficiently diff rows across two different databases." authors = ["Datafold "] license = "MIT"