From afa683fbd5ea9e2a1a1074f3da6329471d76cf0e Mon Sep 17 00:00:00 2001 From: Ally Franken <108901375+allyfranken@users.noreply.github.com> Date: Wed, 22 Jan 2025 14:45:41 -0500 Subject: [PATCH 1/3] Addition of Hydra's Helper Functions (#371) * dummy commit to tests permissions and torchy * add helper functions and unit test cases * remove unnecessary helper functions * moving to helper * update naming convetion, add docstrings for functions * update unit tests to pass, update naming to use 'base' and 'compare' * update lint fixes * update for linting * reformat with ruff format * updating compare_by_row signature * updating compare_by_row signature * bump version --------- Co-authored-by: Faisal Dosani Co-authored-by: Faisal Dosani --- datacompy/__init__.py | 2 +- datacompy/spark/helper.py | 245 ++++++++++++++++++++++++ tests/test_spark/test_helper.py | 298 +++++++++++++++++++++++++++++ tests/test_spark/test_sql_spark.py | 2 +- 4 files changed, 545 insertions(+), 2 deletions(-) create mode 100644 datacompy/spark/helper.py create mode 100644 tests/test_spark/test_helper.py diff --git a/datacompy/__init__.py b/datacompy/__init__.py index 8f5651c6..d62c2454 100644 --- a/datacompy/__init__.py +++ b/datacompy/__init__.py @@ -18,7 +18,7 @@ Then extended to carry that functionality over to Spark Dataframes. """ -__version__ = "0.16.0" +__version__ = "0.16.1" import platform from warnings import warn diff --git a/datacompy/spark/helper.py b/datacompy/spark/helper.py new file mode 100644 index 00000000..72fcb3a3 --- /dev/null +++ b/datacompy/spark/helper.py @@ -0,0 +1,245 @@ +# +# Copyright 2025 Capital One Services, LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Helper function module contributed by Capital One's Hydra Team. + +Helper functions to assist in specific usecases where there is no columns to join +and use the row order of the datasets. +""" + +from datacompy.logger import INFO, get_logger +from datacompy.spark.sql import SparkSQLCompare + +LOG = get_logger(__name__, INFO) + +try: + import pyspark.sql + from pyspark.sql import Window + from pyspark.sql import types as T + from pyspark.sql.functions import col, format_number, row_number +except ImportError: + LOG.warning( + "Please note that you are missing the optional dependency: spark. " + "If you need to use this functionality it must be installed." + ) + + +def compare_by_row( + spark_session: "pyspark.sql.SparkSession", + base_dataframe: "pyspark.sql.DataFrame", + compare_dataframe: "pyspark.sql.DataFrame", + string2double_cols: list[str] | None, + abs_tol: float = 0, + rel_tol: float = 0, + df1_name: str = "df1", + df2_name: str = "df2", + ignore_spaces: bool = False, + ignore_case: bool = False, + cast_column_names_lower: bool = True, +) -> SparkSQLCompare: + """Run a detailed analysis on specific usecases where there is no columns to join and use the row order of the datasets. + + If you know which columns to join on then please use ``SparkSQLCompare`` directly as this is meant to help + support very specific helper usecases using row order contributed by Capital One's Hydra Team. + + Parameters + ---------- + spark_session : pyspark.sql.SparkSession + A ``SparkSession`` to be used to execute Spark commands in the comparison. + base_dataframe: pyspark.sql.DataFrame + Dataset to be compared against + compare_dataframe: pyspark.sql.DataFrame + dataset to compare + string2double_cols: List[str], optional + The columns that contain numeric values but are stored as string types + abs_tol : float, optional + Absolute tolerance between two values. + rel_tol : float, optional + Relative tolerance between two values. + df1_name : str, optional + A string name for the first dataframe. This allows the reporting to + print out an actual name instead of "df1", and allows human users to + more easily track the dataframes. + df2_name : str, optional + A string name for the second dataframe + ignore_spaces : bool, optional + Flag to strip whitespace (including newlines) from string columns (including any join + columns) + ignore_case : bool, optional + Flag to ignore the case of string columns + cast_column_names_lower: bool, optional + Boolean indicator that controls of column names will be cast into lower case + + Returns + ------- + datacompy.spark.sql.SparkSQLCompare + """ + # Convert fields that contain numeric values stored as strings to numeric types for comparison + if len(string2double_cols) != 0: + base_dataframe = handle_numeric_strings(base_dataframe, string2double_cols) + compare_dataframe = handle_numeric_strings( + compare_dataframe, string2double_cols + ) + + sorted_base_df, sorted_compare_df = sort_rows(base_dataframe, compare_dataframe) + column_to_join = ["row"] + + LOG.info("Compared by column(s): ", column_to_join) + if string2double_cols: + LOG.info( + "String column(s) cast to doubles for numeric comparison: ", + string2double_cols, + ) + return SparkSQLCompare( + spark_session=spark_session, + df1=sorted_base_df, + df2=sorted_compare_df, + join_columns=column_to_join, + abs_tol=abs_tol, + rel_tol=rel_tol, + df1_name=df1_name, + df2_name=df2_name, + ignore_spaces=ignore_spaces, + ignore_case=ignore_case, + cast_column_names_lower=cast_column_names_lower, + ) + + +def handle_numeric_strings( + df: "pyspark.sql.DataFrame", field_list: list[str] +) -> "pyspark.sql.DataFrame": + """Convert columns in field_list from numeric strings to DoubleType. + + Parameters + ---------- + df: pyspark.sql.DataFrame + The DataFrame to be converted + field_list: List[str] + List of StringType columns to be converted to DoubleType + + Returns + ------- + pyspark.sql.DataFrame + """ + for this_col in field_list: + df = df.withColumn(this_col, col(this_col).cast(T.DoubleType())) + return df + + +def sort_rows( + base_df: "pyspark.sql.DataFrame", compare_df: "pyspark.sql.DataFrame" +) -> "pyspark.sql.DataFrame": + """Add new column to each DataFrame that numbers the rows, so they can be compared by row number. + + Parameters + ---------- + base_df: pyspark.sql.DataFrame + The base DataFrame to be sorted + compare_df: pyspark.sql.DataFrame + The compare DataFrame to be sorted + + Returns + ------- + pyspark.sql.DataFrame, pyspark.sql.DataFrame + + + """ + base_cols = base_df.columns + compare_cols = compare_df.columns + + # Ensure both DataFrames have the same columns + for x in base_cols: + if x not in compare_cols: + raise Exception( + f"{x} is present in base_df but does not exist in compare_df" + ) + + if set(base_cols) != set(compare_cols): + LOG.warning( + "WARNING: There are columns present in Compare df that do not exist in Base df. " + "The Base df columns will be used for row-wise sorting and may produce unanticipated " + "report output if the extra fields are not null." + ) + + w = Window.orderBy(*base_cols) + sorted_base_df = base_df.select("*", row_number().over(w).alias("row")) + sorted_compare_df = compare_df.select("*", row_number().over(w).alias("row")) + return sorted_base_df, sorted_compare_df + + +def sort_columns( + base_df: "pyspark.sql.DataFrame", compare_df: "pyspark.sql.DataFrame" +) -> "pyspark.sql.DataFrame": + """Sort both DataFrames by their columns to ensure consistent order. + + Parameters + ---------- + base_df: pyspark.sql.DataFrame + The base DataFrame to be sorted + compare_df: pyspark.sql.DataFrame + The compare DataFrame to be sorted + + Returns + ------- + pyspark.sql.DataFrame, pyspark.sql.DataFrame + """ + # Ensure both DataFrames have the same columns + common_columns = set(base_df.columns) + for x in common_columns: + if x not in compare_df.columns: + raise Exception( + f"{x} is present in base_df but does not exist in compare_df" + ) + # Sort both DataFrames to ensure consistent order + base_df = base_df.orderBy(*common_columns) + compare_df = compare_df.orderBy(*common_columns) + return base_df, compare_df + + +def format_numeric_fields(df: "pyspark.sql.DataFrame") -> "pyspark.sql.DataFrame": + """Round and truncate numeric fields to 5 decimal places. + + Parameters + ---------- + df: pyspark.sql.DataFrame + The DataFrame to be formatted + + Returns + ------- + pyspark.sql.DataFrame + """ + fixed_cols = [] + numeric_types = [ + "tinyint", + "smallint", + "int", + "bigint", + "float", + "double", + "decimal", + ] + + for c in df.dtypes: + # do not change non-numeric fields + if c[1] not in numeric_types: + fixed_cols.append(col(c[0])) + # round & truncate numeric fields + else: + new_val = format_number(col(c[0]), 5).alias(c[0]) + fixed_cols.append(new_val) + + formatted_df = df.select(*fixed_cols) + return formatted_df diff --git a/tests/test_spark/test_helper.py b/tests/test_spark/test_helper.py new file mode 100644 index 00000000..6eed4daa --- /dev/null +++ b/tests/test_spark/test_helper.py @@ -0,0 +1,298 @@ +# +# Copyright 2025 Capital One Services, LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Testing out the spark helper functionality +""" + +import logging +import sys + +import pytest + +pytest.importorskip("pyspark") +if sys.version_info >= (3, 12): + pytest.skip("unsupported python version", allow_module_level=True) + + +from datacompy.spark.helper import ( + compare_by_row, + format_numeric_fields, + handle_numeric_strings, + sort_columns, + sort_rows, +) +from datacompy.spark.sql import SparkSQLCompare +from pyspark.sql.types import ( + IntegerType, + StringType, + StructField, + StructType, +) + +logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) + + +def test_detailed_compare_with_string2columns(spark_session): + # create mock data + mock_base_data = [ + ("bob", "22", "dog"), + ("alice", "19", "cat"), + ("john", "70", "bunny"), + ] + mock_base_columns = ["name", "age", "pet"] + + mock_compare_data = [ + ("bob", "22", "dog"), + ("alice", "19", "cat"), + ("john", "70", "bunny"), + ] + mock_compare_columns = ["name", "age", "pet"] + + # Create DataFrames + mock_base_df = spark_session.createDataFrame(mock_base_data, mock_base_columns) + mock_compare_df = spark_session.createDataFrame( + mock_compare_data, mock_compare_columns + ) + + # call detailed_compare + result_compared_data = compare_by_row( + spark_session=spark_session, + base_dataframe=mock_base_df, + compare_dataframe=mock_compare_df, + string2double_cols=["age"], + ) + + # assert result + assert isinstance(result_compared_data, SparkSQLCompare) + assert result_compared_data.matches() + + +def test_detailed_compare_with_column_to_join(spark_session): + # create mock data + mock_base_data = [ + ("bob", "22", "dog"), + ("alice", "19", "cat"), + ("john", "70", "bunny"), + ] + mock_base_columns = ["name", "age", "pet"] + + mock_compare_data = [ + ("bob", "22", "dog"), + ("alice", "19", "cat"), + ("john", "70", "bunny"), + ] + mock_compare_columns = ["name", "age", "pet"] + + # Create DataFrames + mock_base_df = spark_session.createDataFrame(mock_base_data, mock_base_columns) + mock_compare_df = spark_session.createDataFrame( + mock_compare_data, mock_compare_columns + ) + + # call detailed_compare + result_compared_data = compare_by_row( + spark_session=spark_session, + base_dataframe=mock_base_df, + compare_dataframe=mock_compare_df, + string2double_cols=[], + ) + + # assert result + assert result_compared_data.matches() + assert isinstance(result_compared_data, SparkSQLCompare) + + +def test_handle_numeric_strings(spark_session): + # create mock_df + mock_data = [("bob", "22", "dog"), ("alice", "19", "cat"), ("john", "70", "bunny")] + mock_columns = ["name", "age", "pet"] + mock_df = spark_session.createDataFrame(mock_data, mock_columns) + + # create mock field_list + mock_field_list = ["age"] + + # call handle_numeric_strings + result_df = handle_numeric_strings(mock_df, mock_field_list) + + # create expected dataframe + expected_data = [ + ("bob", 22.0, "dog"), + ("alice", 19.0, "cat"), + ("john", 70.0, "bunny"), + ] + expected_columns = ["name", "age", "pet"] + expected_df = spark_session.createDataFrame(expected_data, expected_columns) + + # assert calls + assert result_df.collect() == expected_df.collect() + + +def test_format_numeric_fields(spark_session): + # create mock dataframe + mock_data = [("bob", 22, "dog"), ("alice", 19, "cat"), ("john", 70, "bunny")] + mock_columns = ["name", "age", "pet"] + mock_df = spark_session.createDataFrame(mock_data, mock_columns) + + # call format_numeric_fields + formatted_df = format_numeric_fields(mock_df) + + # create expected dataframe + expected_data = [ + ("bob", "22.00000", "dog"), + ("alice", "19.00000", "cat"), + ("john", "70.00000", "bunny"), + ] + expected_columns = ["name", "age", "pet"] + expected_df = spark_session.createDataFrame(expected_data, expected_columns) + + # assert calls + assert formatted_df.collect() == expected_df.collect() + + +def test_sort_rows_failure(spark_session): + # create mock dataframes + input_base_data = [ + ("bob", "22", "dog"), + ("alice", "19", "cat"), + ("john", "70", "bunny"), + ] + columns_base = ["name", "age", "pet"] + + input_compare_data = [("19", "cat"), ("70", "bunny"), ("22", "dog")] + columns_commpare = ["age", "pet"] + + # Create DataFrames + input_base_df = spark_session.createDataFrame(input_base_data, columns_base) + input_compare_df = spark_session.createDataFrame( + input_compare_data, columns_commpare + ) + + # call call_rows + with pytest.raises( + Exception, match="name is present in base_df but does not exist in compare_df" + ): + sort_rows(input_base_df, input_compare_df) + + +def test_sort_rows_success(caplog, spark_session): + caplog.set_level(logging.WARNING) + + # create mock data + input_base_data = [ + ("bob", "22", "dog"), + ("alice", "19", "cat"), + ("john", "70", "bunny"), + ] + columns_base = ["name", "age", "pet"] + + input_compare_data = [ + ("19", "cat", "alice", "red"), + ("70", "bunny", "john", "black"), + ("22", "dog", "bob", "white"), + ] + columns_compare = ["age", "pet", "name", "color"] + + # create dataFrames + input_base_df = spark_session.createDataFrame(input_base_data, columns_base) + input_compare_df = spark_session.createDataFrame( + input_compare_data, columns_compare + ) + + # call sort_rows + sorted_base_df, sorted_compare_df = sort_rows(input_base_df, input_compare_df) + + # create expected base_dataframe + expected_base_data = [ + ("alice", "19", "cat", 1), + ("bob", "22", "dog", 2), + ("john", "70", "bunny", 3), + ] + expected_base_schema = StructType( + [ + StructField("name", StringType(), True), + StructField("age", StringType(), True), + StructField("pet", StringType(), True), + StructField("row", IntegerType(), True), + ] + ) + expected_base_df = spark_session.createDataFrame( + expected_base_data, expected_base_schema + ) + + # create expected compare_dataframe + expected_compare_data = [ + ("19", "cat", "alice", "red", 1), + ("22", "dog", "bob", "white", 2), + ("70", "bunny", "john", "black", 3), + ] + expected_compare_schema = StructType( + [ + StructField("age", StringType(), True), + StructField("pet", StringType(), True), + StructField("name", StringType(), True), + StructField("color", StringType(), True), + StructField("row", IntegerType(), True), + ] + ) + expected_compare_df = spark_session.createDataFrame( + expected_compare_data, expected_compare_schema + ) + + # assertions + assert sorted_base_df.collect() == expected_base_df.collect() + assert sorted_compare_df.collect() == expected_compare_df.collect() + assert ( + "WARNING: There are columns present in Compare df that do not exist in Base df. The Base df columns will be used for row-wise sorting and may produce unanticipated report output if the extra fields are not null.\n" + in caplog.text + ) + + +def test_sort_columns_failure(spark_session): + # create mock dataframes + input_base_data = [ + ("row1", "col2", "col3"), + ("row2", "col2", "col3"), + ("row3", "col2", "col3"), + ] + columns_1 = ["col1", "col2", "col3"] + + input_compare_data = [("row1", "col2"), ("row2", "col2"), ("row3", "col2")] + columns_2 = ["col1", "col2"] + + # Create DataFrames + input_base_df = spark_session.createDataFrame(input_base_data, columns_1) + input_compare_df = spark_session.createDataFrame(input_compare_data, columns_2) + + # call sort_columns + with pytest.raises( + Exception, match="col3 is present in base_df but does not exist in compare_df" + ): + sort_columns(input_base_df, input_compare_df) + + +def test_sort_columns_success(spark_session): + # Create sample DataFrames + data1 = [(1, "a"), (2, "b"), (3, "c")] + data2 = [(1, "a"), (2, "b"), (3, "c")] + columns = ["id", "value"] + + df1 = spark_session.createDataFrame(data1, columns) + df2 = spark_session.createDataFrame(data2, columns) + + # Test with matching columns + sorted_df1, sorted_df2 = sort_columns(df1, df2) + assert sorted_df1.columns == sorted_df2.columns + assert sorted_df1.collect() == sorted_df2.collect() diff --git a/tests/test_spark/test_sql_spark.py b/tests/test_spark/test_sql_spark.py index e93fbafc..5b2686f6 100644 --- a/tests/test_spark/test_sql_spark.py +++ b/tests/test_spark/test_sql_spark.py @@ -1,5 +1,5 @@ # -# Copyright 2024 Capital One Services, LLC +# Copyright 2025 Capital One Services, LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 2ae7d431bcbee876dc8a4750d4431d0eaa527348 Mon Sep 17 00:00:00 2001 From: Faisal Date: Wed, 5 Feb 2025 13:05:19 -0500 Subject: [PATCH 2/3] adding check for pyarrow strings in columns_equal (#375) * adding check for pyarrow strings in columns_equal * Update datacompy/core.py Co-authored-by: Gladys Teh <97971054+gladysteh99@users.noreply.github.com> * Update datacompy/core.py Co-authored-by: Gladys Teh <97971054+gladysteh99@users.noreply.github.com> --------- Co-authored-by: Gladys Teh <97971054+gladysteh99@users.noreply.github.com> --- datacompy/core.py | 18 ++++++++++++++---- tests/test_core.py | 23 +++++++++++++++++++++++ 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/datacompy/core.py b/datacompy/core.py index b0d10d77..64c5ebf6 100644 --- a/datacompy/core.py +++ b/datacompy/core.py @@ -380,8 +380,12 @@ def _intersect_compare(self, ignore_spaces: bool, ignore_case: bool) -> None: "match_column": col_match, "match_cnt": match_cnt, "unequal_cnt": row_cnt - match_cnt, - "dtype1": str(self.df1[column].dtype), - "dtype2": str(self.df2[column].dtype), + "dtype1": str(self.df1[column].dtype.__repr__()) + if str(self.df1[column].dtype) == "string" + else str(self.df1[column].dtype), + "dtype2": str(self.df2[column].dtype.__repr__()) + if str(self.df2[column].dtype) == "string" + else str(self.df2[column].dtype), "all_match": all( ( self.df1[column].dtype == self.df2[column].dtype, @@ -847,8 +851,14 @@ def columns_equal( | (col_1.isnull() & col_2.isnull()) ) except Exception: - # Blanket exception should just return all False - compare = pd.Series(False, index=col_1.index) + # Check for string[pyarrow] and string[python] + if col_1.dtype in ( + "string[python]", + "string[pyarrow]", + ) and col_2.dtype in ("string[python]", "string[pyarrow]"): + compare = pd.Series(col_1.astype(str) == col_2.astype(str)) + else: # Blanket exception should just return all False + compare = pd.Series(False, index=col_1.index) compare.index = col_1.index return compare diff --git a/tests/test_core.py b/tests/test_core.py index b0c3647f..c14918e9 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -62,6 +62,29 @@ def test_numeric_columns_equal_rel(): assert_series_equal(expect_out, actual_out, check_names=False) +def test_string_pyarrow_columns_equal(): + data = """a|b|expected +Hi|Hi|True +Yo|Yo|True +Hey|Hey |False +résumé|resume|False +résumé|résumé|True +💩|💩|True +💩|🤔|False + | |True + | |False +datacompy|DataComPy|False +something||False +|something|False +||True""" + df = pd.read_csv(io.StringIO(data), sep="|") + actual_out = datacompy.columns_equal( + df.a.astype("string[python]"), df.b.astype("string[pyarrow]"), rel_tol=0.2 + ) + expect_out = df["expected"] + assert (actual_out == expect_out).all() + + def test_string_columns_equal(): data = """a|b|expected Hi|Hi|True From e80fb12aafe0f93d0401fbf90c65db2daeef45a3 Mon Sep 17 00:00:00 2001 From: Faisal Date: Wed, 5 Feb 2025 14:17:23 -0500 Subject: [PATCH 3/3] refactor logging (#376) * refactor logging * Update CODEOWNERS --- CODEOWNERS | 2 +- datacompy/__init__.py | 2 +- datacompy/base.py | 7 ++- datacompy/core.py | 6 +-- datacompy/fugue.py | 6 +-- datacompy/logger.py | 61 -------------------------- datacompy/polars.py | 6 +-- datacompy/snowflake.py | 6 +-- datacompy/spark/helper.py | 5 ++- datacompy/spark/legacy.py | 2 +- datacompy/spark/pandas.py | 2 +- datacompy/spark/sql.py | 6 +-- tests/test_core.py | 2 +- tests/test_fugue/test_duckdb.py | 2 +- tests/test_fugue/test_fugue_helpers.py | 2 +- tests/test_fugue/test_fugue_pandas.py | 2 +- tests/test_fugue/test_fugue_polars.py | 2 +- tests/test_fugue/test_fugue_spark.py | 2 +- tests/test_polars.py | 2 +- tests/test_snowflake.py | 2 +- tests/test_spark/test_legacy_spark.py | 2 +- tests/test_spark/test_pandas_spark.py | 2 +- 22 files changed, 35 insertions(+), 96 deletions(-) delete mode 100644 datacompy/logger.py diff --git a/CODEOWNERS b/CODEOWNERS index a61d6a58..373f7d06 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1 +1 @@ -* @fdosani @ak-gupta @jdawang @gladysteh99 +* @fdosani @ak-gupta @jdawang @gladysteh99 @rhaffar diff --git a/datacompy/__init__.py b/datacompy/__init__.py index d62c2454..ea04402b 100644 --- a/datacompy/__init__.py +++ b/datacompy/__init__.py @@ -1,5 +1,5 @@ # -# Copyright 2024 Capital One Services, LLC +# Copyright 2025 Capital One Services, LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/datacompy/base.py b/datacompy/base.py index 27425da6..bf0305bf 100644 --- a/datacompy/base.py +++ b/datacompy/base.py @@ -1,5 +1,5 @@ # -# Copyright 2024 Capital One Services, LLC +# Copyright 2025 Capital One Services, LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -21,14 +21,13 @@ two dataframes. """ +import logging from abc import ABC, abstractmethod from typing import Any from ordered_set import OrderedSet -from datacompy.logger import INFO, get_logger - -LOG = get_logger(__name__, INFO) +LOG = logging.getLogger(__name__) class BaseCompare(ABC): diff --git a/datacompy/core.py b/datacompy/core.py index 64c5ebf6..3313b236 100644 --- a/datacompy/core.py +++ b/datacompy/core.py @@ -1,5 +1,5 @@ # -# Copyright 2024 Capital One Services, LLC +# Copyright 2025 Capital One Services, LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -21,6 +21,7 @@ two dataframes. """ +import logging import os from typing import Any, Dict, List, cast @@ -29,9 +30,8 @@ from ordered_set import OrderedSet from datacompy.base import BaseCompare, temp_column_name -from datacompy.logger import INFO, get_logger -LOG = get_logger(__name__, INFO) +LOG = logging.getLogger(__name__) class Compare(BaseCompare): diff --git a/datacompy/fugue.py b/datacompy/fugue.py index 57d097a9..5cbcc200 100644 --- a/datacompy/fugue.py +++ b/datacompy/fugue.py @@ -1,5 +1,5 @@ # -# Copyright 2024 Capital One Services, LLC +# Copyright 2025 Capital One Services, LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,6 +15,7 @@ """Compare two DataFrames that are supported by Fugue.""" +import logging import pickle from collections import defaultdict from typing import Any, Callable, Dict, Iterable, List, Tuple, cast @@ -23,9 +24,8 @@ from ordered_set import OrderedSet from datacompy.core import Compare, render -from datacompy.logger import INFO, get_logger -LOG = get_logger(__name__, INFO) +LOG = logging.getLogger(__name__) HASH_COL = "__datacompy__hash__" diff --git a/datacompy/logger.py b/datacompy/logger.py deleted file mode 100644 index 2efcb47e..00000000 --- a/datacompy/logger.py +++ /dev/null @@ -1,61 +0,0 @@ -# SPDX-Copyright: Copyright (c) Capital One Services, LLC -# SPDX-License-Identifier: Apache-2.0 -# Copyright 2024 Capital One Services, LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Logging Module. - -Module which sets up the basic logging infrustrcuture for the application. -""" - -import logging -import sys - -# logger formating -BRIEF_FORMAT = "%(levelname)s %(asctime)s - %(name)s: %(message)s" -VERBOSE_FORMAT = ( - "%(levelname)s|%(asctime)s|%(name)s|%(filename)s|" - "%(funcName)s|%(lineno)d: %(message)s" -) -FORMAT_TO_USE = VERBOSE_FORMAT - -# logger levels -DEBUG = logging.DEBUG -INFO = logging.INFO -WARN = logging.WARNING -ERROR = logging.ERROR -CRITICAL = logging.CRITICAL - - -def get_logger(name=None, log_level=logging.DEBUG): - """Set the basic logging features for the application. - - Parameters - ---------- - name : str, optional - The name of the logger. Defaults to ``None`` - log_level : int, optional - The logging level. Defaults to ``logging.INFO`` - - Returns - ------- - logging.Logger - Returns a Logger obejct which is set with the passed in paramters. - Please see the following for more details: - https://docs.python.org/2/library/logging.html - """ - logging.basicConfig(format=FORMAT_TO_USE, stream=sys.stdout, level=log_level) - logging.captureWarnings(True) - logger = logging.getLogger(name) - return logger diff --git a/datacompy/polars.py b/datacompy/polars.py index 1197ae20..405901eb 100644 --- a/datacompy/polars.py +++ b/datacompy/polars.py @@ -1,5 +1,5 @@ # -# Copyright 2024 Capital One Services, LLC +# Copyright 2025 Capital One Services, LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -21,6 +21,7 @@ two dataframes. """ +import logging import os from copy import deepcopy from typing import Any, Dict, List, cast @@ -31,9 +32,8 @@ from polars.exceptions import ComputeError, InvalidOperationError from datacompy.base import BaseCompare, temp_column_name -from datacompy.logger import INFO, get_logger -LOG = get_logger(__name__, INFO) +LOG = logging.getLogger(__name__) STRING_TYPE = ["String", "Utf8"] DATE_TYPE = ["Date", "Datetime"] diff --git a/datacompy/snowflake.py b/datacompy/snowflake.py index 1cd3247a..a12e1325 100644 --- a/datacompy/snowflake.py +++ b/datacompy/snowflake.py @@ -1,5 +1,5 @@ # -# Copyright 2024 Capital One Services, LLC +# Copyright 2025 Capital One Services, LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -21,6 +21,7 @@ two dataframes. """ +import logging import os from concurrent.futures import ThreadPoolExecutor, as_completed from copy import deepcopy @@ -30,10 +31,9 @@ from ordered_set import OrderedSet from datacompy.base import BaseCompare -from datacompy.logger import INFO, get_logger from datacompy.spark.sql import decimal_comparator -LOG = get_logger(__name__, INFO) +LOG = logging.getLogger(__name__) try: import snowflake.snowpark as sp diff --git a/datacompy/spark/helper.py b/datacompy/spark/helper.py index 72fcb3a3..eb090220 100644 --- a/datacompy/spark/helper.py +++ b/datacompy/spark/helper.py @@ -20,10 +20,11 @@ and use the row order of the datasets. """ -from datacompy.logger import INFO, get_logger +import logging + from datacompy.spark.sql import SparkSQLCompare -LOG = get_logger(__name__, INFO) +LOG = logging.getLogger(__name__) try: import pyspark.sql diff --git a/datacompy/spark/legacy.py b/datacompy/spark/legacy.py index c32db3b7..6bddc956 100644 --- a/datacompy/spark/legacy.py +++ b/datacompy/spark/legacy.py @@ -1,5 +1,5 @@ # -# Copyright 2024 Capital One Services, LLC +# Copyright 2025 Capital One Services, LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/datacompy/spark/pandas.py b/datacompy/spark/pandas.py index 0886dd18..be6ef08a 100644 --- a/datacompy/spark/pandas.py +++ b/datacompy/spark/pandas.py @@ -1,5 +1,5 @@ # -# Copyright 2024 Capital One Services, LLC +# Copyright 2025 Capital One Services, LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/datacompy/spark/sql.py b/datacompy/spark/sql.py index 63e8cb26..f0574a1a 100644 --- a/datacompy/spark/sql.py +++ b/datacompy/spark/sql.py @@ -1,5 +1,5 @@ # -# Copyright 2024 Capital One Services, LLC +# Copyright 2025 Capital One Services, LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -21,6 +21,7 @@ two dataframes. """ +import logging import os from copy import deepcopy from typing import List, Tuple @@ -29,9 +30,8 @@ from ordered_set import OrderedSet from datacompy.base import BaseCompare, temp_column_name -from datacompy.logger import INFO, get_logger -LOG = get_logger(__name__, INFO) +LOG = logging.getLogger(__name__) try: import pyspark.sql diff --git a/tests/test_core.py b/tests/test_core.py index c14918e9..2f164b41 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1,5 +1,5 @@ # -# Copyright 2024 Capital One Services, LLC +# Copyright 2025 Capital One Services, LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tests/test_fugue/test_duckdb.py b/tests/test_fugue/test_duckdb.py index 77b05883..8d9ee41a 100644 --- a/tests/test_fugue/test_duckdb.py +++ b/tests/test_fugue/test_duckdb.py @@ -1,5 +1,5 @@ # -# Copyright 2024 Capital One Services, LLC +# Copyright 2025 Capital One Services, LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tests/test_fugue/test_fugue_helpers.py b/tests/test_fugue/test_fugue_helpers.py index cf51690f..784cfd04 100644 --- a/tests/test_fugue/test_fugue_helpers.py +++ b/tests/test_fugue/test_fugue_helpers.py @@ -1,5 +1,5 @@ # -# Copyright 2024 Capital One Services, LLC +# Copyright 2025 Capital One Services, LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tests/test_fugue/test_fugue_pandas.py b/tests/test_fugue/test_fugue_pandas.py index 3c99a54e..db2ff305 100644 --- a/tests/test_fugue/test_fugue_pandas.py +++ b/tests/test_fugue/test_fugue_pandas.py @@ -1,5 +1,5 @@ # -# Copyright 2024 Capital One Services, LLC +# Copyright 2025 Capital One Services, LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tests/test_fugue/test_fugue_polars.py b/tests/test_fugue/test_fugue_polars.py index bf58bcdf..0e77781f 100644 --- a/tests/test_fugue/test_fugue_polars.py +++ b/tests/test_fugue/test_fugue_polars.py @@ -1,5 +1,5 @@ # -# Copyright 2024 Capital One Services, LLC +# Copyright 2025 Capital One Services, LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tests/test_fugue/test_fugue_spark.py b/tests/test_fugue/test_fugue_spark.py index 8c1046ac..008ef691 100644 --- a/tests/test_fugue/test_fugue_spark.py +++ b/tests/test_fugue/test_fugue_spark.py @@ -1,5 +1,5 @@ # -# Copyright 2024 Capital One Services, LLC +# Copyright 2025 Capital One Services, LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tests/test_polars.py b/tests/test_polars.py index ff89db00..9e0cf27c 100644 --- a/tests/test_polars.py +++ b/tests/test_polars.py @@ -1,5 +1,5 @@ # -# Copyright 2024 Capital One Services, LLC +# Copyright 2025 Capital One Services, LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tests/test_snowflake.py b/tests/test_snowflake.py index 026c8075..af6029ff 100644 --- a/tests/test_snowflake.py +++ b/tests/test_snowflake.py @@ -1,5 +1,5 @@ # -# Copyright 2024 Capital One Services, LLC +# Copyright 2025 Capital One Services, LLC # # Licensed under the Apache License, Version 2.0 (the "LICENSE"); # you may not use this file except in compliance with the License. diff --git a/tests/test_spark/test_legacy_spark.py b/tests/test_spark/test_legacy_spark.py index cc71d90b..5c24087c 100644 --- a/tests/test_spark/test_legacy_spark.py +++ b/tests/test_spark/test_legacy_spark.py @@ -1,5 +1,5 @@ # -# Copyright 2024 Capital One Services, LLC +# Copyright 2025 Capital One Services, LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tests/test_spark/test_pandas_spark.py b/tests/test_spark/test_pandas_spark.py index 433c5762..068b1332 100644 --- a/tests/test_spark/test_pandas_spark.py +++ b/tests/test_spark/test_pandas_spark.py @@ -1,5 +1,5 @@ # -# Copyright 2024 Capital One Services, LLC +# Copyright 2025 Capital One Services, LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License.