From afa683fbd5ea9e2a1a1074f3da6329471d76cf0e Mon Sep 17 00:00:00 2001
From: Ally Franken <108901375+allyfranken@users.noreply.github.com>
Date: Wed, 22 Jan 2025 14:45:41 -0500
Subject: [PATCH 1/3] Addition of Hydra's Helper Functions (#371)

* dummy commit to tests permissions and torchy

* add helper functions and unit test cases

* remove unnecessary helper functions

* moving to helper

* update naming convetion, add docstrings for functions

* update unit tests to pass, update naming to use 'base' and 'compare'

* update lint fixes

* update for linting

* reformat with ruff format

* updating compare_by_row signature

* updating compare_by_row signature

* bump version

---------

Co-authored-by: Faisal Dosani <faisal.dosani@capitalone.com>
Co-authored-by: Faisal Dosani <faisal.dosani@pm.me>
---
 datacompy/__init__.py              |   2 +-
 datacompy/spark/helper.py          | 245 ++++++++++++++++++++++++
 tests/test_spark/test_helper.py    | 298 +++++++++++++++++++++++++++++
 tests/test_spark/test_sql_spark.py |   2 +-
 4 files changed, 545 insertions(+), 2 deletions(-)
 create mode 100644 datacompy/spark/helper.py
 create mode 100644 tests/test_spark/test_helper.py

diff --git a/datacompy/__init__.py b/datacompy/__init__.py
index 8f5651c6..d62c2454 100644
--- a/datacompy/__init__.py
+++ b/datacompy/__init__.py
@@ -18,7 +18,7 @@
 Then extended to carry that functionality over to Spark Dataframes.
 """
 
-__version__ = "0.16.0"
+__version__ = "0.16.1"
 
 import platform
 from warnings import warn
diff --git a/datacompy/spark/helper.py b/datacompy/spark/helper.py
new file mode 100644
index 00000000..72fcb3a3
--- /dev/null
+++ b/datacompy/spark/helper.py
@@ -0,0 +1,245 @@
+#
+# Copyright 2025 Capital One Services, LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Helper function module contributed by Capital One's Hydra Team.
+
+Helper functions to assist in specific usecases where there is no columns to join
+and use the row order of the datasets.
+"""
+
+from datacompy.logger import INFO, get_logger
+from datacompy.spark.sql import SparkSQLCompare
+
+LOG = get_logger(__name__, INFO)
+
+try:
+    import pyspark.sql
+    from pyspark.sql import Window
+    from pyspark.sql import types as T
+    from pyspark.sql.functions import col, format_number, row_number
+except ImportError:
+    LOG.warning(
+        "Please note that you are missing the optional dependency: spark. "
+        "If you need to use this functionality it must be installed."
+    )
+
+
+def compare_by_row(
+    spark_session: "pyspark.sql.SparkSession",
+    base_dataframe: "pyspark.sql.DataFrame",
+    compare_dataframe: "pyspark.sql.DataFrame",
+    string2double_cols: list[str] | None,
+    abs_tol: float = 0,
+    rel_tol: float = 0,
+    df1_name: str = "df1",
+    df2_name: str = "df2",
+    ignore_spaces: bool = False,
+    ignore_case: bool = False,
+    cast_column_names_lower: bool = True,
+) -> SparkSQLCompare:
+    """Run a detailed analysis on specific usecases where there is no columns to join and use the row order of the datasets.
+
+    If you know which columns to join on then please use ``SparkSQLCompare`` directly as this is meant to help
+    support very specific helper usecases using row order contributed by Capital One's Hydra Team.
+
+    Parameters
+    ----------
+    spark_session : pyspark.sql.SparkSession
+        A ``SparkSession`` to be used to execute Spark commands in the comparison.
+    base_dataframe: pyspark.sql.DataFrame
+        Dataset to be compared against
+    compare_dataframe: pyspark.sql.DataFrame
+        dataset to compare
+    string2double_cols: List[str], optional
+        The columns that contain numeric values but are stored as string types
+    abs_tol : float, optional
+        Absolute tolerance between two values.
+    rel_tol : float, optional
+        Relative tolerance between two values.
+    df1_name : str, optional
+        A string name for the first dataframe.  This allows the reporting to
+        print out an actual name instead of "df1", and allows human users to
+        more easily track the dataframes.
+    df2_name : str, optional
+        A string name for the second dataframe
+    ignore_spaces : bool, optional
+        Flag to strip whitespace (including newlines) from string columns (including any join
+        columns)
+    ignore_case : bool, optional
+        Flag to ignore the case of string columns
+    cast_column_names_lower: bool, optional
+        Boolean indicator that controls of column names will be cast into lower case
+
+    Returns
+    -------
+    datacompy.spark.sql.SparkSQLCompare
+    """
+    # Convert fields that contain numeric values stored as strings to numeric types for comparison
+    if len(string2double_cols) != 0:
+        base_dataframe = handle_numeric_strings(base_dataframe, string2double_cols)
+        compare_dataframe = handle_numeric_strings(
+            compare_dataframe, string2double_cols
+        )
+
+    sorted_base_df, sorted_compare_df = sort_rows(base_dataframe, compare_dataframe)
+    column_to_join = ["row"]
+
+    LOG.info("Compared by column(s): ", column_to_join)
+    if string2double_cols:
+        LOG.info(
+            "String column(s) cast to doubles for numeric comparison: ",
+            string2double_cols,
+        )
+    return SparkSQLCompare(
+        spark_session=spark_session,
+        df1=sorted_base_df,
+        df2=sorted_compare_df,
+        join_columns=column_to_join,
+        abs_tol=abs_tol,
+        rel_tol=rel_tol,
+        df1_name=df1_name,
+        df2_name=df2_name,
+        ignore_spaces=ignore_spaces,
+        ignore_case=ignore_case,
+        cast_column_names_lower=cast_column_names_lower,
+    )
+
+
+def handle_numeric_strings(
+    df: "pyspark.sql.DataFrame", field_list: list[str]
+) -> "pyspark.sql.DataFrame":
+    """Convert columns in field_list from numeric strings to DoubleType.
+
+    Parameters
+    ----------
+    df: pyspark.sql.DataFrame
+        The DataFrame to be converted
+    field_list: List[str]
+        List of StringType columns to be converted to DoubleType
+
+    Returns
+    -------
+    pyspark.sql.DataFrame
+    """
+    for this_col in field_list:
+        df = df.withColumn(this_col, col(this_col).cast(T.DoubleType()))
+    return df
+
+
+def sort_rows(
+    base_df: "pyspark.sql.DataFrame", compare_df: "pyspark.sql.DataFrame"
+) -> "pyspark.sql.DataFrame":
+    """Add new column to each DataFrame that numbers the rows, so they can be compared by row number.
+
+    Parameters
+    ----------
+    base_df: pyspark.sql.DataFrame
+        The base DataFrame to be sorted
+    compare_df: pyspark.sql.DataFrame
+        The compare DataFrame to be sorted
+
+    Returns
+    -------
+    pyspark.sql.DataFrame, pyspark.sql.DataFrame
+
+
+    """
+    base_cols = base_df.columns
+    compare_cols = compare_df.columns
+
+    # Ensure both DataFrames have the same columns
+    for x in base_cols:
+        if x not in compare_cols:
+            raise Exception(
+                f"{x} is present in base_df but does not exist in compare_df"
+            )
+
+    if set(base_cols) != set(compare_cols):
+        LOG.warning(
+            "WARNING: There are columns present in Compare df that do not exist in Base df. "
+            "The Base df columns will be used for row-wise sorting and may produce unanticipated "
+            "report output if the extra fields are not null."
+        )
+
+    w = Window.orderBy(*base_cols)
+    sorted_base_df = base_df.select("*", row_number().over(w).alias("row"))
+    sorted_compare_df = compare_df.select("*", row_number().over(w).alias("row"))
+    return sorted_base_df, sorted_compare_df
+
+
+def sort_columns(
+    base_df: "pyspark.sql.DataFrame", compare_df: "pyspark.sql.DataFrame"
+) -> "pyspark.sql.DataFrame":
+    """Sort both DataFrames by their columns to ensure consistent order.
+
+    Parameters
+    ----------
+    base_df: pyspark.sql.DataFrame
+        The base DataFrame to be sorted
+    compare_df: pyspark.sql.DataFrame
+        The compare DataFrame to be sorted
+
+    Returns
+    -------
+    pyspark.sql.DataFrame, pyspark.sql.DataFrame
+    """
+    # Ensure both DataFrames have the same columns
+    common_columns = set(base_df.columns)
+    for x in common_columns:
+        if x not in compare_df.columns:
+            raise Exception(
+                f"{x} is present in base_df but does not exist in compare_df"
+            )
+    # Sort both DataFrames to ensure consistent order
+    base_df = base_df.orderBy(*common_columns)
+    compare_df = compare_df.orderBy(*common_columns)
+    return base_df, compare_df
+
+
+def format_numeric_fields(df: "pyspark.sql.DataFrame") -> "pyspark.sql.DataFrame":
+    """Round and truncate numeric fields to 5 decimal places.
+
+    Parameters
+    ----------
+    df: pyspark.sql.DataFrame
+        The DataFrame to be formatted
+
+    Returns
+    -------
+    pyspark.sql.DataFrame
+    """
+    fixed_cols = []
+    numeric_types = [
+        "tinyint",
+        "smallint",
+        "int",
+        "bigint",
+        "float",
+        "double",
+        "decimal",
+    ]
+
+    for c in df.dtypes:
+        # do not change non-numeric fields
+        if c[1] not in numeric_types:
+            fixed_cols.append(col(c[0]))
+        # round & truncate numeric fields
+        else:
+            new_val = format_number(col(c[0]), 5).alias(c[0])
+            fixed_cols.append(new_val)
+
+    formatted_df = df.select(*fixed_cols)
+    return formatted_df
diff --git a/tests/test_spark/test_helper.py b/tests/test_spark/test_helper.py
new file mode 100644
index 00000000..6eed4daa
--- /dev/null
+++ b/tests/test_spark/test_helper.py
@@ -0,0 +1,298 @@
+#
+# Copyright 2025 Capital One Services, LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Testing out the spark helper functionality
+"""
+
+import logging
+import sys
+
+import pytest
+
+pytest.importorskip("pyspark")
+if sys.version_info >= (3, 12):
+    pytest.skip("unsupported python version", allow_module_level=True)
+
+
+from datacompy.spark.helper import (
+    compare_by_row,
+    format_numeric_fields,
+    handle_numeric_strings,
+    sort_columns,
+    sort_rows,
+)
+from datacompy.spark.sql import SparkSQLCompare
+from pyspark.sql.types import (
+    IntegerType,
+    StringType,
+    StructField,
+    StructType,
+)
+
+logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
+
+
+def test_detailed_compare_with_string2columns(spark_session):
+    # create mock data
+    mock_base_data = [
+        ("bob", "22", "dog"),
+        ("alice", "19", "cat"),
+        ("john", "70", "bunny"),
+    ]
+    mock_base_columns = ["name", "age", "pet"]
+
+    mock_compare_data = [
+        ("bob", "22", "dog"),
+        ("alice", "19", "cat"),
+        ("john", "70", "bunny"),
+    ]
+    mock_compare_columns = ["name", "age", "pet"]
+
+    # Create DataFrames
+    mock_base_df = spark_session.createDataFrame(mock_base_data, mock_base_columns)
+    mock_compare_df = spark_session.createDataFrame(
+        mock_compare_data, mock_compare_columns
+    )
+
+    # call detailed_compare
+    result_compared_data = compare_by_row(
+        spark_session=spark_session,
+        base_dataframe=mock_base_df,
+        compare_dataframe=mock_compare_df,
+        string2double_cols=["age"],
+    )
+
+    # assert result
+    assert isinstance(result_compared_data, SparkSQLCompare)
+    assert result_compared_data.matches()
+
+
+def test_detailed_compare_with_column_to_join(spark_session):
+    # create mock data
+    mock_base_data = [
+        ("bob", "22", "dog"),
+        ("alice", "19", "cat"),
+        ("john", "70", "bunny"),
+    ]
+    mock_base_columns = ["name", "age", "pet"]
+
+    mock_compare_data = [
+        ("bob", "22", "dog"),
+        ("alice", "19", "cat"),
+        ("john", "70", "bunny"),
+    ]
+    mock_compare_columns = ["name", "age", "pet"]
+
+    # Create DataFrames
+    mock_base_df = spark_session.createDataFrame(mock_base_data, mock_base_columns)
+    mock_compare_df = spark_session.createDataFrame(
+        mock_compare_data, mock_compare_columns
+    )
+
+    # call detailed_compare
+    result_compared_data = compare_by_row(
+        spark_session=spark_session,
+        base_dataframe=mock_base_df,
+        compare_dataframe=mock_compare_df,
+        string2double_cols=[],
+    )
+
+    # assert result
+    assert result_compared_data.matches()
+    assert isinstance(result_compared_data, SparkSQLCompare)
+
+
+def test_handle_numeric_strings(spark_session):
+    # create mock_df
+    mock_data = [("bob", "22", "dog"), ("alice", "19", "cat"), ("john", "70", "bunny")]
+    mock_columns = ["name", "age", "pet"]
+    mock_df = spark_session.createDataFrame(mock_data, mock_columns)
+
+    # create mock field_list
+    mock_field_list = ["age"]
+
+    # call handle_numeric_strings
+    result_df = handle_numeric_strings(mock_df, mock_field_list)
+
+    # create expected dataframe
+    expected_data = [
+        ("bob", 22.0, "dog"),
+        ("alice", 19.0, "cat"),
+        ("john", 70.0, "bunny"),
+    ]
+    expected_columns = ["name", "age", "pet"]
+    expected_df = spark_session.createDataFrame(expected_data, expected_columns)
+
+    # assert calls
+    assert result_df.collect() == expected_df.collect()
+
+
+def test_format_numeric_fields(spark_session):
+    # create mock dataframe
+    mock_data = [("bob", 22, "dog"), ("alice", 19, "cat"), ("john", 70, "bunny")]
+    mock_columns = ["name", "age", "pet"]
+    mock_df = spark_session.createDataFrame(mock_data, mock_columns)
+
+    # call format_numeric_fields
+    formatted_df = format_numeric_fields(mock_df)
+
+    # create expected dataframe
+    expected_data = [
+        ("bob", "22.00000", "dog"),
+        ("alice", "19.00000", "cat"),
+        ("john", "70.00000", "bunny"),
+    ]
+    expected_columns = ["name", "age", "pet"]
+    expected_df = spark_session.createDataFrame(expected_data, expected_columns)
+
+    # assert calls
+    assert formatted_df.collect() == expected_df.collect()
+
+
+def test_sort_rows_failure(spark_session):
+    # create mock dataframes
+    input_base_data = [
+        ("bob", "22", "dog"),
+        ("alice", "19", "cat"),
+        ("john", "70", "bunny"),
+    ]
+    columns_base = ["name", "age", "pet"]
+
+    input_compare_data = [("19", "cat"), ("70", "bunny"), ("22", "dog")]
+    columns_commpare = ["age", "pet"]
+
+    # Create DataFrames
+    input_base_df = spark_session.createDataFrame(input_base_data, columns_base)
+    input_compare_df = spark_session.createDataFrame(
+        input_compare_data, columns_commpare
+    )
+
+    # call call_rows
+    with pytest.raises(
+        Exception, match="name is present in base_df but does not exist in compare_df"
+    ):
+        sort_rows(input_base_df, input_compare_df)
+
+
+def test_sort_rows_success(caplog, spark_session):
+    caplog.set_level(logging.WARNING)
+
+    # create mock data
+    input_base_data = [
+        ("bob", "22", "dog"),
+        ("alice", "19", "cat"),
+        ("john", "70", "bunny"),
+    ]
+    columns_base = ["name", "age", "pet"]
+
+    input_compare_data = [
+        ("19", "cat", "alice", "red"),
+        ("70", "bunny", "john", "black"),
+        ("22", "dog", "bob", "white"),
+    ]
+    columns_compare = ["age", "pet", "name", "color"]
+
+    # create dataFrames
+    input_base_df = spark_session.createDataFrame(input_base_data, columns_base)
+    input_compare_df = spark_session.createDataFrame(
+        input_compare_data, columns_compare
+    )
+
+    # call sort_rows
+    sorted_base_df, sorted_compare_df = sort_rows(input_base_df, input_compare_df)
+
+    # create expected base_dataframe
+    expected_base_data = [
+        ("alice", "19", "cat", 1),
+        ("bob", "22", "dog", 2),
+        ("john", "70", "bunny", 3),
+    ]
+    expected_base_schema = StructType(
+        [
+            StructField("name", StringType(), True),
+            StructField("age", StringType(), True),
+            StructField("pet", StringType(), True),
+            StructField("row", IntegerType(), True),
+        ]
+    )
+    expected_base_df = spark_session.createDataFrame(
+        expected_base_data, expected_base_schema
+    )
+
+    # create expected compare_dataframe
+    expected_compare_data = [
+        ("19", "cat", "alice", "red", 1),
+        ("22", "dog", "bob", "white", 2),
+        ("70", "bunny", "john", "black", 3),
+    ]
+    expected_compare_schema = StructType(
+        [
+            StructField("age", StringType(), True),
+            StructField("pet", StringType(), True),
+            StructField("name", StringType(), True),
+            StructField("color", StringType(), True),
+            StructField("row", IntegerType(), True),
+        ]
+    )
+    expected_compare_df = spark_session.createDataFrame(
+        expected_compare_data, expected_compare_schema
+    )
+
+    # assertions
+    assert sorted_base_df.collect() == expected_base_df.collect()
+    assert sorted_compare_df.collect() == expected_compare_df.collect()
+    assert (
+        "WARNING: There are columns present in Compare df that do not exist in Base df. The Base df columns will be used for row-wise sorting and may produce unanticipated report output if the extra fields are not null.\n"
+        in caplog.text
+    )
+
+
+def test_sort_columns_failure(spark_session):
+    # create mock dataframes
+    input_base_data = [
+        ("row1", "col2", "col3"),
+        ("row2", "col2", "col3"),
+        ("row3", "col2", "col3"),
+    ]
+    columns_1 = ["col1", "col2", "col3"]
+
+    input_compare_data = [("row1", "col2"), ("row2", "col2"), ("row3", "col2")]
+    columns_2 = ["col1", "col2"]
+
+    # Create DataFrames
+    input_base_df = spark_session.createDataFrame(input_base_data, columns_1)
+    input_compare_df = spark_session.createDataFrame(input_compare_data, columns_2)
+
+    # call sort_columns
+    with pytest.raises(
+        Exception, match="col3 is present in base_df but does not exist in compare_df"
+    ):
+        sort_columns(input_base_df, input_compare_df)
+
+
+def test_sort_columns_success(spark_session):
+    # Create sample DataFrames
+    data1 = [(1, "a"), (2, "b"), (3, "c")]
+    data2 = [(1, "a"), (2, "b"), (3, "c")]
+    columns = ["id", "value"]
+
+    df1 = spark_session.createDataFrame(data1, columns)
+    df2 = spark_session.createDataFrame(data2, columns)
+
+    # Test with matching columns
+    sorted_df1, sorted_df2 = sort_columns(df1, df2)
+    assert sorted_df1.columns == sorted_df2.columns
+    assert sorted_df1.collect() == sorted_df2.collect()
diff --git a/tests/test_spark/test_sql_spark.py b/tests/test_spark/test_sql_spark.py
index e93fbafc..5b2686f6 100644
--- a/tests/test_spark/test_sql_spark.py
+++ b/tests/test_spark/test_sql_spark.py
@@ -1,5 +1,5 @@
 #
-# Copyright 2024 Capital One Services, LLC
+# Copyright 2025 Capital One Services, LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 2ae7d431bcbee876dc8a4750d4431d0eaa527348 Mon Sep 17 00:00:00 2001
From: Faisal <faisal.dosani@pm.me>
Date: Wed, 5 Feb 2025 13:05:19 -0500
Subject: [PATCH 2/3] adding check for pyarrow strings in columns_equal (#375)

* adding check for pyarrow strings in columns_equal

* Update datacompy/core.py

Co-authored-by: Gladys Teh <97971054+gladysteh99@users.noreply.github.com>

* Update datacompy/core.py

Co-authored-by: Gladys Teh <97971054+gladysteh99@users.noreply.github.com>

---------

Co-authored-by: Gladys Teh <97971054+gladysteh99@users.noreply.github.com>
---
 datacompy/core.py  | 18 ++++++++++++++----
 tests/test_core.py | 23 +++++++++++++++++++++++
 2 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/datacompy/core.py b/datacompy/core.py
index b0d10d77..64c5ebf6 100644
--- a/datacompy/core.py
+++ b/datacompy/core.py
@@ -380,8 +380,12 @@ def _intersect_compare(self, ignore_spaces: bool, ignore_case: bool) -> None:
                     "match_column": col_match,
                     "match_cnt": match_cnt,
                     "unequal_cnt": row_cnt - match_cnt,
-                    "dtype1": str(self.df1[column].dtype),
-                    "dtype2": str(self.df2[column].dtype),
+                    "dtype1": str(self.df1[column].dtype.__repr__())
+                    if str(self.df1[column].dtype) == "string"
+                    else str(self.df1[column].dtype),
+                    "dtype2": str(self.df2[column].dtype.__repr__())
+                    if str(self.df2[column].dtype) == "string"
+                    else str(self.df2[column].dtype),
                     "all_match": all(
                         (
                             self.df1[column].dtype == self.df2[column].dtype,
@@ -847,8 +851,14 @@ def columns_equal(
                         | (col_1.isnull() & col_2.isnull())
                     )
             except Exception:
-                # Blanket exception should just return all False
-                compare = pd.Series(False, index=col_1.index)
+                # Check for string[pyarrow] and string[python]
+                if col_1.dtype in (
+                    "string[python]",
+                    "string[pyarrow]",
+                ) and col_2.dtype in ("string[python]", "string[pyarrow]"):
+                    compare = pd.Series(col_1.astype(str) == col_2.astype(str))
+                else:  # Blanket exception should just return all False
+                    compare = pd.Series(False, index=col_1.index)
     compare.index = col_1.index
     return compare
 
diff --git a/tests/test_core.py b/tests/test_core.py
index b0c3647f..c14918e9 100644
--- a/tests/test_core.py
+++ b/tests/test_core.py
@@ -62,6 +62,29 @@ def test_numeric_columns_equal_rel():
     assert_series_equal(expect_out, actual_out, check_names=False)
 
 
+def test_string_pyarrow_columns_equal():
+    data = """a|b|expected
+Hi|Hi|True
+Yo|Yo|True
+Hey|Hey |False
+résumé|resume|False
+résumé|résumé|True
+💩|💩|True
+💩|🤔|False
+ | |True
+  | |False
+datacompy|DataComPy|False
+something||False
+|something|False
+||True"""
+    df = pd.read_csv(io.StringIO(data), sep="|")
+    actual_out = datacompy.columns_equal(
+        df.a.astype("string[python]"), df.b.astype("string[pyarrow]"), rel_tol=0.2
+    )
+    expect_out = df["expected"]
+    assert (actual_out == expect_out).all()
+
+
 def test_string_columns_equal():
     data = """a|b|expected
 Hi|Hi|True

From e80fb12aafe0f93d0401fbf90c65db2daeef45a3 Mon Sep 17 00:00:00 2001
From: Faisal <faisal.dosani@pm.me>
Date: Wed, 5 Feb 2025 14:17:23 -0500
Subject: [PATCH 3/3] refactor logging (#376)

* refactor logging

* Update CODEOWNERS
---
 CODEOWNERS                             |  2 +-
 datacompy/__init__.py                  |  2 +-
 datacompy/base.py                      |  7 ++-
 datacompy/core.py                      |  6 +--
 datacompy/fugue.py                     |  6 +--
 datacompy/logger.py                    | 61 --------------------------
 datacompy/polars.py                    |  6 +--
 datacompy/snowflake.py                 |  6 +--
 datacompy/spark/helper.py              |  5 ++-
 datacompy/spark/legacy.py              |  2 +-
 datacompy/spark/pandas.py              |  2 +-
 datacompy/spark/sql.py                 |  6 +--
 tests/test_core.py                     |  2 +-
 tests/test_fugue/test_duckdb.py        |  2 +-
 tests/test_fugue/test_fugue_helpers.py |  2 +-
 tests/test_fugue/test_fugue_pandas.py  |  2 +-
 tests/test_fugue/test_fugue_polars.py  |  2 +-
 tests/test_fugue/test_fugue_spark.py   |  2 +-
 tests/test_polars.py                   |  2 +-
 tests/test_snowflake.py                |  2 +-
 tests/test_spark/test_legacy_spark.py  |  2 +-
 tests/test_spark/test_pandas_spark.py  |  2 +-
 22 files changed, 35 insertions(+), 96 deletions(-)
 delete mode 100644 datacompy/logger.py

diff --git a/CODEOWNERS b/CODEOWNERS
index a61d6a58..373f7d06 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -1 +1 @@
-*       @fdosani @ak-gupta @jdawang @gladysteh99
+*       @fdosani @ak-gupta @jdawang @gladysteh99 @rhaffar
diff --git a/datacompy/__init__.py b/datacompy/__init__.py
index d62c2454..ea04402b 100644
--- a/datacompy/__init__.py
+++ b/datacompy/__init__.py
@@ -1,5 +1,5 @@
 #
-# Copyright 2024 Capital One Services, LLC
+# Copyright 2025 Capital One Services, LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/datacompy/base.py b/datacompy/base.py
index 27425da6..bf0305bf 100644
--- a/datacompy/base.py
+++ b/datacompy/base.py
@@ -1,5 +1,5 @@
 #
-# Copyright 2024 Capital One Services, LLC
+# Copyright 2025 Capital One Services, LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -21,14 +21,13 @@
 two dataframes.
 """
 
+import logging
 from abc import ABC, abstractmethod
 from typing import Any
 
 from ordered_set import OrderedSet
 
-from datacompy.logger import INFO, get_logger
-
-LOG = get_logger(__name__, INFO)
+LOG = logging.getLogger(__name__)
 
 
 class BaseCompare(ABC):
diff --git a/datacompy/core.py b/datacompy/core.py
index 64c5ebf6..3313b236 100644
--- a/datacompy/core.py
+++ b/datacompy/core.py
@@ -1,5 +1,5 @@
 #
-# Copyright 2024 Capital One Services, LLC
+# Copyright 2025 Capital One Services, LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 two dataframes.
 """
 
+import logging
 import os
 from typing import Any, Dict, List, cast
 
@@ -29,9 +30,8 @@
 from ordered_set import OrderedSet
 
 from datacompy.base import BaseCompare, temp_column_name
-from datacompy.logger import INFO, get_logger
 
-LOG = get_logger(__name__, INFO)
+LOG = logging.getLogger(__name__)
 
 
 class Compare(BaseCompare):
diff --git a/datacompy/fugue.py b/datacompy/fugue.py
index 57d097a9..5cbcc200 100644
--- a/datacompy/fugue.py
+++ b/datacompy/fugue.py
@@ -1,5 +1,5 @@
 #
-# Copyright 2024 Capital One Services, LLC
+# Copyright 2025 Capital One Services, LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
 
 """Compare two DataFrames that are supported by Fugue."""
 
+import logging
 import pickle
 from collections import defaultdict
 from typing import Any, Callable, Dict, Iterable, List, Tuple, cast
@@ -23,9 +24,8 @@
 from ordered_set import OrderedSet
 
 from datacompy.core import Compare, render
-from datacompy.logger import INFO, get_logger
 
-LOG = get_logger(__name__, INFO)
+LOG = logging.getLogger(__name__)
 HASH_COL = "__datacompy__hash__"
 
 
diff --git a/datacompy/logger.py b/datacompy/logger.py
deleted file mode 100644
index 2efcb47e..00000000
--- a/datacompy/logger.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# SPDX-Copyright: Copyright (c) Capital One Services, LLC
-# SPDX-License-Identifier: Apache-2.0
-# Copyright 2024 Capital One Services, LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Logging Module.
-
-Module which sets up the basic logging infrustrcuture for the application.
-"""
-
-import logging
-import sys
-
-# logger formating
-BRIEF_FORMAT = "%(levelname)s %(asctime)s - %(name)s: %(message)s"
-VERBOSE_FORMAT = (
-    "%(levelname)s|%(asctime)s|%(name)s|%(filename)s|"
-    "%(funcName)s|%(lineno)d: %(message)s"
-)
-FORMAT_TO_USE = VERBOSE_FORMAT
-
-# logger levels
-DEBUG = logging.DEBUG
-INFO = logging.INFO
-WARN = logging.WARNING
-ERROR = logging.ERROR
-CRITICAL = logging.CRITICAL
-
-
-def get_logger(name=None, log_level=logging.DEBUG):
-    """Set the basic logging features for the application.
-
-    Parameters
-    ----------
-    name : str, optional
-        The name of the logger. Defaults to ``None``
-    log_level : int, optional
-        The logging level. Defaults to ``logging.INFO``
-
-    Returns
-    -------
-    logging.Logger
-        Returns a Logger obejct which is set with the passed in paramters.
-        Please see the following for more details:
-        https://docs.python.org/2/library/logging.html
-    """
-    logging.basicConfig(format=FORMAT_TO_USE, stream=sys.stdout, level=log_level)
-    logging.captureWarnings(True)
-    logger = logging.getLogger(name)
-    return logger
diff --git a/datacompy/polars.py b/datacompy/polars.py
index 1197ae20..405901eb 100644
--- a/datacompy/polars.py
+++ b/datacompy/polars.py
@@ -1,5 +1,5 @@
 #
-# Copyright 2024 Capital One Services, LLC
+# Copyright 2025 Capital One Services, LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 two dataframes.
 """
 
+import logging
 import os
 from copy import deepcopy
 from typing import Any, Dict, List, cast
@@ -31,9 +32,8 @@
 from polars.exceptions import ComputeError, InvalidOperationError
 
 from datacompy.base import BaseCompare, temp_column_name
-from datacompy.logger import INFO, get_logger
 
-LOG = get_logger(__name__, INFO)
+LOG = logging.getLogger(__name__)
 
 STRING_TYPE = ["String", "Utf8"]
 DATE_TYPE = ["Date", "Datetime"]
diff --git a/datacompy/snowflake.py b/datacompy/snowflake.py
index 1cd3247a..a12e1325 100644
--- a/datacompy/snowflake.py
+++ b/datacompy/snowflake.py
@@ -1,5 +1,5 @@
 #
-# Copyright 2024 Capital One Services, LLC
+# Copyright 2025 Capital One Services, LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 two dataframes.
 """
 
+import logging
 import os
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from copy import deepcopy
@@ -30,10 +31,9 @@
 from ordered_set import OrderedSet
 
 from datacompy.base import BaseCompare
-from datacompy.logger import INFO, get_logger
 from datacompy.spark.sql import decimal_comparator
 
-LOG = get_logger(__name__, INFO)
+LOG = logging.getLogger(__name__)
 
 try:
     import snowflake.snowpark as sp
diff --git a/datacompy/spark/helper.py b/datacompy/spark/helper.py
index 72fcb3a3..eb090220 100644
--- a/datacompy/spark/helper.py
+++ b/datacompy/spark/helper.py
@@ -20,10 +20,11 @@
 and use the row order of the datasets.
 """
 
-from datacompy.logger import INFO, get_logger
+import logging
+
 from datacompy.spark.sql import SparkSQLCompare
 
-LOG = get_logger(__name__, INFO)
+LOG = logging.getLogger(__name__)
 
 try:
     import pyspark.sql
diff --git a/datacompy/spark/legacy.py b/datacompy/spark/legacy.py
index c32db3b7..6bddc956 100644
--- a/datacompy/spark/legacy.py
+++ b/datacompy/spark/legacy.py
@@ -1,5 +1,5 @@
 #
-# Copyright 2024 Capital One Services, LLC
+# Copyright 2025 Capital One Services, LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/datacompy/spark/pandas.py b/datacompy/spark/pandas.py
index 0886dd18..be6ef08a 100644
--- a/datacompy/spark/pandas.py
+++ b/datacompy/spark/pandas.py
@@ -1,5 +1,5 @@
 #
-# Copyright 2024 Capital One Services, LLC
+# Copyright 2025 Capital One Services, LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/datacompy/spark/sql.py b/datacompy/spark/sql.py
index 63e8cb26..f0574a1a 100644
--- a/datacompy/spark/sql.py
+++ b/datacompy/spark/sql.py
@@ -1,5 +1,5 @@
 #
-# Copyright 2024 Capital One Services, LLC
+# Copyright 2025 Capital One Services, LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 two dataframes.
 """
 
+import logging
 import os
 from copy import deepcopy
 from typing import List, Tuple
@@ -29,9 +30,8 @@
 from ordered_set import OrderedSet
 
 from datacompy.base import BaseCompare, temp_column_name
-from datacompy.logger import INFO, get_logger
 
-LOG = get_logger(__name__, INFO)
+LOG = logging.getLogger(__name__)
 
 try:
     import pyspark.sql
diff --git a/tests/test_core.py b/tests/test_core.py
index c14918e9..2f164b41 100644
--- a/tests/test_core.py
+++ b/tests/test_core.py
@@ -1,5 +1,5 @@
 #
-# Copyright 2024 Capital One Services, LLC
+# Copyright 2025 Capital One Services, LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tests/test_fugue/test_duckdb.py b/tests/test_fugue/test_duckdb.py
index 77b05883..8d9ee41a 100644
--- a/tests/test_fugue/test_duckdb.py
+++ b/tests/test_fugue/test_duckdb.py
@@ -1,5 +1,5 @@
 #
-# Copyright 2024 Capital One Services, LLC
+# Copyright 2025 Capital One Services, LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tests/test_fugue/test_fugue_helpers.py b/tests/test_fugue/test_fugue_helpers.py
index cf51690f..784cfd04 100644
--- a/tests/test_fugue/test_fugue_helpers.py
+++ b/tests/test_fugue/test_fugue_helpers.py
@@ -1,5 +1,5 @@
 #
-# Copyright 2024 Capital One Services, LLC
+# Copyright 2025 Capital One Services, LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tests/test_fugue/test_fugue_pandas.py b/tests/test_fugue/test_fugue_pandas.py
index 3c99a54e..db2ff305 100644
--- a/tests/test_fugue/test_fugue_pandas.py
+++ b/tests/test_fugue/test_fugue_pandas.py
@@ -1,5 +1,5 @@
 #
-# Copyright 2024 Capital One Services, LLC
+# Copyright 2025 Capital One Services, LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tests/test_fugue/test_fugue_polars.py b/tests/test_fugue/test_fugue_polars.py
index bf58bcdf..0e77781f 100644
--- a/tests/test_fugue/test_fugue_polars.py
+++ b/tests/test_fugue/test_fugue_polars.py
@@ -1,5 +1,5 @@
 #
-# Copyright 2024 Capital One Services, LLC
+# Copyright 2025 Capital One Services, LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tests/test_fugue/test_fugue_spark.py b/tests/test_fugue/test_fugue_spark.py
index 8c1046ac..008ef691 100644
--- a/tests/test_fugue/test_fugue_spark.py
+++ b/tests/test_fugue/test_fugue_spark.py
@@ -1,5 +1,5 @@
 #
-# Copyright 2024 Capital One Services, LLC
+# Copyright 2025 Capital One Services, LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tests/test_polars.py b/tests/test_polars.py
index ff89db00..9e0cf27c 100644
--- a/tests/test_polars.py
+++ b/tests/test_polars.py
@@ -1,5 +1,5 @@
 #
-# Copyright 2024 Capital One Services, LLC
+# Copyright 2025 Capital One Services, LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tests/test_snowflake.py b/tests/test_snowflake.py
index 026c8075..af6029ff 100644
--- a/tests/test_snowflake.py
+++ b/tests/test_snowflake.py
@@ -1,5 +1,5 @@
 #
-# Copyright 2024 Capital One Services, LLC
+# Copyright 2025 Capital One Services, LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "LICENSE");
 # you may not use this file except in compliance with the License.
diff --git a/tests/test_spark/test_legacy_spark.py b/tests/test_spark/test_legacy_spark.py
index cc71d90b..5c24087c 100644
--- a/tests/test_spark/test_legacy_spark.py
+++ b/tests/test_spark/test_legacy_spark.py
@@ -1,5 +1,5 @@
 #
-# Copyright 2024 Capital One Services, LLC
+# Copyright 2025 Capital One Services, LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tests/test_spark/test_pandas_spark.py b/tests/test_spark/test_pandas_spark.py
index 433c5762..068b1332 100644
--- a/tests/test_spark/test_pandas_spark.py
+++ b/tests/test_spark/test_pandas_spark.py
@@ -1,5 +1,5 @@
 #
-# Copyright 2024 Capital One Services, LLC
+# Copyright 2025 Capital One Services, LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.