diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index 2729a9bb4..6ec552955 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -53,7 +53,7 @@ For more nuanced testing runs, check out more detailed documentation [here](http ## Creating [Pull Requests](https://github.com/capitalone/DataProfiler/pulls) Pull requests are the best way to propose changes to the codebase. We actively welcome your pull requests: -1. Fork the repo and create your branch from `main`. +1. Fork the repo and create your branch from `dev`. 2. If you've added code that should be tested, add tests. 3. If you've changed APIs, update the documentation. 4. Ensure the test suite passes. diff --git a/.gitignore b/.gitignore index db3bb0cc2..0a12bc7be 100644 --- a/.gitignore +++ b/.gitignore @@ -14,7 +14,7 @@ dataprofiler/labelers/embeddings/glove-reduced-64D.txt .cache/ .idea/ -.vscode +.vscode* *.pyc *.pkl *.whl @@ -134,3 +134,6 @@ venv.bak/ env3/ *.bak + +#Pipfiles +Pipfile* diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ce67fff77..203e62b1f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -64,6 +64,7 @@ repos: typing-extensions>=3.10.0.2, HLL>=2.0.3, datasketches>=4.1.0, + boto3>=1.28.61, # requirements-dev.txt check-manifest>=0.48, @@ -110,7 +111,7 @@ repos: additional_dependencies: ['h5py', 'wheel', 'future', 'numpy', 'pandas', 'python-dateutil', 'pytz', 'pyarrow', 'chardet', 'fastavro', 'python-snappy', 'charset-normalizer', 'psutil', 'scipy', 'requests', - 'networkx','typing-extensions', 'HLL', 'datasketches'] + 'networkx','typing-extensions', 'HLL', 'datasketches', 'boto3'] # Pyupgrade - standardize and modernize Python syntax for newer versions of the language - repo: https://github.com/asottile/pyupgrade rev: v3.3.0 diff --git a/dataprofiler/__init__.py b/dataprofiler/__init__.py index 46c7196cc..2e89d3e2b 100644 --- a/dataprofiler/__init__.py +++ b/dataprofiler/__init__.py @@ -8,6 +8,7 @@ UnstructuredDataLabeler, train_structured_labeler, ) +from .plugins import load_plugins from .profilers.graph_profiler import GraphProfiler from .profilers.profile_builder import ( Profiler, @@ -41,3 +42,6 @@ def set_seed(seed=None): if seed is not None and (not isinstance(seed, int) or seed < 0): raise ValueError("Seed should be a non-negative integer.") settings._seed = seed + + +load_plugins() diff --git a/dataprofiler/data_readers/data.py b/dataprofiler/data_readers/data.py index 8c0f07507..8a3e6d94a 100644 --- a/dataprofiler/data_readers/data.py +++ b/dataprofiler/data_readers/data.py @@ -6,7 +6,7 @@ from .. import dp_logging from .avro_data import AVROData from .csv_data import CSVData -from .data_utils import is_valid_url, url_to_bytes +from .data_utils import S3Helper, is_valid_url, url_to_bytes from .graph_data import GraphData from .json_data import JSONData from .parquet_data import ParquetData @@ -65,7 +65,14 @@ def __new__( options = dict() if is_valid_url(input_file_path): - input_file_path = url_to_bytes(input_file_path, options) + if S3Helper.is_s3_uri(input_file_path, logger=logger): + storage_options = options.pop("storage_options", {}) + s3 = S3Helper.create_s3_client(**storage_options) + input_file_path = S3Helper.get_s3_uri( + s3_uri=input_file_path, s3_client=s3 + ) + else: + input_file_path = url_to_bytes(input_file_path, options) for data_class_info in cls.data_classes: data_class = data_class_info["data_class"] diff --git a/dataprofiler/data_readers/data_utils.py b/dataprofiler/data_readers/data_utils.py index 58ea61179..d0cc72115 100644 --- a/dataprofiler/data_readers/data_utils.py +++ b/dataprofiler/data_readers/data_utils.py @@ -1,5 +1,7 @@ """Contains functions for data readers.""" import json +import logging +import os import re import urllib from collections import OrderedDict @@ -19,6 +21,8 @@ cast, ) +import boto3 +import botocore import dateutil import pandas as pd import pyarrow.parquet as pq @@ -843,3 +847,125 @@ def url_to_bytes(url_as_string: Url, options: Dict) -> BytesIO: stream.seek(0) return stream + + +class S3Helper: + """ + A utility class for working with Amazon S3. + + This class provides methods to check if a path is an S3 URI + and to create an S3 client. + """ + + @staticmethod + def is_s3_uri(path: str, logger: logging.Logger) -> bool: + """ + Check if the given path is an S3 URI. + + This function checks for common S3 URI prefixes "s3://" and "s3a://". + + Args: + path (str): The path to check for an S3 URI. + logger (logging.Logger): The logger instance for logging. + + Returns: + bool: True if the path is an S3 URI, False otherwise. + """ + # Define the S3 URI prefixes to check + s3_uri_prefixes = ["s3://", "s3a://"] + path = path.strip() + # Check if the path starts with any of the specified prefixes + is_s3 = any(path.startswith(prefix) for prefix in s3_uri_prefixes) + if not is_s3: + logger.debug(f"'{path}' is not a valid S3 URI") + + return is_s3 + + @staticmethod + def _create_boto3_client( + aws_access_key_id: Optional[str], + aws_secret_access_key: Optional[str], + aws_session_token: Optional[str], + region_name: Optional[str], + ) -> boto3.client: + return boto3.client( + "s3", + aws_access_key_id=aws_access_key_id, + aws_secret_access_key=aws_secret_access_key, + aws_session_token=aws_session_token, + region_name=region_name, + ) + + @staticmethod + def create_s3_client( + aws_access_key_id: Optional[str] = None, + aws_secret_access_key: Optional[str] = None, + aws_session_token: Optional[str] = None, + region_name: Optional[str] = None, + ) -> boto3.client: + """ + Create and return an S3 client. + + Args: + aws_access_key_id (str): The AWS access key ID. + aws_secret_access_key (str): The AWS secret access key. + aws_session_token (str): The AWS session token + (optional, typically used for temporary credentials). + region_name (str): The AWS region name (default is 'us-east-1'). + + Returns: + boto3.client: A S3 client instance. + """ + # Check if credentials are not provided + # and use environment variables as fallback + if aws_access_key_id is None: + aws_access_key_id = os.environ.get("AWS_ACCESS_KEY_ID") + if aws_secret_access_key is None: + aws_secret_access_key = os.environ.get("AWS_SECRET_ACCESS_KEY") + if aws_session_token is None: + aws_session_token = os.environ.get("AWS_SESSION_TOKEN") + + # Check if region is not provided and use environment variable as fallback + if region_name is None: + region_name = os.environ.get("AWS_REGION", "us-east-1") + + # Check if IAM roles for service accounts are available + try: + s3 = S3Helper._create_boto3_client( + aws_access_key_id, aws_secret_access_key, aws_session_token, region_name + ) + except botocore.exceptions.NoCredentialsError: + # IAM roles are not available, so fall back to provided credentials + if aws_access_key_id is None or aws_secret_access_key is None: + raise ValueError( + "AWS access key ID and secret access key are required." + ) + s3 = S3Helper._create_boto3_client( + aws_access_key_id, aws_secret_access_key, aws_session_token, region_name + ) + + return s3 + + @staticmethod + def get_s3_uri(s3_uri: str, s3_client: boto3.client) -> BytesIO: + """ + Download an object from an S3 URI and return its content as BytesIO. + + Args: + s3_uri (str): The S3 URI specifying the location of the object to download. + s3_client (boto3.client): An initialized AWS S3 client + for accessing the S3 service. + + Returns: + BytesIO: A BytesIO object containing the content of + the downloaded S3 object. + """ + # Parse the S3 URI + parsed_uri = urllib.parse.urlsplit(s3_uri) + bucket_name = parsed_uri.netloc + file_key = parsed_uri.path.lstrip("/") + # Download the S3 object + response = s3_client.get_object(Bucket=bucket_name, Key=file_key) + + # Return the object's content as BytesIO + return BytesIO(response["Body"].read()) diff --git a/dataprofiler/labelers/data_processing.py b/dataprofiler/labelers/data_processing.py index 2213fd72d..d53980a35 100644 --- a/dataprofiler/labelers/data_processing.py +++ b/dataprofiler/labelers/data_processing.py @@ -2047,9 +2047,9 @@ def process( elif aggregation_func == "random": num_labels = max(label_mapping.values()) + 1 random_state: random.Random = self._parameters["random_state"] - priority_order = np.array(list(range(num_labels))) - random_state.shuffle(priority_order) # type: ignore - self.priority_prediction(results, priority_order) + priority_order = list(range(num_labels)) + random_state.shuffle(priority_order) + self.priority_prediction(results, np.array(priority_order)) else: raise ValueError( f"`{aggregation_func}` is not a valid aggregation function" diff --git a/dataprofiler/plugins/__init__.py b/dataprofiler/plugins/__init__.py new file mode 100644 index 000000000..fbe52182e --- /dev/null +++ b/dataprofiler/plugins/__init__.py @@ -0,0 +1,38 @@ +import importlib +import os + +from .decorators import plugin_decorator, plugins_dict + + +def load_plugins(): + """ + Digs through plugins folder for possible plugins to be imported + and consequently added to the plugins_dict if properly decorated + + :return: None + """ + plugin_path = os.path.dirname(os.path.abspath(__file__)) + for folder in os.listdir(plugin_path): + option_path = os.path.join(plugin_path, folder) + if os.path.isdir(option_path): + if folder == "__pycache__": + continue + for filename in os.listdir(option_path): + if filename is None or not filename.endswith(".py"): + continue + spec = importlib.util.spec_from_file_location( + filename, os.path.join(option_path, filename) + ) + if spec is not None: + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + + +def get_plugins(typ): + """ + Fetches a dictionary of plugins of a certain type + + :param typ: Broader classification/type of a plugin + :return: dict + """ + return plugins_dict.get(typ) diff --git a/dataprofiler/plugins/decorators.py b/dataprofiler/plugins/decorators.py new file mode 100644 index 000000000..c781f4300 --- /dev/null +++ b/dataprofiler/plugins/decorators.py @@ -0,0 +1,28 @@ +"""Contains function for generating plugins data.""" +from collections import defaultdict +from typing import Any, DefaultDict, Dict + +plugins_dict: DefaultDict[str, Dict[str, Any]] = defaultdict(dict) + + +def plugin_decorator(typ, name): + """ + Populate plugins_dict with decorated plugin functions. + + :param typ: Broader classification/type of a plugin + :param name: Specific name of a plugin + :return: function + """ + + def __inner_factory_function(fn): + """ + Actual population of plugin_dict. + + :param fn: Plugin function + :return: function + """ + global plugins_dict + plugins_dict[typ][name] = fn + return fn + + return __inner_factory_function diff --git a/dataprofiler/profilers/numerical_column_stats.py b/dataprofiler/profilers/numerical_column_stats.py index 2b35c8792..6086b575c 100644 --- a/dataprofiler/profilers/numerical_column_stats.py +++ b/dataprofiler/profilers/numerical_column_stats.py @@ -611,8 +611,8 @@ def _perform_t_test( ) -> dict: results: dict = { "t-statistic": None, - "conservative": {"df": None, "p-value": None}, - "welch": {"df": None, "p-value": None}, + "conservative": {"deg_of_free": None, "p-value": None}, + "welch": {"deg_of_free": None, "p-value": None}, } invalid_stats = False @@ -647,17 +647,17 @@ def _perform_t_test( s_delta = var1 / n1 + var2 / n2 t = (mean1 - mean2) / np.sqrt(s_delta) - conservative_df = min(n1, n2) - 1 - welch_df = s_delta**2 / ( + conservative_deg_of_free = min(n1, n2) - 1 + welch_deg_of_free = s_delta**2 / ( (var1 / n1) ** 2 / (n1 - 1) + (var2 / n2) ** 2 / (n2 - 1) ) results["t-statistic"] = t - results["conservative"]["df"] = float(conservative_df) - results["welch"]["df"] = float(welch_df) + results["conservative"]["deg_of_free"] = float(conservative_deg_of_free) + results["welch"]["deg_of_free"] = float(welch_deg_of_free) - conservative_t = scipy.stats.t(conservative_df) + conservative_t = scipy.stats.t(conservative_deg_of_free) conservative_p_val = (1 - conservative_t.cdf(abs(t))) * 2 - welch_t = scipy.stats.t(welch_df) + welch_t = scipy.stats.t(welch_deg_of_free) welch_p_val = (1 - welch_t.cdf(abs(t))) * 2 results["conservative"]["p-value"] = float(conservative_p_val) diff --git a/dataprofiler/profilers/profile_builder.py b/dataprofiler/profilers/profile_builder.py index 113d19ef2..6e512658f 100644 --- a/dataprofiler/profilers/profile_builder.py +++ b/dataprofiler/profilers/profile_builder.py @@ -94,6 +94,7 @@ def __init__( self.sample_size: int = 0 self.sample: list[str] = list() self.null_count: int = 0 + self.null_ratio: float | None = None self.null_types: list[str] = list() self.null_types_index: dict = {} self._min_id: int | None = None @@ -292,6 +293,9 @@ def diff(self, other_profile: StructuredColProfiler, options: dict = None) -> di "null_count": profiler_utils.find_diff_of_numbers( self.null_count, other_profile.null_count ), + "null_ratio": profiler_utils.find_diff_of_numbers( + self.null_ratio, other_profile.null_ratio + ), "null_types": profiler_utils.find_diff_of_lists_and_sets( self.null_types, other_profile.null_types ), @@ -428,6 +432,7 @@ def _update_base_stats(self, base_stats: dict) -> None: self._last_batch_size = base_stats["sample_size"] self.sample = base_stats["sample"] self.null_count += base_stats["null_count"] + self.null_ratio = base_stats["null_count"] / base_stats["sample_size"] self.null_types = profiler_utils._combine_unique_sets( self.null_types, list(base_stats["null_types"].keys()) ) @@ -570,6 +575,7 @@ def clean_data_and_get_base_stats( { "sample_size": 0, "null_count": 0, + "null_ratio": None, "null_types": dict(), "sample": [], "min_id": None, @@ -658,6 +664,7 @@ def clean_data_and_get_base_stats( base_stats = { "sample_size": total_sample_size, "null_count": total_na, + "null_ratio": total_na / total_sample_size, "null_types": na_columns, "sample": rng.choice( list(df_series.values), (min(len(df_series), 5),), replace=False diff --git a/dataprofiler/profilers/profiler_options.py b/dataprofiler/profilers/profiler_options.py index e3d10696b..038acf806 100644 --- a/dataprofiler/profilers/profiler_options.py +++ b/dataprofiler/profilers/profiler_options.py @@ -9,6 +9,7 @@ from typing import Any, Generic, TypeVar, cast from ..labelers.base_data_labeler import BaseDataLabeler +from ..plugins.__init__ import get_plugins from . import profiler_utils from .json_decoder import load_option @@ -1699,6 +1700,7 @@ def __init__(self, presets: str = None) -> None: self.structured_options = StructuredOptions() self.unstructured_options = UnstructuredOptions() self.presets = presets + option_plugins = get_plugins("option_preset") if self.presets: if self.presets == "complete": self._complete_presets() @@ -1708,6 +1710,8 @@ def __init__(self, presets: str = None) -> None: self._numeric_stats_disabled_presets() elif self.presets == "lower_memory_sketching": self._lower_memory_sketching_presets() + elif option_plugins is not None and self.presets in option_plugins: + option_plugins[self.presets](self) else: raise ValueError("The preset entered is not a valid preset.") diff --git a/dataprofiler/profilers/profiler_utils.py b/dataprofiler/profilers/profiler_utils.py index a3ed375b4..e38e1b041 100644 --- a/dataprofiler/profilers/profiler_utils.py +++ b/dataprofiler/profilers/profiler_utils.py @@ -739,7 +739,7 @@ def perform_chi_squared_test_for_homogeneity( """ results: dict[str, int | float | None] = { "chi2-statistic": None, - "df": None, + "deg_of_free": None, "p-value": None, } @@ -758,8 +758,8 @@ def perform_chi_squared_test_for_homogeneity( # Calculate degrees of freedom # df = (rows - 1) * (cols - 1), in the case of two groups reduces to cols - 1 - df = num_cats - 1 - results["df"] = df + deg_of_free = num_cats - 1 + results["deg_of_free"] = deg_of_free total = sample_size1 + sample_size2 @@ -781,7 +781,7 @@ def perform_chi_squared_test_for_homogeneity( results["chi2-statistic"] = chi2_statistic # Calculate p-value, i.e. P(X > chi2_statistic) - p_value: float = 1 - scipy.stats.chi2(df).cdf(chi2_statistic) + p_value: float = 1 - scipy.stats.chi2(deg_of_free).cdf(chi2_statistic) results["p-value"] = p_value return results diff --git a/dataprofiler/tests/data_readers/test_csv_data.py b/dataprofiler/tests/data_readers/test_csv_data.py index ef20361be..f2c0ca120 100644 --- a/dataprofiler/tests/data_readers/test_csv_data.py +++ b/dataprofiler/tests/data_readers/test_csv_data.py @@ -498,6 +498,19 @@ def test_specifying_data_type(self): input_data_obj.delimiter, input_file["delimiter"], input_file["path"] ) + def test_specifying_data_type_when_sampled(self): + """ + Determine if the csv file can be loaded with manual data_type setting + """ + for input_file in self.file_or_buf_list: + input_data_obj = Data( + input_file["path"], data_type="csv", options={"sample_nrows": 100} + ) + self.assertEqual(input_data_obj.data_type, "csv", input_file["path"]) + self.assertEqual( + input_data_obj.delimiter, input_file["delimiter"], input_file["path"] + ) + def test_data_formats(self): """ Test the data format options. @@ -518,6 +531,26 @@ def test_data_formats(self): + "['dataframe', 'records']", ) + def test_data_formats_when_sampled(self): + """ + Test the data format options. + """ + for input_file in self.file_or_buf_list: + input_data_obj = Data(input_file["path"], options={"sample_nrows": 100}) + self.assertEqual(input_data_obj.data_type, "csv") + self.assertIsInstance(input_data_obj.data, pd.DataFrame) + + input_data_obj.data_format = "records" + self.assertIsInstance(input_data_obj.data, list) + + with self.assertRaises(ValueError) as exc: + input_data_obj.data_format = "NON_EXISTENT" + self.assertEqual( + str(exc.exception), + "The data format must be one of the following: " + + "['dataframe', 'records']", + ) + def test_reload_data(self): """ Determine if the csv file can be reloaded @@ -547,6 +580,24 @@ def test_allowed_data_formats(self): self.assertIsInstance(data, list) self.assertIsInstance(data[0], str) + def test_allowed_data_formats_when_sampled(self): + """ + Determine if the csv file data_formats can be used + """ + for input_file in self.file_or_buf_list: + input_data_obj = Data(input_file["path"], options={"sample_nrows": 100}) + for data_format in list(input_data_obj._data_formats.keys()): + input_data_obj.data_format = data_format + self.assertEqual( + input_data_obj.data_format, data_format, msg=input_file["path"] + ) + data = input_data_obj.data + if data_format == "dataframe": + self.assertIsInstance(data, pd.DataFrame, msg=input_file["path"]) + elif data_format in ["records", "json"]: + self.assertIsInstance(data, list) + self.assertIsInstance(data[0], str) + def test_set_header(self): test_dir = os.path.join(test_root_path, "data") filename = "csv/sparse-first-and-last-column-two-headers.txt" diff --git a/dataprofiler/tests/data_readers/test_data.py b/dataprofiler/tests/data_readers/test_data.py index edbada028..159a8c160 100644 --- a/dataprofiler/tests/data_readers/test_data.py +++ b/dataprofiler/tests/data_readers/test_data.py @@ -1,11 +1,13 @@ import os import unittest from unittest import mock +from unittest.mock import MagicMock, patch import pandas as pd import requests from dataprofiler.data_readers.data import Data +from dataprofiler.data_readers.text_data import TextData test_root_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) @@ -160,6 +162,35 @@ def test_read_url_verify_ssl(self, mock_request_get): ): data_obj = Data("https://test.com") + @patch("boto3.client") + @patch( + "os.environ", + { + "AWS_ACCESS_KEY_ID": "", + "AWS_SECRET_ACCESS_KEY": "", + }, + ) + def test_read_s3_uri(self, mock_boto3_client): + region_name = "us-east-1" + + # Create a custom mock response + custom_response = {"Body": MagicMock()} + custom_response["Body"].read.return_value = b"Test S3 content" + + # Mock the behavior of the S3 client to return the custom response + mock_boto3_client.return_value.get_object.return_value = custom_response + data_obj = Data("s3a://my-bucket/my_file.txt") + + self.assertEqual(type(data_obj), TextData) + + mock_boto3_client.assert_called_with( + "s3", + aws_access_key_id="", + aws_secret_access_key="", + aws_session_token=None, + region_name=region_name, + ) + if __name__ == "__main__": unittest.main() diff --git a/dataprofiler/tests/data_readers/test_data_utils.py b/dataprofiler/tests/data_readers/test_data_utils.py index e2cb0f148..6b4127bae 100644 --- a/dataprofiler/tests/data_readers/test_data_utils.py +++ b/dataprofiler/tests/data_readers/test_data_utils.py @@ -32,9 +32,8 @@ def test_file_UTF_encoding_detection(self): dict(path=os.path.join(test_dir, "csv/reddit_wsb.csv"), encoding="utf-8"), ] - get_match_acc = lambda s, s2: sum([s[i] == s2[i] for i in range(len(s))]) / len( - s - ) + def get_match_acc(s, s2): + return sum([s[i] == s2[i] for i in range(len(s))]) / len(s) for input_file in input_files: detected_encoding = data_utils.detect_file_encoding( diff --git a/dataprofiler/tests/data_readers/test_s3_helper.py b/dataprofiler/tests/data_readers/test_s3_helper.py new file mode 100644 index 000000000..85abba7ca --- /dev/null +++ b/dataprofiler/tests/data_readers/test_s3_helper.py @@ -0,0 +1,123 @@ +import unittest +from unittest.mock import patch + +from dataprofiler import dp_logging +from dataprofiler.data_readers.data_utils import S3Helper + + +class TestS3Helper(unittest.TestCase): + @patch("boto3.client") + def test_create_s3_client_with_credentials(self, mock_boto3_client): + aws_access_key_id = "" + aws_secret_access_key = "" + region_name = "us-west-1" + + S3Helper.create_s3_client( + aws_access_key_id=aws_access_key_id, + aws_secret_access_key=aws_secret_access_key, + region_name=region_name, + ) + + mock_boto3_client.assert_called_with( + "s3", + aws_access_key_id=aws_access_key_id, + aws_secret_access_key=aws_secret_access_key, + aws_session_token=None, + region_name=region_name, + ) + + @patch("boto3.client") + @patch( + "os.environ", + { + "AWS_ACCESS_KEY_ID": "", + "AWS_SECRET_ACCESS_KEY": "", + }, + ) + def test_create_s3_client_with_environment_variables(self, mock_boto3_client): + region_name = "us-west-1" + + S3Helper.create_s3_client(region_name=region_name) + + mock_boto3_client.assert_called_with( + "s3", + aws_access_key_id="", + aws_secret_access_key="", + aws_session_token=None, + region_name=region_name, + ) + + @patch("boto3.client") + @patch("os.environ", {"AWS_REGION": "us-west-1"}) + def test_create_s3_client_with_iam_role_and_region_from_environment_variable( + self, mock_boto3_client + ): + S3Helper.create_s3_client() + + mock_boto3_client.assert_called_with( + "s3", + aws_access_key_id=None, + aws_secret_access_key=None, + aws_session_token=None, + region_name="us-west-1", + ) + + @patch("boto3.client") + @patch("botocore.exceptions.NoCredentialsError", Exception) + def test_create_s3_client_with_iam_role_fallback_to_credentials( + self, mock_boto3_client + ): + aws_access_key_id = "" + aws_secret_access_key = "" + region_name = "us-west-1" + + S3Helper.create_s3_client( + aws_access_key_id=aws_access_key_id, + aws_secret_access_key=aws_secret_access_key, + region_name=region_name, + ) + + mock_boto3_client.assert_called_with( + "s3", + aws_access_key_id=aws_access_key_id, + aws_secret_access_key=aws_secret_access_key, + aws_session_token=None, + region_name=region_name, + ) + + @patch("boto3.client") + def test_create_s3_client_with_iam_role(self, mock_boto3_client): + # Simulate a scenario where IAM roles are available, + # and no credentials are provided + region_name = "us-west-1" + + S3Helper.create_s3_client(region_name=region_name) + + mock_boto3_client.assert_called_with( + "s3", + aws_access_key_id=None, + aws_secret_access_key=None, + aws_session_token=None, + region_name=region_name, + ) + + def test_is_s3_uri_failure_logger_check(self): + invalid_path = "invalid_path" + + logger = dp_logging.get_child_logger(__name__) + + with self.assertLogs(logger, level="DEBUG") as log_context: + # Call the function with the invalid path + is_s3 = S3Helper.is_s3_uri(invalid_path, logger) + + # Assert that the function returns False (invalid path) + self.assertFalse(is_s3) + + # Assert that the log message is generated and logged + self.assertIn( + f"'{invalid_path}' is not a valid S3 URI", log_context.output[0] + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/dataprofiler/tests/plugins/test_plugins.py b/dataprofiler/tests/plugins/test_plugins.py new file mode 100644 index 000000000..14b0c77e0 --- /dev/null +++ b/dataprofiler/tests/plugins/test_plugins.py @@ -0,0 +1,47 @@ +import unittest +from collections import defaultdict +from unittest import mock + +from dataprofiler.plugins.__init__ import get_plugins, load_plugins +from dataprofiler.plugins.decorators import plugin_decorator, plugins_dict + + +class TestPlugins(unittest.TestCase): + def test_decorator_get_plugin(self, *mocks): + mock_plugin_execution = mock.Mock() + with mock.patch.dict(plugins_dict, defaultdict(dict)) as mock_plugin_dict: + + @plugin_decorator(typ="test", name="mock_test") + def test_plugin(): + mock_plugin_execution() + + expected_default_dict = defaultdict(dict) + expected_default_dict["test"]["mock_test"] = test_plugin + self.assertDictEqual(expected_default_dict, mock_plugin_dict) + + test_get_dict = get_plugins("test") + self.assertDictEqual({"mock_test": test_plugin}, test_get_dict) + + @mock.patch("dataprofiler.plugins.__init__.importlib.util") + @mock.patch("dataprofiler.plugins.__init__.os.path.isdir") + @mock.patch("dataprofiler.plugins.__init__.os.listdir") + def test_load_plugin(self, mock_listdir, mock_isdir, mock_importlib_util): + mock_listdir.side_effect = ( + lambda folder_dir: ["__pycache__", "py"] + if folder_dir.endswith("plugins") + else ["stillnotrealpy", "a.json", None] + ) + mock_isdir.return_value = True + mock_importlib_util.spec_from_file_location.return_value = None + load_plugins() + mock_importlib_util.spec_from_file_location.assert_not_called() + + mock_listdir.side_effect = ( + lambda folder_dir: ["folder"] + if folder_dir.endswith("plugins") + else ["file.py"] + ) + mock_spec = mock.Mock() + mock_importlib_util.spec_from_file_location.return_value = mock_spec + load_plugins() + mock_importlib_util.module_from_spec.assert_called_with(mock_spec) diff --git a/dataprofiler/tests/profilers/test_categorical_column_profile.py b/dataprofiler/tests/profilers/test_categorical_column_profile.py index 2b751d573..55d2ea68e 100644 --- a/dataprofiler/tests/profilers/test_categorical_column_profile.py +++ b/dataprofiler/tests/profilers/test_categorical_column_profile.py @@ -724,7 +724,7 @@ def test_categorical_diff(self): "categorical_count": {"y": 1, "n": 1, "maybe": -2}, "chi2-test": { "chi2-statistic": 82 / 35, - "df": 2, + "deg_of_free": 2, "p-value": 0.3099238764710244, }, "psi": 0.0990210257942779, @@ -774,7 +774,7 @@ def test_categorical_diff(self): "unique_ratio": -0.05357142857142855, "chi2-test": { "chi2-statistic": 0.6122448979591839, - "df": 2, + "deg_of_free": 2, "p-value": 0.7362964551863367, }, "categories": "unchanged", diff --git a/dataprofiler/tests/profilers/test_column_profile_compilers.py b/dataprofiler/tests/profilers/test_column_profile_compilers.py index 46b0212d3..1e0afc124 100644 --- a/dataprofiler/tests/profilers/test_column_profile_compilers.py +++ b/dataprofiler/tests/profilers/test_column_profile_compilers.py @@ -230,8 +230,11 @@ def test_diff_primitive_compilers(self): "stddev": 3.285085839971525, "t-test": { "t-statistic": 0.4155260166386663, - "conservative": {"df": 1.0, "p-value": 0.749287157907667}, - "welch": {"df": 3.6288111187629117, "p-value": 0.7011367179395704}, + "conservative": {"deg_of_free": 1.0, "p-value": 0.749287157907667}, + "welch": { + "deg_of_free": 3.6288111187629117, + "p-value": 0.7011367179395704, + }, }, "psi": 0.17328679513998632, }, @@ -322,8 +325,14 @@ def test_disabling_columns_during_primitive_diff(self): }, "t-test": { "t-statistic": -1.9674775073518591, - "conservative": {"df": 1.0, "p-value": 0.29936264581081673}, - "welch": {"df": 1.0673824509440946, "p-value": 0.28696889329266506}, + "conservative": { + "deg_of_free": 1.0, + "p-value": 0.29936264581081673, + }, + "welch": { + "deg_of_free": 1.0673824509440946, + "p-value": 0.28696889329266506, + }, }, "psi": 0, }, @@ -503,7 +512,7 @@ def test_column_stats_profile_compiler_stats_diff(self): "categorical_count": {"9": -1, "1": 1, "10": -1}, "chi2-test": { "chi2-statistic": 2.1, - "df": 2, + "deg_of_free": 2, "p-value": 0.3499377491111554, }, "psi": 0.009815252971365292, diff --git a/dataprofiler/tests/profilers/test_float_column_profile.py b/dataprofiler/tests/profilers/test_float_column_profile.py index b7a2bfab7..d79fdd641 100644 --- a/dataprofiler/tests/profilers/test_float_column_profile.py +++ b/dataprofiler/tests/profilers/test_float_column_profile.py @@ -1703,8 +1703,11 @@ def test_diff(self): }, "t-test": { "t-statistic": 0.5393164101529813, - "conservative": {"df": 2.0, "p-value": 0.643676756587475}, - "welch": {"df": 4.999127432888682, "p-value": 0.6128117908944144}, + "conservative": {"deg_of_free": 2.0, "p-value": 0.643676756587475}, + "welch": { + "deg_of_free": 4.999127432888682, + "p-value": 0.6128117908944144, + }, }, "psi": 0, } diff --git a/dataprofiler/tests/profilers/test_int_column_profile.py b/dataprofiler/tests/profilers/test_int_column_profile.py index d224a57a0..961b33c8c 100644 --- a/dataprofiler/tests/profilers/test_int_column_profile.py +++ b/dataprofiler/tests/profilers/test_int_column_profile.py @@ -1062,8 +1062,11 @@ def test_diff(self): "median_absolute_deviation": -5, "t-test": { "t-statistic": -0.5638091828819275, - "conservative": {"df": 1.0, "p-value": 0.6731699660830497}, - "welch": {"df": 1.0547717074524683, "p-value": 0.6691886269547123}, + "conservative": {"deg_of_free": 1.0, "p-value": 0.6731699660830497}, + "welch": { + "deg_of_free": 1.0547717074524683, + "p-value": 0.6691886269547123, + }, }, "psi": 0.0675775180180274, } diff --git a/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py b/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py index 4294dfd40..e112781ab 100644 --- a/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py +++ b/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py @@ -868,8 +868,11 @@ def test_diff(self): "stddev": np.sqrt(10 / 9) - np.sqrt(9 * 20 / 19), "t-test": { "t-statistic": 0.3923009049186606, - "conservative": {"df": 9, "p-value": 0.7039643545772609}, - "welch": {"df": 25.945257024943864, "p-value": 0.6980401261750298}, + "conservative": {"deg_of_free": 9, "p-value": 0.7039643545772609}, + "welch": { + "deg_of_free": 25.945257024943864, + "p-value": 0.6980401261750298, + }, }, "psi": None, } @@ -909,8 +912,8 @@ def test_diff(self): "stddev": np.nan, "t-test": { "t-statistic": None, - "conservative": {"df": None, "p-value": None}, - "welch": {"df": None, "p-value": None}, + "conservative": {"deg_of_free": None, "p-value": None}, + "welch": {"deg_of_free": None, "p-value": None}, }, "psi": None, } @@ -959,8 +962,8 @@ def test_diff(self): "stddev": np.nan, "t-test": { "t-statistic": None, - "conservative": {"df": None, "p-value": None}, - "welch": {"df": None, "p-value": None}, + "conservative": {"deg_of_free": None, "p-value": None}, + "welch": {"deg_of_free": None, "p-value": None}, }, "psi": None, } @@ -1008,8 +1011,8 @@ def test_diff(self): "stddev": 0, "t-test": { "t-statistic": None, - "conservative": {"df": None, "p-value": None}, - "welch": {"df": None, "p-value": None}, + "conservative": {"deg_of_free": None, "p-value": None}, + "welch": {"deg_of_free": None, "p-value": None}, }, "psi": None, } @@ -1056,8 +1059,11 @@ def test_diff(self): "stddev": np.sqrt(10 / 9) - np.sqrt(9 * 20 / 19), "t-test": { "t-statistic": -3.138407239349285, - "conservative": {"df": 9, "p-value": 0.011958658754358975}, - "welch": {"df": 25.945257024943864, "p-value": 0.004201616692122823}, + "conservative": {"deg_of_free": 9, "p-value": 0.011958658754358975}, + "welch": { + "deg_of_free": 25.945257024943864, + "p-value": 0.004201616692122823, + }, }, "psi": None, } diff --git a/dataprofiler/tests/profilers/test_profile_builder.py b/dataprofiler/tests/profilers/test_profile_builder.py index 9507fe20f..c4e604737 100644 --- a/dataprofiler/tests/profilers/test_profile_builder.py +++ b/dataprofiler/tests/profilers/test_profile_builder.py @@ -2043,6 +2043,7 @@ def test_diff(self, *mocks): "label_representation": {"a": "unchanged"}, "sample_size": -2, "null_count": -1, + "null_ratio": -0.25, "null_types": [[], [], ["nan"]], "null_types_index": [{}, {}, {"nan": {2}}], "data_type_representation": {"all_data_types": "unchanged"}, @@ -2061,6 +2062,7 @@ def test_diff(self, *mocks): "label_representation": {"a": "unchanged"}, "sample_size": -2, "null_count": -1, + "null_ratio": -0.25, "null_types": [[], [], ["nan"]], "null_types_index": [{}, {}, {"nan": {2}}], "data_type_representation": {"all_data_types": "unchanged"}, @@ -2151,7 +2153,7 @@ def test_diff_categorical_chi2_test(self, *mocks): diff = profile1.diff(profile2) expected_chi2_test_dict = { "chi2-statistic": 2.342857142857143, - "df": 2, + "deg_of_free": 2, "p-value": 0.3099238764710244, } self.assertDictEqual( @@ -2780,6 +2782,7 @@ def test_clean_data_and_get_base_stats(self, *mocks): "sample": ["6.0", "3.0", "4.0"], "sample_size": 5, "null_count": 2, + "null_ratio": 2 / 5, "null_types": dict(nan=["e", "b"]), "min_id": None, "max_id": None, @@ -2797,6 +2800,7 @@ def test_clean_data_and_get_base_stats(self, *mocks): "sample": ["6.0", "nan", "nan", "4.0"], "sample_size": 6, "null_count": 2, + "null_ratio": 2 / 6, "null_types": {"1.0": ["a"], "3.0": ["c"]}, "min_id": None, "max_id": None, @@ -2814,6 +2818,7 @@ def test_clean_data_and_get_base_stats(self, *mocks): "sample": ["3.0", "4.0", "nan", "6.0", "nan"], "sample_size": 6, "null_count": 0, + "null_ratio": 0 / 6, "null_types": {}, "min_id": None, "max_id": None, @@ -3091,6 +3096,7 @@ def test_diff(self, *mocks): "label_representation": {"a": "unchanged"}, "sample_size": 3, "null_count": 2, + "null_ratio": 2 / 7, "null_types": [["nan"], [], []], "null_types_index": [{"nan": {1, 5}}, {}, {}], "data_type_representation": {"all_data_types": "unchanged"}, @@ -3119,6 +3125,7 @@ def test_json_encode(self, mocked_datalabeler, *mocks): "sample_size": 0, "sample": [], "null_count": 0, + "null_ratio": None, "null_types": [], "null_types_index": {}, "_min_id": None, @@ -3170,6 +3177,7 @@ def test_json_encode_after_update(self, mock_DataLabeler, *mocks): "sample_size": 4, "sample": ["2", "-2", "1"], "null_count": 1, + "null_ratio": 1 / 4, "null_types": ["Nan"], "null_types_index": { "Nan": [ diff --git a/dataprofiler/tests/profilers/test_text_column_profile.py b/dataprofiler/tests/profilers/test_text_column_profile.py index 98b87acbe..12fb1d27b 100644 --- a/dataprofiler/tests/profilers/test_text_column_profile.py +++ b/dataprofiler/tests/profilers/test_text_column_profile.py @@ -589,8 +589,11 @@ def test_diff(self): ), "t-test": { "t-statistic": -1.9339958714826413, - "conservative": {"df": 8.0, "p-value": 0.08916903961929257}, - "welch": {"df": 15.761400272034564, "p-value": 0.07127621949432528}, + "conservative": {"deg_of_free": 8.0, "p-value": 0.08916903961929257}, + "welch": { + "deg_of_free": 15.761400272034564, + "p-value": 0.07127621949432528, + }, }, } diff --git a/dataprofiler/version.py b/dataprofiler/version.py index 9b043f481..fa4db03fb 100644 --- a/dataprofiler/version.py +++ b/dataprofiler/version.py @@ -2,7 +2,7 @@ MAJOR = 0 MINOR = 10 -MICRO = 5 +MICRO = 6 POST = None # otherwise None VERSION = "%d.%d.%d" % (MAJOR, MINOR, MICRO) diff --git a/requirements.txt b/requirements.txt index 7c8aa0b99..a45dc34ae 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,3 +17,4 @@ typing-extensions>=3.10.0.2 HLL>=2.0.3 datasketches>=4.1.0 packaging>=23.0 +boto3>=1.28.61