Polars added to unstructured labeler

capitalone · Mar 22, 2024 · 28a20a1 · 28a20a1
1 parent b5416e1
commit 28a20a1
Show file tree

Hide file tree

Showing 2 changed files with 26 additions and 18 deletions.
diff --git a/dataprofiler/profilers/unstructured_labeler_profile.py b/dataprofiler/profilers/unstructured_labeler_profile.py
@@ -3,6 +3,7 @@
 
 from collections import defaultdict
 
+import polars as pl
 from pandas import Series
 
 from ..labelers.base_data_labeler import BaseDataLabeler
@@ -102,7 +103,7 @@ def __add__(self, other: UnstructuredLabelerProfile) -> UnstructuredLabelerProfi
 
         return merged_profile
 
-    def report(self, remove_disabled_flag: bool = False) -> dict:
+    def report(self) -> dict:
         """
         Return profile object.
 
@@ -176,6 +177,7 @@ def _update_helper(self, df_series_clean: Series, profile: dict) -> None:
             df_series_clean, predictions.copy(), self.data_labeler.label_mapping
         )
 
+        df_series_clean = pl.Series(df_series_clean)
         # Update counts and percent values
         self._update_word_label_counts(df_series_clean, format_predictions["pred"])
         self._update_true_char_label_counts(predictions["pred"])
@@ -188,14 +190,17 @@ def _update_helper(self, df_series_clean: Series, profile: dict) -> None:
         # CHARACTERS/WORDS PROCESSED
         self._update_column_base_properties(profile)
 
-    def update(self, df_series: Series) -> None:
+    def update(self, df_series: Series | pl.Series) -> None:
         """Update profile."""
         if len(df_series) == 0:
             return
         profile = dict(
             char_sample_size=self.char_sample_size,
             word_sample_size=self.word_sample_size,
         )
+
+        if type(df_series) is pl.Series:
+            df_series = df_series.to_pandas()
         self._update_helper(df_series, profile)
 
     @property
@@ -278,7 +283,7 @@ def _update_true_char_label_counts(self, predictions: list) -> None:
             self.char_sample_size += len(sample)
 
     def _update_postprocess_char_label_counts(
-        self, df_series_clean: Series, format_predictions: dict
+        self, df_series_clean: Series | pl.Series, format_predictions: dict
     ) -> None:
         """
         Update the postprocess character label counts.
@@ -292,7 +297,8 @@ def _update_postprocess_char_label_counts(
         """
         char_label_counts = self.entity_counts["postprocess_char_level"]
 
-        for index, result in enumerate(zip(df_series_clean, format_predictions)):
+        df_series_clean = pl.Series(df_series_clean)
+        for result in zip(df_series_clean, format_predictions):
             text, entities = result
             index = 0
             for entity in entities:
@@ -308,7 +314,7 @@ def _update_postprocess_char_label_counts(
             char_label_counts["UNKNOWN"] += len(text) - index
 
     def _update_word_label_counts(
-        self, df_series_clean: Series, format_predictions: dict
+        self, df_series_clean: Series | pl.Series, format_predictions: dict
     ) -> None:
         """
         Update the sorted dictionary of each entity count.
@@ -321,7 +327,8 @@ def _update_word_label_counts(
         """
         word_label_counts = self.entity_counts["word_level"]
 
-        for index, result in enumerate(zip(df_series_clean, format_predictions)):
+        df_series_clean = pl.Series(df_series_clean)
+        for result in zip(df_series_clean, format_predictions):
             text, entities = result
             begin_word_idx = -1
             index = 0

diff --git a/dataprofiler/tests/profilers/test_unstructured_labeler_profile.py b/dataprofiler/tests/profilers/test_unstructured_labeler_profile.py
@@ -3,6 +3,7 @@
 from unittest import mock
 
 import pandas as pd
+import polars as pl
 
 from dataprofiler.profilers import profiler_utils
 from dataprofiler.profilers.unstructured_labeler_profile import (
@@ -15,7 +16,7 @@ def test_char_level_counts(self):
         # setting up objects/profile
         default = UnstructuredLabelerProfile()
 
-        sample = pd.Series(["abc123", "Bob", "!@##$%"])
+        sample = pl.Series(["abc123", "Bob", "!@##$%"])
 
         # running update
         default.update(sample)
@@ -34,7 +35,7 @@ def test_advanced_sample(self):
         # setting up objects/profile
         default = UnstructuredLabelerProfile()
 
-        sample = pd.Series(
+        sample = pl.Series(
             [
                 "Help\tJohn Macklemore\tneeds\tfood.\tPlease\tCall\t555-301-1234."
                 "\tHis\tssn\tis\tnot\t334-97-1234. I'm a BAN: 000043219499392912."
@@ -56,7 +57,7 @@ def test_word_level_NER_label_counts(self):
         # setting up objects/profile
         default = UnstructuredLabelerProfile()
 
-        sample = pd.Series(
+        sample = pl.Series(
             [
                 "Help\tJohn Macklemore\tneeds\tfood.\tPlease\tCall\t555-301-1234."
                 "\tHis\tssn\tis\tnot\t334-97-1234. I'm a BAN: 000049939232194912."
@@ -78,7 +79,7 @@ def test_statistics(self):
         # setting up objects/profile
         default = UnstructuredLabelerProfile()
 
-        sample = pd.Series(
+        sample = pl.Series(
             [
                 "Help\tJohn Macklemore\tneeds\tfood.\tPlease\tCall\t555-301-1234."
                 "\tHis\tssn\tis\tnot\t334-97-1234. I'm a BAN: 000043219499392912."
@@ -123,7 +124,7 @@ def test_profile(self, processor_class_mock, model_class_mock):
         # initialize labeler profile
         default = UnstructuredLabelerProfile()
 
-        sample = pd.Series(["a"])
+        sample = pl.Series(["a"])
         expected_profile = dict(
             entity_counts={
                 "postprocess_char_level": defaultdict(int, {"UNKNOWN": 1}),
@@ -163,15 +164,15 @@ def test_report(self, processor_class_mock, model_class_mock):
         # initialize labeler profile
         profile = UnstructuredLabelerProfile()
 
-        sample = pd.Series(["a"])
+        sample = pl.Series(["a"])
 
         time_array = [float(i) for i in range(4, 0, -1)]
         with mock.patch("time.time", side_effect=lambda: time_array.pop()):
             profile.update(sample)
 
         report1 = profile.profile
-        report2 = profile.report(remove_disabled_flag=False)
-        report3 = profile.report(remove_disabled_flag=True)
+        report2 = profile.report()
+        report3 = profile.report()
         self.assertDictEqual(report1, report2)
         self.assertDictEqual(report1, report3)
 
@@ -192,7 +193,7 @@ def test_entity_percentages(self, mock1, mock2):
         profile.entity_counts["true_char_level"]["TEST"] = 16
         profile.entity_counts["word_level"]["UNKNOWN"] = 5
         profile.entity_counts["word_level"]["TEST"] = 5
-        profile.update(pd.Series(["a"]))
+        profile.update(pl.Series(["a"]))
 
         expected_percentages = {
             "postprocess_char_level": defaultdict(int, {"UNKNOWN": 0.3, "TEST": 0.7}),
@@ -275,7 +276,7 @@ def test_diff(self, mock1, mock2):
         profiler1.entity_counts["word_level"]["UNKNOWN"] = 5
         profiler1.entity_counts["word_level"]["TEST"] = 5
         profiler1.entity_counts["word_level"]["UNIQUE1"] = 5
-        profiler1.update(pd.Series(["a"]))
+        profiler1.update(pl.Series(["a"]))
 
         profiler2 = UnstructuredLabelerProfile()
         profiler2.char_sample_size = 20
@@ -289,7 +290,7 @@ def test_diff(self, mock1, mock2):
         profiler2.entity_counts["word_level"]["UNKNOWN"] = 2
         profiler2.entity_counts["word_level"]["TEST"] = 4
         profiler2.entity_counts["word_level"]["UNIQUE2"] = 4
-        profiler2.update(pd.Series(["a"]))
+        profiler2.update(pl.Series(["a"]))
 
         expected_diff = {
             "entity_counts": {
@@ -342,7 +343,7 @@ def test_diff(self, mock1, mock2):
         profiler1.entity_counts["postprocess_char_level"]["UNKNOWN"] = 5
         profiler1.entity_counts["true_char_level"]["UNKNOWN"] = 5
         profiler1.entity_counts["word_level"]["UNKNOWN"] = 5
-        profiler1.update(pd.Series(["a"]))
+        profiler1.update(pl.Series(["a"]))
 
         profiler2 = UnstructuredLabelerProfile()
         profile2 = profiler2.profile