Skip to content

Commit

Permalink
Polars added to unstructured labeler
Browse files Browse the repository at this point in the history
  • Loading branch information
abajpai15 committed Mar 22, 2024
1 parent b5416e1 commit 28a20a1
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 18 deletions.
19 changes: 13 additions & 6 deletions dataprofiler/profilers/unstructured_labeler_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from collections import defaultdict

import polars as pl
from pandas import Series

from ..labelers.base_data_labeler import BaseDataLabeler
Expand Down Expand Up @@ -102,7 +103,7 @@ def __add__(self, other: UnstructuredLabelerProfile) -> UnstructuredLabelerProfi

return merged_profile

def report(self, remove_disabled_flag: bool = False) -> dict:
def report(self) -> dict:
"""
Return profile object.
Expand Down Expand Up @@ -176,6 +177,7 @@ def _update_helper(self, df_series_clean: Series, profile: dict) -> None:
df_series_clean, predictions.copy(), self.data_labeler.label_mapping
)

df_series_clean = pl.Series(df_series_clean)
# Update counts and percent values
self._update_word_label_counts(df_series_clean, format_predictions["pred"])
self._update_true_char_label_counts(predictions["pred"])
Expand All @@ -188,14 +190,17 @@ def _update_helper(self, df_series_clean: Series, profile: dict) -> None:
# CHARACTERS/WORDS PROCESSED
self._update_column_base_properties(profile)

def update(self, df_series: Series) -> None:
def update(self, df_series: Series | pl.Series) -> None:
"""Update profile."""
if len(df_series) == 0:
return
profile = dict(
char_sample_size=self.char_sample_size,
word_sample_size=self.word_sample_size,
)

if type(df_series) is pl.Series:
df_series = df_series.to_pandas()
self._update_helper(df_series, profile)

@property
Expand Down Expand Up @@ -278,7 +283,7 @@ def _update_true_char_label_counts(self, predictions: list) -> None:
self.char_sample_size += len(sample)

def _update_postprocess_char_label_counts(
self, df_series_clean: Series, format_predictions: dict
self, df_series_clean: Series | pl.Series, format_predictions: dict
) -> None:
"""
Update the postprocess character label counts.
Expand All @@ -292,7 +297,8 @@ def _update_postprocess_char_label_counts(
"""
char_label_counts = self.entity_counts["postprocess_char_level"]

for index, result in enumerate(zip(df_series_clean, format_predictions)):
df_series_clean = pl.Series(df_series_clean)
for result in zip(df_series_clean, format_predictions):
text, entities = result
index = 0
for entity in entities:
Expand All @@ -308,7 +314,7 @@ def _update_postprocess_char_label_counts(
char_label_counts["UNKNOWN"] += len(text) - index

def _update_word_label_counts(
self, df_series_clean: Series, format_predictions: dict
self, df_series_clean: Series | pl.Series, format_predictions: dict
) -> None:
"""
Update the sorted dictionary of each entity count.
Expand All @@ -321,7 +327,8 @@ def _update_word_label_counts(
"""
word_label_counts = self.entity_counts["word_level"]

for index, result in enumerate(zip(df_series_clean, format_predictions)):
df_series_clean = pl.Series(df_series_clean)
for result in zip(df_series_clean, format_predictions):
text, entities = result
begin_word_idx = -1
index = 0
Expand Down
25 changes: 13 additions & 12 deletions dataprofiler/tests/profilers/test_unstructured_labeler_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from unittest import mock

import pandas as pd
import polars as pl

from dataprofiler.profilers import profiler_utils
from dataprofiler.profilers.unstructured_labeler_profile import (
Expand All @@ -15,7 +16,7 @@ def test_char_level_counts(self):
# setting up objects/profile
default = UnstructuredLabelerProfile()

sample = pd.Series(["abc123", "Bob", "!@##$%"])
sample = pl.Series(["abc123", "Bob", "!@##$%"])

# running update
default.update(sample)
Expand All @@ -34,7 +35,7 @@ def test_advanced_sample(self):
# setting up objects/profile
default = UnstructuredLabelerProfile()

sample = pd.Series(
sample = pl.Series(
[
"Help\tJohn Macklemore\tneeds\tfood.\tPlease\tCall\t555-301-1234."
"\tHis\tssn\tis\tnot\t334-97-1234. I'm a BAN: 000043219499392912."
Expand All @@ -56,7 +57,7 @@ def test_word_level_NER_label_counts(self):
# setting up objects/profile
default = UnstructuredLabelerProfile()

sample = pd.Series(
sample = pl.Series(
[
"Help\tJohn Macklemore\tneeds\tfood.\tPlease\tCall\t555-301-1234."
"\tHis\tssn\tis\tnot\t334-97-1234. I'm a BAN: 000049939232194912."
Expand All @@ -78,7 +79,7 @@ def test_statistics(self):
# setting up objects/profile
default = UnstructuredLabelerProfile()

sample = pd.Series(
sample = pl.Series(
[
"Help\tJohn Macklemore\tneeds\tfood.\tPlease\tCall\t555-301-1234."
"\tHis\tssn\tis\tnot\t334-97-1234. I'm a BAN: 000043219499392912."
Expand Down Expand Up @@ -123,7 +124,7 @@ def test_profile(self, processor_class_mock, model_class_mock):
# initialize labeler profile
default = UnstructuredLabelerProfile()

sample = pd.Series(["a"])
sample = pl.Series(["a"])
expected_profile = dict(
entity_counts={
"postprocess_char_level": defaultdict(int, {"UNKNOWN": 1}),
Expand Down Expand Up @@ -163,15 +164,15 @@ def test_report(self, processor_class_mock, model_class_mock):
# initialize labeler profile
profile = UnstructuredLabelerProfile()

sample = pd.Series(["a"])
sample = pl.Series(["a"])

time_array = [float(i) for i in range(4, 0, -1)]
with mock.patch("time.time", side_effect=lambda: time_array.pop()):
profile.update(sample)

report1 = profile.profile
report2 = profile.report(remove_disabled_flag=False)
report3 = profile.report(remove_disabled_flag=True)
report2 = profile.report()
report3 = profile.report()
self.assertDictEqual(report1, report2)
self.assertDictEqual(report1, report3)

Expand All @@ -192,7 +193,7 @@ def test_entity_percentages(self, mock1, mock2):
profile.entity_counts["true_char_level"]["TEST"] = 16
profile.entity_counts["word_level"]["UNKNOWN"] = 5
profile.entity_counts["word_level"]["TEST"] = 5
profile.update(pd.Series(["a"]))
profile.update(pl.Series(["a"]))

expected_percentages = {
"postprocess_char_level": defaultdict(int, {"UNKNOWN": 0.3, "TEST": 0.7}),
Expand Down Expand Up @@ -275,7 +276,7 @@ def test_diff(self, mock1, mock2):
profiler1.entity_counts["word_level"]["UNKNOWN"] = 5
profiler1.entity_counts["word_level"]["TEST"] = 5
profiler1.entity_counts["word_level"]["UNIQUE1"] = 5
profiler1.update(pd.Series(["a"]))
profiler1.update(pl.Series(["a"]))

profiler2 = UnstructuredLabelerProfile()
profiler2.char_sample_size = 20
Expand All @@ -289,7 +290,7 @@ def test_diff(self, mock1, mock2):
profiler2.entity_counts["word_level"]["UNKNOWN"] = 2
profiler2.entity_counts["word_level"]["TEST"] = 4
profiler2.entity_counts["word_level"]["UNIQUE2"] = 4
profiler2.update(pd.Series(["a"]))
profiler2.update(pl.Series(["a"]))

expected_diff = {
"entity_counts": {
Expand Down Expand Up @@ -342,7 +343,7 @@ def test_diff(self, mock1, mock2):
profiler1.entity_counts["postprocess_char_level"]["UNKNOWN"] = 5
profiler1.entity_counts["true_char_level"]["UNKNOWN"] = 5
profiler1.entity_counts["word_level"]["UNKNOWN"] = 5
profiler1.update(pd.Series(["a"]))
profiler1.update(pl.Series(["a"]))

profiler2 = UnstructuredLabelerProfile()
profile2 = profiler2.profile
Expand Down

0 comments on commit 28a20a1

Please sign in to comment.