Skip to content

Commit

Permalink
Merge branch 'main' into modify-install-script
Browse files Browse the repository at this point in the history
  • Loading branch information
Mustaballer authored Aug 30, 2023
2 parents 705a2f0 + 824377b commit 73bfad4
Show file tree
Hide file tree
Showing 12 changed files with 659 additions and 213 deletions.
75 changes: 75 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,81 @@



## v0.13.0 (2023-08-29)

### Feature

* feat(privacy.providers): add aws_comprehend as scrubbing provider (#476)

* add aws_comprehend provider module

* fix typo

* add boto3 and botocore to poetry

* ran poetry update

* add the AWS Comprehend Class

* update the script

* create test module to test comprhend

* refactor name

* add full fleged test module

* try to fix failing tests

* try to fix failing tests

* fixed pytest

* format test file

* fix all flake 8 formattings

* add naming meaning for NER

* remove repeated code and make a TextMixin

* format

* change scrub enabled to false
after testing visualization

* remove redundant rmethods

* add enum style provider names

addressing: https://github.com/OpenAdaptAI/OpenAdapt/pull/476#discussion_r1304821459

* fix all flak8 errors

* add files for private_ai scrubbing provider

* change name

* add scrub text function in
PrivateAIScrubbingProvider

* try to skip tests if api key is incorrect or missing or the syntax used in code is incorrect

* pytest passes for test_private_ai

* add temp code for pdfs redaction

* remove the PRIVATE_AI provider code as it will be added in seprate pr

* remove omre prvt_ai files

* ran black

---------

Co-authored-by: Richard Abrich <[email protected]> ([`33fe244`](https://github.com/OpenAdaptAI/OpenAdapt/commit/33fe2446311dd8dea1465f617662407d3bc182cc))


## v0.12.0 (2023-08-29)

### Feature
Expand Down
Binary file added assets/sample_llc_1.pdf
Binary file not shown.
6 changes: 3 additions & 3 deletions openadapt/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@
# APP CONFIGURATIONS
"APP_DARK_MODE": False,
# SCRUBBING CONFIGURATIONS
"SCRUB_ENABLED": True,
"SCRUB_ENABLED": False,
"SCRUB_CHAR": "*",
"SCRUB_LANGUAGE": "en",
# TODO support lists in getenv_fallback
Expand All @@ -62,7 +62,7 @@
"nlp_engine_name": "spacy",
"models": [{"lang_code": "en", "model_name": "en_core_web_trf"}],
},
"SCRUB_IGNORE_ENTITIES": [
"SCRUB_PRESIDIO_IGNORE_ENTITIES": [
# 'US_PASSPORT',
# 'US_DRIVER_LICENSE',
# 'CRYPTO',
Expand Down Expand Up @@ -109,7 +109,7 @@
# Calculate and save the difference between 2 neighboring screenshots
"SAVE_SCREENSHOT_DIFF": False,
"SPACY_MODEL_NAME": "en_core_web_trf",
"SCRUB_PROVIDER_NAME": ["Presidio"],
"PRIVATE_AI_API_KEY": "<set your api key in .env>",
}

# each string in STOP_STRS should only contain strings
Expand Down
183 changes: 183 additions & 0 deletions openadapt/privacy/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,189 @@ def scrub_mp4( # pylint: disable=too-many-arguments
raise NotImplementedError


class TextScrubbingMixin:
"""Mixin class to add scrub_text method."""

def scrub_text_all(self, text: str) -> str:
"""Scrub the text by replacing all characters with config.SCRUB_CHAR.
Args:
text (str): Text to be scrubbed
Returns:
str: Scrubbed text
"""
return config.SCRUB_CHAR * len(text) # pylint: disable=E1101

def scrub_dict(
self,
input_dict: dict,
list_keys: list = None,
scrub_all: bool = False,
force_scrub_children: bool = False,
) -> dict:
"""Scrub the dict of all PII/PHI using Presidio ANALYZER.TRF and Anonymizer.
Args:
input_dict (dict): A dict to be scrubbed
list_keys (list): List of keys to be scrubbed
scrub_all (bool): Whether to scrub all sub-fields/keys/values
of that particular key
force_scrub_children (bool): Whether to force scrub children
even if key is not present
Returns:
dict: The scrubbed dict with PII and PHI removed.
"""
if list_keys is None:
list_keys = config.SCRUB_KEYS_HTML # pylint: disable=E1101

scrubbed_dict = {}
for key, value in input_dict.items():
if self._should_scrub_text(key, value, list_keys, scrub_all):
scrubbed_text = self._scrub_text_item(value, key, force_scrub_children)
if key in ("text", "canonical_text") and self._is_scrubbed(
value, scrubbed_text
):
force_scrub_children = True
scrubbed_dict[key] = scrubbed_text
elif isinstance(value, list):
scrubbed_list = [
(
self._scrub_list_item(
item, key, list_keys, force_scrub_children
)
if self._should_scrub_list_item(item, key, list_keys)
else item
)
for item in value
]
scrubbed_dict[key] = scrubbed_list
force_scrub_children = False
elif isinstance(value, dict):
if isinstance(key, str) and key == "state":
scrubbed_dict[key] = self.scrub_dict(
value, list_keys, scrub_all=True
)
else:
scrubbed_dict[key] = self.scrub_dict(value, list_keys)
else:
scrubbed_dict[key] = value

return scrubbed_dict

def scrub_list_dicts(
self, input_list: list[dict], list_keys: list = None
) -> list[dict]:
"""Scrub list of dicts to remove PII/PHI.
Args:
input_list (list[dict]): A list of dicts to be scrubbed
list_keys (list): List of keys to be scrubbed
Returns:
list[dict]: The scrubbed list of dicts with PII and PHI removed.
"""
scrubbed_list_dicts = []
for input_dict in input_list:
scrubbed_list_dicts.append(self.scrub_dict(input_dict, list_keys))

return scrubbed_list_dicts

def _should_scrub_text(
self,
key: str,
value: str,
list_keys: list[str],
scrub_all: bool = False,
) -> bool:
"""Check if the key and value should be scrubbed and are of correct instance.
Args:
key (str): The key of the item.
value (str): The value of the item.
list_keys (list[str]): A list of keys that need to be scrubbed.
scrub_all (bool): Whether to scrub all sub-fields/keys/values
of that particular key.
Returns:
bool: True if the key and value should be scrubbed, False otherwise.
"""
return (
isinstance(value, str)
and isinstance(key, str)
and (key in list_keys or scrub_all)
)

def _is_scrubbed(self, old_text: str, new_text: str) -> bool:
"""Check if the text has been scrubbed.
Args:
old_text (str): The original text
new_text (str): The scrubbed text
Returns:
bool: True if the text has been scrubbed, False otherwise
"""
return old_text != new_text

def _scrub_text_item(
self, value: str, key: str, force_scrub_children: bool = False
) -> str:
"""Scrubs the value of a text item.
Args:
value (str): The value of the item
key (str): The key of the item
Returns:
str: The scrubbed value
"""
if key in ("text", "canonical_text"):
return self.scrub_text(value, is_separated=True)
if force_scrub_children:
return self.scrub_text_all(value)
return self.scrub_text(value)

def _should_scrub_list_item(
self, item: str, key: str, list_keys: list[str]
) -> bool:
"""Check if the key and item should be scrubbed and are of correct instance.
Args:
item (str): The value of the item
key (str): The key of the item
list_keys (list): A list of keys that are needed to be scrubbed
Returns:
bool: True if the key and value should be scrubbed, False otherwise
"""
return isinstance(item, (str)) and isinstance(key, str) and key in list_keys

def _scrub_list_item(
self,
item: str | dict,
key: str,
list_keys: list[str],
force_scrub_children: bool = False,
) -> str | dict:
"""Scrubs the value of a dict item.
Args:
item (str/dict): The value of the dict item
key (str): The key of the dict item
list_keys (list): A list of keys that are needed to be scrubbed
Returns:
dict/str: The scrubbed dict/value respectively
"""
if isinstance(item, dict):
return self.scrub_dict(
item, list_keys, force_scrub_children=force_scrub_children
)
return self._scrub_text_item(item, key)


class ScrubbingProviderFactory: # pylint: disable=too-few-public-methods
"""A Factory Class for Scrubbing Providers."""

Expand Down
7 changes: 7 additions & 0 deletions openadapt/privacy/providers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,10 @@
Module: __init__.py
"""


class ScrubProvider: # pylint: disable=too-few-public-methods
"""A Class for Scrubbing Provider."""

PRESIDIO = "PRESIDIO"
COMPREHEND = "COMPREHEND"
Loading

0 comments on commit 73bfad4

Please sign in to comment.