Skip to content

Commit

Permalink
Merge branch 'main' into feat/add-nino-recognizer
Browse files Browse the repository at this point in the history
  • Loading branch information
omri374 authored Sep 22, 2024
2 parents 1e9cd99 + 9321e14 commit 25216ce
Show file tree
Hide file tree
Showing 11 changed files with 156 additions and 5 deletions.
21 changes: 21 additions & 0 deletions .devcontainer/presidio-analyzer-transformers/devcontainer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{
"name": "Presidio Analyzer Transformers",
"build": {
"dockerfile": "../../presidio-analyzer/Dockerfile.dev",
"context": "../../presidio-analyzer",
"args": {
"DEV_MODE": "transformers",
"NLP_CONF_FILE": "presidio_analyzer/conf/transformers.yaml",
"POETRY_EXTRAS": "-E transformers"
}
},
"workspaceMount": "source=${localWorkspaceFolder}/presidio-analyzer,target=/workspace,type=bind",
"workspaceFolder": "/workspace",
"postCreateCommand": "chmod +x ./install_dependencies.sh && ./install_dependencies.sh",
"postAttachCommand": "poetry shell",
"customizations": {
"extensions": [
"ms-python.python",
],
}
}
18 changes: 18 additions & 0 deletions .devcontainer/presidio-analyzer/devcontainer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"name": "Presidio Analyzer",
"build": {
"dockerfile": "../../presidio-analyzer/Dockerfile.dev",
"context": "../../presidio-analyzer",
"args": {
"DEV_MODE": "dev"
}
},
"workspaceMount": "source=${localWorkspaceFolder}/presidio-analyzer,target=/workspace,type=bind",
"workspaceFolder": "/workspace",
"postAttachCommand": "chmod +x ./install_dependencies.sh && ./install_dependencies.sh && poetry shell",
"customizations": {
"extensions": [
"ms-python.python",
],
}
}
20 changes: 20 additions & 0 deletions .devcontainer/presidio-anonymizer/devcontainer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"name": "Presidio Anonymizer",
"build": {
"dockerfile": "../../presidio-anonymizer/Dockerfile.dev",
"context": "../../presidio-anonymizer"
},
"workspaceMount": "source=${localWorkspaceFolder}/presidio-anonymizer,target=/workspace,type=bind",
"workspaceFolder": "/workspace",
"onCreateCommand": [
"poetry",
"install",
"--no-interaction"
],
"postAttachCommand": "poetry shell",
"customizations": {
"extensions": [
"ms-python.python",
],
}
}
20 changes: 20 additions & 0 deletions .devcontainer/presidio-image-redactor/devcontainer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"name": "Presidio Image Redactor",
"build": {
"dockerfile": "../../presidio-image-redactor/Dockerfile.dev",
"context": "../../presidio-image-redactor"
},
"workspaceMount": "source=${localWorkspaceFolder}/presidio-image-redactor,target=/workspace,type=bind",
"workspaceFolder": "/workspace",
"onCreateCommand": [
"poetry",
"install",
"--no-interaction"
],
"postAttachCommand": "poetry shell",
"customizations": {
"extensions": [
"ms-python.python",
],
}
}
20 changes: 20 additions & 0 deletions presidio-analyzer/Dockerfile.dev
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
FROM python:3.9-slim

ARG DEV_MODE=dev
ARG POETRY_EXTRAS=""
ARG NLP_CONF_FILE=presidio_analyzer/conf/default.yaml
ARG ANALYZER_CONF_FILE=presidio_analyzer/conf/default_analyzer.yaml
ARG RECOGNIZER_REGISTRY_CONF_FILE=presidio_analyzer/conf/default_recognizers.yaml

ENV DEV_MODE=${DEV_MODE}
ENV PIP_NO_CACHE_DIR=1
ENV ANALYZER_CONF_FILE=${ANALYZER_CONF_FILE}
ENV RECOGNIZER_REGISTRY_CONF_FILE=${RECOGNIZER_REGISTRY_CONF_FILE}
ENV NLP_CONF_FILE=${NLP_CONF_FILE}
ENV POETRY_EXTRAS=${POETRY_EXTRAS}

# Install essential build tools
RUN apt-get update \
&& apt-get install -y build-essential

RUN pip install poetry
5 changes: 5 additions & 0 deletions presidio-analyzer/install_dependencies.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/bin/bash

poetry install -E server ${POETRY_EXTRAS} --no-interaction

poetry run python install_nlp_models.py --conf_file "$NLP_CONF_FILE"
14 changes: 10 additions & 4 deletions presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,9 +79,15 @@ def get_supported_entities(self) -> List[str]:
raise ValueError(
"model_to_presidio_entity_mapping is missing from model configuration"
)
return list(
entities_from_mapping = list(
set(self.ner_model_configuration.model_to_presidio_entity_mapping.values())
)
entities = [
ent
for ent in entities_from_mapping
if ent not in self.ner_model_configuration.labels_to_ignore
]
return entities

def get_supported_languages(self) -> List[str]:
"""Return the supported languages for this NLP engine."""
Expand Down Expand Up @@ -121,9 +127,9 @@ def process_batch(
raise ValueError("NLP engine is not loaded. Consider calling .load()")

texts = (str(text) for text in texts)
docs = self.nlp[language].pipe(texts,
as_tuples=as_tuples,
batch_size=batch_size)
docs = self.nlp[language].pipe(
texts, as_tuples=as_tuples, batch_size=batch_size
)
for doc in docs:
yield doc.text, self._doc_to_nlp_artifact(doc, language)

Expand Down
7 changes: 6 additions & 1 deletion presidio-analyzer/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,15 @@ stanza = { version = "*", optional = true }
spacy_stanza = { version = "*", optional = true }
azure-ai-textanalytics = { version = "*", optional = true }
azure-core = { version = "*", optional = true }
transformers = { version = "*", optional = true }
huggingface_hub = { version = "*", optional = true }

[tool.poetry.extras]
server = ["flask"]
transformers = ["spacy_huggingface_pipelines"]
transformers = [
"transformers",
"huggingface_hub",
"spacy_huggingface_pipelines"]
stanza = [
"stanza",
"spacy_stanza",
Expand Down
13 changes: 13 additions & 0 deletions presidio-analyzer/tests/test_spacy_nlp_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,3 +67,16 @@ def test_default_configuration_correct():
)

assert actual_config_json == expected_config_json


def test_get_supported_entities_doesnt_include_ignored():
ner_config = NerModelConfiguration(labels_to_ignore=["A","B"],
model_to_presidio_entity_mapping=dict(A="A",
B="B",
C="C"))
spacy_nlp_engine = SpacyNlpEngine(ner_model_configuration=ner_config)
entities = spacy_nlp_engine.get_supported_entities()

assert "A" not in entities
assert "B" not in entities
assert "C" in entities
7 changes: 7 additions & 0 deletions presidio-anonymizer/Dockerfile.dev
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Dockerfile
FROM python:3.9-slim

RUN apt-get update \
&& apt-get install -y build-essential

RUN pip install poetry
16 changes: 16 additions & 0 deletions presidio-image-redactor/Dockerfile.dev
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Dockerfile
FROM python:3.9-slim

RUN apt-get update \
&& apt-get install -y build-essential

# Install dependencies required for Tesseract
RUN apt-get update \
&& apt-get install tesseract-ocr -y \
&& rm -rf /var/lib/apt/lists/* \
&& tesseract -v

RUN apt-get update \
&& apt-get install ffmpeg libsm6 libxext6 -y

RUN pip install poetry

0 comments on commit 25216ce

Please sign in to comment.