Skip to content

Commit

Permalink
Phone Number Recognizer - Multi-Regional And set as Default (#775)
Browse files Browse the repository at this point in the history
* Phone Number Recognizer - Multi-Regional And Default
  • Loading branch information
SharonHart authored Oct 3, 2021
1 parent 0a4defe commit 734807f
Show file tree
Hide file tree
Showing 9 changed files with 67 additions and 175 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,15 @@
from .iban_recognizer import IbanRecognizer
from .ip_recognizer import IpRecognizer
from .medical_license_recognizer import MedicalLicenseRecognizer
from .phone_recognizer import PhoneRecognizer
from .sg_fin_recognizer import SgFinRecognizer
from .spacy_recognizer import SpacyRecognizer
from .stanza_recognizer import StanzaRecognizer
from .phone_recognizer import PhoneRecognizer
from .uk_nhs_recognizer import NhsRecognizer
from .us_bank_recognizer import UsBankRecognizer
from .us_driver_license_recognizer import UsLicenseRecognizer
from .us_itin_recognizer import UsItinRecognizer
from .us_passport_recognizer import UsPassportRecognizer
from .us_phone_recognizer import UsPhoneRecognizer
from .us_ssn_recognizer import UsSsnRecognizer
from .es_nif_recognizer import EsNifRecognizer
from .au_abn_recognizer import AuAbnRecognizer
Expand All @@ -39,6 +38,7 @@
"IpRecognizer",
"NhsRecognizer",
"MedicalLicenseRecognizer",
"PhoneRecognizer",
"SgFinRecognizer",
"SpacyRecognizer",
"StanzaRecognizer",
Expand All @@ -47,7 +47,6 @@
"UsItinRecognizer",
"UsLicenseRecognizer",
"UsPassportRecognizer",
"UsPhoneRecognizer",
"UsSsnRecognizer",
"EsNifRecognizer",
"SpacyRecognizer",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,40 +1,39 @@
from typing import List, Optional

import phonenumbers
from phonenumbers import COUNTRY_CODE_TO_REGION_CODE, SUPPORTED_REGIONS
from phonenumbers.geocoder import country_name_for_number

from presidio_analyzer import RecognizerResult, LocalRecognizer, AnalysisExplanation
from presidio_analyzer import (
RecognizerResult,
LocalRecognizer,
AnalysisExplanation,
EntityRecognizer,
)
from presidio_analyzer.nlp_engine import NlpArtifacts

ENTITY_TYPE_SUFFIX = "_PHONE_NUMBER"
INTERNATIONAL_ENTITY_TYPE = "INTERNATIONAL_PHONE_NUMBER"


class PhoneRecognizer(LocalRecognizer):
"""Recognize multi-regional phone numbers.
Using python-phonenumbers, along with fixed and regional context words.
:param context: Base context words for enhancing the assurance scores.
:param supported_language: Language this recognizer supports
:param supported_entities: The entities this recognizer can detect
:param supported_regions: The regions for phone number matching and validation
"""

SCORE = 0.6
SCORE = 0.4
CONTEXT = ["phone", "number", "telephone", "cell", "cellphone", "mobile", "call"]
DEFAULT_SUPPORTED_COUNTRY_CODES = ("US", "UK", "DE", "FE", "IL")
DEFAULT_SUPPORTED_REGIONS = ("US", "UK", "DE", "FE", "IL", "IN", "CA", "BR")

def __init__(
self,
context: Optional[List[str]] = CONTEXT,
supported_language: str = "en",
supported_entities: List[str] = [
code + ENTITY_TYPE_SUFFIX for code in DEFAULT_SUPPORTED_COUNTRY_CODES
],
support_international=True
# For all regions, use phonenumbers.SUPPORTED_REGIONS
supported_regions=DEFAULT_SUPPORTED_REGIONS,
):
self.context = context
self.supported_entities = supported_entities + [INTERNATIONAL_ENTITY_TYPE]\
if support_international else supported_entities
self.supported_regions = supported_regions
super().__init__(
supported_entities=self.get_supported_entities(),
supported_language=supported_language,
Expand All @@ -44,14 +43,7 @@ def load(self) -> None: # noqa D102
pass

def get_supported_entities(self): # noqa D102
return (
self.supported_entities
if self.supported_entities
else [
value[0] + ENTITY_TYPE_SUFFIX
for value in COUNTRY_CODE_TO_REGION_CODE.values()
]
)
return ["PHONE_NUMBER"]

def analyze(
self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts = None
Expand All @@ -66,56 +58,41 @@ def analyze(
:return: List of phone numbers RecognizerResults
"""
results = []
for entity in entities:
region = entity.replace(ENTITY_TYPE_SUFFIX, "")
if region in SUPPORTED_REGIONS or entity is INTERNATIONAL_ENTITY_TYPE:
for match in phonenumbers.PhoneNumberMatcher(text, region, leniency=0):
international_phone_prefix = match.raw_string.startswith("+")
if entity == INTERNATIONAL_ENTITY_TYPE \
and international_phone_prefix:
results += [self._get_international_recognizer_result(match)]
# phone-numbers matches international numbers twice
elif not international_phone_prefix:
results += [
self._get_regional_recognizer_result(
match, entity, text, nlp_artifacts
)
]

return results

def _get_regional_recognizer_result(self, match, entity, text, nlp_artifacts):
for region in self.supported_regions:
for match in phonenumbers.PhoneNumberMatcher(text, region, leniency=1):
results += [
self._get_recognizer_result(match, text, region, nlp_artifacts)
]

return EntityRecognizer.remove_duplicates(results)

def _get_recognizer_result(self, match, text, region, nlp_artifacts):
number = match.number
main_region_code = COUNTRY_CODE_TO_REGION_CODE.get(number.country_code)[0]
result = RecognizerResult(
entity_type=entity,
entity_type="PHONE_NUMBER",
start=match.start,
end=match.end,
score=self.SCORE,
analysis_explanation=self._get_analysis_explanation(),
analysis_explanation=self._get_analysis_explanation(region),
)

# Enhance confidence using 'phone' related context and region code and name.
region_specific_context = (
self.context
+ [main_region_code]
+ [country_name_for_number(number, self.supported_language)]
)
return self.enhance_using_context(
text, [result], nlp_artifacts, region_specific_context
text,
[result],
nlp_artifacts,
self._get_region_specific_context(number, region),
)[0]

def _get_international_recognizer_result(self, match):
return RecognizerResult(
entity_type=INTERNATIONAL_ENTITY_TYPE,
start=match.start,
end=match.end,
score=0.6,
analysis_explanation=self._get_analysis_explanation(),
)
def _get_region_specific_context(self, number, region):
country_name = country_name_for_number(number, self.supported_language)
country_name_in_words = country_name.lower().split(" ")
return self.context + country_name_in_words + [region.lower()]

def _get_analysis_explanation(self):
def _get_analysis_explanation(self, region):
return AnalysisExplanation(
recognizer=PhoneRecognizer.__class__.__name__,
original_score=self.SCORE,
textual_explanation="Recognized using PhoneRecognizer",
textual_explanation=f"Recognized as {region} region phone number, "
f"using PhoneRecognizer",
)

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@
IpRecognizer,
MedicalLicenseRecognizer,
NhsRecognizer,
PhoneRecognizer,
UsBankRecognizer,
UsLicenseRecognizer,
UsItinRecognizer,
UsPassportRecognizer,
UsPhoneRecognizer,
UsSsnRecognizer,
SgFinRecognizer,
SpacyRecognizer,
Expand Down Expand Up @@ -68,7 +68,6 @@ def load_predefined_recognizers(
UsLicenseRecognizer,
UsItinRecognizer,
UsPassportRecognizer,
UsPhoneRecognizer,
UsSsnRecognizer,
NhsRecognizer,
SgFinRecognizer,
Expand All @@ -88,6 +87,7 @@ def load_predefined_recognizers(
IpRecognizer,
MedicalLicenseRecognizer,
nlp_recognizer,
PhoneRecognizer,
],
}
for lang in languages:
Expand Down
4 changes: 2 additions & 2 deletions presidio-analyzer/tests/mocks/recognizer_registry_mock.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from presidio_analyzer import RecognizerRegistry
from presidio_analyzer.predefined_recognizers import (
CreditCardRecognizer,
UsPhoneRecognizer,
PhoneRecognizer,
DomainRecognizer,
)

Expand All @@ -13,5 +13,5 @@ class RecognizerRegistryMock(RecognizerRegistry):

def load_predefined_recognizers(self, languages=None, nlp_engine=None):
self.recognizers.extend(
[CreditCardRecognizer(), UsPhoneRecognizer(), DomainRecognizer()]
[CreditCardRecognizer(), PhoneRecognizer(), DomainRecognizer()]
)
2 changes: 1 addition & 1 deletion presidio-analyzer/tests/test_analyzer_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def test_when_analyze_with_multiple_predefined_recognizers_then_succeed(
)

assert len(results) == 2
medium_regex_score = 0.5 # see UsPhoneRecognizer.PATTERNS
medium_regex_score = 0.4
context_similarity_factor = 0.35 # PatternRecognizer.CONTEXT_SIMILARITY_FACTOR
assert_result(results[0], "CREDIT_CARD", 14, 33, max_score)
expected_score = medium_regex_score + context_similarity_factor
Expand Down
4 changes: 2 additions & 2 deletions presidio-analyzer/tests/test_context_support.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from presidio_analyzer.predefined_recognizers import (
AbaRoutingRecognizer,
# CreditCardRecognizer,
UsPhoneRecognizer,
PhoneRecognizer,
# DomainRecognizer,
UsItinRecognizer,
UsLicenseRecognizer,
Expand All @@ -23,7 +23,7 @@ def recognizers():
rec_map = {
"IP_ADDRESS": IpRecognizer(),
"US_SSN": UsSsnRecognizer(),
"PHONE_NUMBER": UsPhoneRecognizer(),
"PHONE_NUMBER": PhoneRecognizer(),
"ABA_ROUTING_NUMBER": AbaRoutingRecognizer(),
"US_ITIN": UsItinRecognizer(),
"US_DRIVER_LICENSE": UsLicenseRecognizer(),
Expand Down
33 changes: 21 additions & 12 deletions presidio-analyzer/tests/test_phone_recognizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,30 +9,39 @@ def recognizer():
return PhoneRecognizer()


@pytest.fixture(scope="module")
def nlp_engine(nlp_engines):
return nlp_engines["spacy_en"]


@pytest.mark.parametrize(
"text, expected_len, entities, expected_positions, max_score",
"text, expected_len, entities, expected_positions, score",
[
# fmt: off
("My US number is (415) 555-0132, and my international one is +1 415 555 0132",
2, ["INTERNATIONAL_PHONE_NUMBER", "US_PHONE_NUMBER"],
((60, 75), (16, 30),), 0.6),
("My Israeli number is 09-7625400", 0,
["INTERNATIONAL_PHONE_NUMBER", "US_PHONE_NUMBER"], ((60, 75), (16, 30),), 0.6),
("My Israeli number is 09-7625400", 1, ["IL_PHONE_NUMBER"], ((21, 31), ), 0.6),
("My Israeli number is 09-7625400", 2,
PhoneRecognizer().get_supported_entities(), (2 * ()), 0.6),
2, ["PHONE_NUMBER", "PHONE_NUMBER"],
((16, 30), (60, 75),), 0.75),
("My Israeli number is 09-7625400", 1, ["PHONE_NUMBER"], ((21, 31), ), 0.75),
("_: (415)555-0132", 1, ["PHONE_NUMBER"], ((3, 16), ), 0.4),
("United States: (415)555-0132", 1, ["PHONE_NUMBER"], ((15, 28), ), 0.75),
("US: 415-555-0132", 1, ["PHONE_NUMBER"], ((4, 16), ), 0.4), # 'us' stop word
("_: +55 11 98456 5666", 1, ["PHONE_NUMBER"], ((3, 20), ), 0.4),
("Brazil: +55 11 98456 5666", 1, ["PHONE_NUMBER"], ((8, 25), ), 0.75),
("BR: +55 11 98456 5666", 1, ["PHONE_NUMBER"], ((4, 21), ), 0.75),
# fmt: on
],
)
def test_when_all_cryptos_then_succeed(
def test_when_all_phones_then_succeed(
nlp_engine,
text,
expected_len,
entities,
expected_positions,
max_score,
score,
recognizer,
):
results = recognizer.analyze(text, entities)
nlp_artifacts = nlp_engine.process_text(text, "en")
results = recognizer.analyze(text, entities, nlp_artifacts=nlp_artifacts)
assert len(results) == expected_len
for i, (res, (st_pos, fn_pos)) in enumerate(zip(results, expected_positions)):
assert_result(res, entities[i], st_pos, fn_pos, max_score)
assert_result(res, entities[i], st_pos, fn_pos, score)
Loading

0 comments on commit 734807f

Please sign in to comment.