Skip to content

Commit 654c90c

Browse files
committed
Put a levenshtein max distance check into the phonetic matcher.
1 parent 723f32b commit 654c90c

File tree

5 files changed

+73
-34
lines changed

5 files changed

+73
-34
lines changed

nomenklatura/matching/compare/names.py

+5-10
Original file line numberDiff line numberDiff line change
@@ -4,23 +4,18 @@
44
from followthemoney.types import registry
55
from fingerprints import clean_name_light, clean_name_ascii
66
from rigour.text.distance import levenshtein_similarity
7-
from rigour.text.distance import dam_levenshtein, jaro_winkler
7+
from rigour.text.distance import jaro_winkler
88
from nomenklatura.util import names_word_list, name_words
99
from nomenklatura.util import fingerprint_name, normalize_name
1010
from nomenklatura.matching.util import type_pair, props_pair, has_schema
1111
from nomenklatura.matching.compare.util import is_disjoint, clean_map, has_overlap
12+
from nomenklatura.matching.compare.util import is_levenshtein_plausible
1213

1314

1415
def _name_parts(name: str) -> List[str]:
1516
return name_words(normalize_name(name))
1617

1718

18-
def _is_levenshtein_plausible(query: str, result: str) -> bool:
19-
# Skip results with an overall distance of more than 3 characters:
20-
max_edits = min(3, (min(len(query), len(result)) // 3))
21-
return dam_levenshtein(query, result) <= max_edits
22-
23-
2419
def _align_name_parts(query: List[str], result: List[str]) -> float:
2520
if len(query) == 0 or len(result) == 0:
2621
return 0.0
@@ -29,7 +24,7 @@ def _align_name_parts(query: List[str], result: List[str]) -> float:
2924
# compute all pairwise scores for name parts:
3025
for qn, rn in product(set(query), set(result)):
3126
score = jaro_winkler(qn, rn)
32-
if score > 0.0 and _is_levenshtein_plausible(qn, rn):
27+
if score > 0.0 and is_levenshtein_plausible(qn, rn):
3328
scores[(qn, rn)] = score
3429
pairs: List[Tuple[str, str]] = []
3530
# original length of query:
@@ -50,7 +45,7 @@ def _align_name_parts(query: List[str], result: List[str]) -> float:
5045
aligned = pairs[::-1]
5146
query_aligned = "".join(p[0] for p in aligned)
5247
result_aligned = "".join(p[1] for p in aligned)
53-
if not _is_levenshtein_plausible(query_aligned, result_aligned):
48+
if not is_levenshtein_plausible(query_aligned, result_aligned):
5449
return 0.0
5550
# return an amped-up jaro-winkler score for the aligned name parts:
5651
return total_score
@@ -68,7 +63,7 @@ def person_name_jaro_winkler(query: E, result: E) -> float:
6863
for (qn, rn) in product(query_names, result_names):
6964
qns = "".join(qn)
7065
rns = "".join(rn)
71-
if _is_levenshtein_plausible(qns, rns):
66+
if is_levenshtein_plausible(qns, rns):
7267
score = max(score, jaro_winkler(qns, rns) ** len(qns))
7368
score = max(score, _align_name_parts(list(qn), list(rn)))
7469
return score

nomenklatura/matching/compare/phonetic.py

+44-16
Original file line numberDiff line numberDiff line change
@@ -3,18 +3,33 @@
33
from followthemoney.proxy import E
44
from followthemoney.types import registry
55
from rigour.text.scripts import is_modern_alphabet
6-
from fingerprints import clean_name_ascii, clean_entity_prefix
6+
from rigour.names.part import name_parts, NamePart
77
from nomenklatura.util import name_words, list_intersection, fingerprint_name
8-
from nomenklatura.util import phonetic_token, metaphone_token, soundex_token
8+
from nomenklatura.util import metaphone_token, soundex_token
99
from nomenklatura.matching.util import type_pair, has_schema
10+
from nomenklatura.matching.compare.util import is_levenshtein_plausible
1011

1112

12-
def _clean_phonetic_person(original: str) -> Optional[str]:
13-
"""Normalize a person name without transliteration."""
14-
if not is_modern_alphabet(original):
15-
return None
16-
text = clean_entity_prefix(original)
17-
return clean_name_ascii(text)
13+
def compare_parts_phonetic(left: NamePart, right: NamePart) -> bool:
14+
if left.metaphone is None or right.metaphone is None:
15+
return left.ascii == right.ascii
16+
if (
17+
left.metaphone == right.metaphone
18+
and left.ascii is not None
19+
and right.ascii is not None
20+
):
21+
# Secondary check for Levenshtein distance:
22+
if is_levenshtein_plausible(left.ascii, right.ascii):
23+
return True
24+
return False
25+
26+
27+
# def _clean_phonetic_person(original: str) -> Optional[str]:
28+
# """Normalize a person name without transliteration."""
29+
# if not is_modern_alphabet(original):
30+
# return None
31+
# text = clean_entity_prefix(original)
32+
# return clean_name_ascii(text)
1833

1934

2035
def _clean_phonetic_entity(original: str) -> Optional[str]:
@@ -24,11 +39,11 @@ def _clean_phonetic_entity(original: str) -> Optional[str]:
2439
return fingerprint_name(original)
2540

2641

27-
def _phonetic_person_tokens(token: str) -> List[str]:
28-
words: List[str] = []
29-
for word in name_words(_clean_phonetic_person(token), min_length=2):
30-
words.append(phonetic_token(word))
31-
return words
42+
# def _phonetic_person_tokens(token: str) -> List[str]:
43+
# words: List[str] = []
44+
# for word in name_words(_clean_phonetic_person(token), min_length=2):
45+
# words.append(phonetic_token(word))
46+
# return words
3247

3348

3449
def _token_names_compare(
@@ -48,9 +63,22 @@ def person_name_phonetic_match(query: E, result: E) -> float:
4863
if not has_schema(query, result, "Person"):
4964
return 0.0
5065
query_names_, result_names_ = type_pair(query, result, registry.name)
51-
query_names = [_phonetic_person_tokens(n) for n in query_names_]
52-
result_names = [_phonetic_person_tokens(n) for n in result_names_]
53-
return _token_names_compare(query_names, result_names)
66+
query_parts = [name_parts(n) for n in query_names_]
67+
result_parts = [name_parts(n) for n in result_names_]
68+
score = 0.0
69+
for (q, r) in product(query_parts, result_parts):
70+
if len(q) == 0:
71+
continue
72+
matches = list(r)
73+
matched = 0
74+
for part in q:
75+
for other in matches:
76+
if compare_parts_phonetic(part, other):
77+
matches.remove(other)
78+
matched += 1
79+
break
80+
score = max(score, matched / float(len(q)))
81+
return score
5482

5583

5684
def _metaphone_tokens(token: str) -> List[str]:

nomenklatura/matching/compare/util.py

+10
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import re
22
from typing import List, Set, Union, Iterable, Callable, Optional
3+
from rigour.text.distance import dam_levenshtein
34

45
CleanFunc = Optional[Callable[[str], Optional[str]]]
56
FIND_NUM = re.compile(r"\d{1,}")
@@ -49,3 +50,12 @@ def extract_numbers(values: List[str]) -> Set[str]:
4950
for value in values:
5051
numbers.update(FIND_NUM.findall(value))
5152
return numbers
53+
54+
55+
def is_levenshtein_plausible(query: str, result: str) -> bool:
56+
"""A sanity check to post-filter name matching results based on a budget
57+
of allowed Levenshtein distance. This basically cuts off results where
58+
the Jaro-Winkler or Metaphone comparison was too lenient."""
59+
# Skip results with an overall distance of more than 3 characters:
60+
max_edits = min(3, (min(len(query), len(result)) // 3))
61+
return dam_levenshtein(query, result) <= max_edits

nomenklatura/util.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,9 @@
44
from datetime import datetime, timezone
55
from followthemoney import model
66
from functools import lru_cache, cache
7-
from jellyfish import metaphone, soundex
87
from normality import collapse_spaces, category_replace
98
from normality.constants import WS
9+
from rigour.text import metaphone, soundex
1010
from collections.abc import Mapping, Sequence
1111
from fingerprints.cleanup import clean_name_ascii, clean_entity_prefix
1212
from fingerprints.cleanup import CHARACTERS_REMOVE_RE
@@ -150,7 +150,6 @@ def phonetic_token(token: str) -> str:
150150
return metaphone_token(token)
151151

152152

153-
@lru_cache(maxsize=1024)
154153
def metaphone_token(token: str) -> str:
155154
if token.isalpha() and len(token) > 1:
156155
out = metaphone(token)
@@ -160,7 +159,6 @@ def metaphone_token(token: str) -> str:
160159
return token.upper()
161160

162161

163-
@lru_cache(maxsize=1024)
164162
def soundex_token(token: str) -> str:
165163
if token.isalpha() and len(token) > 1:
166164
out = soundex(token)

tests/matching/test_names.py

+13-5
Original file line numberDiff line numberDiff line change
@@ -82,14 +82,14 @@ def test_duplicative_name_similarity():
8282
def test_single_name():
8383
name = e("Person", name="Hannibal")
8484
other = e("Person", name="Hannibal")
85-
assert person_name_phonetic_match(name, other) == 0.5
85+
assert person_name_phonetic_match(name, other) == 1.0
8686
assert person_name_jaro_winkler(name, other) == 1.0
8787

8888
other = e("Person", name="Hanniball")
89-
assert person_name_phonetic_match(name, other) == 0.5
89+
assert person_name_phonetic_match(name, other) == 1.0
9090

9191
other = e("Person", name="Hannibol")
92-
assert person_name_phonetic_match(name, other) == 0.5
92+
assert person_name_phonetic_match(name, other) == 1.0
9393
assert person_name_jaro_winkler(name, other) > 0.8
9494
assert person_name_jaro_winkler(name, other) < 1.0
9595

@@ -144,6 +144,14 @@ def test_person_name_phonetic_match():
144144
result = e("Person", name="Фуад Гулієв")
145145
assert person_name_phonetic_match(query, result) < 1.0
146146

147+
query = e("Person", name="Olga Barynova")
148+
result = e("Person", name="Oleg BARANOV")
149+
assert person_name_phonetic_match(query, result) < 0.6
150+
151+
query = e("Person", name="Ginta Boreza")
152+
result = e("Person", name="Janett Borez")
153+
assert person_name_phonetic_match(query, result) < 0.6
154+
147155
query = e("Person", name="Shaikh Isa Bin Tarif Al Bin Ali")
148156
result = e("Person", name="Shaikh Isa Bin Tarif Al Bin Ali")
149157
assert person_name_phonetic_match(query, result) == 1.0
@@ -264,8 +272,8 @@ def test_jaro_lindemann():
264272
def test_name_alphabets():
265273
query = e("Person", name="Ротенберг Аркадий")
266274
result = e("Person", name="Arkadiii Romanovich Rotenberg")
267-
assert person_name_phonetic_match(query, result) > 0.0
268-
assert person_name_phonetic_match(query, result) < 0.7
275+
# assert person_name_phonetic_match(query, result) > 0.0
276+
assert person_name_phonetic_match(query, result) > 0.7
269277
assert person_name_jaro_winkler(query, result) > 0.7
270278

271279
query = e("Person", name="Osama bin Laden")

0 commit comments

Comments
 (0)