3
3
from followthemoney .proxy import E
4
4
from followthemoney .types import registry
5
5
from rigour .text .scripts import is_modern_alphabet
6
- from fingerprints import clean_name_ascii , clean_entity_prefix
6
+ from rigour . names . part import name_parts , NamePart
7
7
from nomenklatura .util import name_words , list_intersection , fingerprint_name
8
- from nomenklatura .util import phonetic_token , metaphone_token , soundex_token
8
+ from nomenklatura .util import metaphone_token , soundex_token
9
9
from nomenklatura .matching .util import type_pair , has_schema
10
+ from nomenklatura .matching .compare .util import is_levenshtein_plausible
10
11
11
12
12
- def _clean_phonetic_person (original : str ) -> Optional [str ]:
13
- """Normalize a person name without transliteration."""
14
- if not is_modern_alphabet (original ):
15
- return None
16
- text = clean_entity_prefix (original )
17
- return clean_name_ascii (text )
13
+ def compare_parts_phonetic (left : NamePart , right : NamePart ) -> bool :
14
+ if left .metaphone is None or right .metaphone is None :
15
+ return left .ascii == right .ascii
16
+ if (
17
+ left .metaphone == right .metaphone
18
+ and left .ascii is not None
19
+ and right .ascii is not None
20
+ ):
21
+ # Secondary check for Levenshtein distance:
22
+ if is_levenshtein_plausible (left .ascii , right .ascii ):
23
+ return True
24
+ return False
25
+
26
+
27
+ # def _clean_phonetic_person(original: str) -> Optional[str]:
28
+ # """Normalize a person name without transliteration."""
29
+ # if not is_modern_alphabet(original):
30
+ # return None
31
+ # text = clean_entity_prefix(original)
32
+ # return clean_name_ascii(text)
18
33
19
34
20
35
def _clean_phonetic_entity (original : str ) -> Optional [str ]:
@@ -24,11 +39,11 @@ def _clean_phonetic_entity(original: str) -> Optional[str]:
24
39
return fingerprint_name (original )
25
40
26
41
27
- def _phonetic_person_tokens (token : str ) -> List [str ]:
28
- words : List [str ] = []
29
- for word in name_words (_clean_phonetic_person (token ), min_length = 2 ):
30
- words .append (phonetic_token (word ))
31
- return words
42
+ # def _phonetic_person_tokens(token: str) -> List[str]:
43
+ # words: List[str] = []
44
+ # for word in name_words(_clean_phonetic_person(token), min_length=2):
45
+ # words.append(phonetic_token(word))
46
+ # return words
32
47
33
48
34
49
def _token_names_compare (
@@ -48,9 +63,22 @@ def person_name_phonetic_match(query: E, result: E) -> float:
48
63
if not has_schema (query , result , "Person" ):
49
64
return 0.0
50
65
query_names_ , result_names_ = type_pair (query , result , registry .name )
51
- query_names = [_phonetic_person_tokens (n ) for n in query_names_ ]
52
- result_names = [_phonetic_person_tokens (n ) for n in result_names_ ]
53
- return _token_names_compare (query_names , result_names )
66
+ query_parts = [name_parts (n ) for n in query_names_ ]
67
+ result_parts = [name_parts (n ) for n in result_names_ ]
68
+ score = 0.0
69
+ for (q , r ) in product (query_parts , result_parts ):
70
+ if len (q ) == 0 :
71
+ continue
72
+ matches = list (r )
73
+ matched = 0
74
+ for part in q :
75
+ for other in matches :
76
+ if compare_parts_phonetic (part , other ):
77
+ matches .remove (other )
78
+ matched += 1
79
+ break
80
+ score = max (score , matched / float (len (q )))
81
+ return score
54
82
55
83
56
84
def _metaphone_tokens (token : str ) -> List [str ]:
0 commit comments