Merge pull request #194 from opensanctions/pudo/remove-regression-v2

Remove regression-v2 scoring system
opensanctions · Mar 6, 2025 · e9be28d · e9be28d
2 parents 05bd474 + 5447336
commit e9be28d
Show file tree

Hide file tree

Showing 17 changed files with 31 additions and 471 deletions.
diff --git a/Makefile b/Makefile
@@ -8,17 +8,10 @@ typecheck:
 
 check: test typecheck
 
-data/pairs-v2.json:
-	mkdir -p data/
-	curl -o data/pairs-v2.json https://data.opensanctions.org/contrib/training/pairs-v2.json
-
 data/pairs-v1.json:
 	mkdir -p data/
 	curl -o data/pairs-v1.json https://data.opensanctions.org/contrib/training/pairs-v1.json
 
-train-v2: data/pairs-v2.json
-	nomenklatura train-v2-matcher data/pairs-v2.json
-
 train-v1: data/pairs-v1.json
 	nomenklatura train-v1-matcher data/pairs-v1.json
 

diff --git a/nomenklatura/cli.py b/nomenklatura/cli.py
@@ -10,7 +10,7 @@
 from followthemoney.cli.aggregate import sorted_aggregate
 
 from nomenklatura.cache import Cache
-from nomenklatura.matching import train_v2_matcher, train_v1_matcher
+from nomenklatura.matching import train_v1_matcher
 from nomenklatura.store import load_entity_file_store
 from nomenklatura.resolver import Resolver
 from nomenklatura.dataset import Dataset, DefaultDataset
@@ -168,12 +168,6 @@ def train_v1_matcher_(pairs_file: Path) -> None:
     train_v1_matcher(pairs_file)
 
 
-@cli.command("train-v2-matcher", help="Train a matching model from judgement pairs")
-@click.argument("pairs_file", type=InPath)
-def train_v2_matcher_(pairs_file: Path) -> None:
-    train_v2_matcher(pairs_file)
-
-
 @cli.command("match", help="Generate matches from an enrichment source")
 @click.argument("config", type=InPath)
 @click.argument("entities", type=InPath)

diff --git a/nomenklatura/data/regression-v2.pkl b/nomenklatura/data/regression-v2.pkl
diff --git a/nomenklatura/matching/__init__.py b/nomenklatura/matching/__init__.py
@@ -1,8 +1,6 @@
 from typing import List, Type, Optional
 from nomenklatura.matching.regression_v1.model import RegressionV1
 from nomenklatura.matching.regression_v1.train import train_matcher as train_v1_matcher
-from nomenklatura.matching.regression_v2.model import RegressionV2
-from nomenklatura.matching.regression_v2.train import train_matcher as train_v2_matcher
 from nomenklatura.matching.name_based import NameMatcher, NameQualifiedMatcher
 from nomenklatura.matching.logic import LogicV1
 from nomenklatura.matching.types import ScoringAlgorithm
@@ -12,10 +10,9 @@
     NameMatcher,
     NameQualifiedMatcher,
     RegressionV1,
-    RegressionV2,
 ]
 
-DefaultAlgorithm = RegressionV2
+DefaultAlgorithm = RegressionV1
 
 
 def get_algorithm(name: str) -> Optional[Type[ScoringAlgorithm]]:
@@ -29,8 +26,6 @@ def get_algorithm(name: str) -> Optional[Type[ScoringAlgorithm]]:
 __all__ = [
     "RegressionV1",
     "train_v1_matcher",
-    "RegressionV2",
-    "train_v2_matcher",
     "DefaultAlgorithm",
     "ScoringAlgorithm",
     "NameMatcher",

diff --git a/nomenklatura/matching/compare/phonetic.py b/nomenklatura/matching/compare/phonetic.py
@@ -4,12 +4,30 @@
 from followthemoney.types import registry
 from rigour.text.scripts import is_modern_alphabet
 from rigour.text.distance import is_levenshtein_plausible
+from rigour.text.phonetics import metaphone, soundex
 from rigour.names.part import name_parts, NamePart
 from nomenklatura.util import name_words, list_intersection, fingerprint_name
-from nomenklatura.util import metaphone_token, soundex_token
 from nomenklatura.matching.util import type_pair, has_schema
 
 
+def metaphone_token(token: str) -> str:
+    if token.isalpha() and len(token) > 1:
+        out = metaphone(token)
+        # doesn't handle non-ascii characters
+        if len(out) >= 3:
+            return out
+    return token.upper()
+
+
+def soundex_token(token: str) -> str:
+    if token.isalpha() and len(token) > 1:
+        out = soundex(token)
+        # doesn't handle non-ascii characters
+        if len(out):
+            return out
+    return token.upper()
+
+
 def compare_parts_phonetic(left: NamePart, right: NamePart) -> bool:
     if left.metaphone is None or right.metaphone is None:
         return left.ascii == right.ascii
@@ -50,7 +68,7 @@ def _token_names_compare(
     query_names: List[List[str]], result_names: List[List[str]]
 ) -> float:
     score = 0.0
-    for (q, r) in product(query_names, result_names):
+    for q, r in product(query_names, result_names):
         # length = max(2.0, (len(q) + len(r)) / 2.0)
         length = max(2.0, len(q))
         combo = list_intersection(q, r) / float(length)
@@ -66,7 +84,7 @@ def person_name_phonetic_match(query: E, result: E) -> float:
     query_parts = [name_parts(n) for n in query_names_]
     result_parts = [name_parts(n) for n in result_names_]
     score = 0.0
-    for (q, r) in product(query_parts, result_parts):
+    for q, r in product(query_parts, result_parts):
         if len(q) == 0:
             continue
         matches = list(r)

diff --git a/nomenklatura/matching/name_based/names.py b/nomenklatura/matching/name_based/names.py
@@ -4,7 +4,8 @@
 from rigour.text.distance import jaro_winkler
 
 from nomenklatura.matching.util import type_pair
-from nomenklatura.util import names_word_list, soundex_token
+from nomenklatura.matching.compare.phonetic import soundex_token
+from nomenklatura.util import names_word_list
 
 
 def soundex_name_parts(query: E, result: E) -> float:

diff --git a/nomenklatura/matching/regression_v2/__init__.py b/nomenklatura/matching/regression_v2/__init__.py
diff --git a/nomenklatura/matching/regression_v2/misc.py b/nomenklatura/matching/regression_v2/misc.py
diff --git a/nomenklatura/matching/regression_v2/model.py b/nomenklatura/matching/regression_v2/model.py
diff --git a/nomenklatura/matching/regression_v2/names.py b/nomenklatura/matching/regression_v2/names.py