Skip to content

Commit

Permalink
Merge pull request #194 from opensanctions/pudo/remove-regression-v2
Browse files Browse the repository at this point in the history
Remove regression-v2 scoring system
  • Loading branch information
pudo authored Mar 6, 2025
2 parents 05bd474 + 5447336 commit e9be28d
Show file tree
Hide file tree
Showing 17 changed files with 31 additions and 471 deletions.
7 changes: 0 additions & 7 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,10 @@ typecheck:

check: test typecheck

data/pairs-v2.json:
mkdir -p data/
curl -o data/pairs-v2.json https://data.opensanctions.org/contrib/training/pairs-v2.json

data/pairs-v1.json:
mkdir -p data/
curl -o data/pairs-v1.json https://data.opensanctions.org/contrib/training/pairs-v1.json

train-v2: data/pairs-v2.json
nomenklatura train-v2-matcher data/pairs-v2.json

train-v1: data/pairs-v1.json
nomenklatura train-v1-matcher data/pairs-v1.json

Expand Down
8 changes: 1 addition & 7 deletions nomenklatura/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from followthemoney.cli.aggregate import sorted_aggregate

from nomenklatura.cache import Cache
from nomenklatura.matching import train_v2_matcher, train_v1_matcher
from nomenklatura.matching import train_v1_matcher
from nomenklatura.store import load_entity_file_store
from nomenklatura.resolver import Resolver
from nomenklatura.dataset import Dataset, DefaultDataset
Expand Down Expand Up @@ -168,12 +168,6 @@ def train_v1_matcher_(pairs_file: Path) -> None:
train_v1_matcher(pairs_file)


@cli.command("train-v2-matcher", help="Train a matching model from judgement pairs")
@click.argument("pairs_file", type=InPath)
def train_v2_matcher_(pairs_file: Path) -> None:
train_v2_matcher(pairs_file)


@cli.command("match", help="Generate matches from an enrichment source")
@click.argument("config", type=InPath)
@click.argument("entities", type=InPath)
Expand Down
Binary file removed nomenklatura/data/regression-v2.pkl
Binary file not shown.
7 changes: 1 addition & 6 deletions nomenklatura/matching/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
from typing import List, Type, Optional
from nomenklatura.matching.regression_v1.model import RegressionV1
from nomenklatura.matching.regression_v1.train import train_matcher as train_v1_matcher
from nomenklatura.matching.regression_v2.model import RegressionV2
from nomenklatura.matching.regression_v2.train import train_matcher as train_v2_matcher
from nomenklatura.matching.name_based import NameMatcher, NameQualifiedMatcher
from nomenklatura.matching.logic import LogicV1
from nomenklatura.matching.types import ScoringAlgorithm
Expand All @@ -12,10 +10,9 @@
NameMatcher,
NameQualifiedMatcher,
RegressionV1,
RegressionV2,
]

DefaultAlgorithm = RegressionV2
DefaultAlgorithm = RegressionV1


def get_algorithm(name: str) -> Optional[Type[ScoringAlgorithm]]:
Expand All @@ -29,8 +26,6 @@ def get_algorithm(name: str) -> Optional[Type[ScoringAlgorithm]]:
__all__ = [
"RegressionV1",
"train_v1_matcher",
"RegressionV2",
"train_v2_matcher",
"DefaultAlgorithm",
"ScoringAlgorithm",
"NameMatcher",
Expand Down
24 changes: 21 additions & 3 deletions nomenklatura/matching/compare/phonetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,30 @@
from followthemoney.types import registry
from rigour.text.scripts import is_modern_alphabet
from rigour.text.distance import is_levenshtein_plausible
from rigour.text.phonetics import metaphone, soundex
from rigour.names.part import name_parts, NamePart
from nomenklatura.util import name_words, list_intersection, fingerprint_name
from nomenklatura.util import metaphone_token, soundex_token
from nomenklatura.matching.util import type_pair, has_schema


def metaphone_token(token: str) -> str:
if token.isalpha() and len(token) > 1:
out = metaphone(token)
# doesn't handle non-ascii characters
if len(out) >= 3:
return out
return token.upper()


def soundex_token(token: str) -> str:
if token.isalpha() and len(token) > 1:
out = soundex(token)
# doesn't handle non-ascii characters
if len(out):
return out
return token.upper()


def compare_parts_phonetic(left: NamePart, right: NamePart) -> bool:
if left.metaphone is None or right.metaphone is None:
return left.ascii == right.ascii
Expand Down Expand Up @@ -50,7 +68,7 @@ def _token_names_compare(
query_names: List[List[str]], result_names: List[List[str]]
) -> float:
score = 0.0
for (q, r) in product(query_names, result_names):
for q, r in product(query_names, result_names):
# length = max(2.0, (len(q) + len(r)) / 2.0)
length = max(2.0, len(q))
combo = list_intersection(q, r) / float(length)
Expand All @@ -66,7 +84,7 @@ def person_name_phonetic_match(query: E, result: E) -> float:
query_parts = [name_parts(n) for n in query_names_]
result_parts = [name_parts(n) for n in result_names_]
score = 0.0
for (q, r) in product(query_parts, result_parts):
for q, r in product(query_parts, result_parts):
if len(q) == 0:
continue
matches = list(r)
Expand Down
3 changes: 2 additions & 1 deletion nomenklatura/matching/name_based/names.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
from rigour.text.distance import jaro_winkler

from nomenklatura.matching.util import type_pair
from nomenklatura.util import names_word_list, soundex_token
from nomenklatura.matching.compare.phonetic import soundex_token
from nomenklatura.util import names_word_list


def soundex_name_parts(query: E, result: E) -> float:
Expand Down
Empty file.
44 changes: 0 additions & 44 deletions nomenklatura/matching/regression_v2/misc.py

This file was deleted.

105 changes: 0 additions & 105 deletions nomenklatura/matching/regression_v2/model.py

This file was deleted.

83 changes: 0 additions & 83 deletions nomenklatura/matching/regression_v2/names.py

This file was deleted.

Loading

0 comments on commit e9be28d

Please sign in to comment.