From ab7cc502b7c15479cf19b3da9f747351bca45c74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kristian=20J=C3=A4rventaus?= Date: Tue, 7 Jan 2025 10:43:56 +0200 Subject: [PATCH] [en] Adjustment to form_description heuristics "clipping" was skipped because it had too high distw() with "clip" (0.5), so I added yet another kludge for when the language section is English: if something starts with the title, accept it in this case. This will cause problems elsewhere, but we'll hunt those down... --- .../extractor/en/form_descriptions.py | 29 ++++++++++++------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/src/wiktextract/extractor/en/form_descriptions.py b/src/wiktextract/extractor/en/form_descriptions.py index b31cf10a5..ebda44a25 100644 --- a/src/wiktextract/extractor/en/form_descriptions.py +++ b/src/wiktextract/extractor/en/form_descriptions.py @@ -2447,16 +2447,25 @@ def strokes_repl(m: re.Match) -> str: if ( i > 1 and len(parts[i - 1]) >= 4 - and distw(titleparts, parts[i - 1]) <= 0.4 - # Fixes wiktextract #983, where "participle" - # was too close to "Martinize" and so this accepted - # ["participle", "Martinize"] as matching; this - # kludge prevents this from happening if titleparts - # is shorter than what would be 'related'. - # This breaks if we want to detect stuff that - # actually gets an extra space-separated word when - # 'inflected'. - and len(titleparts) >= len(parts[i - 1:]) + and ( + distw(titleparts, parts[i - 1]) <= 0.4 + # Fixes wiktextract #983, where "participle" + # was too close to "Martinize" and so this accepted + # ["participle", "Martinize"] as matching; this + # kludge prevents this from happening if titleparts + # is shorter than what would be 'related'. + # This breaks if we want to detect stuff that + # actually gets an extra space-separated word when + # 'inflected'. + or ( + wxr.wtp.section == "English" + and any( + parts[i - 1].startswith(title) + for title in titleparts + ) + ) + ) + and len(titleparts) >= len(parts[i - 1 :]) ): # print(f"Reached; {parts=}, {parts[i-1]=}") alt_related = related