Skip to content

Commit

Permalink
Merge pull request #976 from tatuylonen/clipping
Browse files Browse the repository at this point in the history
[en] Adjustment to form_description heuristics
  • Loading branch information
kristian-clausal authored Jan 7, 2025
2 parents ccb3129 + 0091534 commit 332cc91
Showing 2 changed files with 56 additions and 10 deletions.
29 changes: 19 additions & 10 deletions src/wiktextract/extractor/en/form_descriptions.py
Original file line number Diff line number Diff line change
@@ -2447,16 +2447,25 @@ def strokes_repl(m: re.Match) -> str:
if (
i > 1
and len(parts[i - 1]) >= 4
and distw(titleparts, parts[i - 1]) <= 0.4
# Fixes wiktextract #983, where "participle"
# was too close to "Martinize" and so this accepted
# ["participle", "Martinize"] as matching; this
# kludge prevents this from happening if titleparts
# is shorter than what would be 'related'.
# This breaks if we want to detect stuff that
# actually gets an extra space-separated word when
# 'inflected'.
and len(titleparts) >= len(parts[i - 1:])
and (
distw(titleparts, parts[i - 1]) <= 0.4
# Fixes wiktextract #983, where "participle"
# was too close to "Martinize" and so this accepted
# ["participle", "Martinize"] as matching; this
# kludge prevents this from happening if titleparts
# is shorter than what would be 'related'.
# This breaks if we want to detect stuff that
# actually gets an extra space-separated word when
# 'inflected'.
or (
wxr.wtp.section == "English"
and any(
parts[i - 1].startswith(title)
for title in titleparts
)
)
)
and len(titleparts) >= len(parts[i - 1 :])
):
# print(f"Reached; {parts=}, {parts[i-1]=}")
alt_related = related
37 changes: 37 additions & 0 deletions tests/test_en_head.py
Original file line number Diff line number Diff line change
@@ -803,3 +803,40 @@ def test_converted_topic_is_not_form(self):
)[0]["forms"],
[{"form": "chuunibyou", "tags": ["plural"]}],
)

def test_english_forms_that_are_also_tag_words1(self):
# Issue #967
# Specifically only for English words
# "clipping" is in valid tags...
# Check if language section is "English", then if the checked
# word starts with the title ([clip]ping) accept that even if
# the distw is high (in this case, clipping and clip -> 0.5 distw())
data = {}
self.maxDiff = 10000
self.wxr.wtp.start_page("clip")
self.wxr.wtp.start_section("English")
self.wxr.wtp.start_subsection("verb")
parse_word_head(
self.wxr,
"verb",
"clip (third-person singular simple present clips, present participle clipping, simple past and past participle clipped)",
data,
False,
None,
)
# print(json.dumps(data, indent=2, sort_keys=True))
self.assertEqual(
data,
{
"forms": [
{
"form": "clips",
"tags": ["present", "singular", "third-person"],
},
{"form": "clipping", "tags": ["participle", "present"]},
{"form": "clipped", "tags": ["participle", "past"]},
{"form": "clipped", "tags": ["past"]},
],
},
)

0 comments on commit 332cc91

Please sign in to comment.