Merge pull request #976 from tatuylonen/clipping

[en] Adjustment to form_description heuristics
tatuylonen · Jan 7, 2025 · 332cc91 · 332cc91
2 parents ccb3129 + 0091534
commit 332cc91
Showing 2 changed files with 56 additions and 10 deletions.
diff --git a/src/wiktextract/extractor/en/form_descriptions.py b/src/wiktextract/extractor/en/form_descriptions.py
@@ -2447,16 +2447,25 @@ def strokes_repl(m: re.Match) -> str:
                     if (
                         i > 1
                         and len(parts[i - 1]) >= 4
-                        and distw(titleparts, parts[i - 1]) <= 0.4
-                        # Fixes wiktextract #983, where "participle"
-                        # was too close to "Martinize" and so this accepted
-                        # ["participle", "Martinize"] as matching; this
-                        # kludge prevents this from happening if titleparts
-                        # is shorter than what would be 'related'.
-                        # This breaks if we want to detect stuff that
-                        # actually gets an extra space-separated word when
-                        # 'inflected'.
-                        and len(titleparts) >= len(parts[i - 1:])
+                        and (
+                            distw(titleparts, parts[i - 1]) <= 0.4
+                            # Fixes wiktextract #983, where "participle"
+                            # was too close to "Martinize" and so this accepted
+                            # ["participle", "Martinize"] as matching; this
+                            # kludge prevents this from happening if titleparts
+                            # is shorter than what would be 'related'.
+                            # This breaks if we want to detect stuff that
+                            # actually gets an extra space-separated word when
+                            # 'inflected'.
+                            or (
+                                wxr.wtp.section == "English"
+                                and any(
+                                    parts[i - 1].startswith(title)
+                                    for title in titleparts
+                                )
+                            )
+                        )
+                        and len(titleparts) >= len(parts[i - 1 :])
                     ):
                         # print(f"Reached; {parts=}, {parts[i-1]=}")
                         alt_related = related

diff --git a/tests/test_en_head.py b/tests/test_en_head.py
@@ -803,3 +803,40 @@ def test_converted_topic_is_not_form(self):
             )[0]["forms"],
             [{"form": "chuunibyou", "tags": ["plural"]}],
         )
+
+    def test_english_forms_that_are_also_tag_words1(self):
+        # Issue #967
+        # Specifically only for English words
+        # "clipping" is in valid tags...
+        # Check if language section is "English", then if the checked
+        # word starts with the title ([clip]ping) accept that even if
+        # the distw is high (in this case, clipping and clip -> 0.5 distw())
+        data = {}
+        self.maxDiff = 10000
+        self.wxr.wtp.start_page("clip")
+        self.wxr.wtp.start_section("English")
+        self.wxr.wtp.start_subsection("verb")
+        parse_word_head(
+            self.wxr,
+            "verb",
+            "clip (third-person singular simple present clips, present participle clipping, simple past and past participle clipped)",
+            data,
+            False,
+            None,
+        )
+        # print(json.dumps(data, indent=2, sort_keys=True))
+        self.assertEqual(
+            data,
+            {
+                "forms": [
+                    {
+                        "form": "clips",
+                        "tags": ["present", "singular", "third-person"],
+                    },
+                    {"form": "clipping", "tags": ["participle", "present"]},
+                    {"form": "clipped", "tags": ["participle", "past"]},
+                    {"form": "clipped", "tags": ["past"]},
+                ],
+            },
+        )
+