Skip to content

Commit

Permalink
Merge pull request #893 from tatuylonen/formof
Browse files Browse the repository at this point in the history
[en] Retry decoding tags with errors if " and " in tag
  • Loading branch information
kristian-clausal authored Oct 29, 2024
2 parents fcb2cf5 + 33acb40 commit 598a4ee
Show file tree
Hide file tree
Showing 3 changed files with 173 additions and 70 deletions.
28 changes: 17 additions & 11 deletions src/wiktextract/extractor/en/form_descriptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -441,13 +441,6 @@
r"(" + "|".join(re.escape(x) for x in head_end_map.keys()) + r")$"
)

# Words that can be part of form description
valid_words: set[str] = set(["or", "and"])
for x in valid_tags:
valid_words.update(x.split(" "))
for x in xlat_tags_map.keys():
valid_words.update(x.split(" "))


# Dictionary of language-specific parenthesized head part starts that
# either introduce new tags or modify previous tags. The value for each
Expand Down Expand Up @@ -929,6 +922,7 @@ def check_unknown(
words = wordlst[from_i:to_i]
tag = " ".join(words)
assert tag
# print(f"{tag=}")
if re.match(ignored_unknown_starts_re, tag):
# Tags with this start are to be ignored
return [(from_i, ["UNKNOWN"], [])]
Expand Down Expand Up @@ -1010,11 +1004,14 @@ def decode_tags(
# I hate Python's *nested* list comprehension syntax ^
or any(s.startswith("error-") for s in topics)
):
# slashes_re contains valid key entries with slashes; we're going to
# skip them by splitting the string and skipping handling every
# second entry, which contains the splitting group like "masculine/
# feminine" style keys.
new_tagsets: list[tuple[str, ...]] = []
new_topics: list[str] = []

if "/" in src:
# slashes_re contains valid key entries with slashes; we're going
# to skip them by splitting the string and skipping handling every
# second entry, which contains the splitting group like "masculine/
# feminine" style keys.
split_parts = re.split(slashes_re, src)
new_parts: list[str] = []
if len(split_parts) > 1:
Expand All @@ -1029,7 +1026,16 @@ def decode_tags(
new_tagsets, new_topics = decode_tags1(
new_src, allow_any, no_unknown_starts
)
elif " or " in src or " and " in src:
# Annoying kludge.
new_src = src.replace(" and ", " ")
new_src = new_src.replace(" or ", " ")
new_tagsets, new_topics = decode_tags1(
new_src, allow_any, no_unknown_starts
)
# print(f"{new_tagsets=}")

if new_tagsets or new_topics:
old_errors = sum(
1 for tagset in tagsets for s in tagset if s.startswith("error")
)
Expand Down
2 changes: 1 addition & 1 deletion src/wiktextract/tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -4508,7 +4508,7 @@
"syncopated": "syncope",
"reduplication with syncope": "reduplication syncope",
"introducing subjunctive hortative": "subjunctive hortative",
"nominative and vocative plural animate": "nominative vocative",
"nominative and vocative plural animate": "nominative vocative plural animate",
"with diaeresis to indicate disyllabilicity": "",
"aphaeretic variant": "variant",
"mediopassive voice": "mediopassive",
Expand Down
213 changes: 155 additions & 58 deletions tests/test_en_tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@


class EnTagTests(unittest.TestCase):

def test_empty(self):
ret = decode_tags("")
self.assertEqual(ret, ([()], []))
Expand Down Expand Up @@ -67,7 +66,15 @@ def test_tags12(self):

def test_tags13(self):
ret, topics = decode_tags("class 2a stress pattern xyz")
self.assertEqual(ret, [("class-2a", "error-unknown-tag",)])
self.assertEqual(
ret,
[
(
"class-2a",
"error-unknown-tag",
)
],
)

def test_tags14(self):
ret, topics = decode_tags("Cockney rhyming slang")
Expand All @@ -84,46 +91,55 @@ def test_tags16(self):

def test_tags17(self):
ret, topics = decode_tags("colloquial Cockney Test rhyming slang")
self.assertEqual(ret, [("Cockney", "colloquial",
"error-unknown-tag", "slang")])
self.assertEqual(
ret, [("Cockney", "colloquial", "error-unknown-tag", "slang")]
)

def test_tags18(self):
ret, topics = decode_tags("colloquial Cockney Test unknown1 "
"rhyming slang")
self.assertEqual(ret, [("Cockney", "colloquial",
"error-unknown-tag", "slang")])
ret, topics = decode_tags(
"colloquial Cockney Test unknown1 " "rhyming slang"
)
self.assertEqual(
ret, [("Cockney", "colloquial", "error-unknown-tag", "slang")]
)

def test_tags19(self):
ret, topics = decode_tags("colloquial Cockney Test unknown1 "
"rhyming slang",
allow_any=True)
self.assertEqual(ret, [("Cockney", "Test unknown1", "colloquial",
"slang")])
ret, topics = decode_tags(
"colloquial Cockney Test unknown1 " "rhyming slang", allow_any=True
)
self.assertEqual(
ret, [("Cockney", "Test unknown1", "colloquial", "slang")]
)

def test_tags20(self):
ret, topics = decode_tags("colloquial Cockney rhyming slang "
"Test unknown1",
allow_any=True)
self.assertEqual(ret, [("Cockney", "Test unknown1", "colloquial",
"slang")])
ret, topics = decode_tags(
"colloquial Cockney rhyming slang " "Test unknown1", allow_any=True
)
self.assertEqual(
ret, [("Cockney", "Test unknown1", "colloquial", "slang")]
)

def test_tags21(self):
ret, topics = decode_tags("simple past and past participle")
self.assertEqual(topics, [])
self.assertEqual(ret, [("participle", "past"), ("past",)])

def test_tags22(self):
ret, topics = decode_tags("colloquial Cockney Test, unknown1; "
"rhyming slang",
allow_any=True)
self.assertEqual(ret, [("Cockney", "Test", "colloquial",
"slang", "unknown1")])
ret, topics = decode_tags(
"colloquial Cockney Test, unknown1; " "rhyming slang",
allow_any=True,
)
self.assertEqual(
ret, [("Cockney", "Test", "colloquial", "slang", "unknown1")]
)

def test_tags23(self):
ret, topics = decode_tags("intransitive, in perfect tenses, "
"without predicate")
self.assertEqual(ret, [("in perfect tenses",
"intransitive",
"without predicate")])
ret, topics = decode_tags(
"intransitive, in perfect tenses, " "without predicate"
)
self.assertEqual(
ret, [("in perfect tenses", "intransitive", "without predicate")]
)

def test_tags24(self):
ret, topics = decode_tags("as a modifier in compound words")
Expand Down Expand Up @@ -187,7 +203,7 @@ def test_tags38(self):

def test_tags39(self):
ret, topics = decode_tags("with inf., obsolescent")
self.assertEqual(ret, [("obsolete", "possibly","with-infinitive")])
self.assertEqual(ret, [("obsolete", "possibly", "with-infinitive")])

def test_tags40(self):
ret, topics = decode_tags("transitive of people")
Expand All @@ -198,46 +214,88 @@ def test_tags41(self):
self.assertEqual(ret, [("error-unknown-tag", "transitive")])

def test_tags42(self):
ret, topics = decode_tags("first/third-person singular present "
"subjunctive")
self.assertEqual(ret, [("first-person", "present",
"singular", "subjunctive", "third-person")])
ret, topics = decode_tags(
"first/third-person singular present " "subjunctive"
)
self.assertEqual(
ret,
[
(
"first-person",
"present",
"singular",
"subjunctive",
"third-person",
)
],
)

def test_tags43(self):
ret, topics = decode_tags("inflection of")
self.assertEqual(ret, [("form-of",)])

def test_tags44(self):
ret, topics = decode_tags("third-person singular present indicative")
self.assertEqual(ret, [("indicative", "present", "singular",
"third-person",)])
self.assertEqual(
ret,
[
(
"indicative",
"present",
"singular",
"third-person",
)
],
)

def test_tags45(self):
ret, topics = decode_tags("ordinal form of")
self.assertEqual(ret, [("form-of", "ordinal")])

def test_tags46(self):
ret, topics = decode_tags("first-person singular (eu) present "
"subjunctive")
self.assertEqual(ret, [("first-person", "present", "singular",
"subjunctive", "with-eu")])
ret, topics = decode_tags(
"first-person singular (eu) present " "subjunctive"
)
self.assertEqual(
ret,
[("first-person", "present", "singular", "subjunctive", "with-eu")],
)

def test_tags47(self):
ret, topics = decode_tags("third-person singular (él, ella, also "
"used with usted) present subjunctive "
"form of")
self.assertEqual(ret, [("form-of", "present", "singular", "subjunctive",
"third-person",
"with-ella", "with-usted", "with-él")])
ret, topics = decode_tags(
"third-person singular (él, ella, also "
"used with usted) present subjunctive "
"form of"
)
self.assertEqual(
ret,
[
(
"form-of",
"present",
"singular",
"subjunctive",
"third-person",
"with-ella",
"with-usted",
"with-él",
)
],
)

def test_tags48(self):
ret, topics = decode_tags("instant messaging")
self.assertEqual(ret, [("Internet",)])

def test_tags49(self):
ret, topics = decode_tags("plural and definite singular attributive")
self.assertEqual(ret, [("attributive", "definite", "singular"),
("attributive", "plural")])
self.assertEqual(
ret,
[
("attributive", "definite", "singular"),
("attributive", "plural"),
],
)

def test_tags50(self):
ret, topics = decode_tags("alternative spelling of")
Expand All @@ -261,8 +319,16 @@ def test_tags54(self):

def test_tags55(self):
ret, topics = decode_tags("plural and definite singular attributive")
self.assertEqual(ret, [("attributive", "definite", "singular"),
("attributive", "plural",)])
self.assertEqual(
ret,
[
("attributive", "definite", "singular"),
(
"attributive",
"plural",
),
],
)

def test_tags56(self):
ret, topics = decode_tags("comparative")
Expand Down Expand Up @@ -293,10 +359,13 @@ def test_tags62(self):
self.assertEqual(ret, [("definite", "plural", "singular")])

def test_tags63(self):
ret, topics = decode_tags("first-person plural "
"reflexive/dative/accusative form")
self.assertEqual(ret, [("accusative", "dative", "first-person",
"plural", "reflexive")])
ret, topics = decode_tags(
"first-person plural " "reflexive/dative/accusative form"
)
self.assertEqual(
ret,
[("accusative", "dative", "first-person", "plural", "reflexive")],
)

self.assertEqual(topics, [])

Expand All @@ -314,15 +383,43 @@ def test_tags66(self):
# during the first run, except for keys with slashes in them.
ret, topics = decode_tags("nominative/plural masculine/feminine")
# -> "nominative plural masculine/feminine"
self.assertEqual(ret, [("feminine", "masculine",
"nominative", "plural",)])
self.assertEqual(
ret,
[
(
"feminine",
"masculine",
"nominative",
"plural",
)
],
)

def test_topics1(self):
ret, topics = decode_tags("nautical")
self.assertEqual(topics, ["nautical", "transport"])

def test_topics2(self):
ret, topics = decode_tags("ropemaking")
self.assertEqual(topics, ["ropemaking", "crafts",
"nautical", "transport",
"arts", "hobbies", "lifestyle"])
self.assertEqual(
topics,
[
"ropemaking",
"crafts",
"nautical",
"transport",
"arts",
"hobbies",
"lifestyle",
],
)

def test_and(self):
ret, topics = decode_tags("nominative and accusative")
self.assertEqual(
ret,
[(
"accusative",
"nominative",
)],
)

0 comments on commit 598a4ee

Please sign in to comment.