Skip to content

Combine genders and plurals into comma separated strings for noun que… #564

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Mar 2, 2025
45 changes: 33 additions & 12 deletions src/scribe_data/wikidata/format_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,20 +51,41 @@ def format_data(

for data_vals in data_list:
lexeme_id = data_vals["lexemeID"]
modified_date = data_vals["lastModified"]

# Initialize a new entry if this lexeme hasn't been seen yet.
if lexeme_id not in data_formatted:
data_formatted[lexeme_id] = {}

# Reverse to make sure that we're getting the same order as the query.
query_identifiers = list(reversed(data_vals.keys()))
query_identifiers.remove("lexemeID")
query_identifiers.remove("lastModified")

for k in query_identifiers:
data_formatted[lexeme_id][k] = data_vals[k]
data_formatted[lexeme_id]["lastModified"] = modified_date

data_formatted[lexeme_id] = {
key: value
for key, value in data_vals.items()
if key not in ["lexemeID", "lastModified"]
}

data_formatted[lexeme_id]["lastModified"] = data_vals["lastModified"]

else:
# Merge fields for an existing lexeme.
for field, value in data_vals.items():
if field in ["lexemeID", "lastModified"]:
continue

if value: # Only process non-empty values.
if (
field in data_formatted[lexeme_id]
and data_formatted[lexeme_id][field]
):
# Merge field values into a comma-separated string using a set for uniqueness.
existing_values = set(
data_formatted[lexeme_id][field].split(", ")
)
existing_values.add(value)
data_formatted[lexeme_id][field] = ", ".join(
sorted(existing_values)
)

else:
data_formatted[lexeme_id][field] = value

# Convert the dictionary to an ordered dictionary for consistent output.
data_formatted = collections.OrderedDict(sorted(data_formatted.items()))

export_formatted_data(
Expand Down
Loading