diff --git a/src/wiktextract/pronunciations.py b/src/wiktextract/pronunciations.py index 324038a9..3e189117 100644 --- a/src/wiktextract/pronunciations.py +++ b/src/wiktextract/pronunciations.py @@ -424,7 +424,7 @@ def split_cleaned_node_on_newlines(contents): active_pos = None for text, ipa_text in split_cleaned_node_on_newlines(contents): - # print(text, ipa_text) + # print(f"{text=}, {ipa_text=}") prefix = None if not text: continue @@ -582,72 +582,72 @@ def split_cleaned_node_on_newlines(contents): # XXX what about {{hyphenation|...}}, {{hyph|...}} # and those used to be stored under "hyphenation" - # Add data that was collected in template_fn - if audios: - for audio in audios: - if "audio" in audio: - # Compute audio file URLs - fn = audio["audio"] - # Strip certain characters, e.g., left-to-right mark - fn = re.sub(r"[\u200f\u200e]", "", fn) - fn = fn.strip() - fn = urllib.parse.unquote(fn) - # First character is usually uppercased - if re.match(r"^[a-z][a-z]+", fn): - fn = fn[0].upper() + fn[1:] - if fn in wxr.config.redirects: - fn = wxr.config.redirects[fn] - # File extension is lowercased - # XXX some words seem to need this, some don't seem to - # have this??? what is the exact rule? - # fn = re.sub(r"\.[^.]*$", lambda m: m.group(0).lower(), fn) - # Spaces are converted to underscores - fn = re.sub(r"\s+", "_", fn) - # Compute hash digest part - h = hashlib.md5() - hname = fn.encode("utf-8") - h.update(hname) - digest = h.hexdigest() - # Quote filename for URL - qfn = urllib.parse.quote(fn) - # For safety when writing files - qfn = qfn.replace("/", "__slash__") - if re.search(r"(?i)\.(ogg|oga)$", fn): - ogg = ("https://upload.wikimedia.org/wikipedia/" - "commons/{}/{}/{}" - .format(digest[:1], digest[:2], qfn)) - else: - ogg = ("https://upload.wikimedia.org/wikipedia/" - "commons/transcoded/" - "{}/{}/{}/{}.ogg" - .format(digest[:1], digest[:2], qfn, qfn)) - if re.search(r"(?i)\.(mp3)$", fn): - mp3 = ("https://upload.wikimedia.org/wikipedia/" - "commons/{}/{}/{}" - .format(digest[:1], digest[:2], qfn)) - else: - mp3 = ("https://upload.wikimedia.org/wikipedia/" - "commons/transcoded/" - "{}/{}/{}/{}.mp3" - .format(digest[:1], digest[:2], qfn, qfn)) - audio["ogg_url"] = ogg - audio["mp3_url"] = mp3 - if active_pos: audio["pos"] = active_pos - if audio not in data.get("sounds", ()): - data_append(data, "sounds", audio) - # have_pronunciations = True - audios =[] - for enpr in enprs: - if re.match(r"/[^/]+/$", enpr): - enpr = enpr[1: -1] - pron = {"enpr": enpr} - parse_pronunciation_tags(wxr, tagstext, pron) - if active_pos: - pron["pos"] = active_pos - if pron not in data.get("sounds", ()): - data_append(data, "sounds", pron) - # have_pronunciations = True - enprs = [] + # Add data that was collected in template_fn + for audio in audios: + if "audio" in audio: + # Compute audio file URLs + fn = audio["audio"] + # Strip certain characters, e.g., left-to-right mark + fn = re.sub(r"[\u200f\u200e]", "", fn) + fn = fn.strip() + fn = urllib.parse.unquote(fn) + # First character is usually uppercased + if re.match(r"^[a-z][a-z]+", fn): + fn = fn[0].upper() + fn[1:] + if fn in wxr.config.redirects: + fn = wxr.config.redirects[fn] + # File extension is lowercased + # XXX some words seem to need this, some don't seem to + # have this??? what is the exact rule? + # fn = re.sub(r"\.[^.]*$", lambda m: m.group(0).lower(), fn) + # Spaces are converted to underscores + fn = re.sub(r"\s+", "_", fn) + # Compute hash digest part + h = hashlib.md5() + hname = fn.encode("utf-8") + h.update(hname) + digest = h.hexdigest() + # Quote filename for URL + qfn = urllib.parse.quote(fn) + # For safety when writing files + qfn = qfn.replace("/", "__slash__") + if re.search(r"(?i)\.(ogg|oga)$", fn): + ogg = ("https://upload.wikimedia.org/wikipedia/" + "commons/{}/{}/{}" + .format(digest[:1], digest[:2], qfn)) + else: + ogg = ("https://upload.wikimedia.org/wikipedia/" + "commons/transcoded/" + "{}/{}/{}/{}.ogg" + .format(digest[:1], digest[:2], qfn, qfn)) + if re.search(r"(?i)\.(mp3)$", fn): + mp3 = ("https://upload.wikimedia.org/wikipedia/" + "commons/{}/{}/{}" + .format(digest[:1], digest[:2], qfn)) + else: + mp3 = ("https://upload.wikimedia.org/wikipedia/" + "commons/transcoded/" + "{}/{}/{}/{}.mp3" + .format(digest[:1], digest[:2], qfn, qfn)) + audio["ogg_url"] = ogg + audio["mp3_url"] = mp3 + if active_pos: audio["pos"] = active_pos + if audio not in data.get("sounds", ()): + data_append(data, "sounds", audio) + # if audios: + # have_pronunciations = True + audios =[] + for enpr in enprs: + if re.match(r"/[^/]+/$", enpr): + enpr = enpr[1: -1] + pron = {"enpr": enpr} + parse_pronunciation_tags(wxr, tagstext, pron) + if active_pos: + pron["pos"] = active_pos + if pron not in data.get("sounds", ()): + data_append(data, "sounds", pron) + # have_pronunciations = True + enprs = [] ## I have commented out the otherwise unused have_pronunciation ## toggles; uncomment them to use this debug print