Merge pull request #413 from tatuylonen/audios-index-out-of-bounds

Move code out of for loop that didn't belong there
tatuylonen · Dec 5, 2023 · fee414a · fee414a
2 parents c42a520 + 059895e
commit fee414a
Showing 1 changed file with 67 additions and 67 deletions.
diff --git a/src/wiktextract/pronunciations.py b/src/wiktextract/pronunciations.py
@@ -424,7 +424,7 @@ def split_cleaned_node_on_newlines(contents):
     active_pos = None
 
     for text, ipa_text in split_cleaned_node_on_newlines(contents):
-        # print(text, ipa_text)
+        # print(f"{text=}, {ipa_text=}")
         prefix = None
         if not text:
             continue
@@ -582,72 +582,72 @@ def split_cleaned_node_on_newlines(contents):
         # XXX what about {{hyphenation|...}}, {{hyph|...}}
         # and those used to be stored under "hyphenation"
 
-        # Add data that was collected in template_fn
-        if audios:
-            for audio in audios:
-                if "audio" in audio:
-                    # Compute audio file URLs
-                    fn = audio["audio"]
-                    # Strip certain characters, e.g., left-to-right mark
-                    fn = re.sub(r"[\u200f\u200e]", "", fn)
-                    fn = fn.strip()
-                    fn = urllib.parse.unquote(fn)
-                    # First character is usually uppercased
-                    if re.match(r"^[a-z][a-z]+", fn):
-                        fn = fn[0].upper() + fn[1:]
-                    if fn in wxr.config.redirects:
-                        fn = wxr.config.redirects[fn]
-                    # File extension is lowercased
-                    # XXX some words seem to need this, some don't seem to
-                    # have this??? what is the exact rule?
-                    # fn = re.sub(r"\.[^.]*$", lambda m: m.group(0).lower(), fn)
-                    # Spaces are converted to underscores
-                    fn = re.sub(r"\s+", "_", fn)
-                    # Compute hash digest part
-                    h = hashlib.md5()
-                    hname = fn.encode("utf-8")
-                    h.update(hname)
-                    digest = h.hexdigest()
-                    # Quote filename for URL
-                    qfn = urllib.parse.quote(fn)
-                    # For safety when writing files
-                    qfn = qfn.replace("/", "__slash__")
-                    if re.search(r"(?i)\.(ogg|oga)$", fn):
-                        ogg = ("https://upload.wikimedia.org/wikipedia/"
-                               "commons/{}/{}/{}"
-                               .format(digest[:1], digest[:2], qfn))
-                    else:
-                        ogg = ("https://upload.wikimedia.org/wikipedia/"
-                               "commons/transcoded/"
-                               "{}/{}/{}/{}.ogg"
-                               .format(digest[:1], digest[:2], qfn, qfn))
-                    if re.search(r"(?i)\.(mp3)$", fn):
-                        mp3 = ("https://upload.wikimedia.org/wikipedia/"
-                               "commons/{}/{}/{}"
-                               .format(digest[:1], digest[:2], qfn))
-                    else:
-                        mp3 = ("https://upload.wikimedia.org/wikipedia/"
-                               "commons/transcoded/"
-                               "{}/{}/{}/{}.mp3"
-                               .format(digest[:1], digest[:2], qfn, qfn))
-                    audio["ogg_url"] = ogg
-                    audio["mp3_url"] = mp3
-                    if active_pos: audio["pos"] = active_pos
-                if audio not in data.get("sounds", ()):
-                    data_append(data, "sounds", audio)
-            # have_pronunciations = True
-        audios =[]
-        for enpr in enprs:
-            if re.match(r"/[^/]+/$", enpr):
-                enpr = enpr[1: -1]
-            pron = {"enpr": enpr}
-            parse_pronunciation_tags(wxr, tagstext, pron)
-            if active_pos:
-                pron["pos"] = active_pos
-            if pron not in data.get("sounds", ()):
-                data_append(data, "sounds", pron)
-            # have_pronunciations = True
-        enprs = []
+    # Add data that was collected in template_fn
+    for audio in audios:
+        if "audio" in audio:
+            # Compute audio file URLs
+            fn = audio["audio"]
+            # Strip certain characters, e.g., left-to-right mark
+            fn = re.sub(r"[\u200f\u200e]", "", fn)
+            fn = fn.strip()
+            fn = urllib.parse.unquote(fn)
+            # First character is usually uppercased
+            if re.match(r"^[a-z][a-z]+", fn):
+                fn = fn[0].upper() + fn[1:]
+            if fn in wxr.config.redirects:
+                fn = wxr.config.redirects[fn]
+            # File extension is lowercased
+            # XXX some words seem to need this, some don't seem to
+            # have this??? what is the exact rule?
+            # fn = re.sub(r"\.[^.]*$", lambda m: m.group(0).lower(), fn)
+            # Spaces are converted to underscores
+            fn = re.sub(r"\s+", "_", fn)
+            # Compute hash digest part
+            h = hashlib.md5()
+            hname = fn.encode("utf-8")
+            h.update(hname)
+            digest = h.hexdigest()
+            # Quote filename for URL
+            qfn = urllib.parse.quote(fn)
+            # For safety when writing files
+            qfn = qfn.replace("/", "__slash__")
+            if re.search(r"(?i)\.(ogg|oga)$", fn):
+                ogg = ("https://upload.wikimedia.org/wikipedia/"
+                       "commons/{}/{}/{}"
+                       .format(digest[:1], digest[:2], qfn))
+            else:
+                ogg = ("https://upload.wikimedia.org/wikipedia/"
+                       "commons/transcoded/"
+                       "{}/{}/{}/{}.ogg"
+                       .format(digest[:1], digest[:2], qfn, qfn))
+            if re.search(r"(?i)\.(mp3)$", fn):
+                mp3 = ("https://upload.wikimedia.org/wikipedia/"
+                       "commons/{}/{}/{}"
+                       .format(digest[:1], digest[:2], qfn))
+            else:
+                mp3 = ("https://upload.wikimedia.org/wikipedia/"
+                       "commons/transcoded/"
+                       "{}/{}/{}/{}.mp3"
+                       .format(digest[:1], digest[:2], qfn, qfn))
+            audio["ogg_url"] = ogg
+            audio["mp3_url"] = mp3
+            if active_pos: audio["pos"] = active_pos
+        if audio not in data.get("sounds", ()):
+            data_append(data, "sounds", audio)
+    # if audios:
+    #     have_pronunciations = True
+    audios =[]
+    for enpr in enprs:
+        if re.match(r"/[^/]+/$", enpr):
+            enpr = enpr[1: -1]
+        pron = {"enpr": enpr}
+        parse_pronunciation_tags(wxr, tagstext, pron)
+        if active_pos:
+            pron["pos"] = active_pos
+        if pron not in data.get("sounds", ()):
+            data_append(data, "sounds", pron)
+        # have_pronunciations = True
+    enprs = []
 
     ## I have commented out the otherwise unused have_pronunciation
     ## toggles; uncomment them to use this debug print