Skip to content

Commit

Permalink
Merge pull request #413 from tatuylonen/audios-index-out-of-bounds
Browse files Browse the repository at this point in the history
Move code out of for loop that didn't belong there
  • Loading branch information
kristian-clausal authored Dec 5, 2023
2 parents c42a520 + 059895e commit fee414a
Showing 1 changed file with 67 additions and 67 deletions.
134 changes: 67 additions & 67 deletions src/wiktextract/pronunciations.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,7 @@ def split_cleaned_node_on_newlines(contents):
active_pos = None

for text, ipa_text in split_cleaned_node_on_newlines(contents):
# print(text, ipa_text)
# print(f"{text=}, {ipa_text=}")
prefix = None
if not text:
continue
Expand Down Expand Up @@ -582,72 +582,72 @@ def split_cleaned_node_on_newlines(contents):
# XXX what about {{hyphenation|...}}, {{hyph|...}}
# and those used to be stored under "hyphenation"

# Add data that was collected in template_fn
if audios:
for audio in audios:
if "audio" in audio:
# Compute audio file URLs
fn = audio["audio"]
# Strip certain characters, e.g., left-to-right mark
fn = re.sub(r"[\u200f\u200e]", "", fn)
fn = fn.strip()
fn = urllib.parse.unquote(fn)
# First character is usually uppercased
if re.match(r"^[a-z][a-z]+", fn):
fn = fn[0].upper() + fn[1:]
if fn in wxr.config.redirects:
fn = wxr.config.redirects[fn]
# File extension is lowercased
# XXX some words seem to need this, some don't seem to
# have this??? what is the exact rule?
# fn = re.sub(r"\.[^.]*$", lambda m: m.group(0).lower(), fn)
# Spaces are converted to underscores
fn = re.sub(r"\s+", "_", fn)
# Compute hash digest part
h = hashlib.md5()
hname = fn.encode("utf-8")
h.update(hname)
digest = h.hexdigest()
# Quote filename for URL
qfn = urllib.parse.quote(fn)
# For safety when writing files
qfn = qfn.replace("/", "__slash__")
if re.search(r"(?i)\.(ogg|oga)$", fn):
ogg = ("https://upload.wikimedia.org/wikipedia/"
"commons/{}/{}/{}"
.format(digest[:1], digest[:2], qfn))
else:
ogg = ("https://upload.wikimedia.org/wikipedia/"
"commons/transcoded/"
"{}/{}/{}/{}.ogg"
.format(digest[:1], digest[:2], qfn, qfn))
if re.search(r"(?i)\.(mp3)$", fn):
mp3 = ("https://upload.wikimedia.org/wikipedia/"
"commons/{}/{}/{}"
.format(digest[:1], digest[:2], qfn))
else:
mp3 = ("https://upload.wikimedia.org/wikipedia/"
"commons/transcoded/"
"{}/{}/{}/{}.mp3"
.format(digest[:1], digest[:2], qfn, qfn))
audio["ogg_url"] = ogg
audio["mp3_url"] = mp3
if active_pos: audio["pos"] = active_pos
if audio not in data.get("sounds", ()):
data_append(data, "sounds", audio)
# have_pronunciations = True
audios =[]
for enpr in enprs:
if re.match(r"/[^/]+/$", enpr):
enpr = enpr[1: -1]
pron = {"enpr": enpr}
parse_pronunciation_tags(wxr, tagstext, pron)
if active_pos:
pron["pos"] = active_pos
if pron not in data.get("sounds", ()):
data_append(data, "sounds", pron)
# have_pronunciations = True
enprs = []
# Add data that was collected in template_fn
for audio in audios:
if "audio" in audio:
# Compute audio file URLs
fn = audio["audio"]
# Strip certain characters, e.g., left-to-right mark
fn = re.sub(r"[\u200f\u200e]", "", fn)
fn = fn.strip()
fn = urllib.parse.unquote(fn)
# First character is usually uppercased
if re.match(r"^[a-z][a-z]+", fn):
fn = fn[0].upper() + fn[1:]
if fn in wxr.config.redirects:
fn = wxr.config.redirects[fn]
# File extension is lowercased
# XXX some words seem to need this, some don't seem to
# have this??? what is the exact rule?
# fn = re.sub(r"\.[^.]*$", lambda m: m.group(0).lower(), fn)
# Spaces are converted to underscores
fn = re.sub(r"\s+", "_", fn)
# Compute hash digest part
h = hashlib.md5()
hname = fn.encode("utf-8")
h.update(hname)
digest = h.hexdigest()
# Quote filename for URL
qfn = urllib.parse.quote(fn)
# For safety when writing files
qfn = qfn.replace("/", "__slash__")
if re.search(r"(?i)\.(ogg|oga)$", fn):
ogg = ("https://upload.wikimedia.org/wikipedia/"
"commons/{}/{}/{}"
.format(digest[:1], digest[:2], qfn))
else:
ogg = ("https://upload.wikimedia.org/wikipedia/"
"commons/transcoded/"
"{}/{}/{}/{}.ogg"
.format(digest[:1], digest[:2], qfn, qfn))
if re.search(r"(?i)\.(mp3)$", fn):
mp3 = ("https://upload.wikimedia.org/wikipedia/"
"commons/{}/{}/{}"
.format(digest[:1], digest[:2], qfn))
else:
mp3 = ("https://upload.wikimedia.org/wikipedia/"
"commons/transcoded/"
"{}/{}/{}/{}.mp3"
.format(digest[:1], digest[:2], qfn, qfn))
audio["ogg_url"] = ogg
audio["mp3_url"] = mp3
if active_pos: audio["pos"] = active_pos
if audio not in data.get("sounds", ()):
data_append(data, "sounds", audio)
# if audios:
# have_pronunciations = True
audios =[]
for enpr in enprs:
if re.match(r"/[^/]+/$", enpr):
enpr = enpr[1: -1]
pron = {"enpr": enpr}
parse_pronunciation_tags(wxr, tagstext, pron)
if active_pos:
pron["pos"] = active_pos
if pron not in data.get("sounds", ()):
data_append(data, "sounds", pron)
# have_pronunciations = True
enprs = []

## I have commented out the otherwise unused have_pronunciation
## toggles; uncomment them to use this debug print
Expand Down

0 comments on commit fee414a

Please sign in to comment.