Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rewriting add_ph_num.py for more usage scenarios. #13

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 57 additions & 68 deletions variance-temp-solution/add_ph_num.py
Original file line number Diff line number Diff line change
@@ -1,80 +1,69 @@
import click
import csv
import pathlib

import click
def find_ph_num(i, phonemes_split, dict):
ph_tmp = []
left = i
right = i
for j in range(i, len(phonemes_split)):
ph_tmp.append(phonemes_split[j])
if ph_tmp in dict.values():
right = j
return left, right



@click.command()
@click.option('--csv_path',required = True, help='Path to CSV file')
@click.option('--dictionary',required = True, help='Path to dictionary file')
def add_ph_num(csv_path,dictionary):
ph_seq_index = 1
with open(csv_path, mode='r', newline='', encoding='utf-8') as csvfile:
phonemes_tmp = []
csv_reader = csv.reader(csvfile)

for row in csv_reader:
phonemes_tmp.append(row[ph_seq_index])

dict = {}
with open(dictionary, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
key = line.split('\t')[0]
values = (line.split('\t')[1]).split(' ')
dict.update({key: values})

@click.command(help='Add ph_num attribute into transcriptions.csv')
@click.argument('transcription', metavar='TRANSCRIPTIONS')
@click.option('--dictionary', metavar='DICTIONARY')
@click.option('--vowels', metavar='FILE')
@click.option('--consonants', metavar='FILE')
def add_ph_num(
transcription: str,
dictionary: str = None,
vowels: str = None,
consonants: str = None
):
assert dictionary is not None or (vowels is not None and consonants is not None), \
'Either dictionary file or vowels and consonants file should be specified.'
if dictionary is not None:
dictionary = pathlib.Path(dictionary).resolve()
vowels = {'SP', 'AP'}
consonants = set()
with open(dictionary, 'r', encoding='utf8') as f:
rules = f.readlines()
for r in rules:
syllable, phonemes = r.split('\t')
phonemes = phonemes.split()
assert len(phonemes) <= 2, 'We only support two-phase dictionaries for automatically adding ph_num.'
if len(phonemes) == 1:
vowels.add(phonemes[0])
ph_num = []
for phonemes in phonemes_tmp:
tmp = []
phonemes_split = phonemes.split(' ')
i=0
while i < len(phonemes_split):
if phonemes_split[i] == "AP" or phonemes_split[i] == "SP":
tmp.append("1")
i+=1
else:
consonants.add(phonemes[0])
vowels.add(phonemes[1])
else:
vowels_path = pathlib.Path(vowels).resolve()
consonants_path = pathlib.Path(consonants).resolve()
vowels = {'SP', 'AP'}
consonants = set()
with open(vowels_path, 'r', encoding='utf8') as f:
vowels.update(f.read().split())
with open(consonants_path, 'r', encoding='utf8') as f:
consonants.update(f.read().split())
overlapped = vowels.intersection(consonants)
assert len(vowels.intersection(consonants)) == 0, \
'Vowel set and consonant set overlapped. The following phonemes ' \
'appear both as vowels and as consonants:\n' \
f'{sorted(overlapped)}'
left,right = find_ph_num(i,phonemes_split,dict)
tmp.append(str(right-left+1))
i = right+1

transcription = pathlib.Path(transcription).resolve()
items: list[dict] = []
with open(transcription, 'r', encoding='utf8') as f:
reader = csv.DictReader(f)
for item in reader:
items.append(item)
ph_num.append(tmp)

for item in items:
item: dict
ph_seq = item['ph_seq'].split()
for ph in ph_seq:
assert ph in vowels or ph in consonants, \
f'Invalid phoneme symbol \'{ph}\' in \'{item["name"]}\'.'
ph_num = []
i = 0
while i < len(ph_seq):
j = i + 1
while j < len(ph_seq) and ph_seq[j] in consonants:
j += 1
ph_num.append(str(j - i))
i = j
item['ph_num'] = ' '.join(ph_num)
ph_num_str = []
for i in ph_num:
string = ' '.join(i)
ph_num_str.append(string)

with open(transcription, 'w', encoding='utf8', newline='') as f:
writer = csv.DictWriter(f, fieldnames=['name', 'ph_seq', 'ph_dur', 'ph_num'])
writer.writeheader()
writer.writerows(items)
ph_num_str[0] = "ph_num"

with open(csv_path, 'r', newline='') as csvfile:
reader = csv.reader(csvfile)
rows = list(reader)
for i, value in enumerate(ph_num_str):
rows[i].append(value)
with open(csv_path, 'w', newline='') as csvfile:
writer = csv.writer(csvfile)
writer.writerows(rows)

if __name__ == '__main__':
add_ph_num()