From d6c00b9bf34406426559e82356483988fd5a816a Mon Sep 17 00:00:00 2001 From: komisteng Date: Fri, 23 Feb 2024 12:10:32 +0800 Subject: [PATCH] =?UTF-8?q?=E9=87=8D=E5=86=99=E8=84=9A=E6=9C=AC=EF=BC=8C?= =?UTF-8?q?=E6=94=AF=E6=8C=81=E6=A0=B9=E6=8D=AE=E5=AD=97=E5=85=B8=E6=8E=A8?= =?UTF-8?q?=E6=96=AD=E9=9F=B3=E8=8A=82=E6=95=B0=E9=87=8F=EF=BC=8C=E5=9C=A8?= =?UTF-8?q?=E4=B8=AD=E6=96=87=E4=B8=89=E6=AE=B5=E5=BC=8F=E8=AF=8D=E5=85=B8?= =?UTF-8?q?=E6=B5=8B=E8=AF=95=E6=88=90=E5=8A=9F=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- variance-temp-solution/add_ph_num.py | 125 ++++++++++++--------------- 1 file changed, 57 insertions(+), 68 deletions(-) diff --git a/variance-temp-solution/add_ph_num.py b/variance-temp-solution/add_ph_num.py index 72d75e6..072ab3e 100644 --- a/variance-temp-solution/add_ph_num.py +++ b/variance-temp-solution/add_ph_num.py @@ -1,80 +1,69 @@ +import click import csv -import pathlib -import click +def find_ph_num(i, phonemes_split, dict): + ph_tmp = [] + left = i + right = i + for j in range(i, len(phonemes_split)): + ph_tmp.append(phonemes_split[j]) + if ph_tmp in dict.values(): + right = j + return left, right + + + +@click.command() +@click.option('--csv_path',required = True, help='Path to CSV file') +@click.option('--dictionary',required = True, help='Path to dictionary file') +def add_ph_num(csv_path,dictionary): + ph_seq_index = 1 + with open(csv_path, mode='r', newline='', encoding='utf-8') as csvfile: + phonemes_tmp = [] + csv_reader = csv.reader(csvfile) + for row in csv_reader: + phonemes_tmp.append(row[ph_seq_index]) + + dict = {} + with open(dictionary, 'r', encoding='utf-8') as f: + for line in f: + line = line.strip() + key = line.split('\t')[0] + values = (line.split('\t')[1]).split(' ') + dict.update({key: values}) -@click.command(help='Add ph_num attribute into transcriptions.csv') -@click.argument('transcription', metavar='TRANSCRIPTIONS') -@click.option('--dictionary', metavar='DICTIONARY') -@click.option('--vowels', metavar='FILE') -@click.option('--consonants', metavar='FILE') -def add_ph_num( - transcription: str, - dictionary: str = None, - vowels: str = None, - consonants: str = None -): - assert dictionary is not None or (vowels is not None and consonants is not None), \ - 'Either dictionary file or vowels and consonants file should be specified.' - if dictionary is not None: - dictionary = pathlib.Path(dictionary).resolve() - vowels = {'SP', 'AP'} - consonants = set() - with open(dictionary, 'r', encoding='utf8') as f: - rules = f.readlines() - for r in rules: - syllable, phonemes = r.split('\t') - phonemes = phonemes.split() - assert len(phonemes) <= 2, 'We only support two-phase dictionaries for automatically adding ph_num.' - if len(phonemes) == 1: - vowels.add(phonemes[0]) + ph_num = [] + for phonemes in phonemes_tmp: + tmp = [] + phonemes_split = phonemes.split(' ') + i=0 + while i < len(phonemes_split): + if phonemes_split[i] == "AP" or phonemes_split[i] == "SP": + tmp.append("1") + i+=1 else: - consonants.add(phonemes[0]) - vowels.add(phonemes[1]) - else: - vowels_path = pathlib.Path(vowels).resolve() - consonants_path = pathlib.Path(consonants).resolve() - vowels = {'SP', 'AP'} - consonants = set() - with open(vowels_path, 'r', encoding='utf8') as f: - vowels.update(f.read().split()) - with open(consonants_path, 'r', encoding='utf8') as f: - consonants.update(f.read().split()) - overlapped = vowels.intersection(consonants) - assert len(vowels.intersection(consonants)) == 0, \ - 'Vowel set and consonant set overlapped. The following phonemes ' \ - 'appear both as vowels and as consonants:\n' \ - f'{sorted(overlapped)}' + left,right = find_ph_num(i,phonemes_split,dict) + tmp.append(str(right-left+1)) + i = right+1 - transcription = pathlib.Path(transcription).resolve() - items: list[dict] = [] - with open(transcription, 'r', encoding='utf8') as f: - reader = csv.DictReader(f) - for item in reader: - items.append(item) + ph_num.append(tmp) - for item in items: - item: dict - ph_seq = item['ph_seq'].split() - for ph in ph_seq: - assert ph in vowels or ph in consonants, \ - f'Invalid phoneme symbol \'{ph}\' in \'{item["name"]}\'.' - ph_num = [] - i = 0 - while i < len(ph_seq): - j = i + 1 - while j < len(ph_seq) and ph_seq[j] in consonants: - j += 1 - ph_num.append(str(j - i)) - i = j - item['ph_num'] = ' '.join(ph_num) + ph_num_str = [] + for i in ph_num: + string = ' '.join(i) + ph_num_str.append(string) - with open(transcription, 'w', encoding='utf8', newline='') as f: - writer = csv.DictWriter(f, fieldnames=['name', 'ph_seq', 'ph_dur', 'ph_num']) - writer.writeheader() - writer.writerows(items) + ph_num_str[0] = "ph_num" + with open(csv_path, 'r', newline='') as csvfile: + reader = csv.reader(csvfile) + rows = list(reader) + for i, value in enumerate(ph_num_str): + rows[i].append(value) + with open(csv_path, 'w', newline='') as csvfile: + writer = csv.writer(csvfile) + writer.writerows(rows) if __name__ == '__main__': add_ph_num()