openvpi · komisteng · Feb 23, 2024
diff --git a/variance-temp-solution/add_ph_num.py b/variance-temp-solution/add_ph_num.py
@@ -1,80 +1,69 @@
+import click
 import csv
-import pathlib
 
-import click
+def find_ph_num(i, phonemes_split, dict):
+    ph_tmp = []
+    left = i
+    right = i
+    for j in range(i, len(phonemes_split)):
+        ph_tmp.append(phonemes_split[j])
+        if ph_tmp in dict.values():
+            right = j
+    return left, right
+
+
+
+@click.command()
+@click.option('--csv_path',required = True, help='Path to CSV file')
+@click.option('--dictionary',required = True, help='Path to dictionary file')
+def add_ph_num(csv_path,dictionary):
+    ph_seq_index = 1
+    with open(csv_path, mode='r', newline='', encoding='utf-8') as csvfile:
+        phonemes_tmp = []
+        csv_reader = csv.reader(csvfile)
 
+        for row in csv_reader:
+            phonemes_tmp.append(row[ph_seq_index])
+
+    dict = {}
+    with open(dictionary, 'r', encoding='utf-8') as f:
+        for line in f:
+            line = line.strip()
+            key = line.split('\t')[0]
+            values = (line.split('\t')[1]).split(' ')
+            dict.update({key: values})
 
-@click.command(help='Add ph_num attribute into transcriptions.csv')
-@click.argument('transcription', metavar='TRANSCRIPTIONS')
-@click.option('--dictionary', metavar='DICTIONARY')
-@click.option('--vowels', metavar='FILE')
-@click.option('--consonants', metavar='FILE')
-def add_ph_num(
-        transcription: str,
-        dictionary: str = None,
-        vowels: str = None,
-        consonants: str = None
-):
-    assert dictionary is not None or (vowels is not None and consonants is not None), \
-        'Either dictionary file or vowels and consonants file should be specified.'
-    if dictionary is not None:
-        dictionary = pathlib.Path(dictionary).resolve()
-        vowels = {'SP', 'AP'}
-        consonants = set()
-        with open(dictionary, 'r', encoding='utf8') as f:
-            rules = f.readlines()
-        for r in rules:
-            syllable, phonemes = r.split('\t')
-            phonemes = phonemes.split()
-            assert len(phonemes) <= 2, 'We only support two-phase dictionaries for automatically adding ph_num.'
-            if len(phonemes) == 1:
-                vowels.add(phonemes[0])
+    ph_num = []
+    for phonemes in phonemes_tmp:
+        tmp = []
+        phonemes_split = phonemes.split(' ')
+        i=0
+        while i < len(phonemes_split):
+            if phonemes_split[i] == "AP" or phonemes_split[i] == "SP":
+                tmp.append("1")
+                i+=1
             else:
-                consonants.add(phonemes[0])
-                vowels.add(phonemes[1])
-    else:
-        vowels_path = pathlib.Path(vowels).resolve()
-        consonants_path = pathlib.Path(consonants).resolve()
-        vowels = {'SP', 'AP'}
-        consonants = set()
-        with open(vowels_path, 'r', encoding='utf8') as f:
-            vowels.update(f.read().split())
-        with open(consonants_path, 'r', encoding='utf8') as f:
-            consonants.update(f.read().split())
-        overlapped = vowels.intersection(consonants)
-        assert len(vowels.intersection(consonants)) == 0, \
-            'Vowel set and consonant set overlapped. The following phonemes ' \
-            'appear both as vowels and as consonants:\n' \
-            f'{sorted(overlapped)}'
+                left,right = find_ph_num(i,phonemes_split,dict)
+                tmp.append(str(right-left+1))
+                i = right+1
 
-    transcription = pathlib.Path(transcription).resolve()
-    items: list[dict] = []
-    with open(transcription, 'r', encoding='utf8') as f:
-        reader = csv.DictReader(f)
-        for item in reader:
-            items.append(item)
+        ph_num.append(tmp)
 
-    for item in items:
-        item: dict
-        ph_seq = item['ph_seq'].split()
-        for ph in ph_seq:
-            assert ph in vowels or ph in consonants, \
-                f'Invalid phoneme symbol \'{ph}\' in \'{item["name"]}\'.'
-        ph_num = []
-        i = 0
-        while i < len(ph_seq):
-            j = i + 1
-            while j < len(ph_seq) and ph_seq[j] in consonants:
-                j += 1
-            ph_num.append(str(j - i))
-            i = j
-        item['ph_num'] = ' '.join(ph_num)
+    ph_num_str = []
+    for i in ph_num:
+        string = ' '.join(i)
+        ph_num_str.append(string)
 
-    with open(transcription, 'w', encoding='utf8', newline='') as f:
-        writer = csv.DictWriter(f, fieldnames=['name', 'ph_seq', 'ph_dur', 'ph_num'])
-        writer.writeheader()
-        writer.writerows(items)
+    ph_num_str[0] = "ph_num"
 
+    with open(csv_path, 'r', newline='') as csvfile:
+        reader = csv.reader(csvfile)
+        rows = list(reader)
+    for i, value in enumerate(ph_num_str):
+        rows[i].append(value)
+    with open(csv_path, 'w', newline='') as csvfile:
+        writer = csv.writer(csvfile)
+        writer.writerows(rows)
 
 if __name__ == '__main__':
     add_ph_num()