From d6c00b9bf34406426559e82356483988fd5a816a Mon Sep 17 00:00:00 2001
From: komisteng <komisteng@gmail.com>
Date: Fri, 23 Feb 2024 12:10:32 +0800
Subject: [PATCH] =?UTF-8?q?=E9=87=8D=E5=86=99=E8=84=9A=E6=9C=AC=EF=BC=8C?=
 =?UTF-8?q?=E6=94=AF=E6=8C=81=E6=A0=B9=E6=8D=AE=E5=AD=97=E5=85=B8=E6=8E=A8?=
 =?UTF-8?q?=E6=96=AD=E9=9F=B3=E8=8A=82=E6=95=B0=E9=87=8F=EF=BC=8C=E5=9C=A8?=
 =?UTF-8?q?=E4=B8=AD=E6=96=87=E4=B8=89=E6=AE=B5=E5=BC=8F=E8=AF=8D=E5=85=B8?=
 =?UTF-8?q?=E6=B5=8B=E8=AF=95=E6=88=90=E5=8A=9F=E3=80=82?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 variance-temp-solution/add_ph_num.py | 125 ++++++++++++---------------
 1 file changed, 57 insertions(+), 68 deletions(-)

diff --git a/variance-temp-solution/add_ph_num.py b/variance-temp-solution/add_ph_num.py
index 72d75e6..072ab3e 100644
--- a/variance-temp-solution/add_ph_num.py
+++ b/variance-temp-solution/add_ph_num.py
@@ -1,80 +1,69 @@
+import click
 import csv
-import pathlib
 
-import click
+def find_ph_num(i, phonemes_split, dict):
+    ph_tmp = []
+    left = i
+    right = i
+    for j in range(i, len(phonemes_split)):
+        ph_tmp.append(phonemes_split[j])
+        if ph_tmp in dict.values():
+            right = j
+    return left, right
+
+
+
+@click.command()
+@click.option('--csv_path',required = True, help='Path to CSV file')
+@click.option('--dictionary',required = True, help='Path to dictionary file')
+def add_ph_num(csv_path,dictionary):
+    ph_seq_index = 1
+    with open(csv_path, mode='r', newline='', encoding='utf-8') as csvfile:
+        phonemes_tmp = []
+        csv_reader = csv.reader(csvfile)
 
+        for row in csv_reader:
+            phonemes_tmp.append(row[ph_seq_index])
+    
+    dict = {}
+    with open(dictionary, 'r', encoding='utf-8') as f:
+        for line in f:
+            line = line.strip()
+            key = line.split('\t')[0]
+            values = (line.split('\t')[1]).split(' ')
+            dict.update({key: values})
 
-@click.command(help='Add ph_num attribute into transcriptions.csv')
-@click.argument('transcription', metavar='TRANSCRIPTIONS')
-@click.option('--dictionary', metavar='DICTIONARY')
-@click.option('--vowels', metavar='FILE')
-@click.option('--consonants', metavar='FILE')
-def add_ph_num(
-        transcription: str,
-        dictionary: str = None,
-        vowels: str = None,
-        consonants: str = None
-):
-    assert dictionary is not None or (vowels is not None and consonants is not None), \
-        'Either dictionary file or vowels and consonants file should be specified.'
-    if dictionary is not None:
-        dictionary = pathlib.Path(dictionary).resolve()
-        vowels = {'SP', 'AP'}
-        consonants = set()
-        with open(dictionary, 'r', encoding='utf8') as f:
-            rules = f.readlines()
-        for r in rules:
-            syllable, phonemes = r.split('\t')
-            phonemes = phonemes.split()
-            assert len(phonemes) <= 2, 'We only support two-phase dictionaries for automatically adding ph_num.'
-            if len(phonemes) == 1:
-                vowels.add(phonemes[0])
+    ph_num = []
+    for phonemes in phonemes_tmp:
+        tmp = []
+        phonemes_split = phonemes.split(' ')
+        i=0
+        while i < len(phonemes_split):
+            if phonemes_split[i] == "AP" or phonemes_split[i] == "SP":
+                tmp.append("1")
+                i+=1
             else:
-                consonants.add(phonemes[0])
-                vowels.add(phonemes[1])
-    else:
-        vowels_path = pathlib.Path(vowels).resolve()
-        consonants_path = pathlib.Path(consonants).resolve()
-        vowels = {'SP', 'AP'}
-        consonants = set()
-        with open(vowels_path, 'r', encoding='utf8') as f:
-            vowels.update(f.read().split())
-        with open(consonants_path, 'r', encoding='utf8') as f:
-            consonants.update(f.read().split())
-        overlapped = vowels.intersection(consonants)
-        assert len(vowels.intersection(consonants)) == 0, \
-            'Vowel set and consonant set overlapped. The following phonemes ' \
-            'appear both as vowels and as consonants:\n' \
-            f'{sorted(overlapped)}'
+                left,right = find_ph_num(i,phonemes_split,dict)
+                tmp.append(str(right-left+1))
+                i = right+1
 
-    transcription = pathlib.Path(transcription).resolve()
-    items: list[dict] = []
-    with open(transcription, 'r', encoding='utf8') as f:
-        reader = csv.DictReader(f)
-        for item in reader:
-            items.append(item)
+        ph_num.append(tmp)
 
-    for item in items:
-        item: dict
-        ph_seq = item['ph_seq'].split()
-        for ph in ph_seq:
-            assert ph in vowels or ph in consonants, \
-                f'Invalid phoneme symbol \'{ph}\' in \'{item["name"]}\'.'
-        ph_num = []
-        i = 0
-        while i < len(ph_seq):
-            j = i + 1
-            while j < len(ph_seq) and ph_seq[j] in consonants:
-                j += 1
-            ph_num.append(str(j - i))
-            i = j
-        item['ph_num'] = ' '.join(ph_num)
+    ph_num_str = []
+    for i in ph_num:
+        string = ' '.join(i)
+        ph_num_str.append(string)
 
-    with open(transcription, 'w', encoding='utf8', newline='') as f:
-        writer = csv.DictWriter(f, fieldnames=['name', 'ph_seq', 'ph_dur', 'ph_num'])
-        writer.writeheader()
-        writer.writerows(items)
+    ph_num_str[0] = "ph_num"
 
+    with open(csv_path, 'r', newline='') as csvfile:
+        reader = csv.reader(csvfile)
+        rows = list(reader)
+    for i, value in enumerate(ph_num_str):
+        rows[i].append(value)
+    with open(csv_path, 'w', newline='') as csvfile:
+        writer = csv.writer(csvfile)
+        writer.writerows(rows)
 
 if __name__ == '__main__':
     add_ph_num()