Reorder 廣韻字頭 and correct some 字 & 釋義

- All entries are reordered according to 澤存堂本 - This solves a long-standing issue with both 韻典 and 廣韻字音表's data: Both data tables combine 字頭 from 廣韻全字表 and 釋義 from 宋本廣韻データ. However 廣韻全字表 is based on 巾箱本 while 宋本廣韻データ is based on 澤存堂本, which creates mismatches. - Entries missing in poem's data are added back - This includes characters only representable with IDS, and seveval additions from 廣韻校本 - More errors in 字頭 & 釋義 are corrected - These were discovered when the new 字序表 was being made, and are still WIP.
nk2028 · Jan 23, 2025 · 25e212c · 25e212c
1 parent bf6c1c5
commit 25e212c
Show file tree

Hide file tree

Showing 4 changed files with 25,401 additions and 25,360 deletions.
diff --git a/README.md b/README.md
@@ -3,12 +3,30 @@
 A database of the Qieyun phonological system.
 
 - 韻書
-    - 王一：`王一.csv` (not completed)
-    - 王三：`王三.csv` (小韻內部待校)
-    - 廣韻（澤存堂本）：`廣韻.csv`
+  - 王一：`王一.csv` (not completed)
+  - 王三：`王三.csv` (小韻內部待校)
+  - 廣韻 (澤存堂本, with corrections from 廣韻校本)：`廣韻.csv`
 - 韻圖
-    - 韻鏡（嘉吉本）：`韻鏡（嘉吉本）.csv` (not completed)
-    - 韻鏡（古逸叢書本）：`韻鏡（古逸叢書本）.csv`
+  - 韻鏡（嘉吉本）：`韻鏡（嘉吉本）.csv` (not completed)
+  - 韻鏡（古逸叢書本）：`韻鏡（古逸叢書本）.csv`
 - 反切音韻地位
-    - 王三：`王三反切音韻地位表.csv` (rev. Ayaka & unt)
-    - 廣韻：`廣韻反切音韻地位表.csv` (beta)
+  - 王三：`王三反切音韻地位表.csv` (rev. Ayaka & unt)
+  - 廣韻：`廣韻反切音韻地位表.csv` (beta)
+
+## About fields in 韻書/廣韻.csv
+
+- 小韻號: May contain -a/-b/-c if a 小韻 has multiple 音韻地位s
+- 小韻字號: May contain -a1, -a2 etc for entries not present in 澤存堂本 but added back according to 廣韻校本
+- 反切: May contain annotations:
+  - 脫字: `[徒]候` (小韻 #3067 豆)
+  - 訛字: `士<七>演` (小韻 #1625 淺)
+  - 改用其他來源的音韻地位: `姊宜⦉規⦊` (小韻 #133 厜)
+  - 替換成近似等價字，反切結果改變: `符咸(䒦)` (小韻 #1155 凡)
+  - 替換成音近字，反切結果改變: `式之(脂)` (小韻 #157 尸)
+  - 替換成等價字，反切結果不變: `甫⦅府⦆妄` (小韻 #2918 放)
+  - 替換成同音字，反切結果不變: `呼東⦅紅⦆` (小韻 #32 烘)
+  - 複合使用: `以沼⦅小⦆<水>` (小韻 #1692a 鷕)
+- 字頭當刪: if nonempty, indicates this entry in 澤存堂本 is errorneous and should be removed according to 廣韻校本
+- 釋義參照: 
+  - `上` if 釋義 refers to the entry above ("同上", "俗", "古文" etc.)
+  - `下` if it shares 釋義 with the entry below ("並上同", "並古文" etc.)
diff --git a/build.py b/build.py
@@ -104,7 +104,7 @@ class 廣韻Row:
     音韻地位: str
     反切: str
     字頭: str
-    # 字頭當刪: str  # TODO
+    字頭當刪: str
     釋義: str
     釋義參照: str
 
@@ -166,6 +166,7 @@ def main():
                 釋義參照 = ''
 
         # 修正
+        字頭當刪 = ''
         if (patch := patches.get(字序_key)) is not None:
             assert patch.原字頭 == 字頭, (
                 f'patching 小韻 #{原書小韻號}/{小韻字號} 字 "{patch.原字頭}", but the actual 字 is "{字頭}"'
@@ -174,8 +175,12 @@ def main():
             if patch.校正字頭 and patch.校正字頭 != '～':
                 corrected = patch.校正字頭
                 if corrected.startswith('['):
-                    # TODO 暫忽略當刪字
-                    corrected = corrected[-2] if corrected[-2] != '-' else corrected[1]
+                    if corrected[-2] == '-':
+                        字頭當刪 = patch.當刪說明 or '當刪'
+                        corrected = corrected[1]
+                    else:
+                        assert not patch.當刪說明
+                        corrected = corrected[-2]
                 字頭 = corrected
 
             if patch.校正釋義 or patch.原釋義:
@@ -189,7 +194,15 @@ def main():
                     f'patching 釋義參照 on 小韻 #{原書小韻號}/{小韻字號} 字 "{patch.原字頭}", but the actual 釋義參照 is "{釋義參照}"'
                 )
                 釋義參照 = patch.校正釋義參照
-        # TODO 當刪字
+        elif 字序_data[字序_key].sbgy_字.endswith('/-]'):
+            字頭當刪 = '當刪'
+
+        字_check = 字序_data[字序_key].字
+        if 字_check.startswith('['):
+            字_check = 字_check[-2] if 字_check[-2] != '-' else 字_check[1]
+        assert 字頭 == 字_check, (
+            f'字頭 mismatch between 字序表 and patched data: "{字_check}" != "{字頭}"'
+        )
 
         # 小韻號
         if 原書小韻號 in has_細分:
@@ -223,7 +236,7 @@ def main():
             釋義 = 釋義.replace(poem_反切 + '切', 反切原貌 + '切')
 
         廣韻_data[字序_key] = 廣韻Row(
-            小韻號, 小韻字號, 韻目原貌, 音韻地位, 反切, 字頭, 釋義, 釋義參照
+            小韻號, 小韻字號, 韻目原貌, 音韻地位, 反切, 字頭, 字頭當刪, 釋義, 釋義參照
         )
 
     for 小韻號, cov in 小韻細分_coverage.items():

diff --git a/check.py b/check.py
@@ -28,29 +28,39 @@ def contains_ascii(s: str):
     with open('韻書/廣韻.csv') as f:
         assert (
             next(f).rstrip('\n')
-            == '小韻號,小韻內字序,韻目原貌,音韻地位,反切,字頭,釋義,釋義補充'
+            == '小韻號,小韻字號,韻目原貌,音韻地位,反切,字頭,字頭當刪,釋義,釋義參照'
         )
         for line in f:
             (
                 小韻號,
-                小韻內字序,
+                小韻字號,
                 韻目原貌,
                 音韻地位描述,
                 反切,
                 字頭,
+                字頭當刪,
                 釋義,
-                釋義補充,
+                釋義參照,
             ) = line.rstrip('\n').split(',')
 
-            assert (
-                PATTERN_描述.fullmatch(音韻地位描述) is not None
-            ), f'invalid 音韻地位: {音韻地位描述}'
+            assert re.fullmatch(r'\d+[abc]?', 小韻號), f'invalid 小韻號: {小韻號}'
+            assert re.fullmatch(r'\d+(a\d+)?', 小韻字號), (
+                f'invalid 小韻字號: {小韻字號}'
+            )
+            assert len(韻目原貌) == 1, f'invalid 韻目原𩩕: {韻目原貌}'
+            assert len(字頭) == 1 or re.match(r'[\u2ff0-\u2fff\u303e\u31ef]', 字頭), (
+                f'invalid 字頭: {字頭}'
+            )
+
+            assert PATTERN_描述.fullmatch(音韻地位描述) is not None, (
+                f'invalid 音韻地位: {音韻地位描述}'
+            )
 
             if 反切:
                 assert PATTERN_反切.fullmatch(反切) is not None, f'invalid 反切: {反切}'
-            assert len(字頭) == 1, 'The length of 字頭 should be 1'
 
-            assert 釋義 + 釋義補充, '釋義 and 釋義補充 should not be both empty'
-            assert not contains_ascii(
-                釋義
-            ), '釋義 should not contain any ASCII characters'
+            assert 釋義 + 釋義參照, '釋義 and 釋義參照 should not be both empty'
+            assert not contains_ascii(釋義), (
+                '釋義 should not contain any ASCII characters'
+            )
+            assert 釋義參照 in ('', '上', '下'), '釋義參照 should be "上" or "下"'