Fixes file format, mainly ' => "

catusf · Dec 18, 2024 · 25c9b86 · 25c9b86
1 parent fe5f9b8
commit 25c9b86
Show file tree

Hide file tree

Showing 19 changed files with 1,010 additions and 1,201 deletions.
diff --git a/.ruff.toml b/.ruff.toml
@@ -1,6 +1,6 @@
 target-version = "py313"
 line-length = 160
-indent-width = 2
+indent-width = 4
 extend-exclude = [".vscode", ".idea", "__pycache__", ".python-version", ".ruff.toml", "ruff.toml", "setup.cfg", "pyproject.toml"]
 
 [format]

diff --git a/bin/add_pinyin.py b/bin/add_pinyin.py
@@ -5,6 +5,7 @@
 """
 
 import argparse
+
 import pinyin
 
 
@@ -28,9 +29,7 @@ def main() -> None:
     if not outfile:
         print(f"Couldnot open output file {outputfile}")
 
-    with open(
-        inputfile, "r", encoding="utf-8", errors="replace"
-    ) as infile:  # , errors='replace'
+    with open(inputfile, "r", encoding="utf-8", errors="replace") as infile:  # , errors='replace'
         count = 0
         count_issues = 0
 

diff --git a/bin/collect_compound_chinese_words.py b/bin/collect_compound_chinese_words.py
@@ -3,29 +3,28 @@
 import re
 import pinyin
 
+
 def collect_compound_words():
+    DEFINITION_FILE = "./dict/Tu-dien-ThienChuu-TranVanChanh.tab"
+    PLECO_FLASH_FILE = "./dict/Tu-dien-ThienChuu-TranVanChanh.pleco"
+    with open(DEFINITION_FILE, "r", encoding="utf-8") as f:
+        lines = f.readlines()
 
-    DEFINITION_FILE = './dict/Tu-dien-ThienChuu-TranVanChanh.tab'
-    PLECO_FLASH_FILE = './dict/Tu-dien-ThienChuu-TranVanChanh.pleco'
-    with open(DEFINITION_FILE, 'r', encoding='utf-8') as f:
-        lines=f.readlines()
-
     if not lines:
-        exit('Emply text files')
+        exit("Emply text files")
 
-
-    CHINESE_PATTERN = r'([一-龥]+)'
-    CIRCLED_NUMBERS = r'[①-⑳]'
-    SQUARE_BRACKETS = '[\[\]]'
+    CHINESE_PATTERN = r"([一-龥]+)"
+    CIRCLED_NUMBERS = r"[①-⑳]"
+    SQUARE_BRACKETS = "[\[\]]"
 
     new_words = []
 
     for i, line in enumerate(lines):
-        items = line.split('\t')
+        items = line.split("\t")
         new_items = []
 
-        if (len(items) != 2):
-            exit(f'Line {i+1} has wrong number of tabs: {line}')
+        if len(items) != 2:
+            exit(f"Line {i+1} has wrong number of tabs: {line}")
 
         headword = items[0]
         definition = items[1]
@@ -40,8 +39,8 @@ def collect_compound_words():
         definitions = re.split(CIRCLED_NUMBERS, viet_pronound[2].strip())
 
         new_defs = [x.strip() for x in definitions if x.strip()]
-        
-        new_words.append(f'{headword}\t{pinyin.get(headword)}\t{definition}')
+
+        new_words.append(f"{headword}\t{pinyin.get(headword)}\t{definition}")
 
         for item in new_defs:
             matches = re.findall(CHINESE_PATTERN, item)
@@ -50,14 +49,14 @@ def collect_compound_words():
                 continue
 
             for match in matches:
-
                 if match != headword:
-                    new_words.append(f'{match}\t{pinyin.get(match)}\t{item} Mục từ chính {headword} ({viet_pronound[1]}).\n')
-        
+                    new_words.append(f"{match}\t{pinyin.get(match)}\t{item} Mục từ chính {headword} ({viet_pronound[1]}).\n")
+
         pass
-    print(f'Original words: {len(lines)}\nNew words: {len(new_words)}')
+    print(f"Original words: {len(lines)}\nNew words: {len(new_words)}")
 
-    with open(PLECO_FLASH_FILE, 'w', encoding='utf-8') as f:
+    with open(PLECO_FLASH_FILE, "w", encoding="utf-8") as f:
         f.writelines(new_words)
 
+
 collect_compound_words()
diff --git a/bin/compare_headwords.py b/bin/compare_headwords.py
@@ -1,30 +1,29 @@
-
-file_names = ['ext-dict/star_anhvietanh.tab', 'ext-dict/SPDict-Anh-Viet-Anh.tab']
+file_names = ["ext-dict/star_anhvietanh.tab", "ext-dict/SPDict-Anh-Viet-Anh.tab"]
 word_sets = []
 
-print(f'Set A: {file_names[0]} - Set B: {file_names[1]}')
+print(f"Set A: {file_names[0]} - Set B: {file_names[1]}")
 for filename in file_names:
-    with open(filename, 'r', encoding='utf-8') as f:
-        lines = f.read().split('\n')
+    with open(filename, "r", encoding="utf-8") as f:
+        lines = f.read().split("\n")
         f.close()
 
         words = []
 
         for l in lines:
-            items = l.split('\t')
-#            if items[0].isalpha():
+            items = l.split("\t")
+            #            if items[0].isalpha():
             words.append(items[0])
-                
+
         word_sets.append(set(words))
 
-print(f'No. words: set A: {len(word_sets[0])} set B: {len(word_sets[1])}')
+print(f"No. words: set A: {len(word_sets[0])} set B: {len(word_sets[1])}")
 
-print(f'No. intersected words: {len(word_sets[0].intersection(word_sets[1]))}')
+print(f"No. intersected words: {len(word_sets[0].intersection(word_sets[1]))}")
 
-print(f'No. different words in A not in B: {len(word_sets[0].difference(word_sets[1]))}') # \n{word_sets[0].difference(word_sets[1])}
+print(f"No. different words in A not in B: {len(word_sets[0].difference(word_sets[1]))}")  # \n{word_sets[0].difference(word_sets[1])}
 
-print(f'No. different words in B not in A: {len(word_sets[1].difference(word_sets[0]))}')
+print(f"No. different words in B not in A: {len(word_sets[1].difference(word_sets[0]))}")
 
-print(f'No. symmetricly different words in B not in A: {len(word_sets[1].symmetric_difference(word_sets[0]))}')
+print(f"No. symmetricly different words in B not in A: {len(word_sets[1].symmetric_difference(word_sets[0]))}")
 
-#print(f'No. different words in B not in A: {word_sets[1].difference(word_sets[0])}')
+# print(f'No. different words in B not in A: {word_sets[1].difference(word_sets[0])}')