Skip to content

Commit

Permalink
Fixes file format, mainly ' => "
Browse files Browse the repository at this point in the history
  • Loading branch information
catusphan committed Dec 18, 2024
1 parent fe5f9b8 commit 25c9b86
Show file tree
Hide file tree
Showing 19 changed files with 1,010 additions and 1,201 deletions.
2 changes: 1 addition & 1 deletion .ruff.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
target-version = "py313"
line-length = 160
indent-width = 2
indent-width = 4
extend-exclude = [".vscode", ".idea", "__pycache__", ".python-version", ".ruff.toml", "ruff.toml", "setup.cfg", "pyproject.toml"]

[format]
Expand Down
5 changes: 2 additions & 3 deletions bin/add_pinyin.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"""

import argparse

import pinyin


Expand All @@ -28,9 +29,7 @@ def main() -> None:
if not outfile:
print(f"Couldnot open output file {outputfile}")

with open(
inputfile, "r", encoding="utf-8", errors="replace"
) as infile: # , errors='replace'
with open(inputfile, "r", encoding="utf-8", errors="replace") as infile: # , errors='replace'
count = 0
count_issues = 0

Expand Down
39 changes: 19 additions & 20 deletions bin/collect_compound_chinese_words.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,29 +3,28 @@
import re
import pinyin


def collect_compound_words():
DEFINITION_FILE = "./dict/Tu-dien-ThienChuu-TranVanChanh.tab"
PLECO_FLASH_FILE = "./dict/Tu-dien-ThienChuu-TranVanChanh.pleco"
with open(DEFINITION_FILE, "r", encoding="utf-8") as f:
lines = f.readlines()

DEFINITION_FILE = './dict/Tu-dien-ThienChuu-TranVanChanh.tab'
PLECO_FLASH_FILE = './dict/Tu-dien-ThienChuu-TranVanChanh.pleco'
with open(DEFINITION_FILE, 'r', encoding='utf-8') as f:
lines=f.readlines()

if not lines:
exit('Emply text files')
exit("Emply text files")


CHINESE_PATTERN = r'([一-龥]+)'
CIRCLED_NUMBERS = r'[①-⑳]'
SQUARE_BRACKETS = '[\[\]]'
CHINESE_PATTERN = r"([一-龥]+)"
CIRCLED_NUMBERS = r"[①-⑳]"
SQUARE_BRACKETS = "[\[\]]"

new_words = []

for i, line in enumerate(lines):
items = line.split('\t')
items = line.split("\t")
new_items = []

if (len(items) != 2):
exit(f'Line {i+1} has wrong number of tabs: {line}')
if len(items) != 2:
exit(f"Line {i+1} has wrong number of tabs: {line}")

headword = items[0]
definition = items[1]
Expand All @@ -40,8 +39,8 @@ def collect_compound_words():
definitions = re.split(CIRCLED_NUMBERS, viet_pronound[2].strip())

new_defs = [x.strip() for x in definitions if x.strip()]
new_words.append(f'{headword}\t{pinyin.get(headword)}\t{definition}')

new_words.append(f"{headword}\t{pinyin.get(headword)}\t{definition}")

for item in new_defs:
matches = re.findall(CHINESE_PATTERN, item)
Expand All @@ -50,14 +49,14 @@ def collect_compound_words():
continue

for match in matches:

if match != headword:
new_words.append(f'{match}\t{pinyin.get(match)}\t{item} Mục từ chính {headword} ({viet_pronound[1]}).\n')
new_words.append(f"{match}\t{pinyin.get(match)}\t{item} Mục từ chính {headword} ({viet_pronound[1]}).\n")

pass
print(f'Original words: {len(lines)}\nNew words: {len(new_words)}')
print(f"Original words: {len(lines)}\nNew words: {len(new_words)}")

with open(PLECO_FLASH_FILE, 'w', encoding='utf-8') as f:
with open(PLECO_FLASH_FILE, "w", encoding="utf-8") as f:
f.writelines(new_words)


collect_compound_words()
27 changes: 13 additions & 14 deletions bin/compare_headwords.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,29 @@

file_names = ['ext-dict/star_anhvietanh.tab', 'ext-dict/SPDict-Anh-Viet-Anh.tab']
file_names = ["ext-dict/star_anhvietanh.tab", "ext-dict/SPDict-Anh-Viet-Anh.tab"]
word_sets = []

print(f'Set A: {file_names[0]} - Set B: {file_names[1]}')
print(f"Set A: {file_names[0]} - Set B: {file_names[1]}")
for filename in file_names:
with open(filename, 'r', encoding='utf-8') as f:
lines = f.read().split('\n')
with open(filename, "r", encoding="utf-8") as f:
lines = f.read().split("\n")
f.close()

words = []

for l in lines:
items = l.split('\t')
# if items[0].isalpha():
items = l.split("\t")
# if items[0].isalpha():
words.append(items[0])

word_sets.append(set(words))

print(f'No. words: set A: {len(word_sets[0])} set B: {len(word_sets[1])}')
print(f"No. words: set A: {len(word_sets[0])} set B: {len(word_sets[1])}")

print(f'No. intersected words: {len(word_sets[0].intersection(word_sets[1]))}')
print(f"No. intersected words: {len(word_sets[0].intersection(word_sets[1]))}")

print(f'No. different words in A not in B: {len(word_sets[0].difference(word_sets[1]))}') # \n{word_sets[0].difference(word_sets[1])}
print(f"No. different words in A not in B: {len(word_sets[0].difference(word_sets[1]))}") # \n{word_sets[0].difference(word_sets[1])}

print(f'No. different words in B not in A: {len(word_sets[1].difference(word_sets[0]))}')
print(f"No. different words in B not in A: {len(word_sets[1].difference(word_sets[0]))}")

print(f'No. symmetricly different words in B not in A: {len(word_sets[1].symmetric_difference(word_sets[0]))}')
print(f"No. symmetricly different words in B not in A: {len(word_sets[1].symmetric_difference(word_sets[0]))}")

#print(f'No. different words in B not in A: {word_sets[1].difference(word_sets[0])}')
# print(f'No. different words in B not in A: {word_sets[1].difference(word_sets[0])}')
Loading

0 comments on commit 25c9b86

Please sign in to comment.