Skip to content

Commit

Permalink
Fixes reporting
Browse files Browse the repository at this point in the history
  • Loading branch information
catusphan committed Dec 18, 2024
1 parent f37f1a2 commit 588f038
Show file tree
Hide file tree
Showing 5 changed files with 162 additions and 106 deletions.
51 changes: 51 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -78,3 +78,54 @@ ext-dict/ChineseThesaurus-mid.txt
ext-dict/ChineseThesaurus-small.dfo
ext-dict/ChineseThesaurus-small.tab
ext-dict/ChineseThesaurus-small.txt
ext-dict/HanziHeroPleco.dfo
ext-dict/HanziHeroPleco.tab
ext-dict/HanziHeroPleco.txt
ext-dict/Ngu-vung-Danh-tu-Thien-hoc.dfo
ext-dict/Ngu-vung-Danh-tu-Thien-hoc.tab
ext-dict/opencc_pleco.txt
ext-dict/Phat-Quang-Dai-tu-dien-Han-ngu.dfo
ext-dict/Phat-Quang-Dai-tu-dien-Han-ngu.tab
ext-dict/radical_name_pleco.dfo
ext-dict/radical_name_pleco.tab
ext-dict/radical_name_pleco.txt
ext-dict/Rong-mo-tam-hon.dfo
ext-dict/Rong-mo-tam-hon.tab
ext-dict/TrungViet-big.dfo
ext-dict/TrungViet-big.tab.bz2
ext-dict/TrungViet-big.txt.bz2
ext-dict/TrungViet-mid.dfo
ext-dict/TrungViet-mid.tab
ext-dict/TrungViet-mid.txt
ext-dict/TrungViet-small.dfo
ext-dict/TrungViet-small.tab
ext-dict/TrungViet-small.txt
ext-dict/Tu-dien-Dao-Uyen.dfo
ext-dict/Tu-dien-Dao-Uyen.tab
ext-dict/Tu-dien-Phat-hoc-Anh-Han-Viet.dfo
ext-dict/Tu-dien-Phat-hoc-Anh-Han-Viet.tab
ext-dict/Tu-dien-Phat-hoc-Tinh-tuyen.dfo
ext-dict/Tu-dien-Phat-hoc-Tinh-tuyen.tab
ext-dict/Tu-dien-Phat-hoc-Viet-Anh-Dong-Loai.dfo
ext-dict/Tu-dien-Phat-hoc-Viet-Anh-Dong-Loai.tab
ext-dict/Tu-dien-Phat-hoc-Viet-Anh-Minh-Thong.dfo
ext-dict/Tu-dien-Phat-hoc-Viet-Anh-Minh-Thong.tab
ext-dict/Tu-dien-Phat-hoc-Viet-Anh-Thien-Phuc.dfo
ext-dict/Tu-dien-Phat-hoc-Viet-Anh-Thien-Phuc.tab
ext-dict/Tu-dien-Phat-Quang.dfo
ext-dict/Tu-dien-Phat-Quang.tab
ext-dict/Tu-dien-ThienChuu-TranVanChanh.dfo
ext-dict/Tu-dien-ThienChuu-TranVanChanh.pleco
ext-dict/Tu-dien-ThienChuu-TranVanChanh.tab
ext-dict/Tu-dien-ThienChuu+TranVanChanh_v1.0.pqb
ext-dict/Tu-dien-Tong-hop-Phat-hoc.dfo
ext-dict/Tu-dien-Tong-hop-Phat-hoc.json
ext-dict/Tu-dien-Tong-hop-Phat-hoc.tab
ext-dict/TudienAnhVietAnh.dfo
ext-dict/TudienAnhVietAnh.tab
ext-dict/TudienAnhVietBeta-Inflections.txt
ext-dict/TudienAnhVietBeta.dfo
ext-dict/TudienAnhVietBeta.tab
ext-dict/TudienThienChuu.dfo
ext-dict/TudienThienChuu.tab
ext-dict/TudienThienChuu.txt
15 changes: 3 additions & 12 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,24 +5,15 @@
"version": "0.2.0",
"configurations": [
{
<<<<<<< HEAD
"name": "Python Debugger: Current File with Arguments",
=======
"name": "Python: Current File",
>>>>>>> 376004ac6181baa40ae9042204a57630f6e2a9ba
"type": "debugpy",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal",
"args": [
<<<<<<< HEAD
"--extensions=mobi,epub",
"--read-only=no"
=======
"--input_folder=./dict",
"--output_folder=./output",
"--extension=tab",
>>>>>>> 376004ac6181baa40ae9042204a57630f6e2a9ba
"-d=ext-dict",
"-o=ext-output",
"-r=no"
]
}
]
Expand Down
5 changes: 4 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
sample:
uv run python ./bin/convert_all.py --input_folder=dict --output_folder=output --extension=tab --filter=Hero
uv run python ./bin/convert_all.py --input_folder=dict --output_folder=output --extension=tab --filter=C
echo "Release sample"

all:
Expand All @@ -16,3 +16,6 @@ dict_new_stats:

dict_stats:
uv run python ./bin/dict_summary.py --dict_dir=dict --read_only=yes

dict_ext_stats:
uv run python ./bin/dict_summary.py --dict_dir=ext-dict --read_only=yes
6 changes: 3 additions & 3 deletions bin/convert_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,13 +126,13 @@ def escape_forbidden_chars(text, forbidden_chars=r" (){}[]$*?^|<>\\"):
def main() -> None:
"""Main entry point""" # noqa: D401
parser = argparse.ArgumentParser(description="Convert all dictionaries in a folder")
parser.add_argument("-i", "--input_folder", required=True, help="Input folder containing .tsv and .dfo files")
parser.add_argument("-o", "--output_folder", required=True, help="Output folder containing dictionary files")
parser.add_argument("-i", "--input_folder", default="dict", help="Input folder containing .tsv and .dfo files")
parser.add_argument("-o", "--output_folder", default="output", help="Output folder containing dictionary files")
parser.add_argument("-e", "--extension", default="tab", help="Filename extention for input dictionary files. Default is .tab")
parser.add_argument("-m", "--metadata", default="dfo", help="Filename extention for input metadata for dictionary. Default is .dfo")
parser.add_argument("-f", "--filter", help="Filter only dictionary entries with matching keys (seperated by comma)")

args, array = parser.parse_known_args()
args = parser.parse_args()

input_folder = escape_forbidden_chars(args.input_folder)
output_folder = escape_forbidden_chars(args.output_folder)
Expand Down
191 changes: 101 additions & 90 deletions bin/dict_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,13 +56,14 @@ def count_lines_in_tab(tab_path):


SUPPORTED_EXTENSIONS = {
"dictd.zip": "DictD",
"dsl.dz": "Lingvo (DSL)",
"epub": "EPUB",
"kobo.zip": "Kobo",
"mobi": "Kindle (.mobi)",
"stardict.zip": "StartDict",
"yomitan.zip": "Yomitan",
"dictd.zip": {"dir": "dictd", "name": "DictD"},
"dsl.dz": {"dir": "lingvo", "name": "Lingvo (DSL)"},
"epub": {"dir": "epub", "name": "EPUB"},
"kobo.zip": {"dir": "kobo", "name": "Kobo"},
"mobi": {"dir": "kindle", "name": "Kindle (.mobi)"},
"stardict.zip": {"dir": "stardict", "name": "StartDict"},
"yomitan.zip": {"dir": "yomitan", "name": "Yomitan"},
# "mdict.zip": {"dir": "mdict", "MDict"),
}

COLUMNS = {
Expand Down Expand Up @@ -98,102 +99,112 @@ def generate_summary(dict_dir, output_dir):
num_dict_found = 0

for filename in os.listdir(dict_dir):
if filename.endswith(".dfo"):
filebase = filename[:-4]
dfo_path = os.path.join(dict_dir, filename)
tab_path = os.path.join(dict_dir, filebase + ".tab")

# Parse the .dfo file
metadata = parse_dfo_file(dfo_path)

# Count lines in the corresponding .tab file
num_definitions = count_lines_in_tab(tab_path)

# Generate the download URL for the main file
# main_download_url = f"https://github.com/catusf/tudien/releases/tag/{TAG_DOWNLOAD}/all-kindle.zip"

# Get the additional downloadable files
download_urls = get_downloadable_files(filebase, DOWNLOAD_TAG, dict_dir)

# Get full language names in Vietnamese
source_full_name = langcodes.Language.get(metadata["Source"]).display_name("vi")
# language_names.get(metadata['Source'], f"Unknown ({metadata['Source']})")
target_full_name = langcodes.Language.get(metadata["Target"]).display_name("vi")
# language_names.get(metadata['Target'], f"Unknown ({metadata['Target']})")

# Append the data to the list
data.append(
{
"Number": len(data) + 1, # Add numbering
"Name": metadata["Name"],
"Description": metadata["Description"],
"Source": f"{source_full_name} ({metadata['Source']})", # Full language name in Vietnamese
"Target": f"{target_full_name} ({metadata['Target']})", # Full language name in Vietnamese
"Owner/Editor": metadata["Owner/Editor"],
"Version": metadata["Version"],
"Definitions": num_definitions,
"Download": download_urls,
}
)

num_dict_found += 1

# Save the list of dictionaries as a JSON file
json_path = os.path.join(dict_dir, "dict_summary.json")
with open(json_path, "w", encoding="utf-8") as json_file:
json.dump(data, json_file, ensure_ascii=False, indent=4)
if not filename.endswith(".dfo"):
continue
filebase = filename[:-4]
dfo_path = os.path.join(dict_dir, filename)
tab_path = os.path.join(dict_dir, filebase + ".tab")

# Parse the .dfo file
metadata = parse_dfo_file(dfo_path)

# Count lines in the corresponding .tab file
num_definitions = count_lines_in_tab(tab_path)

# Generate the download URL for the main file
# main_download_url = f"https://github.com/catusf/tudien/releases/tag/{TAG_DOWNLOAD}/all-kindle.zip"

# Get the additional downloadable files
download_urls = get_downloadable_files(filebase, DOWNLOAD_TAG, dict_dir)

for ext in SUPPORTED_EXTENSIONS:
needed_files.append(os.path.join(output_dir, filebase + "." + ext))

# Get full language names in Vietnamese
source_full_name = langcodes.Language.get(metadata["Source"]).display_name("vi")
# language_names.get(metadata['Source'], f"Unknown ({metadata['Source']})")
target_full_name = langcodes.Language.get(metadata["Target"]).display_name("vi")
# language_names.get(metadata['Target'], f"Unknown ({metadata['Target']})")

# Append the data to the list
data.append(
{
"Number": len(data) + 1, # Add numbering
"Name": metadata["Name"],
"Description": metadata["Description"],
"Source": f"{source_full_name} ({metadata['Source']})", # Full language name in Vietnamese
"Target": f"{target_full_name} ({metadata['Target']})", # Full language name in Vietnamese
"Owner/Editor": metadata["Owner/Editor"],
"Version": metadata["Version"],
"Definitions": num_definitions,
"Download": download_urls,
}
)

num_dict_found += 1

for ext in SUPPORTED_EXTENSIONS:
item = SUPPORTED_EXTENSIONS[ext]
needed_files.append(os.path.join(output_dir, f"all-{item['dir']}.zip"))

# Save the list of dictionaries as a JSON file
json_path = os.path.join(dict_dir, "dict_summary.json")
with open(json_path, "w", encoding="utf-8") as json_file:
json.dump(data, json_file, ensure_ascii=False, indent=4)

print(f"Data file writtend to '{json_path}'.")
print(f"Data file writtend to '{json_path}'.")

existing_files = sorted(glob.glob(os.path.join(output_dir, "*.*")))
existing_files = sorted(glob.glob(os.path.join(output_dir, "*.*")))

missing_files = sorted(set(needed_files) - set(existing_files))
missing_files = sorted(set(needed_files) - set(existing_files))

print("JSON file 'dict_summary.json' has been generated.")
print("JSON file 'dict_summary.json' has been generated.")

files_status = "# Status report\n\n"
files_status += "## Counts\n\n"
files_per_format = len(SUPPORTED_EXTENSIONS)
existing_dicts = (len(existing_files) - files_per_format) / files_per_format
missing_dicts = len(missing_files) / files_per_format
if existing_dicts < 0:
existing_dicts = 0
missing_dicts -= 1
files_status = "# Status report\n\n"
files_status += "## Counts\n\n"
files_per_format = len(SUPPORTED_EXTENSIONS)
existing_dicts = (len(existing_files) - files_per_format) / files_per_format
missing_dicts = len(missing_files) / files_per_format

mismatched_dicts = num_dict_found - (existing_dicts + missing_dicts)
if existing_dicts < 0:
existing_dicts = 0
missing_dicts -= 1

assert num_dict_found == len(data)
# assert files_per_format * (num_dict_found + 1) == len(needed_files)
mismatched_dicts = num_dict_found - (existing_dicts + missing_dicts)

files_status += f"- There are **{len(data)}** dict files.\n\n"
files_status += f"- Total NEEDED files: **{len(needed_files)}**\n\n"
files_status += f"- Total EXISTING files: **{len(existing_files)}** "
files_status += f"- or **{existing_dicts:.1f}** dictionaries. "
if len(existing_files) % files_per_format != 0:
files_status += "ABNORMAL NUMBER of files. Some dict has **missing format(s)**. Check missing files list for details.\n\n"
else:
files_status += "The number of files looks NORMAL.\n\n"
assert num_dict_found == len(data)
# assert files_per_format * (num_dict_found + 1) == len(needed_files)

files_status += f"- There are **{len(data)}** dict files.\n\n"
files_status += f"- Total NEEDED files: **{len(needed_files)}**\n\n"
files_status += f"- Total GENERATED files: **{len(existing_files)}** "
files_status += f"- or **{existing_dicts:.1f}** dictionary sets. "

if len(missing_files) or len(existing_files) % files_per_format != 0:
files_status += "ABNORMAL NUMBER of files. Some dict has **missing format(s)**. Check missing files list for details.\n\n"
else:
files_status += "The number of files looks NORMAL.\n\n"

files_status += f"- Total MISSING files: {len(missing_files)}** "
files_status += f"(or **{missing_dicts:.1f}** dictionaries which is {'CORRECT' if mismatched_dicts == 0 else 'IN-CORRECT'})\n\n"
files_status += f"- Total MISSING files: {len(missing_files)}** "
files_status += f"(or **{missing_dicts:.1f}** dictionaries which is {'CORRECT' if mismatched_dicts == 0 else 'IN-CORRECT'})\n\n"

files_status_details = "# Errors\n"
files_status_details = "# Errors\n"

files_status_details += f"## Missing files list\n\n"
for item in missing_files:
files_status_details += f"\t{item}\n"
files_status_details += f"## Missing files list\n\n"
for item in missing_files:
files_status_details += f"\t{item}\n"

print(files_status_details)
print(files_status)
print(files_status_details)
print(files_status)

return data, files_status, files_status_details
return data, files_status, files_status_details


def generate_markdown_table(data, files_status, files_status_details, extensions, columns):
"""Generate a markdown table from the data."""
print(f"Generating report for {len(data)} dictionaries for {extensions}")

types = [SUPPORTED_EXTENSIONS[ext] for ext in extensions]
types = [SUPPORTED_EXTENSIONS[ext]["name"] for ext in extensions]
header = "| Number | Name | " # " Description | Source | Target | Owner/Editor | Definitions | " + " | ".join(types)]
seperator = "| --- | --- | " # " --- | --- | --- | --- | --- |" + " --- |" * len(extensions)]

Expand Down Expand Up @@ -264,12 +275,12 @@ def main():

# Parse command-line arguments
parser = argparse.ArgumentParser(description="Generate a dictionary summary.")
parser.add_argument("--dict_dir", type=str, nargs="?", default="dict", help="The directory containing the dictionary files (default is 'dict').")
parser.add_argument("--outfile", type=str, nargs="?", default="dict_summary.md", help="The output report file name (default is 'dict_summary.md').")
parser.add_argument("--output_dir", type=str, nargs="?", default="output", help="The output dir for all the dict results.")
parser.add_argument("--extensions", type=str, nargs="?", default=None, help="The extensions that need included in the report. None means all.")
parser.add_argument("--columns", type=str, nargs="?", default=None, help="The columns that will be kept (Other than the download links).")
parser.add_argument("--read_only", choices=["yes", "no"], default="no", required=False, help="Read data or create it.")
parser.add_argument("-d", "--dict_dir", default="dict", help="The directory containing the dictionary files (default is 'dict').")
parser.add_argument("-f", "--outfile", default="dict_summary.md", help="The output report file name (default is 'dict_summary.md').")
parser.add_argument("-o", "--output_dir", default="output", help="The output dir for all the dict results.")
parser.add_argument("-e", "--extensions", default=None, help="The extensions that need included in the report. None means all.")
parser.add_argument("-c", "--columns", default=None, help="The columns that will be kept (Other than the download links).")
parser.add_argument("-r", "--read_only", choices=["yes", "no"], default="no", required=False, help="Read data or create it.")

args = parser.parse_args()

Expand Down

0 comments on commit 588f038

Please sign in to comment.