Fixes reporting

catusf · Dec 18, 2024 · 588f038 · 588f038
1 parent f37f1a2
commit 588f038
Show file tree

Hide file tree

Showing 5 changed files with 162 additions and 106 deletions.
diff --git a/.gitignore b/.gitignore
@@ -78,3 +78,54 @@ ext-dict/ChineseThesaurus-mid.txt
 ext-dict/ChineseThesaurus-small.dfo
 ext-dict/ChineseThesaurus-small.tab
 ext-dict/ChineseThesaurus-small.txt
+ext-dict/HanziHeroPleco.dfo
+ext-dict/HanziHeroPleco.tab
+ext-dict/HanziHeroPleco.txt
+ext-dict/Ngu-vung-Danh-tu-Thien-hoc.dfo
+ext-dict/Ngu-vung-Danh-tu-Thien-hoc.tab
+ext-dict/opencc_pleco.txt
+ext-dict/Phat-Quang-Dai-tu-dien-Han-ngu.dfo
+ext-dict/Phat-Quang-Dai-tu-dien-Han-ngu.tab
+ext-dict/radical_name_pleco.dfo
+ext-dict/radical_name_pleco.tab
+ext-dict/radical_name_pleco.txt
+ext-dict/Rong-mo-tam-hon.dfo
+ext-dict/Rong-mo-tam-hon.tab
+ext-dict/TrungViet-big.dfo
+ext-dict/TrungViet-big.tab.bz2
+ext-dict/TrungViet-big.txt.bz2
+ext-dict/TrungViet-mid.dfo
+ext-dict/TrungViet-mid.tab
+ext-dict/TrungViet-mid.txt
+ext-dict/TrungViet-small.dfo
+ext-dict/TrungViet-small.tab
+ext-dict/TrungViet-small.txt
+ext-dict/Tu-dien-Dao-Uyen.dfo
+ext-dict/Tu-dien-Dao-Uyen.tab
+ext-dict/Tu-dien-Phat-hoc-Anh-Han-Viet.dfo
+ext-dict/Tu-dien-Phat-hoc-Anh-Han-Viet.tab
+ext-dict/Tu-dien-Phat-hoc-Tinh-tuyen.dfo
+ext-dict/Tu-dien-Phat-hoc-Tinh-tuyen.tab
+ext-dict/Tu-dien-Phat-hoc-Viet-Anh-Dong-Loai.dfo
+ext-dict/Tu-dien-Phat-hoc-Viet-Anh-Dong-Loai.tab
+ext-dict/Tu-dien-Phat-hoc-Viet-Anh-Minh-Thong.dfo
+ext-dict/Tu-dien-Phat-hoc-Viet-Anh-Minh-Thong.tab
+ext-dict/Tu-dien-Phat-hoc-Viet-Anh-Thien-Phuc.dfo
+ext-dict/Tu-dien-Phat-hoc-Viet-Anh-Thien-Phuc.tab
+ext-dict/Tu-dien-Phat-Quang.dfo
+ext-dict/Tu-dien-Phat-Quang.tab
+ext-dict/Tu-dien-ThienChuu-TranVanChanh.dfo
+ext-dict/Tu-dien-ThienChuu-TranVanChanh.pleco
+ext-dict/Tu-dien-ThienChuu-TranVanChanh.tab
+ext-dict/Tu-dien-ThienChuu+TranVanChanh_v1.0.pqb
+ext-dict/Tu-dien-Tong-hop-Phat-hoc.dfo
+ext-dict/Tu-dien-Tong-hop-Phat-hoc.json
+ext-dict/Tu-dien-Tong-hop-Phat-hoc.tab
+ext-dict/TudienAnhVietAnh.dfo
+ext-dict/TudienAnhVietAnh.tab
+ext-dict/TudienAnhVietBeta-Inflections.txt
+ext-dict/TudienAnhVietBeta.dfo
+ext-dict/TudienAnhVietBeta.tab
+ext-dict/TudienThienChuu.dfo
+ext-dict/TudienThienChuu.tab
+ext-dict/TudienThienChuu.txt
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -5,24 +5,15 @@
     "version": "0.2.0",
     "configurations": [
         {
-<<<<<<< HEAD
             "name": "Python Debugger: Current File with Arguments",
-=======
-            "name": "Python: Current File",
->>>>>>> 376004ac6181baa40ae9042204a57630f6e2a9ba
             "type": "debugpy",
             "request": "launch",
             "program": "${file}",
             "console": "integratedTerminal",
             "args": [
-<<<<<<< HEAD
-                "--extensions=mobi,epub",
-                "--read-only=no"
-=======
-                "--input_folder=./dict",
-                "--output_folder=./output",
-                "--extension=tab",
->>>>>>> 376004ac6181baa40ae9042204a57630f6e2a9ba
+                "-d=ext-dict",
+                "-o=ext-output",
+                "-r=no"
             ]
         }
     ]

diff --git a/Makefile b/Makefile
@@ -1,5 +1,5 @@
 sample:
-	uv run python ./bin/convert_all.py --input_folder=dict --output_folder=output --extension=tab --filter=Hero
+	uv run python ./bin/convert_all.py --input_folder=dict --output_folder=output --extension=tab --filter=C
 	echo "Release sample"
 
 all:
@@ -16,3 +16,6 @@ dict_new_stats:
 
 dict_stats:
 	uv run python ./bin/dict_summary.py --dict_dir=dict --read_only=yes
+
+dict_ext_stats:
+	uv run python ./bin/dict_summary.py --dict_dir=ext-dict --read_only=yes
diff --git a/bin/convert_all.py b/bin/convert_all.py
@@ -126,13 +126,13 @@ def escape_forbidden_chars(text, forbidden_chars=r" (){}[]$*?^|<>\\"):
 def main() -> None:
     """Main entry point"""  # noqa: D401
     parser = argparse.ArgumentParser(description="Convert all dictionaries in a folder")
-    parser.add_argument("-i", "--input_folder", required=True, help="Input folder containing .tsv and .dfo files")
-    parser.add_argument("-o", "--output_folder", required=True, help="Output folder containing dictionary files")
+    parser.add_argument("-i", "--input_folder", default="dict", help="Input folder containing .tsv and .dfo files")
+    parser.add_argument("-o", "--output_folder", default="output", help="Output folder containing dictionary files")
     parser.add_argument("-e", "--extension", default="tab", help="Filename extention for input dictionary files. Default is .tab")
     parser.add_argument("-m", "--metadata", default="dfo", help="Filename extention for input metadata for dictionary. Default is .dfo")
     parser.add_argument("-f", "--filter", help="Filter only dictionary entries with matching keys (seperated by comma)")
 
-    args, array = parser.parse_known_args()
+    args = parser.parse_args()
 
     input_folder = escape_forbidden_chars(args.input_folder)
     output_folder = escape_forbidden_chars(args.output_folder)

diff --git a/bin/dict_summary.py b/bin/dict_summary.py
@@ -56,13 +56,14 @@ def count_lines_in_tab(tab_path):
 
 
 SUPPORTED_EXTENSIONS = {
-    "dictd.zip": "DictD",
-    "dsl.dz": "Lingvo (DSL)",
-    "epub": "EPUB",
-    "kobo.zip": "Kobo",
-    "mobi": "Kindle (.mobi)",
-    "stardict.zip": "StartDict",
-    "yomitan.zip": "Yomitan",
+    "dictd.zip": {"dir": "dictd", "name": "DictD"},
+    "dsl.dz": {"dir": "lingvo", "name": "Lingvo (DSL)"},
+    "epub": {"dir": "epub", "name": "EPUB"},
+    "kobo.zip": {"dir": "kobo", "name": "Kobo"},
+    "mobi": {"dir": "kindle", "name": "Kindle (.mobi)"},
+    "stardict.zip": {"dir": "stardict", "name": "StartDict"},
+    "yomitan.zip": {"dir": "yomitan", "name": "Yomitan"},
+    # "mdict.zip": {"dir": "mdict", "MDict"),
 }
 
 COLUMNS = {
@@ -98,102 +99,112 @@ def generate_summary(dict_dir, output_dir):
     num_dict_found = 0
 
     for filename in os.listdir(dict_dir):
-        if filename.endswith(".dfo"):
-            filebase = filename[:-4]
-            dfo_path = os.path.join(dict_dir, filename)
-            tab_path = os.path.join(dict_dir, filebase + ".tab")
-
-            # Parse the .dfo file
-            metadata = parse_dfo_file(dfo_path)
-
-            # Count lines in the corresponding .tab file
-            num_definitions = count_lines_in_tab(tab_path)
-
-            # Generate the download URL for the main file
-            # main_download_url = f"https://github.com/catusf/tudien/releases/tag/{TAG_DOWNLOAD}/all-kindle.zip"
-
-            # Get the additional downloadable files
-            download_urls = get_downloadable_files(filebase, DOWNLOAD_TAG, dict_dir)
-
-            # Get full language names in Vietnamese
-            source_full_name = langcodes.Language.get(metadata["Source"]).display_name("vi")
-            # language_names.get(metadata['Source'], f"Unknown ({metadata['Source']})")
-            target_full_name = langcodes.Language.get(metadata["Target"]).display_name("vi")
-            # language_names.get(metadata['Target'], f"Unknown ({metadata['Target']})")
-
-            # Append the data to the list
-            data.append(
-                {
-                    "Number": len(data) + 1,  # Add numbering
-                    "Name": metadata["Name"],
-                    "Description": metadata["Description"],
-                    "Source": f"{source_full_name} ({metadata['Source']})",  # Full language name in Vietnamese
-                    "Target": f"{target_full_name} ({metadata['Target']})",  # Full language name in Vietnamese
-                    "Owner/Editor": metadata["Owner/Editor"],
-                    "Version": metadata["Version"],
-                    "Definitions": num_definitions,
-                    "Download": download_urls,
-                }
-            )
-
-            num_dict_found += 1
-
-        # Save the list of dictionaries as a JSON file
-        json_path = os.path.join(dict_dir, "dict_summary.json")
-        with open(json_path, "w", encoding="utf-8") as json_file:
-            json.dump(data, json_file, ensure_ascii=False, indent=4)
+        if not filename.endswith(".dfo"):
+            continue
+        filebase = filename[:-4]
+        dfo_path = os.path.join(dict_dir, filename)
+        tab_path = os.path.join(dict_dir, filebase + ".tab")
+
+        # Parse the .dfo file
+        metadata = parse_dfo_file(dfo_path)
+
+        # Count lines in the corresponding .tab file
+        num_definitions = count_lines_in_tab(tab_path)
+
+        # Generate the download URL for the main file
+        # main_download_url = f"https://github.com/catusf/tudien/releases/tag/{TAG_DOWNLOAD}/all-kindle.zip"
+
+        # Get the additional downloadable files
+        download_urls = get_downloadable_files(filebase, DOWNLOAD_TAG, dict_dir)
+
+        for ext in SUPPORTED_EXTENSIONS:
+            needed_files.append(os.path.join(output_dir, filebase + "." + ext))
+
+        # Get full language names in Vietnamese
+        source_full_name = langcodes.Language.get(metadata["Source"]).display_name("vi")
+        # language_names.get(metadata['Source'], f"Unknown ({metadata['Source']})")
+        target_full_name = langcodes.Language.get(metadata["Target"]).display_name("vi")
+        # language_names.get(metadata['Target'], f"Unknown ({metadata['Target']})")
+
+        # Append the data to the list
+        data.append(
+            {
+                "Number": len(data) + 1,  # Add numbering
+                "Name": metadata["Name"],
+                "Description": metadata["Description"],
+                "Source": f"{source_full_name} ({metadata['Source']})",  # Full language name in Vietnamese
+                "Target": f"{target_full_name} ({metadata['Target']})",  # Full language name in Vietnamese
+                "Owner/Editor": metadata["Owner/Editor"],
+                "Version": metadata["Version"],
+                "Definitions": num_definitions,
+                "Download": download_urls,
+            }
+        )
+
+        num_dict_found += 1
+
+    for ext in SUPPORTED_EXTENSIONS:
+        item = SUPPORTED_EXTENSIONS[ext]
+        needed_files.append(os.path.join(output_dir, f"all-{item['dir']}.zip"))
+
+    # Save the list of dictionaries as a JSON file
+    json_path = os.path.join(dict_dir, "dict_summary.json")
+    with open(json_path, "w", encoding="utf-8") as json_file:
+        json.dump(data, json_file, ensure_ascii=False, indent=4)
 
-            print(f"Data file writtend to '{json_path}'.")
+        print(f"Data file writtend to '{json_path}'.")
 
-        existing_files = sorted(glob.glob(os.path.join(output_dir, "*.*")))
+    existing_files = sorted(glob.glob(os.path.join(output_dir, "*.*")))
 
-        missing_files = sorted(set(needed_files) - set(existing_files))
+    missing_files = sorted(set(needed_files) - set(existing_files))
 
-        print("JSON file 'dict_summary.json' has been generated.")
+    print("JSON file 'dict_summary.json' has been generated.")
 
-        files_status = "# Status report\n\n"
-        files_status += "## Counts\n\n"
-        files_per_format = len(SUPPORTED_EXTENSIONS)
-        existing_dicts = (len(existing_files) - files_per_format) / files_per_format
-        missing_dicts = len(missing_files) / files_per_format
-        if existing_dicts < 0:
-            existing_dicts = 0
-            missing_dicts -= 1
+    files_status = "# Status report\n\n"
+    files_status += "## Counts\n\n"
+    files_per_format = len(SUPPORTED_EXTENSIONS)
+    existing_dicts = (len(existing_files) - files_per_format) / files_per_format
+    missing_dicts = len(missing_files) / files_per_format
 
-        mismatched_dicts = num_dict_found - (existing_dicts + missing_dicts)
+    if existing_dicts < 0:
+        existing_dicts = 0
+        missing_dicts -= 1
 
-        assert num_dict_found == len(data)
-        # assert files_per_format * (num_dict_found + 1) == len(needed_files)
+    mismatched_dicts = num_dict_found - (existing_dicts + missing_dicts)
 
-        files_status += f"- There are **{len(data)}** dict files.\n\n"
-        files_status += f"- Total NEEDED files: **{len(needed_files)}**\n\n"
-        files_status += f"- Total EXISTING files: **{len(existing_files)}** "
-        files_status += f"- or **{existing_dicts:.1f}** dictionaries. "
-        if len(existing_files) % files_per_format != 0:
-            files_status += "ABNORMAL NUMBER of files. Some dict has **missing format(s)**. Check missing files list for details.\n\n"
-        else:
-            files_status += "The number of files looks NORMAL.\n\n"
+    assert num_dict_found == len(data)
+    # assert files_per_format * (num_dict_found + 1) == len(needed_files)
+
+    files_status += f"- There are **{len(data)}** dict files.\n\n"
+    files_status += f"- Total NEEDED files: **{len(needed_files)}**\n\n"
+    files_status += f"- Total GENERATED files: **{len(existing_files)}** "
+    files_status += f"- or **{existing_dicts:.1f}** dictionary sets. "
+
+    if len(missing_files) or len(existing_files) % files_per_format != 0:
+        files_status += "ABNORMAL NUMBER of files. Some dict has **missing format(s)**. Check missing files list for details.\n\n"
+    else:
+        files_status += "The number of files looks NORMAL.\n\n"
 
-        files_status += f"- Total MISSING files: {len(missing_files)}** "
-        files_status += f"(or **{missing_dicts:.1f}** dictionaries which is {'CORRECT' if mismatched_dicts == 0 else 'IN-CORRECT'})\n\n"
+    files_status += f"- Total MISSING files: {len(missing_files)}** "
+    files_status += f"(or **{missing_dicts:.1f}** dictionaries which is {'CORRECT' if mismatched_dicts == 0 else 'IN-CORRECT'})\n\n"
 
-        files_status_details = "# Errors\n"
+    files_status_details = "# Errors\n"
 
-        files_status_details += f"## Missing files list\n\n"
-        for item in missing_files:
-            files_status_details += f"\t{item}\n"
+    files_status_details += f"## Missing files list\n\n"
+    for item in missing_files:
+        files_status_details += f"\t{item}\n"
 
-        print(files_status_details)
-        print(files_status)
+    print(files_status_details)
+    print(files_status)
 
-        return data, files_status, files_status_details
+    return data, files_status, files_status_details
 
 
 def generate_markdown_table(data, files_status, files_status_details, extensions, columns):
     """Generate a markdown table from the data."""
     print(f"Generating report for {len(data)} dictionaries for {extensions}")
 
-    types = [SUPPORTED_EXTENSIONS[ext] for ext in extensions]
+    types = [SUPPORTED_EXTENSIONS[ext]["name"] for ext in extensions]
     header = "| Number | Name | "  # " Description | Source | Target | Owner/Editor | Definitions | " + " | ".join(types)]
     seperator = "| --- | --- | "  # " --- | --- | --- | --- | --- |" + " --- |" * len(extensions)]
 
@@ -264,12 +275,12 @@ def main():
 
     # Parse command-line arguments
     parser = argparse.ArgumentParser(description="Generate a dictionary summary.")
-    parser.add_argument("--dict_dir", type=str, nargs="?", default="dict", help="The directory containing the dictionary files (default is 'dict').")
-    parser.add_argument("--outfile", type=str, nargs="?", default="dict_summary.md", help="The output report file name (default is 'dict_summary.md').")
-    parser.add_argument("--output_dir", type=str, nargs="?", default="output", help="The output dir for all the dict results.")
-    parser.add_argument("--extensions", type=str, nargs="?", default=None, help="The extensions that need included in the report. None means all.")
-    parser.add_argument("--columns", type=str, nargs="?", default=None, help="The columns that will be kept (Other than the download links).")
-    parser.add_argument("--read_only", choices=["yes", "no"], default="no", required=False, help="Read data or create it.")
+    parser.add_argument("-d", "--dict_dir", default="dict", help="The directory containing the dictionary files (default is 'dict').")
+    parser.add_argument("-f", "--outfile", default="dict_summary.md", help="The output report file name (default is 'dict_summary.md').")
+    parser.add_argument("-o", "--output_dir", default="output", help="The output dir for all the dict results.")
+    parser.add_argument("-e", "--extensions", default=None, help="The extensions that need included in the report. None means all.")
+    parser.add_argument("-c", "--columns", default=None, help="The columns that will be kept (Other than the download links).")
+    parser.add_argument("-r", "--read_only", choices=["yes", "no"], default="no", required=False, help="Read data or create it.")
 
     args = parser.parse_args()