From 8e15953fd5c060850adc691aab407cfafec2b1f4 Mon Sep 17 00:00:00 2001
From: NSoiffer <NSoiffer@users.noreply.github.com>
Date: Sun, 18 Aug 2024 03:54:29 +0100
Subject: [PATCH] Some work for Lambda code [not useful yet]

---
 PythonScripts/euro-braille.py | 70 +++++++++++++++++++++++++++++++----
 1 file changed, 62 insertions(+), 8 deletions(-)

diff --git a/PythonScripts/euro-braille.py b/PythonScripts/euro-braille.py
index 4052e4ae..db21c18c 100644
--- a/PythonScripts/euro-braille.py
+++ b/PythonScripts/euro-braille.py
@@ -24,7 +24,7 @@ def create_unicode_from_latex_symbols_html(out_file: str):
             unicode_list = list(map(lambda x: x.find('p').contents[0].split(' ')[0], foo))
             combined = sorted(zip(unicode_list, latex_list))
             for unicode, latex in combined:
-                write_line(unicode, latex, "", out_stream)
+                write_line(unicode, latex, "", False, out_stream)
 
 
 LATEX_COMMENT = """\
@@ -92,12 +92,15 @@ def get_unicode_standard_symbols() -> dict[str, list[str]]:
 UNICODE_CH_PATTERN = re.compile(r' - "(.)"')
 
 
-def get_unicode_yaml_chars() -> set[str]:
+def get_unicode_yaml_chars(file: str, include_ascii: bool) -> set[str]:
+    """Returns a set of all the chars 'file' (full path).
+       If 'include_ascii' is False, ASCII chars are excluded.
+    """
     answer = set()
-    with open("../Rules/Languages/en/unicode.yaml", "r", encoding='utf8') as unicode_stream:
+    with open(file, "r", encoding='utf8') as unicode_stream:
         for line in unicode_stream.readlines():
             matched = UNICODE_CH_PATTERN.match(line)
-            if matched and ord(matched.group(1)) > 127:
+            if matched and (include_ascii or ord(matched.group(1)) > 127):
                 answer.add(matched.group(1))
         for ch in range(ord('Α'), ord('Ω')):   # these are a range in unicode.yaml, so the pattern doesn't match
             answer.add(chr(ch))
@@ -105,7 +108,7 @@ def get_unicode_yaml_chars() -> set[str]:
 
 
 # The chars in unicode.yaml (others go into unicode-full.yaml)
-UNICODE_CHARS_SHORT = get_unicode_yaml_chars()
+UNICODE_CHARS_SHORT = get_unicode_yaml_chars("../Rules/Languages/en/unicode.yaml", False)
 
 
 def get_short_dict() -> dict[str, str]:
@@ -168,7 +171,8 @@ def extract_latex(in_file):
                 # add in ASCII and the Greek block
                 is_in_common_char_blocks = code < 0x7F or (0x0370 <= code and code <= 0x03fF)
                 stream = short_stream if ch in UNICODE_CHARS_SHORT or is_in_common_char_blocks else full_stream
-                # use the standard name unless the char is in the override dict (if it and the standard name is an option, write it first)
+                # use the standard name unless the char is in the override dict
+                #  if it and the standard name is an option, write it first
                 if ch in standard_names:
                     latex_names = standard_names[ch]
                     is_overridden = ch in overrides
@@ -404,7 +408,57 @@ def create_ascii_math(out_file: str):
             write_line(chr(0x2064), '', '', False, out_stream)
 
 
+# create a list of chars for lambda conversion
+def print_lambda_list():
+    chars_as_set = get_unicode_yaml_chars("../Rules/Braille/UEB/unicode.yaml", True)
+    print(*sorted(chars_as_set), sep="\n")
+
+
+def missing_unicode_chars() -> None:
+    """Make sure all the math chars in unicode.xml are listed in one of MathCAT's unicode files"""
+    tree = ET.parse(r"c:\dev\mathml-refresh\xml-entities\unicode.xml")
+    root: ET.Element = tree.getroot()
+    print(f"Root='{root}")
+    all_char_elements = root.find("charlist")
+    if all_char_elements is None:
+        print(r"Didn't find XML root in c:\dev\mathml-refresh\xml-entities\unicode.xml!")
+        exit(1)
+    all_unicode_math_chars = set()
+    for char_element in all_char_elements:
+        if char_element is None:
+            print("char_element is None!")
+            continue
+        unicode_data = char_element.find("unicodedata")
+        if unicode_data is None:
+            continue
+        mathclass = unicode_data.get("mathclass", default="none")
+        if mathclass == "none" or mathclass == "A" or mathclass == "G":  # Alphabetic and Glyph classes
+            continue
+        # if unicode_data.get("category", default="none") != "Sm":
+        #     continue
+        ch = char_element.get("id")
+        if ch is None:
+            print('char_element.get("id") is None!')
+            continue
+        ch = convert_to_char(ch)
+        if len(ch) > 1:
+            continue
+        all_unicode_math_chars.add(ch)
+    print(f"#all_unicode_math_chars = {len(all_unicode_math_chars)}")
+    mathcat_chars = get_unicode_yaml_chars("../Rules/Languages/en/unicode.yaml", True) \
+        .union(get_unicode_yaml_chars("../Rules/Languages/en/unicode-full.yaml", True))
+    print(f"#mathcat_chars = {len(mathcat_chars)}")
+    missing_chars = all_unicode_math_chars.difference(mathcat_chars)
+    print(f"#mathcat_chars = {len(missing_chars)}")
+
+    with open("missing_chars.yaml", 'w', encoding='utf8') as out_stream:
+        for ch in sorted(missing_chars):
+            write_line(ch, '', '', False, out_stream)
+
+
 # create_unicode_from_list_of_symbols_html("euro-symbols2.yaml")
 # create_greek_letters("greek-letters.yaml")
-# extract_latex("c:\\dev\\mathml-refresh\\xml-entities\\unicode.xml")
-create_ascii_math("ascii-math-unicode.yaml")
+# extract_latex(r"c:\dev\mathml-refresh\xml-entities\unicode.xml")
+# create_ascii_math("ascii-math-unicode.yaml")
+# print_lambda_list()
+missing_unicode_chars()