PythonScripts/nemeth-convert.py

# convert CSV file (saved from Murray Sargent's word file) for Unicode -> Nemeth (and UEB) mappings
#  this file doesn't include digits and letters, so they are added separately
# FIX: this table seems incomplete when compared with SRE
# FIX: add SRE json contents from speech-rule-engine\mathmaps\nemeth when there is a translation
# FIX:   it would be good to add in the description, but that involves look up into the Unicode file
import csv
def write_nemeth_yaml(in_file, out_file):
    with open(out_file, 'w', encoding="utf8") as out_stream:
        with open(in_file, encoding="utf8") as csv_file:
            csv_reader = csv.reader(csv_file, delimiter=',')
            out_stream.write("---\n")
            write_letters_and_digits(out_stream)
            # entries are a list of numeric code point, char, full name, Nemeth, UEB
            for entry in csv_reader:
                write_yaml_line(out_stream, unicode_char(entry), nemeth(entry), code_point(entry), unicode_name(entry))

            # add space and non-breaking space
            write_yaml_line(out_stream, " ", "⠀","0020", "space")
            write_yaml_line(out_stream, " ", "⠀","00A0", "non-breaking space")

            # add invisible chars inserted by canonicalization
            write_comma_line(out_stream)
            write_yaml_line(out_stream, "⁡", "","2061⁡", "invisible function apply")
            write_yaml_line(out_stream, "⁢", "","2062⁡", "invisible times")
            write_yaml_line(out_stream, "⁣", "","2063⁡", "invisible separator")
            write_yaml_line(out_stream, "⁤", "","2064", "invisible plus")


def code_point(list):
    return list[0]

def unicode_char(list):
    ch = list[1]
    # escape quotes and backslashes
    if (ch == '"' or ch == '\\'):
        ch = "\\" + ch
    return ch

def unicode_name(list):
    return list[2]

def nemeth(list):
    return list[3]

def ueb(list):
    return list[4]

def write_yaml_line(out_stream, char, nemeth, hex, unicode_name):
    # we do the write in two parts so that the comment is aligned
    first_part = ' - "{}": [t: "{}"]'.format(char, nemeth)
    out_stream.write('{:32}# 0x{} ({})\n'.format(
            first_part, hex, unicode_name))

def write_letters_and_digits(out_stream):
    digits = ["⠴", "⠂","⠆","⠒","⠲","⠢","⠖","⠶","⠦","⠔"]
    small_latin = ["⠁", "⠃", "⠉", "⠙", "⠑", "⠋", "⠛", "⠓", "⠊", "⠚", "⠅", "⠇", "⠍",
                 "⠝", "⠕", "⠏", "⠟", "⠗", "⠎", "⠞", "⠥", "⠧", "⠺", "⠭", "⠽", "⠵" ]
    cap_latin = ["⠠⠁", "⠠⠃", "⠠⠉", "⠠⠙", "⠠⠑", "⠠⠋", "⠠⠛", "⠠⠓", "⠠⠊", "⠠⠚", "⠠⠅",
                 "⠠⠇", "⠠⠍", "⠠⠝", "⠠⠕", "⠠⠏", "⠠⠟", "⠠⠗", "⠠⠎", "⠠⠞", "⠠⠥", "⠠⠧", "⠠⠺", "⠠⠭", "⠠⠽", "⠠⠵" ]
    write_range(out_stream, digits, '0')
    write_range(out_stream, small_latin, 'a')
    write_range(out_stream, cap_latin, 'A')


def write_range(out_stream, list, first_char):
    for i in range(0,len(list)):
        unicode = ord(first_char) + i
        write_yaml_line(out_stream, chr(unicode), list[i], hex(unicode)[2:], "")

def write_comma_line(out_stream):
    # comma needs a special test when in a script
    out_stream.write('{:32}# 0x{} ({})\n'.format(' - ",":', "002C", "Comma"))
    out_stream.write('     - test:\n')
    out_stream.write('         if: "parent::*[self::m:msub or self::m:msup or self::m:msubsup]"\n')
    out_stream.write('         then: [t: "⠪"]\n')
    out_stream.write('         else: [t: "⠂"]\n')


write_nemeth_yaml("nemeth.csv", "unicode.yaml")