Skip to content

Commit

Permalink
Added full set of names based on unicode.xml
Browse files Browse the repository at this point in the history
Changed from emitting dots to using text based on feedback

Renamed "EuroBraille" to "LaTeX"

Still need to look through name list some more.
  • Loading branch information
NSoiffer committed Mar 6, 2024
1 parent 3162691 commit b341e84
Show file tree
Hide file tree
Showing 12 changed files with 899 additions and 589 deletions.
114 changes: 114 additions & 0 deletions PythonScripts/euro-braille-short.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
<= \le kleiner gleich
<< \ll viel kleiner
>= \ge groesser gleich
>> \gg viel groesser
\8 \infty unendlich
\apx \approx etwa
\be \begin{equation}
\bea \begin{eqnarray}
\beas \begin{eqnarray*}
\bs \backslash Backslash
\cd \cdots mittige Dots
\da \downarrow einfacher Pfeil nach unten
\dd \ddots diagonale Dots
\Do \o kleines daenisches 'o'
\ee \end{equation}
\eea \end{eqnarray}
\eeas \end{eqnarray*}
\eqv \equiv aequivalent
\es \emptyset leere Menge
\ex \exists es existiert
\f \frac Bruch
\fa \forall fuer alle
\fmf Rahmen um einfache Formel
\fsf Rahmen um mehrzeilige Formel
\inn \int\nolimits Integral nolimits
\inl \int\limits Integral limits
\l< \langle Winkelklammer links
\la \leftarrow einfacher Pfeil nach links
\ld \ldots line Dots
\lgla \longleftarrow langer Pfeil nach links
\lglra \longleftrightarrow langer Pfeil nach links und rechts
\lgmt \longmapsto langer folgt-aus-Pfeil
\lgra \longrightarrow langer Pfeil nach rechts
\lra \leftrightarrow einfacher Pfeil nach links und rechts
\Lra \leftrightarrow doppelter Pfeil nach links und rechts
\lt \leadsto fuehrt-zu-Pfeil
\mt \mapsto folgt-aus-Pfeil
\n \not Negation
\o+ \oplus
\o- \ominus
\o. \opoint
\o/ \oslash
\oin \oint\nolimits Kreisintegral nolimits
\oil \oint\limits Kreisintegral limits
\ol \overline Ueberstrich
\ox \otimes
\pn \prod\nolimits Produkt nolimits
\pl \prod\limits Produkt limits
\Pl \l kleines polnisches 'l'
\pll \parallel parallel
\r> \rangle Winkelklammer rechts
\ra \rightarrow einfacher Pfeil nach rechts
\Ra \rightarrow doppelter Pfeil nach rechts
\s \sqrt Wurzel
\sbs \subset Untermenge
\sbse \subseteq Untermenge und gleich
\sps \supset Obermenge
\spse \supseteq Obermenge und gleich
\sun \sum\nolimits Summe nolimits
\sul \sum\limits Summe limits
\tri \triangle Dreieck
\ua \uparrow einfacher Pfeil nach unten
\uda \updownarrow einfacher Pfeil nach unten und oben
\ul \underline Unterstrich
\vd \vdots vertikale Dots
\x \times
~a \alpha
~b \beta
~g \gamma
~d \delta
~e \epsilon
~z \zeta
~j \eta
~h \theta
~i \iota
~k \kappa
~l \lambda
~m \mu
~n \nu
~x \xi
~o \o
~p \pi
~r \rho
~s \sigma
~t \tau
~u \upsilon
~f \phi
~c \chi
~y \psi
~w \omega
~A \Alpha
~B \Beta
~G \Gamma
~D \Delta
~E \Epsilon
~Z \Zeta
~J \Eta
~H \Theta
~I \Iota
~K \Kappa
~L \Lambda
~M \Mu
~N \Nu
~X \Xi
~O \O
~P \Pi
~R \Rho
~S \Sigma
~T \Tau
~U \Upsilon
~F \Phi
~C \Chi
~Y \Psi
~W \Omega
172 changes: 131 additions & 41 deletions PythonScripts/euro-braille.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from html_table_extractor.extractor import Extractor
from typing import TextIO
import sys
from string import ascii_uppercase, ascii_lowercase
import xml.etree.ElementTree as ET
import re
sys.stdout.reconfigure(encoding='utf-8')
Expand Down Expand Up @@ -79,7 +80,28 @@ def get_unicode_yaml_chars() -> set[str]:
UNICODE_CHARS_SHORT = get_unicode_yaml_chars()


def get_short_dict() -> dict[str, str]:
with open("euro-braille-short.csv", "r", encoding='utf8') as stream:
answer = {}
for line in stream.readlines():
parts: list[str] = list(filter(lambda x: x != '', line.split(' ')))
short_name = parts[0].strip()
latex_name = parts[1].strip()
answer[latex_name] = short_name
for ch in ascii_lowercase + ascii_uppercase:
answer[f'\\mathbb{{{ch}}}'] = f'\\{ch}'
return answer


def extract_latex(in_file):
short_names = get_short_dict()
overrides = {
"*": "*", "{": "\\{", "}": "\\}", "|": "|",
"°": "°", "ϵ": "\\epsilon", "≠": "\\not=", # varepsilon
"′": "'", "″": "''", "‴": "'''",
"△": "\\triangle", "→": "\\to",
}

tree = ET.parse(in_file)
root = tree.getroot()
all_chars = root.find("charlist")
Expand All @@ -90,35 +112,69 @@ def extract_latex(in_file):
full_stream.write("---\n")
for char in all_chars:
ch = convert_to_char(char.get("id"))
if len(ch) == 1 and ord(ch) < 128:
if len(ch) > 1:
continue
code = ord(ch)
if code < 0x20:
continue
if ch in overrides:
latex_name = overrides[ch]
write_line(ch, latex_name, short_names.get(latex_name, ''), short_stream)
continue

# add in ASCII and the Greek block
stream = short_stream if ch in UNICODE_CHARS_SHORT or code < 0x7F or (0x0370 <= code and code <= 0x03fF) else full_stream

stream = short_stream if ch in UNICODE_CHARS_SHORT else full_stream
latex = char.find("latex")
var_latex = char.find("varlatex")
ams_latex = char.find("ams")
math_latex = char.find("mathlatex")
# if latex is None and not(var_latex is None and math_latex is None):
# print(f"No latex for ch: {ch}/{char.get('id')}" +
# "" if var_latex is None else f"var_latex={var_latex.text}" +
# "" if math_latex is None else f"math_latex={math_latex.text}"
# )
# continue

names_seen = []
for latex_name in [latex, var_latex, ams_latex, math_latex]:
# I wish there was a simple way to choose the names.
# Based on what David Carlisle (who maintains unicode.xml) recomends,
# 'math_latex' is the preferred field except for the alphabets (I only exclude Greek and math alphanumerics)
# For those, math_latex is more technically correct but not what most latex users are accustomed to
names_seen: list[str] = []
for style in ["mathlatex", "latex", "varlatex", "ams"]:
latex_name = char.find(style)
if latex_name is None:
continue
latex_name = latex_name.text.strip()
if latex_name in names_seen:
latex_name:str = latex_name.text.strip()
# the fontencoding char won't happen and the \unicode (two ellipsis entries) have short names for the latex style
if latex_name.startswith('{\\fontencoding{') or latex_name.startswith('\\unicode'):
continue
if latex_name.startswith('\\up') and "\\" + latex_name[3:] in names_seen: # "\upiota", etc, is skipped
if not latex_name.startswith('\\') and not latex_name.startswith('{') and code >= 0x7F:
latex_name = '\\' + latex_name # some are missing the initial \
if latex_name.startswith('\\mathchar'):
continue # seems to happen once -- not sure what that is about
if style == 'mathlatex':
if code < 0x7F:
continue # use the latex names
if 0x0370 <= code and code <= 0x03fF:
continue # Greek block
if 0x1D400 <= code and code <= 0x1D7FF:
continue # alphanumerics
if latex_name.startswith('\\Bbb'): # some blackboard chars (ℝ, etc) not in math alphanumerics
continue
if latex_name.startswith('\\mbox'):
continue # the alternative name avoids that and so is better
if latex_name.lower().find('theta') != -1:
latex_name = latex_name.replace("text", "") # don't care about upright theta
elif ch == '$':
latex_name = '\\$'
elif ch == '\\':
latex_name = '\\backslash' # avoid '\textbackslash'
elif latex_name.startswith("\\mitBbb"):
latex_name = latex_name.replace("\\mitBbb", "") # exponential e, etc
if latex_name in names_seen:
continue
if len(names_seen) > 0:
stream.write('# ') # alternative name
write_line(ch, latex_name, stream)
write_line(ch, latex_name, short_names.get(latex_name, ''), stream)
names_seen.append(latex_name)

# write the invisible chars out
short_stream.write('\n # invisible chars\n')
write_line(chr(0x2061), '', '', short_stream)
write_line(chr(0x2062), '', '', short_stream)
write_line(chr(0x2063), '', '', short_stream)
write_line(chr(0x2064), '', '', short_stream)


def convert_to_char(str: str) -> str:
# str is 'Uddddd' or 'Uddddd-ddddd'
Expand All @@ -127,29 +183,14 @@ def convert_to_char(str: str) -> str:
for char_str in str.split("-"):
# FIX: need to add backslash is str becomes ""
ch = chr(int(char_str, base=16))
if (ch == '"' or ch == '\\'):
answer += "\\"
# if (ch == '"' or ch == '\\'):
# answer += "\\"
answer += ch

return answer


def create_greek_letters(out_file: str):
# the HTML file has rowspans in it -- hence the use of table extractor
with open("greek-letters.txt", encoding='utf8') as in_stream:
with open(out_file, 'w', encoding='utf8') as out_stream:
all_entries = []
lines = in_stream.readlines()
for line in lines:
parts = line.split('\t')
if parts[1].startswith('\\'): # ignore 'A', etc., which don't have latex commands
all_entries.append((parts[0].strip(), parts[1].strip()))
all_entries = sorted(all_entries)
for unicode, latex in all_entries:
write_line(unicode, latex, out_stream)


def write_line(ch: str, latex: str, out_stream: TextIO):
def write_line(ch: str, latex: str, short: str, out_stream: TextIO):
def hex_string(ch: str) -> str:
comment = ''
if ch == '\\\\' or ch == '\\"':
Expand All @@ -160,21 +201,70 @@ def hex_string(ch: str) -> str:
comment = "0" + ch[1:]
return comment

if ord(ch) < 0x7F and len(latex) <= 1:
return # probably an ASCII char

if ch == '"':
ch = '\\"'
elif ch == '\\':
ch = '\\\\'
elif ch == '\\127':
ch = '\\x7F'
space = '' if ch.startswith('\\') and not(ch.endswith('}')) else ' '
braille = ascii_to_euro_braille(latex + space)
first_part = f' - "{ch}": [t: "{braille}"]'
elif ch == "°":
latex = "°" # special case in their code
short_space = '𝐖' if short.startswith('\\') and not short.endswith('}') and len(short) > 2 else ''
long_space = '𝐖' if latex.startswith('\\') and not latex.endswith('}') and len(latex) > 2 else ''
try:
out_stream.write('{:40}# {} ({})\n'.format(first_part, hex_string(ch), latex))
# write untranslated text
latex = latex.replace('\\', '\\\\').replace('"', '\\"')
short = short.replace('\\', '\\\\').replace('"', '\\"')
if short == '':
first_part_char = f' - "{ch}": [t: "{latex + long_space}"]'
out_stream.write(f'{first_part_char:<40} # {hex_string(ch)}\n')
else:
first_part_char = f' - "{ch}":'
first_part_short = f' then: [t: "{short + short_space}"]'
first_part_long = f' else: [t: "{latex + long_space}"]'
out_stream.write(f'{first_part_char:<40} # {hex_string(ch)}\n')
out_stream.write(' - test:\n')
out_stream.write(' if: "$LaTeX_UseShortName"\n')
out_stream.write(f'{first_part_short}\n')
out_stream.write(f'{first_part_long}\n') # not sure why, but this gives better alignment
# write the translated dots
# braille = ascii_to_euro_braille(latex + space)
# if short == '':
# first_part_char = f' - "{ch}": [t: "{braille}"]'
# out_stream.write(f'{first_part_char:<40} # {hex_string(ch)} ({latex})\n')
# else:
# short_braille = ascii_to_euro_braille(short+space) # fix spacing
# first_part_char = f' - "{ch}":'
# first_part_short = f' else: [t: "{short_braille}"]'
# first_part_long = f' then: [t: "{braille}"]'
# out_stream.write(f'{first_part_char:<40} # {hex_string(ch)}\n')
# out_stream.write(' - test:\n')
# out_stream.write(' if: "$LaTeX_UseShortName=\'True\'"\n')
# out_stream.write(f'{first_part_long:<34} # {latex}\n') # not sure why, but this gives better alignment
# out_stream.write(f'{first_part_short:<36} # {short}\n')
except:
print(f"failed to write a line for ch='{ch}/{hex_string(ch)}'")


def create_greek_letters(out_file: str):
# the HTML file has rowspans in it -- hence the use of table extractor
with open("greek-letters.txt", encoding='utf8') as in_stream:
with open(out_file, 'w', encoding='utf8') as out_stream:
all_entries = []
lines = in_stream.readlines()
for line in lines:
parts = line.split('\t')
if parts[1].startswith('\\'): # ignore 'A', etc., which don't have latex commands
all_entries.append((parts[0].strip(), parts[1].strip()))
all_entries = sorted(all_entries)
for unicode, latex in all_entries:
write_line(unicode, latex, out_stream)




# create_unicode_from_list_of_symbols_html("euro-symbols2.yaml")
# create_greek_letters("greek-letters.yaml")
Expand Down
Loading

0 comments on commit b341e84

Please sign in to comment.