forked from acl-org/acl-anthology
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtexmath.py
233 lines (213 loc) · 9.05 KB
/
texmath.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
# -*- coding: utf-8 -*-
#
# Copyright 2019 Marcel Bollmann <[email protected]>
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from lxml import etree
import csv
import logging as log
import os
from TexSoup import TexSoup
from TexSoup.data import TexCmd, TexText, TexGroup
FUNCTION_NAMES = ("lim", "log")
TEX_TO_HTML = {
"mathrm": ("span", {"class": "font-weight-normal"}),
"textrm": ("span", {"class": "font-weight-normal"}),
"text": ("span", {"class": "font-weight-normal"}),
"mathbf": ("strong", {}),
"textbf": ("strong", {}),
"boldsymbol": ("strong", {}),
"mathit": ("em", {}),
"textit": ("em", {}),
"emph": ("em", {}),
"textsc": ("span", {"style": "font-variant: small-caps;"}),
"texttt": ("span", {"class": "text-monospace"}),
"textsubscript": ("sub", {}),
"textsuperscript": ("sup", {}),
}
REMOVED_COMMANDS = ("bf", "rm", "it", "sc")
def _append_text(text, trg):
if not text:
return
if len(trg):
if trg[-1].tail is not None:
trg[-1].tail += text
else:
trg[-1].tail = text
else:
if trg.text is not None:
trg.text += text
else:
trg.text = text
class TexMath:
"""Interpreter and converter for TeX inline math expressions.
This class uses TexSoup (https://github.com/alvinwan/TexSoup) to parse a TeX
expression and converts it to valid HTML. The conversion combines a small
number of handwritten rules with a mapping of LaTeX math mode commands to
Unicode symbols (http://milde.users.sourceforge.net/LUCR/Math/). Parts that
cannot be interpreted using this simple method are preserved as raw LaTeX.
"""
def __init__(self, symbolsfile=None):
self.cmd_map = {}
if symbolsfile is None:
symbolsfile = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "unimathsymbols.txt"
)
self.load_symbols(symbolsfile)
def load_symbols(self, filename):
with open(filename, "r", encoding="utf8") as f:
reader = csv.reader(f, delimiter="^")
for row in reader:
if row[0].startswith("#"): # comment
continue
assert len(row) == 8, "Expect eight-column format"
char, cmd = row[1], row[2]
if cmd.startswith("\\"):
self.cmd_map[cmd[1:]] = char
if row[-1].startswith("= ") and ", " in row[-1]:
# last column sometimes contains alternative command
cmd = row[-1][2:].split(", ")[0]
if cmd.startswith("\\"):
self.cmd_map[cmd[1:]] = char
def _parse(self, everything, trg):
"""Parses a list of TeX constituents into an lxml.etree._Element.
Arguments:
everything: An iterator over TeX constituents as provided by TexSoup
trg: The lxml.etree._Element to parse the expression into
The approach of iterating over the TeX constituents roughly follows
<https://github.com/alvinwan/TexSoup/blob/master/examples/list_everything.py>.
"""
sxscript = False # Tracks whether we're in a subscript/superscript
for code in everything:
if isinstance(code, TexCmd):
# code is a TeX command
self._parse_command(code, trg)
elif isinstance(code, TexText) or isinstance(code, str):
# code is text
sxscript = self._parse_text(code, trg)
elif isinstance(code, TexGroup):
# If in subscript/superscript, wrap the entire element in respective tag
if sxscript:
my_trg = etree.Element(sxscript)
self._parse(code.contents, my_trg)
trg.append(my_trg)
sxscript = False
# Otherwise, just parse it normally
else:
self._parse(code.contents, trg)
else:
log.error(f"TeX-math parser got unhandled element: {type(code)}")
def _parse_command(self, code, trg):
args = list(code.args)
name = str(code.name)
# Check if the command is in the list of known Unicode mappings
if name in self.cmd_map:
_append_text(self.cmd_map[name], trg)
self._parse(args, trg)
# Check if command + arguments is in the list of known Unicode mappings
# (this covers commands like "\mathcal{A}", which have their own entries)
elif str(code)[1:] in self.cmd_map:
_append_text(self.cmd_map[str(code)[1:]], trg)
# Check if command is a known function name (e.g. "log")
elif name in FUNCTION_NAMES:
sx = etree.Element("span")
sx.attrib["class"] = "tex-math-function"
sx.text = str(name)
trg.append(sx)
self._parse(args, trg)
# Handle fractions
elif name == "frac":
self._parse_fraction(args, trg)
# Handle commands with simple HTML tag substitutions
elif name in TEX_TO_HTML:
elem_name, elem_attrib = TEX_TO_HTML[name]
sx = etree.Element(elem_name, attrib=elem_attrib)
self._parse(args, sx)
trg.append(sx)
# Known, but unsupported formatting tags that will just be removed
elif name in REMOVED_COMMANDS and not args:
pass
# Give up, but preserve element
else:
log.warn(f"Unknown TeX-math command: {code}")
self._append_unparsed(code, trg)
def _parse_fraction(self, args, trg):
if len(args) != 2:
log.warn(f"Couldn't parse \\frac: got {len(args)} arguments, expected 2")
self._append_unparsed({'name': 'frac', 'args': args}, trg)
else:
# Represent numerator of fraction as superscript
sx = etree.Element("sup")
self._parse([args[0]], sx)
trg.append(sx)
# Unicode symbol for fraction slash
_append_text("\u2044", trg)
# Represent denominator of fraction as subscript
sx = etree.Element("sub")
self._parse([args[1]], sx)
trg.append(sx)
def _parse_text(self, code, trg):
text = str(code)
# TexSoup doesn't parse any non-alpha command as a command. Ex: \$
# However it does seperate them into their own text part. Ex: 'r\\&dd' -> ['r', '\\&', 'dd']
# Therefore try to do command mapping replacement of all text beginning with \ and of length 2
if len(text) == 2 and text[0] == '\\':
text = self.cmd_map.get(text[1], text)
_append_text(text, trg)
return
# parse ^ and _ (won't get recognized as separate nodes by TexSoup)
sxscript = False
if "^" in text or "_" in text:
buf = ""
for char in text:
if char == "^" or char == "_":
_append_text(buf, trg)
buf = ""
sxscript = "sup" if char == "^" else "sub"
elif sxscript:
sx = etree.Element(sxscript)
sx.text = char
trg.append(sx)
sxscript = False
else:
buf += char
text = buf
# Append as text
_append_text(text, trg)
return sxscript
def _append_unparsed(self, code, trg):
pre = etree.Element("span")
pre.attrib["class"] = "tex-math-unparsed"
pre.text = f"\\{code.name}{code.args}"
trg.append(pre)
def etree_to_html(self, element):
result = etree.Element("span")
result.attrib["class"] = "tex-math"
result.tail = element.tail # Preserve tail
self._parse(TexSoup(element.text).expr.all, result)
return result
def to_html(self, element):
"""Converts a TeX math expression to HTML markup."""
if isinstance(element, etree._Element):
return self.etree_to_html(element)
elif isinstance(element, str):
value = self.etree_to_html(etree.fromstring(f"<span>{element}</span>"))
return etree.tostring(value)
raise NotImplementedError(f"Cannot convert elements of type {type(element)}")
def to_unicode(self, element):
"""Converts a TeX math expression to a Unicode string.
This will perform the same conversions as `to_html()`, but strip out the
HTML tags afterwards.
"""
element = self.to_html(element)
return etree.tostring(element, encoding="unicode", method="text")