Skip to content

Commit

Permalink
[style] add convert_image_to_latex param
Browse files Browse the repository at this point in the history
  • Loading branch information
GNEHUY committed Mar 9, 2024
1 parent eb8d633 commit d235439
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 8 deletions.
4 changes: 2 additions & 2 deletions EduNLP/SIF/sif.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def to_sif(item, check_formula=True, parser: Parser = None):


def sif4sci(item: str, figures: (dict, bool) = None, mode: int = 2, symbol: str = None, tokenization=True,
tokenization_params=None, errors="raise"):
tokenization_params=None, convert_image_to_latex=False, errors="raise"):
r"""
Default to use linear Tokenizer, change the tokenizer by specifying tokenization_params
Expand Down Expand Up @@ -260,7 +260,7 @@ def sif4sci(item: str, figures: (dict, bool) = None, mode: int = 2, symbol: str
"Unknown mode %s, use only 0 or 1 or 2." % mode
)

ret = seg(item, figures, symbol)
ret = seg(item, figures, symbol, convert_image_to_latex)

if tokenization is True:
ret = tokenize(ret, **(tokenization_params if tokenization_params is not None else {}))
Expand Down
18 changes: 12 additions & 6 deletions EduNLP/Tokenizer/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,8 @@ def __call__(self, items: Iterable, key=lambda x: x, **kwargs):
for item in items:
yield self._tokenize(item, key=key, **kwargs)

def _tokenize(self, item: Union[str, dict], key=lambda x: x, symbol: str = None, **kwargs):
def _tokenize(self, item: Union[str, dict], key=lambda x: x, symbol: str = None,
convert_image_to_latex=False, **kwargs):
"""Tokenize one item, return token list
Parameters
Expand All @@ -67,7 +68,8 @@ def _tokenize(self, item: Union[str, dict], key=lambda x: x, symbol: str = None,
determine how to get the text of item, by default lambdax: x
"""
symbol = self.symbol if symbol is None else symbol
return tokenize(seg(key(item), symbol=symbol, figures=self.figures),
return tokenize(seg(key(item), symbol=symbol, figures=self.figures,
convert_image_to_latex=convert_image_to_latex),
**self.tokenization_params, **kwargs).tokens


Expand Down Expand Up @@ -191,9 +193,11 @@ def __call__(self, items: Iterable, key=lambda x: x, **kwargs):
for item in items:
yield self._tokenize(item, key=key, **kwargs)

def _tokenize(self, item: Union[str, dict], key=lambda x: x, symbol: str = None, **kwargs):
def _tokenize(self, item: Union[str, dict], key=lambda x: x, symbol: str = None,
convert_image_to_latex=False, **kwargs):
symbol = self.symbol if symbol is None else symbol
return tokenize(seg(key(item), symbol=symbol), **self.tokenization_params, **kwargs).tokens
return tokenize(seg(key(item), symbol=symbol, convert_image_to_latex=convert_image_to_latex),
**self.tokenization_params, **kwargs).tokens


class AstFormulaTokenizer(Tokenizer):
Expand Down Expand Up @@ -235,11 +239,13 @@ def __call__(self, items: Iterable, key=lambda x: x, **kwargs):
for item in items:
yield self._tokenize(item, key=key, **kwargs)

def _tokenize(self, item: Union[str, dict], key=lambda x: x, symbol: str = None, **kwargs):
def _tokenize(self, item: Union[str, dict], key=lambda x: x, symbol: str = None,
convert_image_to_latex=False, **kwargs):
mode = kwargs.pop("mode", 0)
symbol = self.symbol if symbol is None else symbol
ret = sif4sci(key(item), figures=self.figures, mode=mode, symbol=symbol,
tokenization_params=self.tokenization_params, errors="ignore", **kwargs)
tokenization_params=self.tokenization_params, convert_image_to_latex=convert_image_to_latex,
errors="ignore", **kwargs)
ret = [] if ret is None else ret.tokens
return ret

Expand Down

0 comments on commit d235439

Please sign in to comment.