diff --git a/EduNLP/SIF/sif.py b/EduNLP/SIF/sif.py index 9c7bb3d6..70a8404e 100644 --- a/EduNLP/SIF/sif.py +++ b/EduNLP/SIF/sif.py @@ -97,7 +97,7 @@ def to_sif(item, check_formula=True, parser: Parser = None): def sif4sci(item: str, figures: (dict, bool) = None, mode: int = 2, symbol: str = None, tokenization=True, - tokenization_params=None, errors="raise"): + tokenization_params=None, convert_image_to_latex=False, errors="raise"): r""" Default to use linear Tokenizer, change the tokenizer by specifying tokenization_params @@ -260,7 +260,7 @@ def sif4sci(item: str, figures: (dict, bool) = None, mode: int = 2, symbol: str "Unknown mode %s, use only 0 or 1 or 2." % mode ) - ret = seg(item, figures, symbol) + ret = seg(item, figures, symbol, convert_image_to_latex) if tokenization is True: ret = tokenize(ret, **(tokenization_params if tokenization_params is not None else {})) diff --git a/EduNLP/Tokenizer/tokenizer.py b/EduNLP/Tokenizer/tokenizer.py index f9e755d7..8bd0215b 100644 --- a/EduNLP/Tokenizer/tokenizer.py +++ b/EduNLP/Tokenizer/tokenizer.py @@ -56,7 +56,8 @@ def __call__(self, items: Iterable, key=lambda x: x, **kwargs): for item in items: yield self._tokenize(item, key=key, **kwargs) - def _tokenize(self, item: Union[str, dict], key=lambda x: x, symbol: str = None, **kwargs): + def _tokenize(self, item: Union[str, dict], key=lambda x: x, symbol: str = None, + convert_image_to_latex=False, **kwargs): """Tokenize one item, return token list Parameters @@ -67,7 +68,8 @@ def _tokenize(self, item: Union[str, dict], key=lambda x: x, symbol: str = None, determine how to get the text of item, by default lambdax: x """ symbol = self.symbol if symbol is None else symbol - return tokenize(seg(key(item), symbol=symbol, figures=self.figures), + return tokenize(seg(key(item), symbol=symbol, figures=self.figures, + convert_image_to_latex=convert_image_to_latex), **self.tokenization_params, **kwargs).tokens @@ -191,9 +193,11 @@ def __call__(self, items: Iterable, key=lambda x: x, **kwargs): for item in items: yield self._tokenize(item, key=key, **kwargs) - def _tokenize(self, item: Union[str, dict], key=lambda x: x, symbol: str = None, **kwargs): + def _tokenize(self, item: Union[str, dict], key=lambda x: x, symbol: str = None, + convert_image_to_latex=False, **kwargs): symbol = self.symbol if symbol is None else symbol - return tokenize(seg(key(item), symbol=symbol), **self.tokenization_params, **kwargs).tokens + return tokenize(seg(key(item), symbol=symbol, convert_image_to_latex=convert_image_to_latex), + **self.tokenization_params, **kwargs).tokens class AstFormulaTokenizer(Tokenizer): @@ -235,11 +239,13 @@ def __call__(self, items: Iterable, key=lambda x: x, **kwargs): for item in items: yield self._tokenize(item, key=key, **kwargs) - def _tokenize(self, item: Union[str, dict], key=lambda x: x, symbol: str = None, **kwargs): + def _tokenize(self, item: Union[str, dict], key=lambda x: x, symbol: str = None, + convert_image_to_latex=False, **kwargs): mode = kwargs.pop("mode", 0) symbol = self.symbol if symbol is None else symbol ret = sif4sci(key(item), figures=self.figures, mode=mode, symbol=symbol, - tokenization_params=self.tokenization_params, errors="ignore", **kwargs) + tokenization_params=self.tokenization_params, convert_image_to_latex=convert_image_to_latex, + errors="ignore", **kwargs) ret = [] if ret is None else ret.tokens return ret